mirror of
				https://github.com/huggingface/transformers.git
				synced 2025-11-04 20:14:36 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			torch_vers
			...
			relative-p
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| da72c2cdde | 
@ -65,20 +65,19 @@ jobs:
 | 
			
		||||
    run_tests_torch_and_tf:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PT_TF_CROSS_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch_and_tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs
 | 
			
		||||
            - run: git lfs install
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
@ -88,7 +87,7 @@ jobs:
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - run: pip install git+https://github.com/huggingface/accelerate
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                key: v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                key: v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
                paths:
 | 
			
		||||
                    - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -106,20 +105,19 @@ jobs:
 | 
			
		||||
    run_tests_torch_and_tf_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PT_TF_CROSS_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch_and_tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs
 | 
			
		||||
            - run: git lfs install
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
@ -129,7 +127,7 @@ jobs:
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - run: pip install git+https://github.com/huggingface/accelerate
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                key: v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                key: v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
                paths:
 | 
			
		||||
                    - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -142,20 +140,19 @@ jobs:
 | 
			
		||||
    run_tests_torch_and_flax:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PT_FLAX_CROSS_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch_and_flax-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
 | 
			
		||||
@ -163,7 +160,7 @@ jobs:
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - run: pip install git+https://github.com/huggingface/accelerate
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                key: v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                key: v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
                paths:
 | 
			
		||||
                    - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -181,20 +178,19 @@ jobs:
 | 
			
		||||
    run_tests_torch_and_flax_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PT_FLAX_CROSS_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch_and_flax-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
 | 
			
		||||
@ -202,7 +198,7 @@ jobs:
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - run: pip install git+https://github.com/huggingface/accelerate
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                key: v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                key: v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
                paths:
 | 
			
		||||
                    - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -215,19 +211,18 @@ jobs:
 | 
			
		||||
    run_tests_torch:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
 | 
			
		||||
@ -235,7 +230,7 @@ jobs:
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - run: pip install git+https://github.com/huggingface/accelerate
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -253,19 +248,18 @@ jobs:
 | 
			
		||||
    run_tests_torch_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
 | 
			
		||||
@ -273,7 +267,7 @@ jobs:
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - run: pip install git+https://github.com/huggingface/accelerate
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -286,26 +280,25 @@ jobs:
 | 
			
		||||
    run_tests_tf:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
 | 
			
		||||
            - run: pip install tensorflow_probability
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -323,26 +316,25 @@ jobs:
 | 
			
		||||
    run_tests_tf_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
 | 
			
		||||
            - run: pip install tensorflow_probability
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -355,25 +347,24 @@ jobs:
 | 
			
		||||
    run_tests_flax:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                keys:
 | 
			
		||||
                    - v0.5-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[flax,testing,sentencepiece,flax-speech,vision]
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -391,25 +382,24 @@ jobs:
 | 
			
		||||
    run_tests_flax_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                keys:
 | 
			
		||||
                    - v0.5-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[flax,testing,sentencepiece,vision,flax-speech]
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-flax-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -422,27 +412,26 @@ jobs:
 | 
			
		||||
    run_tests_pipelines_torch:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PIPELINE_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
 | 
			
		||||
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -460,27 +449,26 @@ jobs:
 | 
			
		||||
    run_tests_pipelines_torch_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PIPELINE_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
 | 
			
		||||
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
 | 
			
		||||
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -493,25 +481,24 @@ jobs:
 | 
			
		||||
    run_tests_pipelines_tf:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PIPELINE_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
 | 
			
		||||
            - run: pip install tensorflow_probability
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -529,25 +516,24 @@ jobs:
 | 
			
		||||
    run_tests_pipelines_tf_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            RUN_PIPELINE_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
 | 
			
		||||
            - run: pip install tensorflow_probability
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-tf-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -560,22 +546,21 @@ jobs:
 | 
			
		||||
    run_tests_custom_tokenizers:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            RUN_CUSTOM_TOKENIZERS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-custom_tokenizers-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]
 | 
			
		||||
            - run: python -m unidic download
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-custom_tokenizers-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -594,25 +579,24 @@ jobs:
 | 
			
		||||
    run_examples_torch:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
 | 
			
		||||
            - run: pip install -r examples/pytorch/_tests_requirements.txt
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
 | 
			
		||||
@ -630,25 +614,24 @@ jobs:
 | 
			
		||||
    run_examples_torch_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
 | 
			
		||||
            - run: pip install -r examples/pytorch/_tests_requirements.txt
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -661,24 +644,23 @@ jobs:
 | 
			
		||||
    run_examples_flax:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                keys:
 | 
			
		||||
                    - v0.5-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[flax,testing,sentencepiece]
 | 
			
		||||
            - run: sudo pip install .[flax,testing,sentencepiece]
 | 
			
		||||
            - run: pip install -r examples/flax/_tests_requirements.txt
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
 | 
			
		||||
@ -696,24 +678,23 @@ jobs:
 | 
			
		||||
    run_examples_flax_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                keys:
 | 
			
		||||
                    - v0.5-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                    - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[flax,testing,sentencepiece]
 | 
			
		||||
            - run: sudo pip install .[flax,testing,sentencepiece]
 | 
			
		||||
            - run: pip install -r examples/flax/_tests_requirements.txt
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-flax_examples-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -726,28 +707,27 @@ jobs:
 | 
			
		||||
    run_tests_hub:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            HUGGINGFACE_CO_STAGING: yes
 | 
			
		||||
            RUN_GIT_LFS_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install git-lfs
 | 
			
		||||
                      - v0.4-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get install git-lfs
 | 
			
		||||
            - run: |
 | 
			
		||||
                git config --global user.email "ci@dummy.com"
 | 
			
		||||
                git config --global user.name "ci"
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[torch,sentencepiece,testing]
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -765,28 +745,27 @@ jobs:
 | 
			
		||||
    run_tests_hub_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            HUGGINGFACE_CO_STAGING: yes
 | 
			
		||||
            RUN_GIT_LFS_TESTS: yes
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install git-lfs
 | 
			
		||||
                      - v0.4-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get install git-lfs
 | 
			
		||||
            - run: |
 | 
			
		||||
                git config --global user.email "ci@dummy.com"
 | 
			
		||||
                git config --global user.name "ci"
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[torch,sentencepiece,testing]
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-hub-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -799,23 +778,22 @@ jobs:
 | 
			
		||||
    run_tests_onnxruntime:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[torch,testing,sentencepiece,onnxruntime,vision,rjieba]
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-onnx-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-onnx-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -833,23 +811,22 @@ jobs:
 | 
			
		||||
    run_tests_onnxruntime_all:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[torch,testing,sentencepiece,onnxruntime,vision]
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-onnx-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-onnx-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: |
 | 
			
		||||
@ -862,22 +839,21 @@ jobs:
 | 
			
		||||
    check_code_quality:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        resource_class: large
 | 
			
		||||
        environment:
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-code_quality-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-code_quality-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[all,quality]
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-code_quality-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-code_quality-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: black --check --preview examples tests src utils
 | 
			
		||||
@ -886,27 +862,25 @@ jobs:
 | 
			
		||||
            - run: python utils/sort_auto_mappings.py --check_only
 | 
			
		||||
            - run: flake8 examples tests src utils
 | 
			
		||||
            - run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 | 
			
		||||
            - run: python utils/check_doc_toc.py
 | 
			
		||||
 | 
			
		||||
    check_repository_consistency:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        resource_class: large
 | 
			
		||||
        environment:
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-repository_consistency-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-repository_consistency-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[all,quality]
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-repository_consistency-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-repository_consistency-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/check_copies.py
 | 
			
		||||
@ -921,19 +895,18 @@ jobs:
 | 
			
		||||
    run_tests_layoutlmv2_and_v3:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
            PYTEST_TIMEOUT: 120
 | 
			
		||||
        resource_class: xlarge
 | 
			
		||||
        parallelism: 1
 | 
			
		||||
        steps:
 | 
			
		||||
            - checkout
 | 
			
		||||
            - restore_cache:
 | 
			
		||||
                  keys:
 | 
			
		||||
                      - v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.5-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                      - v0.4-{{ checksum "setup.py" }}
 | 
			
		||||
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
 | 
			
		||||
            - run: pip install --upgrade pip
 | 
			
		||||
            - run: pip install .[torch,testing,vision]
 | 
			
		||||
@ -942,7 +915,7 @@ jobs:
 | 
			
		||||
            - run: sudo apt install tesseract-ocr
 | 
			
		||||
            - run: pip install pytesseract
 | 
			
		||||
            - save_cache:
 | 
			
		||||
                  key: v0.5-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  key: v0.4-torch-{{ checksum "setup.py" }}
 | 
			
		||||
                  paths:
 | 
			
		||||
                      - '~/.cache/pip'
 | 
			
		||||
            - run: python utils/tests_fetcher.py | tee test_preparation.txt
 | 
			
		||||
@ -960,7 +933,7 @@ jobs:
 | 
			
		||||
# TPU JOBS
 | 
			
		||||
    run_examples_tpu:
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        environment:
 | 
			
		||||
            OMP_NUM_THREADS: 1
 | 
			
		||||
            TRANSFORMERS_IS_CI: yes
 | 
			
		||||
@ -980,7 +953,7 @@ jobs:
 | 
			
		||||
 | 
			
		||||
    cleanup-gke-jobs:
 | 
			
		||||
        docker:
 | 
			
		||||
            - image: cimg/python:3.7.12
 | 
			
		||||
            - image: circleci/python:3.7
 | 
			
		||||
        steps:
 | 
			
		||||
            - gcp-gke/install
 | 
			
		||||
            - gcp-gke/update-kubeconfig-with-credentials:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
								
							@ -7,6 +7,7 @@ body:
 | 
			
		||||
    attributes:
 | 
			
		||||
      label: System Info
 | 
			
		||||
      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
 | 
			
		||||
      render: shell
 | 
			
		||||
      placeholder: transformers version, platform, python version, ...
 | 
			
		||||
    validations:
 | 
			
		||||
      required: true
 | 
			
		||||
@ -117,3 +118,4 @@ body:
 | 
			
		||||
    attributes:
 | 
			
		||||
      label: Expected behavior
 | 
			
		||||
      description: "A clear and concise description of what you would expect to happen."
 | 
			
		||||
      render: shell
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										4
									
								
								.github/conda/meta.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/conda/meta.yaml
									
									
									
									
										vendored
									
									
								
							@ -25,7 +25,7 @@ requirements:
 | 
			
		||||
    - sacremoses
 | 
			
		||||
    - regex !=2019.12.17
 | 
			
		||||
    - protobuf
 | 
			
		||||
    - tokenizers >=0.11.1,!=0.11.3,<0.13
 | 
			
		||||
    - tokenizers >=0.10.1,<0.11.0
 | 
			
		||||
    - pyyaml >=5.1
 | 
			
		||||
  run:
 | 
			
		||||
    - python
 | 
			
		||||
@ -40,7 +40,7 @@ requirements:
 | 
			
		||||
    - sacremoses
 | 
			
		||||
    - regex !=2019.12.17
 | 
			
		||||
    - protobuf
 | 
			
		||||
    - tokenizers >=0.11.1,!=0.11.3,<0.13
 | 
			
		||||
    - tokenizers >=0.10.1,<0.11.0
 | 
			
		||||
    - pyyaml >=5.1
 | 
			
		||||
 | 
			
		||||
test:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/add-model-like.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/add-model-like.yml
									
									
									
									
										vendored
									
									
								
							@ -27,7 +27,7 @@ jobs:
 | 
			
		||||
        id: cache
 | 
			
		||||
        with:
 | 
			
		||||
          path: ~/venv/
 | 
			
		||||
          key: v4-tests_model_like-${{ hashFiles('setup.py') }}
 | 
			
		||||
          key: v3-tests_model_like-${{ hashFiles('setup.py') }}
 | 
			
		||||
 | 
			
		||||
      - name: Create virtual environment on cache miss
 | 
			
		||||
        if: steps.cache.outputs.cache-hit != 'true'
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										54
									
								
								.github/workflows/build-docker-images.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										54
									
								
								.github/workflows/build-docker-images.yml
									
									
									
									
										vendored
									
									
								
							@ -5,7 +5,6 @@ on:
 | 
			
		||||
    branches:
 | 
			
		||||
      - docker-image*
 | 
			
		||||
  repository_dispatch:
 | 
			
		||||
  workflow_call:
 | 
			
		||||
  schedule:
 | 
			
		||||
    - cron: "0 1 * * *"
 | 
			
		||||
 | 
			
		||||
@ -40,33 +39,6 @@ jobs:
 | 
			
		||||
          push: true
 | 
			
		||||
          tags: huggingface/transformers-all-latest-gpu
 | 
			
		||||
 | 
			
		||||
  latest-with-torch-nightly-docker:
 | 
			
		||||
    name: "Nightly PyTorch + Stable TensorFlow"
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      -
 | 
			
		||||
        name: Set up Docker Buildx
 | 
			
		||||
        uses: docker/setup-buildx-action@v1
 | 
			
		||||
      -
 | 
			
		||||
        name: Check out code
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
      -
 | 
			
		||||
        name: Login to DockerHub
 | 
			
		||||
        uses: docker/login-action@v1
 | 
			
		||||
        with:
 | 
			
		||||
          username: ${{ secrets.DOCKERHUB_USERNAME }}
 | 
			
		||||
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
 | 
			
		||||
      -
 | 
			
		||||
        name: Build and push
 | 
			
		||||
        uses: docker/build-push-action@v2
 | 
			
		||||
        with:
 | 
			
		||||
          context: ./docker/transformers-all-latest-gpu
 | 
			
		||||
          build-args: |
 | 
			
		||||
            REF=main
 | 
			
		||||
            PYTORCH=pre
 | 
			
		||||
          push: true
 | 
			
		||||
          tags: huggingface/transformers-all-latest-torch-nightly-gpu
 | 
			
		||||
 | 
			
		||||
  latest-torch-deepspeed-docker:
 | 
			
		||||
    name: "Latest PyTorch + DeepSpeed"
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
@ -93,32 +65,6 @@ jobs:
 | 
			
		||||
          push: true
 | 
			
		||||
          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu
 | 
			
		||||
 | 
			
		||||
  nightly-torch-deepspeed-docker:
 | 
			
		||||
    name: "Nightly PyTorch + DeepSpeed"
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      -
 | 
			
		||||
        name: Set up Docker Buildx
 | 
			
		||||
        uses: docker/setup-buildx-action@v1
 | 
			
		||||
      -
 | 
			
		||||
        name: Check out code
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
      -
 | 
			
		||||
        name: Login to DockerHub
 | 
			
		||||
        uses: docker/login-action@v1
 | 
			
		||||
        with:
 | 
			
		||||
          username: ${{ secrets.DOCKERHUB_USERNAME }}
 | 
			
		||||
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
 | 
			
		||||
      -
 | 
			
		||||
        name: Build and push
 | 
			
		||||
        uses: docker/build-push-action@v2
 | 
			
		||||
        with:
 | 
			
		||||
          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
 | 
			
		||||
          build-args: |
 | 
			
		||||
            REF=main
 | 
			
		||||
          push: true
 | 
			
		||||
          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
 | 
			
		||||
 | 
			
		||||
  doc-builder:
 | 
			
		||||
    name: "Doc builder"
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										108
									
								
								.github/workflows/build-past-ci-docker-images.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										108
									
								
								.github/workflows/build-past-ci-docker-images.yml
									
									
									
									
										vendored
									
									
								
							@ -1,108 +0,0 @@
 | 
			
		||||
name: Build docker images (Past CI)
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  push:
 | 
			
		||||
    branches:
 | 
			
		||||
      - past-ci-docker-image*
 | 
			
		||||
 | 
			
		||||
concurrency:
 | 
			
		||||
  group: docker-images-builds
 | 
			
		||||
  cancel-in-progress: false
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  past-pytorch-docker:
 | 
			
		||||
    name: "Past PyTorch Docker"
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"]
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      -
 | 
			
		||||
        name: Set up Docker Buildx
 | 
			
		||||
        uses: docker/setup-buildx-action@v1
 | 
			
		||||
      -
 | 
			
		||||
        name: Check out code
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
      -
 | 
			
		||||
        name: Login to DockerHub
 | 
			
		||||
        uses: docker/login-action@v1
 | 
			
		||||
        with:
 | 
			
		||||
          username: ${{ secrets.DOCKERHUB_USERNAME }}
 | 
			
		||||
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
 | 
			
		||||
      -
 | 
			
		||||
        name: Build and push
 | 
			
		||||
        uses: docker/build-push-action@v2
 | 
			
		||||
        with:
 | 
			
		||||
          context: ./docker/transformers-past-gpu
 | 
			
		||||
          build-args: |
 | 
			
		||||
            REF=main
 | 
			
		||||
            FRAMEWORK=pytorch
 | 
			
		||||
            VERSION=${{ matrix.version }}
 | 
			
		||||
          push: true
 | 
			
		||||
          tags: huggingface/transformers-pytorch-past-${{ matrix.version }}-gpu
 | 
			
		||||
 | 
			
		||||
  past-tensorflow-docker:
 | 
			
		||||
    name: "Past TensorFlow Docker"
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        version: ["2.8", "2.7", "2.6", "2.5"]
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      -
 | 
			
		||||
        name: Set up Docker Buildx
 | 
			
		||||
        uses: docker/setup-buildx-action@v1
 | 
			
		||||
      -
 | 
			
		||||
        name: Check out code
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
      -
 | 
			
		||||
        name: Login to DockerHub
 | 
			
		||||
        uses: docker/login-action@v1
 | 
			
		||||
        with:
 | 
			
		||||
          username: ${{ secrets.DOCKERHUB_USERNAME }}
 | 
			
		||||
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
 | 
			
		||||
      -
 | 
			
		||||
        name: Build and push
 | 
			
		||||
        uses: docker/build-push-action@v2
 | 
			
		||||
        with:
 | 
			
		||||
          context: ./docker/transformers-past-gpu
 | 
			
		||||
          build-args: |
 | 
			
		||||
            REF=main
 | 
			
		||||
            FRAMEWORK=tensorflow
 | 
			
		||||
            VERSION=${{ matrix.version }}
 | 
			
		||||
          push: true
 | 
			
		||||
          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
 | 
			
		||||
 | 
			
		||||
  past-tensorflow-docker-2-4:
 | 
			
		||||
    name: "Past TensorFlow Docker"
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        version: ["2.4"]
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      -
 | 
			
		||||
        name: Set up Docker Buildx
 | 
			
		||||
        uses: docker/setup-buildx-action@v1
 | 
			
		||||
      -
 | 
			
		||||
        name: Check out code
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
      -
 | 
			
		||||
        name: Login to DockerHub
 | 
			
		||||
        uses: docker/login-action@v1
 | 
			
		||||
        with:
 | 
			
		||||
          username: ${{ secrets.DOCKERHUB_USERNAME }}
 | 
			
		||||
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
 | 
			
		||||
      -
 | 
			
		||||
        name: Build and push
 | 
			
		||||
        uses: docker/build-push-action@v2
 | 
			
		||||
        with:
 | 
			
		||||
          context: ./docker/transformers-past-gpu
 | 
			
		||||
          build-args: |
 | 
			
		||||
            REF=main
 | 
			
		||||
            BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04
 | 
			
		||||
            FRAMEWORK=tensorflow
 | 
			
		||||
            VERSION=${{ matrix.version }}
 | 
			
		||||
          push: true
 | 
			
		||||
          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/model-templates.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/model-templates.yml
									
									
									
									
										vendored
									
									
								
							@ -21,7 +21,7 @@ jobs:
 | 
			
		||||
        id: cache
 | 
			
		||||
        with:
 | 
			
		||||
          path: ~/venv/
 | 
			
		||||
          key: v4-tests_templates-${{ hashFiles('setup.py') }}
 | 
			
		||||
          key: v3-tests_templates-${{ hashFiles('setup.py') }}
 | 
			
		||||
 | 
			
		||||
      - name: Create virtual environment on cache miss
 | 
			
		||||
        if: steps.cache.outputs.cache-hit != 'true'
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										424
									
								
								.github/workflows/self-nightly-scheduled.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										424
									
								
								.github/workflows/self-nightly-scheduled.yml
									
									
									
									
										vendored
									
									
								
							@ -1,236 +1,250 @@
 | 
			
		||||
name: Self-hosted runner (nightly)
 | 
			
		||||
 | 
			
		||||
# Note that each job's dependencies go into a corresponding docker file.
 | 
			
		||||
#
 | 
			
		||||
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
 | 
			
		||||
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
 | 
			
		||||
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
 | 
			
		||||
name: Self-hosted runner; Nightly (scheduled)
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  repository_dispatch:
 | 
			
		||||
  schedule:
 | 
			
		||||
    - cron: "0 16 * * *"
 | 
			
		||||
    push:
 | 
			
		||||
        branches:
 | 
			
		||||
            - nightly_ci*
 | 
			
		||||
    repository_dispatch:
 | 
			
		||||
    schedule:
 | 
			
		||||
        - cron: "0 0 */3 * *"
 | 
			
		||||
 | 
			
		||||
env:
 | 
			
		||||
  HF_HOME: /mnt/cache
 | 
			
		||||
  TRANSFORMERS_IS_CI: yes
 | 
			
		||||
  OMP_NUM_THREADS: 8
 | 
			
		||||
  MKL_NUM_THREADS: 8
 | 
			
		||||
  RUN_SLOW: yes
 | 
			
		||||
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
 | 
			
		||||
  TF_FORCE_GPU_ALLOW_GROWTH: true
 | 
			
		||||
  RUN_PT_TF_CROSS_TESTS: 1
 | 
			
		||||
    HF_HOME: /mnt/cache
 | 
			
		||||
    TRANSFORMERS_IS_CI: yes
 | 
			
		||||
    RUN_SLOW: yes
 | 
			
		||||
    OMP_NUM_THREADS: 16
 | 
			
		||||
    MKL_NUM_THREADS: 16
 | 
			
		||||
    PYTEST_TIMEOUT: 600
 | 
			
		||||
    SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  setup:
 | 
			
		||||
    name: Setup
 | 
			
		||||
    strategy:
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu, multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-torch-nightly-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    outputs:
 | 
			
		||||
      matrix: ${{ steps.set-matrix.outputs.matrix }}
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
    run_all_tests_torch_gpu:
 | 
			
		||||
        runs-on: [self-hosted, docker-gpu, single-gpu]
 | 
			
		||||
        container:
 | 
			
		||||
            image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
 | 
			
		||||
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
        steps:
 | 
			
		||||
            - name: Launcher docker
 | 
			
		||||
              uses: actions/checkout@v2
 | 
			
		||||
 | 
			
		||||
      - name: Cleanup
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          rm -rf tests/__pycache__
 | 
			
		||||
          rm -rf tests/models/__pycache__
 | 
			
		||||
          rm -rf reports
 | 
			
		||||
            - name: NVIDIA-SMI
 | 
			
		||||
              run: |
 | 
			
		||||
                  nvidia-smi
 | 
			
		||||
 | 
			
		||||
      - id: set-matrix
 | 
			
		||||
        name: Identify models to test
 | 
			
		||||
        working-directory: /transformers/tests
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
 | 
			
		||||
            - name: Install dependencies
 | 
			
		||||
              run: |
 | 
			
		||||
                  apt -y update && apt install -y libsndfile1-dev git espeak-ng
 | 
			
		||||
                  pip install --upgrade pip
 | 
			
		||||
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
 | 
			
		||||
                  pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
            - name: Are GPUs recognized by our DL frameworks
 | 
			
		||||
              run: |
 | 
			
		||||
                utils/print_env.py
 | 
			
		||||
 | 
			
		||||
  run_tests_single_gpu:
 | 
			
		||||
    name: Model tests
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
 | 
			
		||||
        machine_type: [single-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-torch-nightly-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    needs: setup
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Echo folder ${{ matrix.folders }}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
 | 
			
		||||
        # set the artifact folder names (because the character `/` is not allowed).
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "${{ matrix.folders }}"
 | 
			
		||||
          matrix_folders=${{ matrix.folders }}
 | 
			
		||||
          matrix_folders=${matrix_folders/'models/'/'models_'}
 | 
			
		||||
          echo "$matrix_folders"
 | 
			
		||||
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
 | 
			
		||||
            - name: Run all tests on GPU
 | 
			
		||||
              run: |
 | 
			
		||||
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
 | 
			
		||||
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
            - name: Failure short reports
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              run: cat reports/tests_torch_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
            - name: Run examples tests on GPU
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              env:
 | 
			
		||||
                  OMP_NUM_THREADS: 16
 | 
			
		||||
                  MKL_NUM_THREADS: 16
 | 
			
		||||
                  RUN_SLOW: yes
 | 
			
		||||
                  HF_HOME: /mnt/cache
 | 
			
		||||
                  TRANSFORMERS_IS_CI: yes
 | 
			
		||||
              run: |
 | 
			
		||||
                  pip install -r examples/pytorch/_tests_requirements.txt
 | 
			
		||||
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
 | 
			
		||||
 | 
			
		||||
      - name: Environment
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 utils/print_env.py
 | 
			
		||||
            - name: Failure short reports
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              run: cat reports/examples_torch_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      - name: Run all tests on GPU
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
 | 
			
		||||
            - name: Run all pipeline tests on GPU
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              env:
 | 
			
		||||
                  RUN_PIPELINE_TESTS: yes
 | 
			
		||||
              run: |
 | 
			
		||||
                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
 | 
			
		||||
 | 
			
		||||
      - name: Failure short reports
 | 
			
		||||
        if: ${{ failure() }}
 | 
			
		||||
        continue-on-error: true
 | 
			
		||||
        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
 | 
			
		||||
            - name: Failure short reports
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              run: cat reports/tests_torch_pipeline_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      - name: Test suite reports artifacts
 | 
			
		||||
        if: ${{ always() }}
 | 
			
		||||
        uses: actions/upload-artifact@v2
 | 
			
		||||
        with:
 | 
			
		||||
          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
 | 
			
		||||
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 | 
			
		||||
            - name: Test suite reports artifacts
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              uses: actions/upload-artifact@v2
 | 
			
		||||
              with:
 | 
			
		||||
                  name: run_all_tests_torch_gpu_test_reports
 | 
			
		||||
                  path: reports
 | 
			
		||||
 | 
			
		||||
  run_tests_multi_gpu:
 | 
			
		||||
    name: Model tests
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
 | 
			
		||||
        machine_type: [multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-torch-nightly-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    needs: setup
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Echo folder ${{ matrix.folders }}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
 | 
			
		||||
        # set the artifact folder names (because the character `/` is not allowed).
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "${{ matrix.folders }}"
 | 
			
		||||
          matrix_folders=${{ matrix.folders }}
 | 
			
		||||
          matrix_folders=${matrix_folders/'models/'/'models_'}
 | 
			
		||||
          echo "$matrix_folders"
 | 
			
		||||
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
 | 
			
		||||
    run_all_tests_torch_multi_gpu:
 | 
			
		||||
        runs-on: [self-hosted, docker-gpu, multi-gpu]
 | 
			
		||||
        container:
 | 
			
		||||
            image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
 | 
			
		||||
            options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
        steps:
 | 
			
		||||
            - name: Launcher docker
 | 
			
		||||
              uses: actions/checkout@v2
 | 
			
		||||
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
            - name: NVIDIA-SMI
 | 
			
		||||
              continue-on-error: true
 | 
			
		||||
              run: |
 | 
			
		||||
                  nvidia-smi
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
            - name: Install dependencies
 | 
			
		||||
              run: |
 | 
			
		||||
                  apt -y update && apt install -y libsndfile1-dev git espeak-ng
 | 
			
		||||
                  pip install --upgrade pip
 | 
			
		||||
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
 | 
			
		||||
                  pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
 | 
			
		||||
 | 
			
		||||
      - name: Environment
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 utils/print_env.py
 | 
			
		||||
            - name: Are GPUs recognized by our DL frameworks
 | 
			
		||||
              run: |
 | 
			
		||||
                utils/print_env.py
 | 
			
		||||
 | 
			
		||||
      - name: Run all tests on GPU
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
 | 
			
		||||
            - name: Run all tests on GPU
 | 
			
		||||
              env:
 | 
			
		||||
                  MKL_SERVICE_FORCE_INTEL: 1
 | 
			
		||||
              run: |
 | 
			
		||||
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
 | 
			
		||||
 | 
			
		||||
      - name: Failure short reports
 | 
			
		||||
        if: ${{ failure() }}
 | 
			
		||||
        continue-on-error: true
 | 
			
		||||
        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
 | 
			
		||||
            - name: Failure short reports
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              run: cat reports/tests_torch_multi_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      - name: Test suite reports artifacts
 | 
			
		||||
        if: ${{ always() }}
 | 
			
		||||
        uses: actions/upload-artifact@v2
 | 
			
		||||
        with:
 | 
			
		||||
          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
 | 
			
		||||
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 | 
			
		||||
            - name: Run all pipeline tests on GPU
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              env:
 | 
			
		||||
                  RUN_PIPELINE_TESTS: yes
 | 
			
		||||
              run: |
 | 
			
		||||
                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
 | 
			
		||||
 | 
			
		||||
  run_all_tests_torch_cuda_extensions_gpu:
 | 
			
		||||
    name: Torch CUDA extension tests
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu, multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    needs: setup
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
            - name: Failure short reports
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              run: cat reports/tests_torch_pipeline_multi_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      # To avoid unknown test failures
 | 
			
		||||
      - name: Pre build DeepSpeed *again*
 | 
			
		||||
        working-directory: /workspace
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 -m pip uninstall -y deepspeed
 | 
			
		||||
          rm -rf DeepSpeed
 | 
			
		||||
          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
 | 
			
		||||
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 | 
			
		||||
            - name: Test suite reports artifacts
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              uses: actions/upload-artifact@v2
 | 
			
		||||
              with:
 | 
			
		||||
                  name: run_all_tests_torch_multi_gpu_test_reports
 | 
			
		||||
                  path: reports
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
    run_all_tests_torch_cuda_extensions_gpu:
 | 
			
		||||
        runs-on: [self-hosted, docker-gpu, single-gpu]
 | 
			
		||||
        container:
 | 
			
		||||
            image: nvcr.io/nvidia/pytorch:21.03-py3
 | 
			
		||||
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
        steps:
 | 
			
		||||
            - name: Launcher docker
 | 
			
		||||
              uses: actions/checkout@v2
 | 
			
		||||
 | 
			
		||||
      - name: Environment
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python utils/print_env.py
 | 
			
		||||
            - name: NVIDIA-SMI
 | 
			
		||||
              run: |
 | 
			
		||||
                  nvidia-smi
 | 
			
		||||
 | 
			
		||||
      - name: Run all tests on GPU
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
 | 
			
		||||
            - name: Install dependencies
 | 
			
		||||
              run: |
 | 
			
		||||
                  apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
 | 
			
		||||
                  pip install --upgrade pip
 | 
			
		||||
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
 | 
			
		||||
                  pip install .[deepspeed-testing]
 | 
			
		||||
                  pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
                  pip install git+https://github.com/microsoft/DeepSpeed
 | 
			
		||||
 | 
			
		||||
      - name: Failure short reports
 | 
			
		||||
        if: ${{ failure() }}
 | 
			
		||||
        continue-on-error: true
 | 
			
		||||
        run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
 | 
			
		||||
            - name: Are GPUs recognized by our DL frameworks
 | 
			
		||||
              run: |
 | 
			
		||||
                utils/print_env.py
 | 
			
		||||
 | 
			
		||||
      - name: Test suite reports artifacts
 | 
			
		||||
        if: ${{ always() }}
 | 
			
		||||
        uses: actions/upload-artifact@v2
 | 
			
		||||
        with:
 | 
			
		||||
          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
 | 
			
		||||
          path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
 | 
			
		||||
            - name: Run all tests on GPU
 | 
			
		||||
              run: |
 | 
			
		||||
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
 | 
			
		||||
 | 
			
		||||
  send_results:
 | 
			
		||||
    name: Send results to webhook
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
 | 
			
		||||
    steps:
 | 
			
		||||
      - uses: actions/checkout@v2
 | 
			
		||||
      - uses: actions/download-artifact@v2
 | 
			
		||||
      - name: Send message to Slack
 | 
			
		||||
        env:
 | 
			
		||||
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
 | 
			
		||||
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
 | 
			
		||||
          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
 | 
			
		||||
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
 | 
			
		||||
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
 | 
			
		||||
          CI_EVENT: nightly-build
 | 
			
		||||
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
 | 
			
		||||
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
 | 
			
		||||
        run: |
 | 
			
		||||
          pip install slack_sdk
 | 
			
		||||
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
 | 
			
		||||
            - name: Failure short reports
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              run: cat reports/tests_torch_cuda_extensions_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
            - name: Test suite reports artifacts
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              uses: actions/upload-artifact@v2
 | 
			
		||||
              with:
 | 
			
		||||
                  name: run_tests_torch_cuda_extensions_gpu_test_reports
 | 
			
		||||
                  path: reports
 | 
			
		||||
 | 
			
		||||
    run_all_tests_torch_cuda_extensions_multi_gpu:
 | 
			
		||||
        runs-on: [self-hosted, docker-gpu, multi-gpu]
 | 
			
		||||
        container:
 | 
			
		||||
            image: nvcr.io/nvidia/pytorch:21.03-py3
 | 
			
		||||
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
        steps:
 | 
			
		||||
            - name: Launcher docker
 | 
			
		||||
              uses: actions/checkout@v2
 | 
			
		||||
 | 
			
		||||
            - name: NVIDIA-SMI
 | 
			
		||||
              continue-on-error: true
 | 
			
		||||
              run: |
 | 
			
		||||
                  nvidia-smi
 | 
			
		||||
 | 
			
		||||
            - name: Install dependencies
 | 
			
		||||
              run: |
 | 
			
		||||
                  apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
 | 
			
		||||
                  pip install --upgrade pip
 | 
			
		||||
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
 | 
			
		||||
                  rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
 | 
			
		||||
                  pip install .[testing,fairscale]
 | 
			
		||||
                  pip install https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
                  pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
 | 
			
		||||
 | 
			
		||||
            - name: Are GPUs recognized by our DL frameworks
 | 
			
		||||
              run: |
 | 
			
		||||
                utils/print_env.py
 | 
			
		||||
 | 
			
		||||
            - name: Run all tests on GPU
 | 
			
		||||
              run: |
 | 
			
		||||
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
 | 
			
		||||
 | 
			
		||||
            - name: Failure short reports
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              run: cat reports/tests_torch_cuda_extensions_multi_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
            - name: Test suite reports artifacts
 | 
			
		||||
              if: ${{ always() }}
 | 
			
		||||
              uses: actions/upload-artifact@v2
 | 
			
		||||
              with:
 | 
			
		||||
                  name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
 | 
			
		||||
                  path: reports
 | 
			
		||||
 | 
			
		||||
    send_results:
 | 
			
		||||
        name: Send results to webhook
 | 
			
		||||
        runs-on: ubuntu-latest
 | 
			
		||||
        if: always()
 | 
			
		||||
        needs: [
 | 
			
		||||
                run_all_tests_torch_gpu,
 | 
			
		||||
                run_all_tests_torch_multi_gpu,
 | 
			
		||||
                run_all_tests_torch_cuda_extensions_gpu,
 | 
			
		||||
                run_all_tests_torch_cuda_extensions_multi_gpu
 | 
			
		||||
        ]
 | 
			
		||||
        steps:
 | 
			
		||||
            - uses: actions/checkout@v2
 | 
			
		||||
 | 
			
		||||
            - uses: actions/download-artifact@v2
 | 
			
		||||
 | 
			
		||||
            - name: Send message to Slack
 | 
			
		||||
              env:
 | 
			
		||||
                  CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
 | 
			
		||||
                  CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
 | 
			
		||||
                  CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
 | 
			
		||||
                  CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
 | 
			
		||||
 | 
			
		||||
              run: |
 | 
			
		||||
                  pip install slack_sdk
 | 
			
		||||
                  python utils/notification_service.py scheduled nightly-torch
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										136
									
								
								.github/workflows/self-past-caller.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										136
									
								
								.github/workflows/self-past-caller.yml
									
									
									
									
										vendored
									
									
								
							@ -1,136 +0,0 @@
 | 
			
		||||
name: Self-hosted runner (past-ci-caller)
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  push:
 | 
			
		||||
    branches:
 | 
			
		||||
      - run-past-ci*
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  run_past_ci_pytorch_1-11:
 | 
			
		||||
    name: PyTorch 1.11
 | 
			
		||||
    if: always()
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.11"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_pytorch_1-10:
 | 
			
		||||
    name: PyTorch 1.10
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-11]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.10"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_pytorch_1-9:
 | 
			
		||||
    name: PyTorch 1.9
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-10]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.9"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_pytorch_1-8:
 | 
			
		||||
    name: PyTorch 1.8
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-9]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.8"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_pytorch_1-7:
 | 
			
		||||
    name: PyTorch 1.7
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-8]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.7"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_pytorch_1-6:
 | 
			
		||||
    name: PyTorch 1.6
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-7]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.6"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_pytorch_1-5:
 | 
			
		||||
    name: PyTorch 1.5
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-6]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.5"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_pytorch_1-4:
 | 
			
		||||
    name: PyTorch 1.4
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-5]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: pytorch
 | 
			
		||||
      version: "1.4"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_tensorflow_2-8:
 | 
			
		||||
    name: TensorFlow 2.8
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_pytorch_1-4]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: tensorflow
 | 
			
		||||
      version: "2.8"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_tensorflow_2-7:
 | 
			
		||||
    name: TensorFlow 2.7
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_tensorflow_2-8]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: tensorflow
 | 
			
		||||
      version: "2.7"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_tensorflow_2-6:
 | 
			
		||||
    name: TensorFlow 2.6
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_tensorflow_2-7]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: tensorflow
 | 
			
		||||
      version: "2.6"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_tensorflow_2-5:
 | 
			
		||||
    name: TensorFlow 2.5
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_tensorflow_2-6]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: tensorflow
 | 
			
		||||
      version: "2.5"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_past_ci_tensorflow_2-4:
 | 
			
		||||
    name: TensorFlow 2.4
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [run_past_ci_tensorflow_2-5]
 | 
			
		||||
    uses: ./.github/workflows/self-past.yml
 | 
			
		||||
    with:
 | 
			
		||||
      framework: tensorflow
 | 
			
		||||
      version: "2.4"
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
							
								
								
									
										192
									
								
								.github/workflows/self-past.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										192
									
								
								.github/workflows/self-past.yml
									
									
									
									
										vendored
									
									
								
							@ -1,192 +0,0 @@
 | 
			
		||||
name: Self-hosted runner (past)
 | 
			
		||||
 | 
			
		||||
# Note that each job's dependencies go into a corresponding docker file.
 | 
			
		||||
#
 | 
			
		||||
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
 | 
			
		||||
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
 | 
			
		||||
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  workflow_call:
 | 
			
		||||
    inputs:
 | 
			
		||||
      framework:
 | 
			
		||||
        required: true
 | 
			
		||||
        type: string
 | 
			
		||||
      version:
 | 
			
		||||
        required: true
 | 
			
		||||
        type: string
 | 
			
		||||
 | 
			
		||||
env:
 | 
			
		||||
  HF_HOME: /mnt/cache
 | 
			
		||||
  TRANSFORMERS_IS_CI: yes
 | 
			
		||||
  OMP_NUM_THREADS: 8
 | 
			
		||||
  MKL_NUM_THREADS: 8
 | 
			
		||||
  RUN_SLOW: yes
 | 
			
		||||
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
 | 
			
		||||
  TF_FORCE_GPU_ALLOW_GROWTH: true
 | 
			
		||||
  RUN_PT_TF_CROSS_TESTS: 1
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  setup:
 | 
			
		||||
    name: Setup
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    outputs:
 | 
			
		||||
      matrix: ${{ steps.set-matrix.outputs.matrix }}
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Checkout transformers
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
        with:
 | 
			
		||||
          fetch-depth: 2
 | 
			
		||||
 | 
			
		||||
      - name: Cleanup
 | 
			
		||||
        run: |
 | 
			
		||||
          rm -rf tests/__pycache__
 | 
			
		||||
          rm -rf tests/models/__pycache__
 | 
			
		||||
          rm -rf reports
 | 
			
		||||
 | 
			
		||||
      - id: set-matrix
 | 
			
		||||
        name: Identify models to test
 | 
			
		||||
        run: |
 | 
			
		||||
          cd tests
 | 
			
		||||
          echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
 | 
			
		||||
 | 
			
		||||
  run_tests_single_gpu:
 | 
			
		||||
    name: Model tests
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
 | 
			
		||||
        machine_type: [single-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    needs: setup
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
 | 
			
		||||
      - name: Echo folder ${{ matrix.folders }}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
 | 
			
		||||
        # set the artifact folder names (because the character `/` is not allowed).
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "${{ matrix.folders }}"
 | 
			
		||||
          matrix_folders=${{ matrix.folders }}
 | 
			
		||||
          matrix_folders=${matrix_folders/'models/'/'models_'}
 | 
			
		||||
          echo "$matrix_folders"
 | 
			
		||||
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
 | 
			
		||||
      - name: Environment
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 utils/print_env.py
 | 
			
		||||
 | 
			
		||||
      - name: Run all tests on GPU
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
 | 
			
		||||
 | 
			
		||||
      - name: Failure short reports
 | 
			
		||||
        if: ${{ failure() }}
 | 
			
		||||
        continue-on-error: true
 | 
			
		||||
        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      - name: Test suite reports artifacts
 | 
			
		||||
        if: ${{ always() }}
 | 
			
		||||
        uses: actions/upload-artifact@v2
 | 
			
		||||
        with:
 | 
			
		||||
          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
 | 
			
		||||
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 | 
			
		||||
 | 
			
		||||
  run_tests_multi_gpu:
 | 
			
		||||
    name: Model tests
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
 | 
			
		||||
        machine_type: [multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    needs: setup
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
 | 
			
		||||
      - name: Echo folder ${{ matrix.folders }}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
 | 
			
		||||
        # set the artifact folder names (because the character `/` is not allowed).
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "${{ matrix.folders }}"
 | 
			
		||||
          matrix_folders=${{ matrix.folders }}
 | 
			
		||||
          matrix_folders=${matrix_folders/'models/'/'models_'}
 | 
			
		||||
          echo "$matrix_folders"
 | 
			
		||||
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
 | 
			
		||||
      - name: Environment
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 utils/print_env.py
 | 
			
		||||
 | 
			
		||||
      - name: Run all tests on GPU
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
 | 
			
		||||
 | 
			
		||||
      - name: Failure short reports
 | 
			
		||||
        if: ${{ failure() }}
 | 
			
		||||
        continue-on-error: true
 | 
			
		||||
        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      - name: Test suite reports artifacts
 | 
			
		||||
        if: ${{ always() }}
 | 
			
		||||
        uses: actions/upload-artifact@v2
 | 
			
		||||
        with:
 | 
			
		||||
          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
 | 
			
		||||
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 | 
			
		||||
 | 
			
		||||
  send_results:
 | 
			
		||||
    name: Send results to webhook
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    if: always()
 | 
			
		||||
    needs: [setup, run_tests_single_gpu, run_tests_multi_gpu]
 | 
			
		||||
    steps:
 | 
			
		||||
      - uses: actions/checkout@v2
 | 
			
		||||
      - uses: actions/download-artifact@v2
 | 
			
		||||
 | 
			
		||||
      # Create a directory to store test failure tables in the next step
 | 
			
		||||
      - name: Create directory
 | 
			
		||||
        run: mkdir test_failure_tables
 | 
			
		||||
 | 
			
		||||
      - name: Send message to Slack
 | 
			
		||||
        env:
 | 
			
		||||
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
 | 
			
		||||
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
 | 
			
		||||
          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
 | 
			
		||||
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
 | 
			
		||||
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
 | 
			
		||||
          CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
 | 
			
		||||
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
 | 
			
		||||
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
 | 
			
		||||
        run: |
 | 
			
		||||
          pip install slack_sdk
 | 
			
		||||
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
 | 
			
		||||
 | 
			
		||||
      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
 | 
			
		||||
      - name: Failure table artifacts
 | 
			
		||||
        if: ${{ always() }}
 | 
			
		||||
        uses: actions/upload-artifact@v2
 | 
			
		||||
        with:
 | 
			
		||||
          name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }}
 | 
			
		||||
          path: test_failure_tables
 | 
			
		||||
							
								
								
									
										47
									
								
								.github/workflows/self-push-caller.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										47
									
								
								.github/workflows/self-push-caller.yml
									
									
									
									
										vendored
									
									
								
							@ -1,4 +1,3 @@
 | 
			
		||||
# Used to trigger self-push CI
 | 
			
		||||
name: Self-hosted runner (push-caller)
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
@ -13,40 +12,18 @@ on:
 | 
			
		||||
      - "utils/**"
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  check-for-setup:
 | 
			
		||||
      runs-on: ubuntu-latest
 | 
			
		||||
      name: Check if setup was changed
 | 
			
		||||
      outputs:
 | 
			
		||||
        changed: ${{ steps.was_changed.outputs.changed }}
 | 
			
		||||
      steps:
 | 
			
		||||
        - uses: actions/checkout@v3
 | 
			
		||||
          with: 
 | 
			
		||||
            fetch-depth: "2"
 | 
			
		||||
        
 | 
			
		||||
        - name: Get changed files
 | 
			
		||||
          id: changed-files
 | 
			
		||||
          uses: tj-actions/changed-files@v22.2
 | 
			
		||||
        
 | 
			
		||||
        - name: Was setup changed 
 | 
			
		||||
          id: was_changed
 | 
			
		||||
          run: |
 | 
			
		||||
            for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
 | 
			
		||||
              if [ `basename "${file}"` = "setup.py" ]; then
 | 
			
		||||
                echo ::set-output name=changed::"1"
 | 
			
		||||
              fi
 | 
			
		||||
            done
 | 
			
		||||
 | 
			
		||||
  build-docker-containers:
 | 
			
		||||
    needs: check-for-setup
 | 
			
		||||
    if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
 | 
			
		||||
    uses: ./.github/workflows/build-docker-images.yml
 | 
			
		||||
    secrets: inherit
 | 
			
		||||
 | 
			
		||||
  run_push_ci:
 | 
			
		||||
    name: Trigger Push CI
 | 
			
		||||
    name: Run Push CI
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    if: ${{ always() }}
 | 
			
		||||
    needs: build-docker-containers
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Trigger push CI via workflow_run
 | 
			
		||||
        run: echo "Trigger push CI via workflow_run"
 | 
			
		||||
      - name: Checkout transformers
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
        with:
 | 
			
		||||
          fetch-depth: 2
 | 
			
		||||
          ssh-key: "${{ secrets.COMMIT_KEY }}"
 | 
			
		||||
 | 
			
		||||
      - name: Checkout to branch push-ci
 | 
			
		||||
        # A more strict way to make sure`push-ci` is exactly the same as `main` at the push event commit.
 | 
			
		||||
        run: |
 | 
			
		||||
          git checkout -b push-ci
 | 
			
		||||
          git push -u origin push-ci --force
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										224
									
								
								.github/workflows/self-push.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										224
									
								
								.github/workflows/self-push.yml
									
									
									
									
										vendored
									
									
								
							@ -1,12 +1,9 @@
 | 
			
		||||
name: Self-hosted runner (push)
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  workflow_run:
 | 
			
		||||
    workflows: ["Self-hosted runner (push-caller)"]
 | 
			
		||||
    branches: ["main"]
 | 
			
		||||
    types: [completed]
 | 
			
		||||
  push:
 | 
			
		||||
    branches:
 | 
			
		||||
      - push-ci
 | 
			
		||||
      - ci_*
 | 
			
		||||
      - ci-*
 | 
			
		||||
    paths:
 | 
			
		||||
@ -34,47 +31,11 @@ jobs:
 | 
			
		||||
      matrix: ${{ steps.set-matrix.outputs.matrix }}
 | 
			
		||||
      test_map: ${{ steps.set-matrix.outputs.test_map }}
 | 
			
		||||
    steps:
 | 
			
		||||
      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
 | 
			
		||||
      # We also take into account the `push` event (we might want to test some changes in a branch)
 | 
			
		||||
      - name: Prepare custom environment variables
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # `CI_BRANCH_PUSH`: The branch name from the push event
 | 
			
		||||
        # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
 | 
			
		||||
        # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
 | 
			
		||||
        # `CI_SHA_PUSH`: The commit SHA from the push event
 | 
			
		||||
        # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
 | 
			
		||||
        # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
 | 
			
		||||
        run: |
 | 
			
		||||
          CI_BRANCH_PUSH=${{ github.event.ref }}
 | 
			
		||||
          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
 | 
			
		||||
          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
 | 
			
		||||
          CI_SHA_PUSH=${{ github.event.head_commit.id }}
 | 
			
		||||
          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
 | 
			
		||||
          echo $CI_BRANCH_PUSH
 | 
			
		||||
          echo $CI_BRANCH_WORKFLOW_RUN
 | 
			
		||||
          echo $CI_SHA_PUSH
 | 
			
		||||
          echo $CI_SHA_WORKFLOW_RUN
 | 
			
		||||
          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: print environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
 | 
			
		||||
          echo "env.CI_SHA = ${{ env.CI_SHA }}"
 | 
			
		||||
 | 
			
		||||
      - name: Checkout transformers
 | 
			
		||||
        uses: actions/checkout@v2
 | 
			
		||||
        with:
 | 
			
		||||
          fetch-depth: 2
 | 
			
		||||
 | 
			
		||||
      - name: Update clone using environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "original branch = $(git branch --show-current)"
 | 
			
		||||
          git fetch && git checkout ${{ env.CI_BRANCH }}
 | 
			
		||||
          echo "updated branch = $(git branch --show-current)"
 | 
			
		||||
          git checkout ${{ env.CI_SHA }}
 | 
			
		||||
          echo "log = $(git log -n 1)"
 | 
			
		||||
 | 
			
		||||
      - name: Cleanup
 | 
			
		||||
        run: |
 | 
			
		||||
          rm -rf tests/__pycache__
 | 
			
		||||
@ -126,38 +87,6 @@ jobs:
 | 
			
		||||
      image: huggingface/transformers-all-latest-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    steps:
 | 
			
		||||
      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
 | 
			
		||||
      # We also take into account the `push` event (we might want to test some changes in a branch)
 | 
			
		||||
      - name: Prepare custom environment variables
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For the meaning of these environment variables, see the job `Setup`
 | 
			
		||||
        run: |
 | 
			
		||||
          CI_BRANCH_PUSH=${{ github.event.ref }}
 | 
			
		||||
          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
 | 
			
		||||
          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
 | 
			
		||||
          CI_SHA_PUSH=${{ github.event.head_commit.id }}
 | 
			
		||||
          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
 | 
			
		||||
          echo $CI_BRANCH_PUSH
 | 
			
		||||
          echo $CI_BRANCH_WORKFLOW_RUN
 | 
			
		||||
          echo $CI_SHA_PUSH
 | 
			
		||||
          echo $CI_SHA_WORKFLOW_RUN
 | 
			
		||||
          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: print environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
 | 
			
		||||
          echo "env.CI_SHA = ${{ env.CI_SHA }}"
 | 
			
		||||
 | 
			
		||||
      - name: Update clone using environment variables
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "original branch = $(git branch --show-current)"
 | 
			
		||||
          git fetch && git checkout ${{ env.CI_BRANCH }}
 | 
			
		||||
          echo "updated branch = $(git branch --show-current)"
 | 
			
		||||
          git checkout ${{ env.CI_SHA }}
 | 
			
		||||
          echo "log = $(git log -n 1)"
 | 
			
		||||
 | 
			
		||||
      - name: Echo folder ${{ matrix.folders }}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
 | 
			
		||||
@ -170,6 +99,10 @@ jobs:
 | 
			
		||||
          echo "$matrix_folders"
 | 
			
		||||
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
@ -211,38 +144,6 @@ jobs:
 | 
			
		||||
      image: huggingface/transformers-all-latest-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    steps:
 | 
			
		||||
      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
 | 
			
		||||
      # We also take into account the `push` event (we might want to test some changes in a branch)
 | 
			
		||||
      - name: Prepare custom environment variables
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For the meaning of these environment variables, see the job `Setup`
 | 
			
		||||
        run: |
 | 
			
		||||
          CI_BRANCH_PUSH=${{ github.event.ref }}
 | 
			
		||||
          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
 | 
			
		||||
          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
 | 
			
		||||
          CI_SHA_PUSH=${{ github.event.head_commit.id }}
 | 
			
		||||
          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
 | 
			
		||||
          echo $CI_BRANCH_PUSH
 | 
			
		||||
          echo $CI_BRANCH_WORKFLOW_RUN
 | 
			
		||||
          echo $CI_SHA_PUSH
 | 
			
		||||
          echo $CI_SHA_WORKFLOW_RUN
 | 
			
		||||
          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: print environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
 | 
			
		||||
          echo "env.CI_SHA = ${{ env.CI_SHA }}"
 | 
			
		||||
 | 
			
		||||
      - name: Update clone using environment variables
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "original branch = $(git branch --show-current)"
 | 
			
		||||
          git fetch && git checkout ${{ env.CI_BRANCH }}
 | 
			
		||||
          echo "updated branch = $(git branch --show-current)"
 | 
			
		||||
          git checkout ${{ env.CI_SHA }}
 | 
			
		||||
          echo "log = $(git log -n 1)"
 | 
			
		||||
 | 
			
		||||
      - name: Echo folder ${{ matrix.folders }}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
 | 
			
		||||
@ -255,6 +156,10 @@ jobs:
 | 
			
		||||
          echo "$matrix_folders"
 | 
			
		||||
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /transformers
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
@ -296,41 +201,13 @@ jobs:
 | 
			
		||||
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    steps:
 | 
			
		||||
      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
 | 
			
		||||
      # We also take into account the `push` event (we might want to test some changes in a branch)
 | 
			
		||||
      - name: Prepare custom environment variables
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For the meaning of these environment variables, see the job `Setup`
 | 
			
		||||
        run: |
 | 
			
		||||
          CI_BRANCH_PUSH=${{ github.event.ref }}
 | 
			
		||||
          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
 | 
			
		||||
          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
 | 
			
		||||
          CI_SHA_PUSH=${{ github.event.head_commit.id }}
 | 
			
		||||
          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
 | 
			
		||||
          echo $CI_BRANCH_PUSH
 | 
			
		||||
          echo $CI_BRANCH_WORKFLOW_RUN
 | 
			
		||||
          echo $CI_SHA_PUSH
 | 
			
		||||
          echo $CI_SHA_WORKFLOW_RUN
 | 
			
		||||
          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: print environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
 | 
			
		||||
          echo "env.CI_SHA = ${{ env.CI_SHA }}"
 | 
			
		||||
 | 
			
		||||
      - name: Update clone using environment variables
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "original branch = $(git branch --show-current)"
 | 
			
		||||
          git fetch && git checkout ${{ env.CI_BRANCH }}
 | 
			
		||||
          echo "updated branch = $(git branch --show-current)"
 | 
			
		||||
          git checkout ${{ env.CI_SHA }}
 | 
			
		||||
          echo "log = $(git log -n 1)"
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
 | 
			
		||||
      # To avoid unknown test failures
 | 
			
		||||
      - name: Pre build DeepSpeed *again*
 | 
			
		||||
        working-directory: /workspace
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 -m pip uninstall -y deepspeed
 | 
			
		||||
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 | 
			
		||||
@ -340,12 +217,10 @@ jobs:
 | 
			
		||||
          nvidia-smi
 | 
			
		||||
 | 
			
		||||
      - name: Environment
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python utils/print_env.py
 | 
			
		||||
 | 
			
		||||
      - name: Run all non-slow selected tests on GPU
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
 | 
			
		||||
        run: |
 | 
			
		||||
          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
 | 
			
		||||
@ -353,14 +228,14 @@ jobs:
 | 
			
		||||
      - name: Failure short reports
 | 
			
		||||
        if: ${{ failure() }}
 | 
			
		||||
        continue-on-error: true
 | 
			
		||||
        run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
 | 
			
		||||
        run: cat reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
 | 
			
		||||
 | 
			
		||||
      - name: Test suite reports artifacts
 | 
			
		||||
        if: ${{ always() }}
 | 
			
		||||
        uses: actions/upload-artifact@v2
 | 
			
		||||
        with:
 | 
			
		||||
          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
 | 
			
		||||
          path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
 | 
			
		||||
          path: reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
 | 
			
		||||
 | 
			
		||||
  run_tests_torch_cuda_extensions_multi_gpu:
 | 
			
		||||
    name: Torch CUDA extension tests
 | 
			
		||||
@ -375,41 +250,13 @@ jobs:
 | 
			
		||||
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
    steps:
 | 
			
		||||
      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
 | 
			
		||||
      # We also take into account the `push` event (we might want to test some changes in a branch)
 | 
			
		||||
      - name: Prepare custom environment variables
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For the meaning of these environment variables, see the job `Setup`
 | 
			
		||||
        run: |
 | 
			
		||||
          CI_BRANCH_PUSH=${{ github.event.ref }}
 | 
			
		||||
          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
 | 
			
		||||
          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
 | 
			
		||||
          CI_SHA_PUSH=${{ github.event.head_commit.id }}
 | 
			
		||||
          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
 | 
			
		||||
          echo $CI_BRANCH_PUSH
 | 
			
		||||
          echo $CI_BRANCH_WORKFLOW_RUN
 | 
			
		||||
          echo $CI_SHA_PUSH
 | 
			
		||||
          echo $CI_SHA_WORKFLOW_RUN
 | 
			
		||||
          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: print environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
 | 
			
		||||
          echo "env.CI_SHA = ${{ env.CI_SHA }}"
 | 
			
		||||
 | 
			
		||||
      - name: Update clone using environment variables
 | 
			
		||||
      - name: Update clone
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "original branch = $(git branch --show-current)"
 | 
			
		||||
          git fetch && git checkout ${{ env.CI_BRANCH }}
 | 
			
		||||
          echo "updated branch = $(git branch --show-current)"
 | 
			
		||||
          git checkout ${{ env.CI_SHA }}
 | 
			
		||||
          echo "log = $(git log -n 1)"
 | 
			
		||||
        run: git fetch && git checkout ${{ github.sha }}
 | 
			
		||||
 | 
			
		||||
      # To avoid unknown test failures
 | 
			
		||||
      - name: Pre build DeepSpeed *again*
 | 
			
		||||
        working-directory: /workspace
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 -m pip uninstall -y deepspeed
 | 
			
		||||
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 | 
			
		||||
@ -453,39 +300,7 @@ jobs:
 | 
			
		||||
        run_tests_torch_cuda_extensions_multi_gpu
 | 
			
		||||
    ]
 | 
			
		||||
    steps:
 | 
			
		||||
      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
 | 
			
		||||
      # We also take into account the `push` event (we might want to test some changes in a branch)
 | 
			
		||||
      - name: Prepare custom environment variables
 | 
			
		||||
        shell: bash
 | 
			
		||||
        # For the meaning of these environment variables, see the job `Setup`
 | 
			
		||||
        run: |
 | 
			
		||||
          CI_BRANCH_PUSH=${{ github.event.ref }}
 | 
			
		||||
          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
 | 
			
		||||
          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
 | 
			
		||||
          CI_SHA_PUSH=${{ github.event.head_commit.id }}
 | 
			
		||||
          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
 | 
			
		||||
          echo $CI_BRANCH_PUSH
 | 
			
		||||
          echo $CI_BRANCH_WORKFLOW_RUN
 | 
			
		||||
          echo $CI_SHA_PUSH
 | 
			
		||||
          echo $CI_SHA_WORKFLOW_RUN
 | 
			
		||||
          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
 | 
			
		||||
 | 
			
		||||
      - name: print environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
 | 
			
		||||
          echo "env.CI_SHA = ${{ env.CI_SHA }}"
 | 
			
		||||
 | 
			
		||||
      - uses: actions/checkout@v2
 | 
			
		||||
 | 
			
		||||
      - name: Update clone using environment variables
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "original branch = $(git branch --show-current)"
 | 
			
		||||
          git fetch && git checkout ${{ env.CI_BRANCH }}
 | 
			
		||||
          echo "updated branch = $(git branch --show-current)"
 | 
			
		||||
          git checkout ${{ env.CI_SHA }}
 | 
			
		||||
          echo "log = $(git log -n 1)"
 | 
			
		||||
 | 
			
		||||
      - uses: actions/download-artifact@v2
 | 
			
		||||
      - name: Send message to Slack
 | 
			
		||||
        env:
 | 
			
		||||
@ -495,9 +310,8 @@ jobs:
 | 
			
		||||
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
 | 
			
		||||
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
 | 
			
		||||
          CI_EVENT: push
 | 
			
		||||
          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
 | 
			
		||||
          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
 | 
			
		||||
          CI_SHA: ${{ env.CI_SHA }}
 | 
			
		||||
          CI_TITLE: ${{ github.event.head_commit.message }}
 | 
			
		||||
          CI_COMMIT_URL: ${{ github.event.head_commit.url }}
 | 
			
		||||
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
 | 
			
		||||
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
 | 
			
		||||
        run: |
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							@ -308,7 +308,7 @@ jobs:
 | 
			
		||||
 | 
			
		||||
      # To avoid unknown test failures
 | 
			
		||||
      - name: Pre build DeepSpeed *again*
 | 
			
		||||
        working-directory: /workspace
 | 
			
		||||
        working-directory: /workspace/transformers
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 -m pip uninstall -y deepspeed
 | 
			
		||||
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/update_metdata.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/update_metdata.yml
									
									
									
									
										vendored
									
									
								
							@ -21,7 +21,7 @@ jobs:
 | 
			
		||||
        id: cache
 | 
			
		||||
        with:
 | 
			
		||||
          path: ~/venv/
 | 
			
		||||
          key: v3-metadata-${{ hashFiles('setup.py') }}
 | 
			
		||||
          key: v2-metadata-${{ hashFiles('setup.py') }}
 | 
			
		||||
 | 
			
		||||
      - name: Create virtual environment on cache miss
 | 
			
		||||
        if: steps.cache.outputs.cache-hit != 'true'
 | 
			
		||||
 | 
			
		||||
@ -128,7 +128,7 @@ You will need basic `git` proficiency to be able to contribute to
 | 
			
		||||
manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 | 
			
		||||
Git](https://git-scm.com/book/en/v2) is a very good reference.
 | 
			
		||||
 | 
			
		||||
Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/transformers/blob/main/setup.py#L426)):
 | 
			
		||||
Follow these steps to start contributing:
 | 
			
		||||
 | 
			
		||||
1. Fork the [repository](https://github.com/huggingface/transformers) by
 | 
			
		||||
   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Makefile
									
									
									
									
									
								
							@ -51,7 +51,6 @@ quality:
 | 
			
		||||
	python utils/sort_auto_mappings.py --check_only
 | 
			
		||||
	flake8 $(check_dirs)
 | 
			
		||||
	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 | 
			
		||||
	python utils/check_doc_toc.py
 | 
			
		||||
 | 
			
		||||
# Format source code automatically and check is there are any problems left that need manual fixing
 | 
			
		||||
 | 
			
		||||
@ -59,7 +58,6 @@ extra_style_checks:
 | 
			
		||||
	python utils/custom_init_isort.py
 | 
			
		||||
	python utils/sort_auto_mappings.py
 | 
			
		||||
	doc-builder style src/transformers docs/source --max_len 119 --path_to_docs docs/source
 | 
			
		||||
	python utils/check_doc_toc.py --fix_and_overwrite
 | 
			
		||||
 | 
			
		||||
# this target runs checks on all files and potentially modifies some of them
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										88
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										88
									
								
								README.md
									
									
									
									
									
								
							@ -116,46 +116,22 @@ To immediately use a model on a given input (text, image, audio, ...), we provid
 | 
			
		||||
 | 
			
		||||
The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here the answer is "positive" with a confidence of 99.97%.
 | 
			
		||||
 | 
			
		||||
Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image:
 | 
			
		||||
Many NLP tasks have a pre-trained `pipeline` ready to go. For example, we can easily extract question answers given context:
 | 
			
		||||
 | 
			
		||||
``` python
 | 
			
		||||
>>> import requests
 | 
			
		||||
>>> from PIL import Image
 | 
			
		||||
>>> from transformers import pipeline
 | 
			
		||||
 | 
			
		||||
# Download an image with cute cats
 | 
			
		||||
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
 | 
			
		||||
>>> image_data = requests.get(url, stream=True).raw
 | 
			
		||||
>>> image = Image.open(image_data)
 | 
			
		||||
# Allocate a pipeline for question-answering
 | 
			
		||||
>>> question_answerer = pipeline('question-answering')
 | 
			
		||||
>>> question_answerer({
 | 
			
		||||
...     'question': 'What is the name of the repository ?',
 | 
			
		||||
...     'context': 'Pipeline has been included in the huggingface/transformers repository'
 | 
			
		||||
... })
 | 
			
		||||
{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
 | 
			
		||||
 | 
			
		||||
# Allocate a pipeline for object detection
 | 
			
		||||
>>> object_detector = pipeline('object_detection')
 | 
			
		||||
>>> object_detector(image)
 | 
			
		||||
[{'score': 0.9982201457023621,
 | 
			
		||||
  'label': 'remote',
 | 
			
		||||
  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
 | 
			
		||||
 {'score': 0.9960021376609802,
 | 
			
		||||
  'label': 'remote',
 | 
			
		||||
  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
 | 
			
		||||
 {'score': 0.9954745173454285,
 | 
			
		||||
  'label': 'couch',
 | 
			
		||||
  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
 | 
			
		||||
 {'score': 0.9988006353378296,
 | 
			
		||||
  'label': 'cat',
 | 
			
		||||
  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
 | 
			
		||||
 {'score': 0.9986783862113953,
 | 
			
		||||
  'label': 'cat',
 | 
			
		||||
  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Here we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the right, with the predictions displayed on the left:
 | 
			
		||||
 | 
			
		||||
<h3 align="center">
 | 
			
		||||
    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
 | 
			
		||||
    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
 | 
			
		||||
</h3>
 | 
			
		||||
 | 
			
		||||
You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary).
 | 
			
		||||
In addition to the answer, the pretrained model used here returned its confidence score, along with the start position and end position of the answer in the tokenized sentence. You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary).
 | 
			
		||||
 | 
			
		||||
To download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
 | 
			
		||||
```python
 | 
			
		||||
@ -167,7 +143,6 @@ To download and use any of the pretrained models on your given task, all it take
 | 
			
		||||
>>> inputs = tokenizer("Hello world!", return_tensors="pt")
 | 
			
		||||
>>> outputs = model(**inputs)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
And here is the equivalent code for TensorFlow:
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import AutoTokenizer, TFAutoModel
 | 
			
		||||
@ -265,19 +240,18 @@ Current number of checkpoints: ** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 | 
			
		||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 | 
			
		||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
 | 
			
		||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 | 
			
		||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 | 
			
		||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 | 
			
		||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 | 
			
		||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 | 
			
		||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 | 
			
		||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 | 
			
		||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 | 
			
		||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 | 
			
		||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 | 
			
		||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 | 
			
		||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 | 
			
		||||
@ -291,48 +265,42 @@ Current number of checkpoints: ** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 | 
			
		||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 | 
			
		||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 | 
			
		||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 | 
			
		||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 | 
			
		||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 | 
			
		||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
 | 
			
		||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 | 
			
		||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 | 
			
		||||
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 | 
			
		||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 | 
			
		||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 | 
			
		||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 | 
			
		||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
 | 
			
		||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 | 
			
		||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 | 
			
		||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 | 
			
		||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 | 
			
		||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 | 
			
		||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 | 
			
		||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 | 
			
		||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 | 
			
		||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 | 
			
		||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 | 
			
		||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 | 
			
		||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 | 
			
		||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 | 
			
		||||
1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 | 
			
		||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 | 
			
		||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 | 
			
		||||
1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 | 
			
		||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 | 
			
		||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 | 
			
		||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 | 
			
		||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 | 
			
		||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 | 
			
		||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 | 
			
		||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 | 
			
		||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 | 
			
		||||
@ -343,7 +311,7 @@ Current number of checkpoints: ** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 | 
			
		||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 | 
			
		||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
 | 
			
		||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 | 
			
		||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 | 
			
		||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 | 
			
		||||
@ -356,15 +324,13 @@ Current number of checkpoints: ** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 | 
			
		||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 | 
			
		||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 | 
			
		||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 | 
			
		||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 | 
			
		||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 | 
			
		||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 | 
			
		||||
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 | 
			
		||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 | 
			
		||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 | 
			
		||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 | 
			
		||||
@ -373,7 +339,7 @@ Current number of checkpoints: ** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 | 
			
		||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 | 
			
		||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 | 
			
		||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 | 
			
		||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 | 
			
		||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 | 
			
		||||
@ -384,7 +350,7 @@ Current number of checkpoints: ** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 | 
			
		||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 | 
			
		||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 | 
			
		||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 | 
			
		||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										47
									
								
								README_ko.md
									
									
									
									
									
								
							
							
						
						
									
										47
									
								
								README_ko.md
									
									
									
									
									
								
							@ -221,19 +221,18 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 | 
			
		||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 | 
			
		||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 | 
			
		||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
 | 
			
		||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 | 
			
		||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 | 
			
		||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 | 
			
		||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 | 
			
		||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 | 
			
		||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 | 
			
		||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 | 
			
		||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 | 
			
		||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 | 
			
		||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 | 
			
		||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 | 
			
		||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 | 
			
		||||
@ -247,48 +246,42 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 | 
			
		||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 | 
			
		||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 | 
			
		||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 | 
			
		||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 | 
			
		||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 | 
			
		||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 | 
			
		||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 
 | 
			
		||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 | 
			
		||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 | 
			
		||||
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 | 
			
		||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 | 
			
		||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 | 
			
		||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 | 
			
		||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
 | 
			
		||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 | 
			
		||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 | 
			
		||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 | 
			
		||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 | 
			
		||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 | 
			
		||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 | 
			
		||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 | 
			
		||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 | 
			
		||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 | 
			
		||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 | 
			
		||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 | 
			
		||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 | 
			
		||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 | 
			
		||||
1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 | 
			
		||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 | 
			
		||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 | 
			
		||||
1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 | 
			
		||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 | 
			
		||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 | 
			
		||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 | 
			
		||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 | 
			
		||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 | 
			
		||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 | 
			
		||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 | 
			
		||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 | 
			
		||||
@ -299,7 +292,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 | 
			
		||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 | 
			
		||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 | 
			
		||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 | 
			
		||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 | 
			
		||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 | 
			
		||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 | 
			
		||||
@ -312,15 +305,13 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 | 
			
		||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 | 
			
		||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 | 
			
		||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 | 
			
		||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 | 
			
		||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 | 
			
		||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 | 
			
		||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 | 
			
		||||
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler 
 | 
			
		||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 | 
			
		||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 | 
			
		||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 | 
			
		||||
@ -329,7 +320,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 | 
			
		||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 | 
			
		||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 | 
			
		||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 | 
			
		||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 | 
			
		||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 | 
			
		||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 | 
			
		||||
@ -340,7 +331,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 | 
			
		||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 | 
			
		||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 | 
			
		||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 | 
			
		||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 | 
			
		||||
1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다. 
 | 
			
		||||
 | 
			
		||||
@ -375,4 +366,4 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 | 
			
		||||
    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
 | 
			
		||||
    pages = "38--45"
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
```
 | 
			
		||||
@ -245,19 +245,18 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 | 
			
		||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 | 
			
		||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/). 
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/). 
 | 
			
		||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
 | 
			
		||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
 | 
			
		||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 | 
			
		||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 | 
			
		||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 | 
			
		||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 | 
			
		||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 | 
			
		||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
 | 
			
		||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
 | 
			
		||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 | 
			
		||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 | 
			
		||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。
 | 
			
		||||
@ -271,48 +270,42 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
 | 
			
		||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 | 
			
		||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
 | 
			
		||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
 | 
			
		||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
 | 
			
		||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
 | 
			
		||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 
 | 
			
		||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
 | 
			
		||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
 | 
			
		||||
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
 | 
			
		||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 | 
			
		||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
 | 
			
		||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 | 
			
		||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
 | 
			
		||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
 | 
			
		||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
 | 
			
		||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
 | 
			
		||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
 | 
			
		||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (来自 Facebook) 伴随论文 [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) 由 Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert 发布。
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (来自 Facebook) 伴随论文 [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) 由 Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert 发布。
 | 
			
		||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
 | 
			
		||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 
 | 
			
		||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
 | 
			
		||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
 | 
			
		||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 | 
			
		||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 | 
			
		||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
 | 
			
		||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (来自 CMU/Google Brain) 伴随论文 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 由 Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 发布。
 | 
			
		||||
1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
 | 
			
		||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 | 
			
		||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 | 
			
		||||
1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。 
 | 
			
		||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
 | 
			
		||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
 | 
			
		||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
 | 
			
		||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 | 
			
		||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
 | 
			
		||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 | 
			
		||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
 | 
			
		||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
 | 
			
		||||
@ -323,7 +316,7 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (来自 Facebook) 伴随论文 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 由 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 发布。
 | 
			
		||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
 | 
			
		||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 | 
			
		||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 | 
			
		||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 | 
			
		||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
 | 
			
		||||
@ -336,15 +329,13 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
 | 
			
		||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
 | 
			
		||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 | 
			
		||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
 | 
			
		||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 | 
			
		||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 | 
			
		||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 | 
			
		||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 | 
			
		||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 | 
			
		||||
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler 
 | 
			
		||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 | 
			
		||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
 | 
			
		||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
 | 
			
		||||
@ -353,7 +344,7 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
 | 
			
		||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 | 
			
		||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 | 
			
		||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
 | 
			
		||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 | 
			
		||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 | 
			
		||||
@ -364,7 +355,7 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
 | 
			
		||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
 | 
			
		||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (来自 Huazhong University of Science & Technology) 伴随论文 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 由 Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 发布。
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (来自 Huazhong University of Science & Technology) 伴随论文 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 由 Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 发布。
 | 
			
		||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
 | 
			
		||||
1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
 | 
			
		||||
 | 
			
		||||
@ -380,7 +371,7 @@ conda install -c huggingface transformers
 | 
			
		||||
| [文档](https://huggingface.co/transformers/) | 完整的 API 文档和教程 |
 | 
			
		||||
| [任务总结](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支持的任务 |
 | 
			
		||||
| [预处理教程](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 来为模型准备数据 |
 | 
			
		||||
| [训练和微调](https://huggingface.co/docs/transformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
 | 
			
		||||
| [训练和微调](https://huggingface.co/docstransformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
 | 
			
		||||
| [快速上手:微调和用例脚本](https://github.com/huggingface/transformers/tree/main/examples) | 为各种任务提供的用例脚本 |
 | 
			
		||||
| [模型分享和上传](https://huggingface.co/docs/transformers/model_sharing) | 和社区上传和分享你微调的模型 |
 | 
			
		||||
| [迁移](https://huggingface.co/docs/transformers/migration) | 从 `pytorch-transformers` 或 `pytorch-pretrained-bert` 迁移到 🤗 Transformers |
 | 
			
		||||
@ -400,4 +391,4 @@ conda install -c huggingface transformers
 | 
			
		||||
    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
 | 
			
		||||
    pages = "38--45"
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
```
 | 
			
		||||
@ -257,19 +257,18 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 | 
			
		||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 | 
			
		||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/). 
 | 
			
		||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/). 
 | 
			
		||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 | 
			
		||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 | 
			
		||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 | 
			
		||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 | 
			
		||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 | 
			
		||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 | 
			
		||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 | 
			
		||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 | 
			
		||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 | 
			
		||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 | 
			
		||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 | 
			
		||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 | 
			
		||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 | 
			
		||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 | 
			
		||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 | 
			
		||||
@ -283,48 +282,42 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 | 
			
		||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 | 
			
		||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 | 
			
		||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 | 
			
		||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 | 
			
		||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 | 
			
		||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 | 
			
		||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 | 
			
		||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 
 | 
			
		||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 
 | 
			
		||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 | 
			
		||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 | 
			
		||||
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 | 
			
		||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 | 
			
		||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 | 
			
		||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 | 
			
		||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 | 
			
		||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
 | 
			
		||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
 | 
			
		||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 | 
			
		||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 | 
			
		||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 | 
			
		||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 | 
			
		||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 | 
			
		||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 | 
			
		||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 | 
			
		||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 | 
			
		||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 | 
			
		||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 
 | 
			
		||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 
 | 
			
		||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 | 
			
		||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 | 
			
		||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 | 
			
		||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 | 
			
		||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 | 
			
		||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 | 
			
		||||
1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 | 
			
		||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 | 
			
		||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 | 
			
		||||
1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 | 
			
		||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 | 
			
		||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 | 
			
		||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 | 
			
		||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 | 
			
		||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 | 
			
		||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 | 
			
		||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 | 
			
		||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 | 
			
		||||
@ -335,7 +328,7 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 | 
			
		||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 | 
			
		||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 | 
			
		||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 | 
			
		||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 | 
			
		||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 | 
			
		||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 | 
			
		||||
@ -348,15 +341,13 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 | 
			
		||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 | 
			
		||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 | 
			
		||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/main/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 | 
			
		||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 | 
			
		||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 | 
			
		||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 
 | 
			
		||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 | 
			
		||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 | 
			
		||||
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler 
 | 
			
		||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 | 
			
		||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 | 
			
		||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 | 
			
		||||
@ -365,7 +356,7 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 | 
			
		||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 | 
			
		||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 | 
			
		||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 | 
			
		||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 | 
			
		||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 | 
			
		||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 | 
			
		||||
@ -376,7 +367,7 @@ conda install -c huggingface transformers
 | 
			
		||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 | 
			
		||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 | 
			
		||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 | 
			
		||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 | 
			
		||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 | 
			
		||||
1. 想要貢獻新的模型?我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
 | 
			
		||||
 | 
			
		||||
@ -412,4 +403,4 @@ conda install -c huggingface transformers
 | 
			
		||||
    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
 | 
			
		||||
    pages = "38--45"
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
```
 | 
			
		||||
@ -3,13 +3,10 @@ LABEL maintainer="Hugging Face"
 | 
			
		||||
 | 
			
		||||
ARG DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
 | 
			
		||||
# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
 | 
			
		||||
SHELL ["sh", "-lc"]
 | 
			
		||||
 | 
			
		||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 | 
			
		||||
# to be used as arguments for docker build (so far).
 | 
			
		||||
 | 
			
		||||
ARG PYTORCH='1.12.0'
 | 
			
		||||
ARG PYTORCH='1.11.0'
 | 
			
		||||
# (not always a valid torch version)
 | 
			
		||||
ARG INTEL_TORCH_EXT='1.11.0'
 | 
			
		||||
# Example: `cu102`, `cu113`, etc.
 | 
			
		||||
@ -24,20 +21,11 @@ ARG REF=main
 | 
			
		||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
 | 
			
		||||
 | 
			
		||||
# TODO: Handle these in a python utility script
 | 
			
		||||
RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
 | 
			
		||||
RUN echo torch=$VERSION
 | 
			
		||||
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
 | 
			
		||||
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
 | 
			
		||||
# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 | 
			
		||||
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -U tensorflow
 | 
			
		||||
RUN python3 -m pip uninstall -y flax jax
 | 
			
		||||
 | 
			
		||||
# Use installed torch version for `torch-scatter` to avid to deal with PYTORCH='pre'.
 | 
			
		||||
# If torch is nightly version, the link is likely to be invalid, but the installation falls back to the latest torch-scatter
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$PYTORCH+$CUDA.html
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
 | 
			
		||||
 | 
			
		||||
@ -1,43 +0,0 @@
 | 
			
		||||
ARG BASE_DOCKER_IMAGE="nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04"
 | 
			
		||||
FROM $BASE_DOCKER_IMAGE
 | 
			
		||||
LABEL maintainer="Hugging Face"
 | 
			
		||||
 | 
			
		||||
ARG DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
 | 
			
		||||
# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
 | 
			
		||||
SHELL ["sh", "-lc"]
 | 
			
		||||
 | 
			
		||||
RUN apt update
 | 
			
		||||
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
 | 
			
		||||
RUN git lfs install
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir --upgrade pip
 | 
			
		||||
 | 
			
		||||
ARG REF=main
 | 
			
		||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
 | 
			
		||||
 | 
			
		||||
# When installing in editable mode, `transformers` is not recognized as a package.
 | 
			
		||||
# this line must be added in order for python to be aware of transformers.
 | 
			
		||||
RUN cd transformers && python3 setup.py develop
 | 
			
		||||
 | 
			
		||||
ARG FRAMEWORK
 | 
			
		||||
ARG VERSION
 | 
			
		||||
 | 
			
		||||
# Remove all frameworks
 | 
			
		||||
# (`accelerate` requires `torch`, and this causes import issues for TF-only testing)
 | 
			
		||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio accelerate tensorflow jax flax
 | 
			
		||||
 | 
			
		||||
# Get the libraries and their versions to install, and write installation command to `~/.profile`.
 | 
			
		||||
RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
 | 
			
		||||
 | 
			
		||||
# Install the target framework
 | 
			
		||||
RUN echo "INSTALL_CMD = $INSTALL_CMD"
 | 
			
		||||
RUN $INSTALL_CMD
 | 
			
		||||
 | 
			
		||||
# Having installation problems for torch-scatter with torch <= 1.6. Disable so we have the same set of tests.
 | 
			
		||||
# (This part will be removed once the logic of using `past_ci_versions.py` is used in other Dockerfile files.)
 | 
			
		||||
# # Use installed torch version for `torch-scatter`.
 | 
			
		||||
# # (The env. variable $CUDA is defined in `past_ci_versions.py`)
 | 
			
		||||
# RUN [ "$FRAMEWORK" = "pytorch" ] && python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html || echo "torch-scatter not to be installed"
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
 | 
			
		||||
@ -3,10 +3,6 @@ LABEL maintainer="Hugging Face"
 | 
			
		||||
 | 
			
		||||
ARG DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
 | 
			
		||||
ARG PYTORCH='1.12.0'
 | 
			
		||||
# Example: `cu102`, `cu113`, etc.
 | 
			
		||||
ARG CUDA='cu113'
 | 
			
		||||
 | 
			
		||||
RUN apt -y update
 | 
			
		||||
RUN apt install -y libaio-dev
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir --upgrade pip
 | 
			
		||||
@ -17,16 +13,13 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 | 
			
		||||
# Install latest release PyTorch
 | 
			
		||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 | 
			
		||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 | 
			
		||||
 | 
			
		||||
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 | 
			
		||||
# Pre-build DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 | 
			
		||||
RUN python3 -m pip uninstall -y deepspeed
 | 
			
		||||
# This has to be run (again) inside the GPU VMs running the tests.
 | 
			
		||||
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 | 
			
		||||
# TODO: Find out why test fail.
 | 
			
		||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 | 
			
		||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 | 
			
		||||
 | 
			
		||||
# When installing in editable mode, `transformers` is not recognized as a package.
 | 
			
		||||
# this line must be added in order for python to be aware of transformers.
 | 
			
		||||
 | 
			
		||||
@ -1,35 +0,0 @@
 | 
			
		||||
FROM nvcr.io/nvidia/pytorch:21.03-py3
 | 
			
		||||
LABEL maintainer="Hugging Face"
 | 
			
		||||
 | 
			
		||||
ARG DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
 | 
			
		||||
# Example: `cu102`, `cu113`, etc.
 | 
			
		||||
ARG CUDA='cu113'
 | 
			
		||||
 | 
			
		||||
RUN apt -y update
 | 
			
		||||
RUN apt install -y libaio-dev
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir --upgrade pip
 | 
			
		||||
 | 
			
		||||
ARG REF=main
 | 
			
		||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 | 
			
		||||
 | 
			
		||||
# Install **nightly** release PyTorch (flag `--pre`)
 | 
			
		||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 | 
			
		||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 | 
			
		||||
 | 
			
		||||
# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 | 
			
		||||
RUN python3 -m pip uninstall -y deepspeed
 | 
			
		||||
# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
 | 
			
		||||
# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
 | 
			
		||||
# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
 | 
			
		||||
#    DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 | 
			
		||||
 | 
			
		||||
# When installing in editable mode, `transformers` is not recognized as a package.
 | 
			
		||||
# this line must be added in order for python to be aware of transformers.
 | 
			
		||||
RUN cd transformers && python3 setup.py develop
 | 
			
		||||
 | 
			
		||||
# Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed.
 | 
			
		||||
# RUN python3 -c "from deepspeed.launcher.runner import main"
 | 
			
		||||
@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing]
 | 
			
		||||
 | 
			
		||||
# If set to nothing, will install the latest version
 | 
			
		||||
ARG PYTORCH='1.12.0'
 | 
			
		||||
ARG PYTORCH=''
 | 
			
		||||
ARG TORCH_VISION=''
 | 
			
		||||
ARG TORCH_AUDIO=''
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -50,32 +50,11 @@ You can adapt the `--build_dir` to set any temporary folder that you prefer. Thi
 | 
			
		||||
the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
 | 
			
		||||
Markdown editor.
 | 
			
		||||
 | 
			
		||||
## Previewing the documentation
 | 
			
		||||
 | 
			
		||||
To preview the docs, first install the `watchdog` module with:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
pip install watchdog
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Then run the following command:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
doc-builder preview {package_name} {path_to_docs}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
For example:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
doc-builder preview transformers docs/source/en/
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
**NOTE**
 | 
			
		||||
 | 
			
		||||
The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
 | 
			
		||||
It's not possible to see locally how the final documentation will look like for now. Once you have opened a PR, you
 | 
			
		||||
will see a bot add a comment to a link where the documentation with your changes lives.
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -22,7 +22,7 @@
 | 
			
		||||
  title: Tutorials
 | 
			
		||||
- sections:
 | 
			
		||||
  - local: fast_tokenizers
 | 
			
		||||
    title: Use tokenizers from 🤗 Tokenizers
 | 
			
		||||
    title: "Use tokenizers from 🤗 Tokenizers"
 | 
			
		||||
  - local: create_a_model
 | 
			
		||||
    title: Create a custom architecture
 | 
			
		||||
  - local: custom_models
 | 
			
		||||
@ -59,31 +59,7 @@
 | 
			
		||||
    title: Converting TensorFlow Checkpoints
 | 
			
		||||
  - local: serialization
 | 
			
		||||
    title: Export 🤗 Transformers models
 | 
			
		||||
  - sections:
 | 
			
		||||
    - local: performance
 | 
			
		||||
      title: Overview
 | 
			
		||||
    - local: perf_train_gpu_one
 | 
			
		||||
      title: Training on one GPU
 | 
			
		||||
    - local: perf_train_gpu_many
 | 
			
		||||
      title: Training on many GPUs
 | 
			
		||||
    - local: perf_train_cpu
 | 
			
		||||
      title: Training on CPU
 | 
			
		||||
    - local: perf_train_cpu_many
 | 
			
		||||
      title: Training on many CPUs
 | 
			
		||||
    - local: perf_train_tpu
 | 
			
		||||
      title: Training on TPUs
 | 
			
		||||
    - local: perf_train_special
 | 
			
		||||
      title: Training on Specialized Hardware
 | 
			
		||||
    - local: perf_infer_cpu
 | 
			
		||||
      title: Inference on CPU
 | 
			
		||||
    - local: perf_infer_gpu_one
 | 
			
		||||
      title: Inference on one GPU
 | 
			
		||||
    - local: perf_infer_gpu_many
 | 
			
		||||
      title: Inference on many GPUs
 | 
			
		||||
    - local: perf_infer_special
 | 
			
		||||
      title: Inference on Specialized Hardware
 | 
			
		||||
    - local: perf_hardware
 | 
			
		||||
      title: Custom hardware for training
 | 
			
		||||
  - local: performance
 | 
			
		||||
    title: Performance and scalability
 | 
			
		||||
  - local: big_models
 | 
			
		||||
    title: Instantiating a big model
 | 
			
		||||
@ -96,15 +72,25 @@
 | 
			
		||||
  - local: debugging
 | 
			
		||||
    title: Debugging
 | 
			
		||||
  - local: notebooks
 | 
			
		||||
    title: 🤗 Transformers Notebooks
 | 
			
		||||
    title: "🤗 Transformers Notebooks"
 | 
			
		||||
  - local: community
 | 
			
		||||
    title: Community
 | 
			
		||||
  - local: contributing
 | 
			
		||||
    title: How to contribute to transformers?
 | 
			
		||||
  - local: add_new_model
 | 
			
		||||
    title: How to add a model to 🤗 Transformers?
 | 
			
		||||
    title: "How to add a model to 🤗 Transformers?"
 | 
			
		||||
  - local: add_new_pipeline
 | 
			
		||||
    title: How to create a custom pipeline?
 | 
			
		||||
    title: "How to add a pipeline to 🤗 Transformers?"
 | 
			
		||||
  - local: perf_train_gpu_one
 | 
			
		||||
    title: Training on one GPU
 | 
			
		||||
  - local: perf_train_gpu_many
 | 
			
		||||
    title: Training on many GPUs
 | 
			
		||||
  - local: perf_train_cpu
 | 
			
		||||
    title: Training on CPU
 | 
			
		||||
  - local: perf_infer_cpu
 | 
			
		||||
    title: Inference on CPU
 | 
			
		||||
  - local: perf_hardware
 | 
			
		||||
    title: Custom hardware for training
 | 
			
		||||
  - local: testing
 | 
			
		||||
    title: Testing
 | 
			
		||||
  - local: pr_checks
 | 
			
		||||
@ -164,301 +150,268 @@
 | 
			
		||||
      title: Feature Extractor
 | 
			
		||||
    title: Main Classes
 | 
			
		||||
  - sections:
 | 
			
		||||
    - local: model_doc/albert
 | 
			
		||||
      title: ALBERT
 | 
			
		||||
    - local: model_doc/auto
 | 
			
		||||
      title: Auto Classes
 | 
			
		||||
    - isExpanded: false
 | 
			
		||||
      sections:
 | 
			
		||||
      - local: model_doc/albert
 | 
			
		||||
        title: ALBERT
 | 
			
		||||
      - local: model_doc/bart
 | 
			
		||||
        title: BART
 | 
			
		||||
      - local: model_doc/barthez
 | 
			
		||||
        title: BARThez
 | 
			
		||||
      - local: model_doc/bartpho
 | 
			
		||||
        title: BARTpho
 | 
			
		||||
      - local: model_doc/bert
 | 
			
		||||
        title: BERT
 | 
			
		||||
      - local: model_doc/bert-generation
 | 
			
		||||
        title: BertGeneration
 | 
			
		||||
      - local: model_doc/bert-japanese
 | 
			
		||||
        title: BertJapanese
 | 
			
		||||
      - local: model_doc/bertweet
 | 
			
		||||
        title: Bertweet
 | 
			
		||||
      - local: model_doc/big_bird
 | 
			
		||||
        title: BigBird
 | 
			
		||||
      - local: model_doc/bigbird_pegasus
 | 
			
		||||
        title: BigBirdPegasus
 | 
			
		||||
      - local: model_doc/blenderbot
 | 
			
		||||
        title: Blenderbot
 | 
			
		||||
      - local: model_doc/blenderbot-small
 | 
			
		||||
        title: Blenderbot Small
 | 
			
		||||
      - local: model_doc/bloom
 | 
			
		||||
        title: BLOOM
 | 
			
		||||
      - local: model_doc/bort
 | 
			
		||||
        title: BORT
 | 
			
		||||
      - local: model_doc/byt5
 | 
			
		||||
        title: ByT5
 | 
			
		||||
      - local: model_doc/camembert
 | 
			
		||||
        title: CamemBERT
 | 
			
		||||
      - local: model_doc/canine
 | 
			
		||||
        title: CANINE
 | 
			
		||||
      - local: model_doc/codegen
 | 
			
		||||
        title: CodeGen
 | 
			
		||||
      - local: model_doc/convbert
 | 
			
		||||
        title: ConvBERT
 | 
			
		||||
      - local: model_doc/cpm
 | 
			
		||||
        title: CPM
 | 
			
		||||
      - local: model_doc/ctrl
 | 
			
		||||
        title: CTRL
 | 
			
		||||
      - local: model_doc/deberta
 | 
			
		||||
        title: DeBERTa
 | 
			
		||||
      - local: model_doc/deberta-v2
 | 
			
		||||
        title: DeBERTa-v2
 | 
			
		||||
      - local: model_doc/dialogpt
 | 
			
		||||
        title: DialoGPT
 | 
			
		||||
      - local: model_doc/distilbert
 | 
			
		||||
        title: DistilBERT
 | 
			
		||||
      - local: model_doc/dpr
 | 
			
		||||
        title: DPR
 | 
			
		||||
      - local: model_doc/electra
 | 
			
		||||
        title: ELECTRA
 | 
			
		||||
      - local: model_doc/encoder-decoder
 | 
			
		||||
        title: Encoder Decoder Models
 | 
			
		||||
      - local: model_doc/flaubert
 | 
			
		||||
        title: FlauBERT
 | 
			
		||||
      - local: model_doc/fnet
 | 
			
		||||
        title: FNet
 | 
			
		||||
      - local: model_doc/fsmt
 | 
			
		||||
        title: FSMT
 | 
			
		||||
      - local: model_doc/funnel
 | 
			
		||||
        title: Funnel Transformer
 | 
			
		||||
      - local: model_doc/openai-gpt
 | 
			
		||||
        title: GPT
 | 
			
		||||
      - local: model_doc/gpt_neo
 | 
			
		||||
        title: GPT Neo
 | 
			
		||||
      - local: model_doc/gpt_neox
 | 
			
		||||
        title: GPT NeoX
 | 
			
		||||
      - local: model_doc/gptj
 | 
			
		||||
        title: GPT-J
 | 
			
		||||
      - local: model_doc/gpt2
 | 
			
		||||
        title: GPT2
 | 
			
		||||
      - local: model_doc/herbert
 | 
			
		||||
        title: HerBERT
 | 
			
		||||
      - local: model_doc/ibert
 | 
			
		||||
        title: I-BERT
 | 
			
		||||
      - local: model_doc/layoutlm
 | 
			
		||||
        title: LayoutLM
 | 
			
		||||
      - local: model_doc/led
 | 
			
		||||
        title: LED
 | 
			
		||||
      - local: model_doc/longformer
 | 
			
		||||
        title: Longformer
 | 
			
		||||
      - local: model_doc/longt5
 | 
			
		||||
        title: LongT5
 | 
			
		||||
      - local: model_doc/luke
 | 
			
		||||
        title: LUKE
 | 
			
		||||
      - local: model_doc/m2m_100
 | 
			
		||||
        title: M2M100
 | 
			
		||||
      - local: model_doc/marian
 | 
			
		||||
        title: MarianMT
 | 
			
		||||
      - local: model_doc/mbart
 | 
			
		||||
        title: MBart and MBart-50
 | 
			
		||||
      - local: model_doc/megatron-bert
 | 
			
		||||
        title: MegatronBERT
 | 
			
		||||
      - local: model_doc/megatron_gpt2
 | 
			
		||||
        title: MegatronGPT2
 | 
			
		||||
      - local: model_doc/mluke
 | 
			
		||||
        title: mLUKE
 | 
			
		||||
      - local: model_doc/mobilebert
 | 
			
		||||
        title: MobileBERT
 | 
			
		||||
      - local: model_doc/mpnet
 | 
			
		||||
        title: MPNet
 | 
			
		||||
      - local: model_doc/mt5
 | 
			
		||||
        title: MT5
 | 
			
		||||
      - local: model_doc/mvp
 | 
			
		||||
        title: MVP
 | 
			
		||||
      - local: model_doc/nezha
 | 
			
		||||
        title: NEZHA
 | 
			
		||||
      - local: model_doc/nllb
 | 
			
		||||
        title: NLLB
 | 
			
		||||
      - local: model_doc/nystromformer
 | 
			
		||||
        title: Nyströmformer
 | 
			
		||||
      - local: model_doc/opt
 | 
			
		||||
        title: OPT
 | 
			
		||||
      - local: model_doc/pegasus
 | 
			
		||||
        title: Pegasus
 | 
			
		||||
      - local: model_doc/phobert
 | 
			
		||||
        title: PhoBERT
 | 
			
		||||
      - local: model_doc/plbart
 | 
			
		||||
        title: PLBart
 | 
			
		||||
      - local: model_doc/prophetnet
 | 
			
		||||
        title: ProphetNet
 | 
			
		||||
      - local: model_doc/qdqbert
 | 
			
		||||
        title: QDQBert
 | 
			
		||||
      - local: model_doc/rag
 | 
			
		||||
        title: RAG
 | 
			
		||||
      - local: model_doc/realm
 | 
			
		||||
        title: REALM
 | 
			
		||||
      - local: model_doc/reformer
 | 
			
		||||
        title: Reformer
 | 
			
		||||
      - local: model_doc/rembert
 | 
			
		||||
        title: RemBERT
 | 
			
		||||
      - local: model_doc/retribert
 | 
			
		||||
        title: RetriBERT
 | 
			
		||||
      - local: model_doc/roberta
 | 
			
		||||
        title: RoBERTa
 | 
			
		||||
      - local: model_doc/roformer
 | 
			
		||||
        title: RoFormer
 | 
			
		||||
      - local: model_doc/splinter
 | 
			
		||||
        title: Splinter
 | 
			
		||||
      - local: model_doc/squeezebert
 | 
			
		||||
        title: SqueezeBERT
 | 
			
		||||
      - local: model_doc/t5
 | 
			
		||||
        title: T5
 | 
			
		||||
      - local: model_doc/t5v1.1
 | 
			
		||||
        title: T5v1.1
 | 
			
		||||
      - local: model_doc/tapas
 | 
			
		||||
        title: TAPAS
 | 
			
		||||
      - local: model_doc/tapex
 | 
			
		||||
        title: TAPEX
 | 
			
		||||
      - local: model_doc/transfo-xl
 | 
			
		||||
        title: Transformer XL
 | 
			
		||||
      - local: model_doc/ul2
 | 
			
		||||
        title: UL2
 | 
			
		||||
      - local: model_doc/xglm
 | 
			
		||||
        title: XGLM
 | 
			
		||||
      - local: model_doc/xlm
 | 
			
		||||
        title: XLM
 | 
			
		||||
      - local: model_doc/xlm-prophetnet
 | 
			
		||||
        title: XLM-ProphetNet
 | 
			
		||||
      - local: model_doc/xlm-roberta
 | 
			
		||||
        title: XLM-RoBERTa
 | 
			
		||||
      - local: model_doc/xlm-roberta-xl
 | 
			
		||||
        title: XLM-RoBERTa-XL
 | 
			
		||||
      - local: model_doc/xlnet
 | 
			
		||||
        title: XLNet
 | 
			
		||||
      - local: model_doc/yoso
 | 
			
		||||
        title: YOSO
 | 
			
		||||
      title: Text models
 | 
			
		||||
    - isExpanded: false
 | 
			
		||||
      sections:
 | 
			
		||||
      - local: model_doc/beit
 | 
			
		||||
        title: BEiT
 | 
			
		||||
      - local: model_doc/convnext
 | 
			
		||||
        title: ConvNeXT
 | 
			
		||||
      - local: model_doc/cvt
 | 
			
		||||
        title: CvT
 | 
			
		||||
      - local: model_doc/deit
 | 
			
		||||
        title: DeiT
 | 
			
		||||
      - local: model_doc/detr
 | 
			
		||||
        title: DETR
 | 
			
		||||
      - local: model_doc/dit
 | 
			
		||||
        title: DiT
 | 
			
		||||
      - local: model_doc/dpt
 | 
			
		||||
        title: DPT
 | 
			
		||||
      - local: model_doc/glpn
 | 
			
		||||
        title: GLPN
 | 
			
		||||
      - local: model_doc/imagegpt
 | 
			
		||||
        title: ImageGPT
 | 
			
		||||
      - local: model_doc/levit
 | 
			
		||||
        title: LeViT
 | 
			
		||||
      - local: model_doc/maskformer
 | 
			
		||||
        title: MaskFormer
 | 
			
		||||
      - local: model_doc/mobilevit
 | 
			
		||||
        title: MobileViT
 | 
			
		||||
      - local: model_doc/owlvit
 | 
			
		||||
        title: OWL-ViT
 | 
			
		||||
      - local: model_doc/poolformer
 | 
			
		||||
        title: PoolFormer
 | 
			
		||||
      - local: model_doc/regnet
 | 
			
		||||
        title: RegNet
 | 
			
		||||
      - local: model_doc/resnet
 | 
			
		||||
        title: ResNet
 | 
			
		||||
      - local: model_doc/segformer
 | 
			
		||||
        title: SegFormer
 | 
			
		||||
      - local: model_doc/swin
 | 
			
		||||
        title: Swin Transformer
 | 
			
		||||
      - local: model_doc/swinv2
 | 
			
		||||
        title: Swin Transformer V2
 | 
			
		||||
      - local: model_doc/van
 | 
			
		||||
        title: VAN
 | 
			
		||||
      - local: model_doc/vit
 | 
			
		||||
        title: Vision Transformer (ViT)
 | 
			
		||||
      - local: model_doc/vit_mae
 | 
			
		||||
        title: ViTMAE
 | 
			
		||||
      - local: model_doc/yolos
 | 
			
		||||
        title: YOLOS
 | 
			
		||||
      title: Vision models
 | 
			
		||||
    - isExpanded: false
 | 
			
		||||
      sections:
 | 
			
		||||
      - local: model_doc/hubert
 | 
			
		||||
        title: Hubert
 | 
			
		||||
      - local: model_doc/mctct
 | 
			
		||||
        title: MCTCT
 | 
			
		||||
      - local: model_doc/sew
 | 
			
		||||
        title: SEW
 | 
			
		||||
      - local: model_doc/sew-d
 | 
			
		||||
        title: SEW-D
 | 
			
		||||
      - local: model_doc/speech_to_text
 | 
			
		||||
        title: Speech2Text
 | 
			
		||||
      - local: model_doc/speech_to_text_2
 | 
			
		||||
        title: Speech2Text2
 | 
			
		||||
      - local: model_doc/unispeech
 | 
			
		||||
        title: UniSpeech
 | 
			
		||||
      - local: model_doc/unispeech-sat
 | 
			
		||||
        title: UniSpeech-SAT
 | 
			
		||||
      - local: model_doc/wav2vec2
 | 
			
		||||
        title: Wav2Vec2
 | 
			
		||||
      - local: model_doc/wav2vec2-conformer
 | 
			
		||||
        title: Wav2Vec2-Conformer
 | 
			
		||||
      - local: model_doc/wav2vec2_phoneme
 | 
			
		||||
        title: Wav2Vec2Phoneme
 | 
			
		||||
      - local: model_doc/wavlm
 | 
			
		||||
        title: WavLM
 | 
			
		||||
      - local: model_doc/xls_r
 | 
			
		||||
        title: XLS-R
 | 
			
		||||
      - local: model_doc/xlsr_wav2vec2
 | 
			
		||||
        title: XLSR-Wav2Vec2
 | 
			
		||||
      title: Audio models
 | 
			
		||||
    - isExpanded: false
 | 
			
		||||
      sections:
 | 
			
		||||
      - local: model_doc/clip
 | 
			
		||||
        title: CLIP
 | 
			
		||||
      - local: model_doc/data2vec
 | 
			
		||||
        title: Data2Vec
 | 
			
		||||
      - local: model_doc/flava
 | 
			
		||||
        title: FLAVA
 | 
			
		||||
      - local: model_doc/groupvit
 | 
			
		||||
        title: GroupViT
 | 
			
		||||
      - local: model_doc/layoutlmv2
 | 
			
		||||
        title: LayoutLMV2
 | 
			
		||||
      - local: model_doc/layoutlmv3
 | 
			
		||||
        title: LayoutLMV3
 | 
			
		||||
      - local: model_doc/layoutxlm
 | 
			
		||||
        title: LayoutXLM
 | 
			
		||||
      - local: model_doc/lxmert
 | 
			
		||||
        title: LXMERT
 | 
			
		||||
      - local: model_doc/perceiver
 | 
			
		||||
        title: Perceiver
 | 
			
		||||
      - local: model_doc/speech-encoder-decoder
 | 
			
		||||
        title: Speech Encoder Decoder Models
 | 
			
		||||
      - local: model_doc/trocr
 | 
			
		||||
        title: TrOCR
 | 
			
		||||
      - local: model_doc/vilt
 | 
			
		||||
        title: ViLT
 | 
			
		||||
      - local: model_doc/vision-encoder-decoder
 | 
			
		||||
        title: Vision Encoder Decoder Models
 | 
			
		||||
      - local: model_doc/vision-text-dual-encoder
 | 
			
		||||
        title: Vision Text Dual Encoder
 | 
			
		||||
      - local: model_doc/visual_bert
 | 
			
		||||
        title: VisualBERT
 | 
			
		||||
      title: Multimodal models
 | 
			
		||||
    - isExpanded: false
 | 
			
		||||
      sections:
 | 
			
		||||
      - local: model_doc/decision_transformer
 | 
			
		||||
        title: Decision Transformer
 | 
			
		||||
      - local: model_doc/trajectory_transformer
 | 
			
		||||
        title: Trajectory Transformer
 | 
			
		||||
      title: Reinforcement learning models
 | 
			
		||||
    - local: model_doc/bart
 | 
			
		||||
      title: BART
 | 
			
		||||
    - local: model_doc/barthez
 | 
			
		||||
      title: BARThez
 | 
			
		||||
    - local: model_doc/bartpho
 | 
			
		||||
      title: BARTpho
 | 
			
		||||
    - local: model_doc/beit
 | 
			
		||||
      title: BEiT
 | 
			
		||||
    - local: model_doc/bert
 | 
			
		||||
      title: BERT
 | 
			
		||||
    - local: model_doc/bertweet
 | 
			
		||||
      title: Bertweet
 | 
			
		||||
    - local: model_doc/bert-generation
 | 
			
		||||
      title: BertGeneration
 | 
			
		||||
    - local: model_doc/bert-japanese
 | 
			
		||||
      title: BertJapanese
 | 
			
		||||
    - local: model_doc/big_bird
 | 
			
		||||
      title: BigBird
 | 
			
		||||
    - local: model_doc/bigbird_pegasus
 | 
			
		||||
      title: BigBirdPegasus
 | 
			
		||||
    - local: model_doc/blenderbot
 | 
			
		||||
      title: Blenderbot
 | 
			
		||||
    - local: model_doc/blenderbot-small
 | 
			
		||||
      title: Blenderbot Small
 | 
			
		||||
    - local: model_doc/bloom
 | 
			
		||||
      title: BLOOM
 | 
			
		||||
    - local: model_doc/bort
 | 
			
		||||
      title: BORT
 | 
			
		||||
    - local: model_doc/byt5
 | 
			
		||||
      title: ByT5
 | 
			
		||||
    - local: model_doc/camembert
 | 
			
		||||
      title: CamemBERT
 | 
			
		||||
    - local: model_doc/canine
 | 
			
		||||
      title: CANINE
 | 
			
		||||
    - local: model_doc/convnext
 | 
			
		||||
      title: ConvNeXT
 | 
			
		||||
    - local: model_doc/clip
 | 
			
		||||
      title: CLIP
 | 
			
		||||
    - local: model_doc/convbert
 | 
			
		||||
      title: ConvBERT
 | 
			
		||||
    - local: model_doc/cpm
 | 
			
		||||
      title: CPM
 | 
			
		||||
    - local: model_doc/ctrl
 | 
			
		||||
      title: CTRL
 | 
			
		||||
    - local: model_doc/cvt
 | 
			
		||||
      title: CvT
 | 
			
		||||
    - local: model_doc/data2vec
 | 
			
		||||
      title: Data2Vec
 | 
			
		||||
    - local: model_doc/deberta
 | 
			
		||||
      title: DeBERTa
 | 
			
		||||
    - local: model_doc/deberta-v2
 | 
			
		||||
      title: DeBERTa-v2
 | 
			
		||||
    - local: model_doc/decision_transformer
 | 
			
		||||
      title: Decision Transformer
 | 
			
		||||
    - local: model_doc/deit
 | 
			
		||||
      title: DeiT
 | 
			
		||||
    - local: model_doc/detr
 | 
			
		||||
      title: DETR
 | 
			
		||||
    - local: model_doc/dialogpt
 | 
			
		||||
      title: DialoGPT
 | 
			
		||||
    - local: model_doc/distilbert
 | 
			
		||||
      title: DistilBERT
 | 
			
		||||
    - local: model_doc/dit
 | 
			
		||||
      title: DiT
 | 
			
		||||
    - local: model_doc/dpr
 | 
			
		||||
      title: DPR
 | 
			
		||||
    - local: model_doc/dpt
 | 
			
		||||
      title: DPT
 | 
			
		||||
    - local: model_doc/electra
 | 
			
		||||
      title: ELECTRA
 | 
			
		||||
    - local: model_doc/encoder-decoder
 | 
			
		||||
      title: Encoder Decoder Models
 | 
			
		||||
    - local: model_doc/flaubert
 | 
			
		||||
      title: FlauBERT
 | 
			
		||||
    - local: model_doc/flava
 | 
			
		||||
      title: FLAVA
 | 
			
		||||
    - local: model_doc/fnet
 | 
			
		||||
      title: FNet
 | 
			
		||||
    - local: model_doc/fsmt
 | 
			
		||||
      title: FSMT
 | 
			
		||||
    - local: model_doc/funnel
 | 
			
		||||
      title: Funnel Transformer
 | 
			
		||||
    - local: model_doc/glpn
 | 
			
		||||
      title: GLPN
 | 
			
		||||
    - local: model_doc/herbert
 | 
			
		||||
      title: HerBERT
 | 
			
		||||
    - local: model_doc/ibert
 | 
			
		||||
      title: I-BERT
 | 
			
		||||
    - local: model_doc/imagegpt
 | 
			
		||||
      title: ImageGPT
 | 
			
		||||
    - local: model_doc/layoutlm
 | 
			
		||||
      title: LayoutLM
 | 
			
		||||
    - local: model_doc/layoutlmv2
 | 
			
		||||
      title: LayoutLMV2
 | 
			
		||||
    - local: model_doc/layoutlmv3
 | 
			
		||||
      title: LayoutLMV3
 | 
			
		||||
    - local: model_doc/layoutxlm
 | 
			
		||||
      title: LayoutXLM
 | 
			
		||||
    - local: model_doc/led
 | 
			
		||||
      title: LED
 | 
			
		||||
    - local: model_doc/levit
 | 
			
		||||
      title: LeViT
 | 
			
		||||
    - local: model_doc/longformer
 | 
			
		||||
      title: Longformer
 | 
			
		||||
    - local: model_doc/longt5
 | 
			
		||||
      title: LongT5
 | 
			
		||||
    - local: model_doc/luke
 | 
			
		||||
      title: LUKE
 | 
			
		||||
    - local: model_doc/lxmert
 | 
			
		||||
      title: LXMERT
 | 
			
		||||
    - local: model_doc/marian
 | 
			
		||||
      title: MarianMT
 | 
			
		||||
    - local: model_doc/maskformer
 | 
			
		||||
      title: MaskFormer
 | 
			
		||||
    - local: model_doc/m2m_100
 | 
			
		||||
      title: M2M100
 | 
			
		||||
    - local: model_doc/mbart
 | 
			
		||||
      title: MBart and MBart-50
 | 
			
		||||
    - local: model_doc/mctct
 | 
			
		||||
      title: MCTCT
 | 
			
		||||
    - local: model_doc/megatron-bert
 | 
			
		||||
      title: MegatronBERT
 | 
			
		||||
    - local: model_doc/megatron_gpt2
 | 
			
		||||
      title: MegatronGPT2
 | 
			
		||||
    - local: model_doc/mluke
 | 
			
		||||
      title: mLUKE
 | 
			
		||||
    - local: model_doc/mobilebert
 | 
			
		||||
      title: MobileBERT
 | 
			
		||||
    - local: model_doc/mpnet
 | 
			
		||||
      title: MPNet
 | 
			
		||||
    - local: model_doc/mt5
 | 
			
		||||
      title: MT5
 | 
			
		||||
    - local: model_doc/nystromformer
 | 
			
		||||
      title: Nyströmformer
 | 
			
		||||
    - local: model_doc/openai-gpt
 | 
			
		||||
      title: OpenAI GPT
 | 
			
		||||
    - local: model_doc/opt
 | 
			
		||||
      title: OPT
 | 
			
		||||
    - local: model_doc/gpt2
 | 
			
		||||
      title: OpenAI GPT2
 | 
			
		||||
    - local: model_doc/gptj
 | 
			
		||||
      title: GPT-J
 | 
			
		||||
    - local: model_doc/gpt_neo
 | 
			
		||||
      title: GPT Neo
 | 
			
		||||
    - local: model_doc/gpt_neox
 | 
			
		||||
      title: GPT NeoX
 | 
			
		||||
    - local: model_doc/hubert
 | 
			
		||||
      title: Hubert
 | 
			
		||||
    - local: model_doc/perceiver
 | 
			
		||||
      title: Perceiver
 | 
			
		||||
    - local: model_doc/pegasus
 | 
			
		||||
      title: Pegasus
 | 
			
		||||
    - local: model_doc/phobert
 | 
			
		||||
      title: PhoBERT
 | 
			
		||||
    - local: model_doc/plbart
 | 
			
		||||
      title: PLBart
 | 
			
		||||
    - local: model_doc/poolformer
 | 
			
		||||
      title: PoolFormer
 | 
			
		||||
    - local: model_doc/prophetnet
 | 
			
		||||
      title: ProphetNet
 | 
			
		||||
    - local: model_doc/qdqbert
 | 
			
		||||
      title: QDQBert
 | 
			
		||||
    - local: model_doc/rag
 | 
			
		||||
      title: RAG
 | 
			
		||||
    - local: model_doc/realm
 | 
			
		||||
      title: REALM
 | 
			
		||||
    - local: model_doc/reformer
 | 
			
		||||
      title: Reformer
 | 
			
		||||
    - local: model_doc/rembert
 | 
			
		||||
      title: RemBERT
 | 
			
		||||
    - local: model_doc/regnet
 | 
			
		||||
      title: RegNet
 | 
			
		||||
    - local: model_doc/resnet
 | 
			
		||||
      title: ResNet
 | 
			
		||||
    - local: model_doc/retribert
 | 
			
		||||
      title: RetriBERT
 | 
			
		||||
    - local: model_doc/roberta
 | 
			
		||||
      title: RoBERTa
 | 
			
		||||
    - local: model_doc/roformer
 | 
			
		||||
      title: RoFormer
 | 
			
		||||
    - local: model_doc/segformer
 | 
			
		||||
      title: SegFormer
 | 
			
		||||
    - local: model_doc/sew
 | 
			
		||||
      title: SEW
 | 
			
		||||
    - local: model_doc/sew-d
 | 
			
		||||
      title: SEW-D
 | 
			
		||||
    - local: model_doc/speech-encoder-decoder
 | 
			
		||||
      title: Speech Encoder Decoder Models
 | 
			
		||||
    - local: model_doc/speech_to_text
 | 
			
		||||
      title: Speech2Text
 | 
			
		||||
    - local: model_doc/speech_to_text_2
 | 
			
		||||
      title: Speech2Text2
 | 
			
		||||
    - local: model_doc/splinter
 | 
			
		||||
      title: Splinter
 | 
			
		||||
    - local: model_doc/squeezebert
 | 
			
		||||
      title: SqueezeBERT
 | 
			
		||||
    - local: model_doc/swin
 | 
			
		||||
      title: Swin Transformer
 | 
			
		||||
    - local: model_doc/t5
 | 
			
		||||
      title: T5
 | 
			
		||||
    - local: model_doc/t5v1.1
 | 
			
		||||
      title: T5v1.1
 | 
			
		||||
    - local: model_doc/tapas
 | 
			
		||||
      title: TAPAS
 | 
			
		||||
    - local: model_doc/tapex
 | 
			
		||||
      title: TAPEX
 | 
			
		||||
    - local: model_doc/trajectory_transformer
 | 
			
		||||
      title: Trajectory Transformer
 | 
			
		||||
    - local: model_doc/transfo-xl
 | 
			
		||||
      title: Transformer XL
 | 
			
		||||
    - local: model_doc/trocr
 | 
			
		||||
      title: TrOCR
 | 
			
		||||
    - local: model_doc/unispeech
 | 
			
		||||
      title: UniSpeech
 | 
			
		||||
    - local: model_doc/unispeech-sat
 | 
			
		||||
      title: UniSpeech-SAT
 | 
			
		||||
    - local: model_doc/van
 | 
			
		||||
      title: VAN
 | 
			
		||||
    - local: model_doc/vilt
 | 
			
		||||
      title: ViLT
 | 
			
		||||
    - local: model_doc/vision-encoder-decoder
 | 
			
		||||
      title: Vision Encoder Decoder Models
 | 
			
		||||
    - local: model_doc/vision-text-dual-encoder
 | 
			
		||||
      title: Vision Text Dual Encoder
 | 
			
		||||
    - local: model_doc/vit
 | 
			
		||||
      title: Vision Transformer (ViT)
 | 
			
		||||
    - local: model_doc/vit_mae
 | 
			
		||||
      title: ViTMAE
 | 
			
		||||
    - local: model_doc/visual_bert
 | 
			
		||||
      title: VisualBERT
 | 
			
		||||
    - local: model_doc/wav2vec2
 | 
			
		||||
      title: Wav2Vec2
 | 
			
		||||
    - local: model_doc/wav2vec2-conformer
 | 
			
		||||
      title: Wav2Vec2-Conformer
 | 
			
		||||
    - local: model_doc/wav2vec2_phoneme
 | 
			
		||||
      title: Wav2Vec2Phoneme
 | 
			
		||||
    - local: model_doc/wavlm
 | 
			
		||||
      title: WavLM
 | 
			
		||||
    - local: model_doc/xglm
 | 
			
		||||
      title: XGLM
 | 
			
		||||
    - local: model_doc/xlm
 | 
			
		||||
      title: XLM
 | 
			
		||||
    - local: model_doc/xlm-prophetnet
 | 
			
		||||
      title: XLM-ProphetNet
 | 
			
		||||
    - local: model_doc/xlm-roberta
 | 
			
		||||
      title: XLM-RoBERTa
 | 
			
		||||
    - local: model_doc/xlm-roberta-xl
 | 
			
		||||
      title: XLM-RoBERTa-XL
 | 
			
		||||
    - local: model_doc/xlnet
 | 
			
		||||
      title: XLNet
 | 
			
		||||
    - local: model_doc/xlsr_wav2vec2
 | 
			
		||||
      title: XLSR-Wav2Vec2
 | 
			
		||||
    - local: model_doc/xls_r
 | 
			
		||||
      title: XLS-R
 | 
			
		||||
    - local: model_doc/yolos
 | 
			
		||||
      title: YOLOS
 | 
			
		||||
    - local: model_doc/yoso
 | 
			
		||||
      title: YOSO
 | 
			
		||||
    title: Models
 | 
			
		||||
  - sections:
 | 
			
		||||
    - local: internal/modeling_utils
 | 
			
		||||
 | 
			
		||||
@ -813,9 +813,13 @@ checkpoint and to get the required access rights to be able to upload the model
 | 
			
		||||
*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
brand_new_bert.push_to_hub("brand_new_bert")
 | 
			
		||||
# Uncomment the following line to push to an organization.
 | 
			
		||||
# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
 | 
			
		||||
brand_new_bert.push_to_hub(
 | 
			
		||||
    repo_path_or_name="brand_new_bert",
 | 
			
		||||
    # Uncomment the following line to push to an organization
 | 
			
		||||
    # organization="<ORGANIZATION>",
 | 
			
		||||
    commit_message="Add model",
 | 
			
		||||
    use_temp_dir=True,
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the
 | 
			
		||||
 | 
			
		||||
@ -9,10 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# How to create a custom pipeline?
 | 
			
		||||
 | 
			
		||||
In this guide, we will see how to create a custom pipeline and share it on the [Hub](hf.co/models) or add it to the
 | 
			
		||||
Transformers library.
 | 
			
		||||
# How to add a pipeline to 🤗 Transformers?
 | 
			
		||||
 | 
			
		||||
First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
 | 
			
		||||
dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
 | 
			
		||||
@ -102,7 +99,7 @@ def _sanitize_parameters(self, **kwargs):
 | 
			
		||||
 | 
			
		||||
    postprocess_kwargs = {}
 | 
			
		||||
    if "top_k" in kwargs:
 | 
			
		||||
        postprocess_kwargs["top_k"] = kwargs["top_k"]
 | 
			
		||||
        preprocess_kwargs["top_k"] = kwargs["top_k"]
 | 
			
		||||
    return preprocess_kwargs, {}, postprocess_kwargs
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -114,123 +111,12 @@ of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
 | 
			
		||||
 | 
			
		||||
## Adding it to the list of supported tasks
 | 
			
		||||
 | 
			
		||||
To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`:
 | 
			
		||||
Go to `src/transformers/pipelines/__init__.py` and fill in `SUPPORTED_TASKS` with your newly created pipeline.
 | 
			
		||||
If possible it should provide a default model.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers.pipelines import PIPELINE_REGISTRY
 | 
			
		||||
## Adding tests
 | 
			
		||||
 | 
			
		||||
PIPELINE_REGISTRY.register_pipeline(
 | 
			
		||||
    "new-task",
 | 
			
		||||
    pipeline_class=MyPipeline,
 | 
			
		||||
    pt_model=AutoModelForSequenceClassification,
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well was the type:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
PIPELINE_REGISTRY.register_pipeline(
 | 
			
		||||
    "new-task",
 | 
			
		||||
    pipeline_class=MyPipeline,
 | 
			
		||||
    pt_model=AutoModelForSequenceClassification,
 | 
			
		||||
    default={"pt": ("user/awesome_model", "abcdef")},
 | 
			
		||||
    type="text",  # current support type: text, audio, image, multimodal
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Share your pipeline on the Hub
 | 
			
		||||
 | 
			
		||||
To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a
 | 
			
		||||
python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from transformers import Pipeline
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def softmax(outputs):
 | 
			
		||||
    maxes = np.max(outputs, axis=-1, keepdims=True)
 | 
			
		||||
    shifted_exp = np.exp(outputs - maxes)
 | 
			
		||||
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PairClassificationPipeline(Pipeline):
 | 
			
		||||
    def _sanitize_parameters(self, **kwargs):
 | 
			
		||||
        preprocess_kwargs = {}
 | 
			
		||||
        if "second_text" in kwargs:
 | 
			
		||||
            preprocess_kwargs["second_text"] = kwargs["second_text"]
 | 
			
		||||
        return preprocess_kwargs, {}, {}
 | 
			
		||||
 | 
			
		||||
    def preprocess(self, text, second_text=None):
 | 
			
		||||
        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
 | 
			
		||||
 | 
			
		||||
    def _forward(self, model_inputs):
 | 
			
		||||
        return self.model(**model_inputs)
 | 
			
		||||
 | 
			
		||||
    def postprocess(self, model_outputs):
 | 
			
		||||
        logits = model_outputs.logits[0].numpy()
 | 
			
		||||
        probabilities = softmax(logits)
 | 
			
		||||
 | 
			
		||||
        best_class = np.argmax(probabilities)
 | 
			
		||||
        label = self.model.config.id2label[best_class]
 | 
			
		||||
        score = probabilities[best_class].item()
 | 
			
		||||
        logits = logits.tolist()
 | 
			
		||||
        return {"label": label, "score": score, "logits": logits}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in
 | 
			
		||||
a file named `pair_classification.py`, we can then import it and register it like this:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from pair_classification import PairClassificationPipeline
 | 
			
		||||
from transformers.pipelines import PIPELINE_REGISTRY
 | 
			
		||||
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
 | 
			
		||||
 | 
			
		||||
PIPELINE_REGISTRY.register_pipeline(
 | 
			
		||||
    "pair-classification",
 | 
			
		||||
    pipeline_class=PairClassificationPipeline,
 | 
			
		||||
    pt_model=AutoModelForSequenceClassification,
 | 
			
		||||
    tf_model=TFAutoModelForSequenceClassification,
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been
 | 
			
		||||
fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import pipeline
 | 
			
		||||
 | 
			
		||||
classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Then we can share it on the Hub by using the `save_pretrained` method in a `Repository`:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from huggingface_hub import Repository
 | 
			
		||||
 | 
			
		||||
repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
 | 
			
		||||
classifier.save_pretrained("test-dynamic-pipeline")
 | 
			
		||||
repo.push_to_hub()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`,
 | 
			
		||||
along with saving the model and tokenizer of the pipeline, before pushing everything in the repository
 | 
			
		||||
`{your_username}/test-dynamic-pipeline`. After that anyone can use it as long as they provide the option
 | 
			
		||||
`trust_remote_code=True`:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import pipeline
 | 
			
		||||
 | 
			
		||||
classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Add the pipeline to Transformers
 | 
			
		||||
 | 
			
		||||
If you want to contribute your pipeline to Transformers, you will need to add a new module in the `pipelines` submodule
 | 
			
		||||
with the code of your pipeline, then add it in the list of tasks defined in `pipelines/__init__.py`.
 | 
			
		||||
 | 
			
		||||
Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with example with the other tests.
 | 
			
		||||
Create a new file `tests/test_pipelines_MY_PIPELINE.py` with example with the other tests.
 | 
			
		||||
 | 
			
		||||
The `run_pipeline_test` function will be very generic and run on small random models on every possible
 | 
			
		||||
architecture as defined by `model_mapping` and `tf_model_mapping`.
 | 
			
		||||
 | 
			
		||||
@ -114,6 +114,15 @@ If you want to directly load such a sharded checkpoint inside a model without us
 | 
			
		||||
 | 
			
		||||
## Low memory loading
 | 
			
		||||
 | 
			
		||||
Sharded checkpoints reduce the memory usage during step 2 of the workflow mentioned above, but in order to use that model in a low memory setting, we recommend leveraging our tools based on the Accelerate library.
 | 
			
		||||
Sharded checkpoints reduce the memory usage during step 2 of the worflow mentioned above, but when loadin a pretrained model, why keep the random weights in memory? The option `low_cpu_mem_usage` will destroy the weights of the randomly initialized model, then progressively load the weights inside, then perform a random initialization for potential missing weights (if you are loadding a model with a newly initialized head for a fine-tuning task for instance).
 | 
			
		||||
 | 
			
		||||
It's very easy to use, just add `low_cpu_mem_usage=True` to your call to [`~PreTrainedModel.from_pretrained`]:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import AutoModelForSequenceClas
 | 
			
		||||
 | 
			
		||||
model = AutoModel.from_pretrained("bert-base-cased", low_cpu_mem_usage=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This can be used in conjunction with a sharded checkpoint.
 | 
			
		||||
 | 
			
		||||
Please read the following guide for more information: [Large model loading using Accelerate](./main_classes/model#large-model-loading)
 | 
			
		||||
@ -289,7 +289,7 @@ from huggingface_hub import notebook_login
 | 
			
		||||
notebook_login()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
You can then push to your own namespace (or an organization you are a member of) like this:
 | 
			
		||||
You can then push to to your own namespace (or an organization you are a member of) like this:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
resnet50d.push_to_hub("custom-resnet50d")
 | 
			
		||||
 | 
			
		||||
@ -69,7 +69,6 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 | 
			
		||||
1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 | 
			
		||||
1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 | 
			
		||||
1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 | 
			
		||||
1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 | 
			
		||||
1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 | 
			
		||||
1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 | 
			
		||||
1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 | 
			
		||||
@ -98,7 +97,6 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 | 
			
		||||
1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
 | 
			
		||||
1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 | 
			
		||||
1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 | 
			
		||||
1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 | 
			
		||||
1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 | 
			
		||||
1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 | 
			
		||||
1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 | 
			
		||||
@ -122,15 +120,10 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 | 
			
		||||
1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 | 
			
		||||
1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 | 
			
		||||
1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 | 
			
		||||
1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 | 
			
		||||
1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 | 
			
		||||
1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 | 
			
		||||
1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 | 
			
		||||
1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 | 
			
		||||
1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 | 
			
		||||
1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 | 
			
		||||
1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 | 
			
		||||
1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 | 
			
		||||
1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 | 
			
		||||
1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 | 
			
		||||
1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 | 
			
		||||
@ -154,7 +147,6 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 | 
			
		||||
1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 | 
			
		||||
1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 | 
			
		||||
1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 | 
			
		||||
1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 | 
			
		||||
1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 | 
			
		||||
1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 | 
			
		||||
@ -162,7 +154,6 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 | 
			
		||||
1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 | 
			
		||||
1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 | 
			
		||||
1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 | 
			
		||||
1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 | 
			
		||||
1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 | 
			
		||||
1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 | 
			
		||||
1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 | 
			
		||||
@ -209,7 +200,6 @@ Flax), PyTorch, and/or TensorFlow.
 | 
			
		||||
|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
@ -220,7 +210,7 @@ Flax), PyTorch, and/or TensorFlow.
 | 
			
		||||
|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
@ -236,7 +226,6 @@ Flax), PyTorch, and/or TensorFlow.
 | 
			
		||||
|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 | 
			
		||||
|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|          GroupViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
@ -256,16 +245,12 @@ Flax), PyTorch, and/or TensorFlow.
 | 
			
		||||
|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|          MobileViT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
@ -275,13 +260,13 @@ Flax), PyTorch, and/or TensorFlow.
 | 
			
		||||
|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|           RegNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 | 
			
		||||
@ -290,7 +275,6 @@ Flax), PyTorch, and/or TensorFlow.
 | 
			
		||||
|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
 | 
			
		||||
@ -34,16 +34,11 @@ Start by creating a virtual environment in your project directory:
 | 
			
		||||
python -m venv .env
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Activate the virtual environment. On Linux and MacOs:
 | 
			
		||||
Activate the virtual environment:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
source .env/bin/activate
 | 
			
		||||
```
 | 
			
		||||
Activate Virtual environment on Windows
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
.env/Scripts/activate
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Now you're ready to install 🤗 Transformers with the following command:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
# Feature Extractor
 | 
			
		||||
 | 
			
		||||
A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction
 | 
			
		||||
A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
 | 
			
		||||
from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
 | 
			
		||||
*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
 | 
			
		||||
tensors.
 | 
			
		||||
 | 
			
		||||
@ -58,10 +58,6 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 | 
			
		||||
 | 
			
		||||
[[autodoc]] BertTokenizerFast
 | 
			
		||||
 | 
			
		||||
## TFBertTokenizer
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFBertTokenizer
 | 
			
		||||
 | 
			
		||||
## Bert specific outputs
 | 
			
		||||
 | 
			
		||||
[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
 | 
			
		||||
 | 
			
		||||
@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License.
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The BLOOM model has been proposed with its various versions through the [BigScience Workshop](https://bigscience.huggingface.co/). BigScience is inspired by other open science initiatives where researchers have pooled their time and resources to collectively achieve a higher impact.
 | 
			
		||||
The architecture of BLOOM is essentially similar to GPT3 (auto-regressive model for next token prediction), but has been trained on 46 different languages and 13 programming languages.
 | 
			
		||||
The architecture of BLOOM is essentially similar to GPT3 (auto-regressive model for next token prediction), but has been trained on different 46 languages including code.
 | 
			
		||||
Several smaller versions of the models have been trained on the same dataset. BLOOM is available in the following versions:
 | 
			
		||||
 | 
			
		||||
- [bloom-350m](https://huggingface.co/bigscience/bloom-350m)
 | 
			
		||||
@ -23,7 +23,7 @@ Several smaller versions of the models have been trained on the same dataset. BL
 | 
			
		||||
- [bloom-1b3](https://huggingface.co/bigscience/bloom-1b3)
 | 
			
		||||
- [bloom-2b5](https://huggingface.co/bigscience/bloom-2b5)
 | 
			
		||||
- [bloom-6b3](https://huggingface.co/bigscience/bloom-6b3)
 | 
			
		||||
- [bloom](https://huggingface.co/bigscience/bloom) (176B parameters)
 | 
			
		||||
- [bloom](https://huggingface.co/bigscience/bloom) (175B parameters)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## BloomConfig
 | 
			
		||||
@ -54,4 +54,4 @@ Several smaller versions of the models have been trained on the same dataset. BL
 | 
			
		||||
## BloomForTokenClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] BloomForTokenClassification
 | 
			
		||||
    - forward
 | 
			
		||||
    - forward
 | 
			
		||||
@ -1,81 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# CodeGen
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The CodeGen model was proposed in [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong.
 | 
			
		||||
 | 
			
		||||
CodeGen is an autoregressive language model for program synthesis trained sequentially on [The Pile](https://pile.eleuther.ai/), BigQuery, and BigPython.
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: [this https URL](https://github.com/salesforce/codegen).* 
 | 
			
		||||
 | 
			
		||||
This model was contributed by [Hiroaki Hayashi](https://huggingface.co/rooa).
 | 
			
		||||
The original code can be found [here](https://github.com/salesforce/codegen).
 | 
			
		||||
 | 
			
		||||
## Checkpoint Naming
 | 
			
		||||
 | 
			
		||||
* CodeGen model [checkpoints](https://huggingface.co/models?other=codegen) are available on different pre-training data with variable sizes.
 | 
			
		||||
* The format is: `Salesforce/codegen-{size}-{data}`, where
 | 
			
		||||
  * `size`: `350M`, `2B`, `6B`, `16B`
 | 
			
		||||
  * `data`: 
 | 
			
		||||
    * `nl`: Pre-trained on the Pile
 | 
			
		||||
    * `multi`: Initialized with `nl`, then further pre-trained on multiple programming languages data
 | 
			
		||||
    * `mono`: Initialized with `multi`, then further pre-trained on Python data
 | 
			
		||||
* For example, `Salesforce/codegen-350M-mono` offers a 350 million-parameter checkpoint pre-trained sequentially on the Pile, multiple programming languages, and Python.
 | 
			
		||||
 | 
			
		||||
## How to use
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
 | 
			
		||||
 | 
			
		||||
>>> checkpoint = "Salesforce/codegen-350M-mono"
 | 
			
		||||
>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 | 
			
		||||
 | 
			
		||||
>>> text = "def hello_world():"
 | 
			
		||||
 | 
			
		||||
>>> completion = model.generate(**tokenizer(text, return_tensors="pt"))
 | 
			
		||||
 | 
			
		||||
>>> print(tokenizer.decode(completion[0]))
 | 
			
		||||
def hello_world():
 | 
			
		||||
    print("Hello World")
 | 
			
		||||
 | 
			
		||||
hello_world()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## CodeGenConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] CodeGenConfig
 | 
			
		||||
    - all
 | 
			
		||||
 | 
			
		||||
## CodeGenTokenizer
 | 
			
		||||
 | 
			
		||||
[[autodoc]] CodeGenTokenizer
 | 
			
		||||
    - save_vocabulary
 | 
			
		||||
 | 
			
		||||
## CodeGenTokenizerFast
 | 
			
		||||
 | 
			
		||||
[[autodoc]] CodeGenTokenizerFast
 | 
			
		||||
 | 
			
		||||
## CodeGenModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] CodeGenModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## CodeGenForCausalLM
 | 
			
		||||
 | 
			
		||||
[[autodoc]] CodeGenForCausalLM
 | 
			
		||||
    - forward
 | 
			
		||||
@ -69,7 +69,7 @@ Tips:
 | 
			
		||||
  *facebook/deit-base-patch16-384*. Note that one should use [`DeiTFeatureExtractor`] in order to
 | 
			
		||||
  prepare images for the model.
 | 
			
		||||
 | 
			
		||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts).
 | 
			
		||||
This model was contributed by [nielsr](https://huggingface.co/nielsr).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## DeiTConfig
 | 
			
		||||
@ -100,23 +100,3 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tenso
 | 
			
		||||
 | 
			
		||||
[[autodoc]] DeiTForImageClassificationWithTeacher
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## TFDeiTModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFDeiTModel
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
## TFDeiTForMaskedImageModeling
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFDeiTForMaskedImageModeling
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
## TFDeiTForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFDeiTForImageClassification
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
## TFDeiTForImageClassificationWithTeacher
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFDeiTForImageClassificationWithTeacher
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
@ -113,28 +113,6 @@ Tips:
 | 
			
		||||
- The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`.
 | 
			
		||||
  It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.
 | 
			
		||||
 | 
			
		||||
There are three ways to instantiate a DETR model (depending on what you prefer):
 | 
			
		||||
  
 | 
			
		||||
Option 1: Instantiate DETR with pre-trained weights for entire model
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import DetrForObjectDetection
 | 
			
		||||
 | 
			
		||||
>>> model = DetrForObjectDetection.from_pretrained("facebook/resnet-50")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import DetrConfig, DetrForObjectDetection
 | 
			
		||||
 | 
			
		||||
>>> config = DetrConfig()
 | 
			
		||||
>>> model = DetrForObjectDetection(config)
 | 
			
		||||
```
 | 
			
		||||
Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer
 | 
			
		||||
```py
 | 
			
		||||
>>> config = DetrConfig(use_pretrained_backbone=False)
 | 
			
		||||
>>> model = DetrForObjectDetection(config)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
As a summary, consider the following table:
 | 
			
		||||
 | 
			
		||||
| Task | Object detection | Instance segmentation | Panoptic segmentation |
 | 
			
		||||
@ -188,4 +166,4 @@ mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are i
 | 
			
		||||
## DetrForSegmentation
 | 
			
		||||
 | 
			
		||||
[[autodoc]] DetrForSegmentation
 | 
			
		||||
    - forward
 | 
			
		||||
    - forward
 | 
			
		||||
@ -12,8 +12,6 @@ specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
# Encoder Decoder Models
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
 | 
			
		||||
pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.
 | 
			
		||||
 | 
			
		||||
@ -27,77 +25,15 @@ any other models (see the examples for more information).
 | 
			
		||||
An application of this architecture could be to leverage two pretrained [`BertModel`] as the encoder
 | 
			
		||||
and decoder for a summarization model as was shown in: [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345) by Yang Liu and Mirella Lapata.
 | 
			
		||||
 | 
			
		||||
## Randomly initializing `EncoderDecoderModel` from model configurations.
 | 
			
		||||
 | 
			
		||||
[`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`BertModel`] configuration for the encoder and the default [`BertForCausalLM`] configuration for the decoder.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> config_encoder = BertConfig()
 | 
			
		||||
>>> config_decoder = BertConfig()
 | 
			
		||||
 | 
			
		||||
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 | 
			
		||||
>>> model = EncoderDecoderModel(config=config)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
 | 
			
		||||
 | 
			
		||||
[`EncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained auto-encoding model, *e.g.* BERT, can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
 | 
			
		||||
Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
 | 
			
		||||
Initializing [`EncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
 | 
			
		||||
To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_encoder_decoder_pretrained`] method.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import EncoderDecoderModel, BertTokenizer
 | 
			
		||||
 | 
			
		||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 | 
			
		||||
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
 | 
			
		||||
 | 
			
		||||
To load fine-tuned checkpoints of the `EncoderDecoderModel` class, [`EncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
 | 
			
		||||
 | 
			
		||||
To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import AutoTokenizer, EncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> # load a fine-tuned seq2seq model and corresponding tokenizer
 | 
			
		||||
>>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
 | 
			
		||||
 | 
			
		||||
>>> # let's perform inference on a long piece of text
 | 
			
		||||
>>> ARTICLE_TO_SUMMARIZE = (
 | 
			
		||||
...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
 | 
			
		||||
...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
 | 
			
		||||
...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
 | 
			
		||||
... )
 | 
			
		||||
>>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids
 | 
			
		||||
 | 
			
		||||
>>> # autoregressively generate summary (uses greedy decoding by default)
 | 
			
		||||
>>> generated_ids = model.generate(input_ids)
 | 
			
		||||
>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 | 
			
		||||
>>> print(generated_text)
 | 
			
		||||
nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow.
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.
 | 
			
		||||
 | 
			
		||||
[`TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
 | 
			
		||||
The [`~TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
 | 
			
		||||
pytorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only pytorch
 | 
			
		||||
checkpoints for a particular encoder-decoder model, a workaround is:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> # a workaround to load from pytorch checkpoint
 | 
			
		||||
>>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
 | 
			
		||||
 | 
			
		||||
>>> _model.encoder.save_pretrained("./encoder")
 | 
			
		||||
>>> _model.decoder.save_pretrained("./decoder")
 | 
			
		||||
 | 
			
		||||
>>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
 | 
			
		||||
...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
 | 
			
		||||
... )
 | 
			
		||||
@ -105,38 +41,6 @@ checkpoints for a particular encoder-decoder model, a workaround is:
 | 
			
		||||
>>> model.config = _model.config
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Training
 | 
			
		||||
 | 
			
		||||
Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model.
 | 
			
		||||
As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
 | 
			
		||||
`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
 | 
			
		||||
target sequence).
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import BertTokenizer, EncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 | 
			
		||||
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
 | 
			
		||||
 | 
			
		||||
>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
 | 
			
		||||
>>> model.config.pad_token_id = tokenizer.pad_token_id
 | 
			
		||||
 | 
			
		||||
>>> input_ids = tokenizer(
 | 
			
		||||
...     "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side.During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was  finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft).Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.",
 | 
			
		||||
...     return_tensors="pt",
 | 
			
		||||
... ).input_ids
 | 
			
		||||
 | 
			
		||||
>>> labels = tokenizer(
 | 
			
		||||
...     "the eiffel tower surpassed the washington monument to become the tallest structure in the world. it was the first structure to reach a height of 300 metres in paris in 1930. it is now taller than the chrysler building by 5. 2 metres ( 17 ft ) and is the second tallest free - standing structure in paris.",
 | 
			
		||||
...     return_tensors="pt",
 | 
			
		||||
... ).input_ids
 | 
			
		||||
 | 
			
		||||
>>> # the forward function automatically creates the correct decoder_input_ids
 | 
			
		||||
>>> loss = model(input_ids=input_ids, labels=labels).loss
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Detailed [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for training.
 | 
			
		||||
 | 
			
		||||
This model was contributed by [thomwolf](https://github.com/thomwolf). This model's TensorFlow and Flax versions
 | 
			
		||||
were contributed by [ydshieh](https://github.com/ydshieh).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,61 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# GroupViT
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The GroupViT model was proposed in [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
 | 
			
		||||
Inspired by [CLIP](clip), GroupViT is a vision-language model that can perform zero-shot semantic segmentation on any given vocabulary categories.
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Grouping and recognition are important components of visual scene understanding, e.g., for object detection and semantic segmentation. With end-to-end deep learning systems, grouping of image regions usually happens implicitly via top-down supervision from pixel-level recognition labels. Instead, in this paper, we propose to bring back the grouping mechanism into deep networks, which allows semantic segments to emerge automatically with only text supervision. We propose a hierarchical Grouping Vision Transformer (GroupViT), which goes beyond the regular grid structure representation and learns to group image regions into progressively larger arbitrary-shaped segments. We train GroupViT jointly with a text encoder on a large-scale image-text dataset via contrastive losses. With only text supervision and without any pixel-level annotations, GroupViT learns to group together semantic regions and successfully transfers to the task of semantic segmentation in a zero-shot manner, i.e., without any further fine-tuning. It achieves a zero-shot accuracy of 52.3% mIoU on the PASCAL VOC 2012 and 22.4% mIoU on PASCAL Context datasets, and performs competitively to state-of-the-art transfer-learning methods requiring greater levels of supervision.*
 | 
			
		||||
 | 
			
		||||
Tips:
 | 
			
		||||
 | 
			
		||||
- You may specify `output_segmentation=True` in the forward of `GroupViTModel` to get the segmentation logits of input texts. 
 | 
			
		||||
- The quickest way to get started with GroupViT is by checking the [example notebooks](https://github.com/xvjiarui/GroupViT/blob/main/demo/GroupViT_hf_inference_notebook.ipynb) (which showcase zero-shot segmentation inference). One can also check out the [HuggingFace Spaces demo](https://huggingface.co/spaces/xvjiarui/GroupViT) to play with GroupViT. 
 | 
			
		||||
 | 
			
		||||
This model was contributed by [xvjiarui](https://huggingface.co/xvjiarui).
 | 
			
		||||
The original code can be found [here](https://github.com/NVlabs/GroupViT).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## GroupViTConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] GroupViTConfig
 | 
			
		||||
    - from_text_vision_configs
 | 
			
		||||
 | 
			
		||||
## GroupViTTextConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] GroupViTTextConfig
 | 
			
		||||
 | 
			
		||||
## GroupViTVisionConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] GroupViTVisionConfig
 | 
			
		||||
 | 
			
		||||
## GroupViTModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] GroupViTModel
 | 
			
		||||
    - forward
 | 
			
		||||
    - get_text_features
 | 
			
		||||
    - get_image_features
 | 
			
		||||
 | 
			
		||||
## GroupViTTextModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] GroupViTTextModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## GroupViTVisionModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] GroupViTVisionModel
 | 
			
		||||
    - forward
 | 
			
		||||
@ -28,9 +28,8 @@ Tips:
 | 
			
		||||
    - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
 | 
			
		||||
    - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece. 
 | 
			
		||||
  Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3FeatureExtractor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
 | 
			
		||||
- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor. 
 | 
			
		||||
- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-LayoutLMv2Processor) of its predecessor. 
 | 
			
		||||
- Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
 | 
			
		||||
- Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
 | 
			
		||||
 | 
			
		||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/layoutlmv3_architecture.png"
 | 
			
		||||
alt="drawing" width="600"/> 
 | 
			
		||||
 | 
			
		||||
@ -152,23 +152,3 @@ This model was contributed by [ikuyamada](https://huggingface.co/ikuyamada) and
 | 
			
		||||
 | 
			
		||||
[[autodoc]] LukeForEntitySpanClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## LukeForSequenceClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] LukeForSequenceClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## LukeForMultipleChoice
 | 
			
		||||
 | 
			
		||||
[[autodoc]] LukeForMultipleChoice
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## LukeForTokenClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] LukeForTokenClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## LukeForQuestionAnswering
 | 
			
		||||
 | 
			
		||||
[[autodoc]] LukeForQuestionAnswering
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
@ -55,7 +55,9 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en
 | 
			
		||||
src_text = "Life is like a box of chocolates."
 | 
			
		||||
tgt_text = "La vie est comme une boîte de chocolat."
 | 
			
		||||
 | 
			
		||||
model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
 | 
			
		||||
model_inputs = tokenizer(src_text, return_tensors="pt")
 | 
			
		||||
with tokenizer.as_target_tokenizer():
 | 
			
		||||
    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
 | 
			
		||||
 | 
			
		||||
loss = model(**model_inputs, labels=labels)  # forward pass
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -155,7 +155,7 @@ Example of translating english to many romance languages, using old-style 2 char
 | 
			
		||||
## MarianTokenizer
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MarianTokenizer
 | 
			
		||||
    - build_inputs_with_special_tokens
 | 
			
		||||
    - as_target_tokenizer
 | 
			
		||||
 | 
			
		||||
## MarianModel
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -33,7 +33,7 @@ Tips:
 | 
			
		||||
  `get_num_masks` function inside in the `MaskFormerLoss` class of `modeling_maskformer.py`. When training on multiple nodes, this should be
 | 
			
		||||
  set to the average number of target masks across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/MaskFormer/blob/da3e60d85fdeedcb31476b5edd7d328826ce56cc/mask_former/modeling/criterion.py#L169).
 | 
			
		||||
- One can use [`MaskFormerFeatureExtractor`] to prepare images for the model and optional targets for the model.
 | 
			
		||||
- To get the final segmentation, depending on the task, you can call [`~MaskFormerFeatureExtractor.post_process_semantic_segmentation`] or [`~MaskFormerFeatureExtractor.post_process_panoptic_segmentation`]. Both tasks can be solved using [`MaskFormerForInstanceSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
 | 
			
		||||
- To get the final segmentation, depending on the task, you can call [`~MaskFormerFeatureExtractor.post_process_semantic_segmentation`] or [`~MaskFormerFeatureExtractor.post_process_panoptic_segmentation`]. Both tasks can be solved using [`MaskFormerForInstanceSegmentation`] output, the latter needs an additional `is_thing_map` to know which instances must be merged together..
 | 
			
		||||
 | 
			
		||||
The figure below illustrates the architecture of MaskFormer. Taken from the [original paper](https://arxiv.org/abs/2107.06278).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -34,8 +34,8 @@ model is multilingual it expects the sequences in a different format. A special
 | 
			
		||||
source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
 | 
			
		||||
target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
 | 
			
		||||
 | 
			
		||||
The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
 | 
			
		||||
keyword, and target text format passed with the `text_label` keyword argument.
 | 
			
		||||
The regular [`~MBartTokenizer.__call__`] will encode source text format, and it should be wrapped
 | 
			
		||||
inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode target text format.
 | 
			
		||||
 | 
			
		||||
- Supervised training
 | 
			
		||||
 | 
			
		||||
@ -46,11 +46,13 @@ keyword, and target text format passed with the `text_label` keyword argument.
 | 
			
		||||
>>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
 | 
			
		||||
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
 | 
			
		||||
 | 
			
		||||
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
 | 
			
		||||
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
 | 
			
		||||
>>> with tokenizer.as_target_tokenizer():
 | 
			
		||||
...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
 | 
			
		||||
 | 
			
		||||
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
 | 
			
		||||
>>> # forward pass
 | 
			
		||||
>>> model(**inputs)
 | 
			
		||||
>>> model(**inputs, labels=batch["labels"])
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
- Generation
 | 
			
		||||
@ -106,9 +108,11 @@ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_
 | 
			
		||||
src_text = " UN Chief Says There Is No Military Solution in Syria"
 | 
			
		||||
tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
 | 
			
		||||
 | 
			
		||||
model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
 | 
			
		||||
model_inputs = tokenizer(src_text, return_tensors="pt")
 | 
			
		||||
with tokenizer.as_target_tokenizer():
 | 
			
		||||
    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
 | 
			
		||||
 | 
			
		||||
model(**model_inputs)  # forward pass
 | 
			
		||||
model(**model_inputs, labels=labels)  # forward pass
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
- Generation
 | 
			
		||||
@ -150,6 +154,7 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
 | 
			
		||||
## MBartTokenizer
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MBartTokenizer
 | 
			
		||||
    - as_target_tokenizer
 | 
			
		||||
    - build_inputs_with_special_tokens
 | 
			
		||||
 | 
			
		||||
## MBartTokenizerFast
 | 
			
		||||
 | 
			
		||||
@ -48,6 +48,7 @@ This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The origi
 | 
			
		||||
    - save_pretrained
 | 
			
		||||
    - batch_decode
 | 
			
		||||
    - decode
 | 
			
		||||
    - as_target_processor
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## MCTCTModel
 | 
			
		||||
 | 
			
		||||
@ -1,55 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# MobileViT
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The MobileViT model was proposed in [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. MobileViT introduces a new layer that replaces local processing in convolutions with global processing using transformers. 
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.*
 | 
			
		||||
 | 
			
		||||
Tips:
 | 
			
		||||
 | 
			
		||||
- MobileViT is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map.
 | 
			
		||||
- One can use [`MobileViTFeatureExtractor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
 | 
			
		||||
- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
 | 
			
		||||
- The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/). 
 | 
			
		||||
 | 
			
		||||
This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here](https://github.com/apple/ml-cvnets).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## MobileViTConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MobileViTConfig
 | 
			
		||||
 | 
			
		||||
## MobileViTFeatureExtractor
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MobileViTFeatureExtractor
 | 
			
		||||
    - __call__
 | 
			
		||||
 | 
			
		||||
## MobileViTModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MobileViTModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## MobileViTForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MobileViTForImageClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## MobileViTForSemanticSegmentation
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MobileViTForSemanticSegmentation
 | 
			
		||||
    - forward
 | 
			
		||||
@ -96,7 +96,3 @@ See [`T5TokenizerFast`] for all details.
 | 
			
		||||
## FlaxMT5ForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
[[autodoc]] FlaxMT5ForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
## FlaxMT5EncoderModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] FlaxMT5EncoderModel
 | 
			
		||||
 | 
			
		||||
@ -1,138 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# MVP
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The MVP model was proposed in [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
According to the abstract,
 | 
			
		||||
 | 
			
		||||
- MVP follows a standard Transformer encoder-decoder architecture.
 | 
			
		||||
- MVP is supervised pre-trained using labeled datasets.
 | 
			
		||||
- MVP also has task-specific soft prompts to stimulate the model's capacity in performing a certain task.
 | 
			
		||||
- MVP is specially designed for natural language generation and can be adapted to a wide range of generation tasks, including but not limited to summarization, data-to-text generation, open-ended dialogue system, story generation, question answering, question generation, task-oriented dialogue system, commonsense generation, paraphrase generation, text style transfer, and text simplification. Our model can also be adapted to natural language understanding tasks such as sequence classification and (extractive) question answering.
 | 
			
		||||
 | 
			
		||||
Tips:
 | 
			
		||||
- We have released a series of models [here](https://huggingface.co/models?filter=mvp), including MVP, MVP with task-specific prompts, and multi-task pre-trained variants.
 | 
			
		||||
- If you want to use a model without prompts (standard Transformer), you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp')`.
 | 
			
		||||
- If you want to use a model with task-specific prompts, such as summarization, you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp-summarization')`.
 | 
			
		||||
- Our model supports lightweight prompt tuning following [Prefix-tuning](https://arxiv.org/abs/2101.00190) with method `set_lightweight_tuning()`.
 | 
			
		||||
 | 
			
		||||
This model was contributed by [Tianyi Tang](https://huggingface.co/StevenTang). The detailed information and instructions can be found [here](https://github.com/RUCAIBox/MVP).
 | 
			
		||||
 | 
			
		||||
## Examples
 | 
			
		||||
For summarization, it is an example to use MVP and MVP with summarization-specific prompts.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import MvpTokenizer, MvpForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
>>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
 | 
			
		||||
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
 | 
			
		||||
>>> model_with_prompt = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp-summarization")
 | 
			
		||||
 | 
			
		||||
>>> inputs = tokenizer(
 | 
			
		||||
...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
 | 
			
		||||
...     return_tensors="pt",
 | 
			
		||||
... )
 | 
			
		||||
>>> generated_ids = model.generate(**inputs)
 | 
			
		||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 | 
			
		||||
["Why You Shouldn't Quit Your Job"]
 | 
			
		||||
 | 
			
		||||
>>> generated_ids = model_with_prompt.generate(**inputs)
 | 
			
		||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 | 
			
		||||
["Don't do it if these are your reasons"]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
For data-to-text generation, it is an example to use MVP and multi-task pre-trained variants.
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import MvpTokenizerFast, MvpForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
>>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
 | 
			
		||||
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
 | 
			
		||||
>>> model_with_mtl = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text")
 | 
			
		||||
 | 
			
		||||
>>> inputs = tokenizer(
 | 
			
		||||
...     "Describe the following data: Iron Man | instance of | Superhero [SEP] Stan Lee | creator | Iron Man",
 | 
			
		||||
...     return_tensors="pt",
 | 
			
		||||
... )
 | 
			
		||||
>>> generated_ids = model.generate(**inputs)
 | 
			
		||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 | 
			
		||||
['Stan Lee created the character of Iron Man, a fictional superhero appearing in American comic']
 | 
			
		||||
 | 
			
		||||
>>> generated_ids = model_with_mtl.generate(**inputs)
 | 
			
		||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 | 
			
		||||
['Iron Man is a fictional superhero appearing in American comic books published by Marvel Comics.']
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
For lightweight tuning, *i.e.*, fixing the model and only tuning prompts, you can load MVP with randomly initialized prompts or with task-specific prompts. Our code also supports Prefix-tuning with BART following the [original paper](https://arxiv.org/abs/2101.00190).
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import MvpForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp", use_prompt=True)
 | 
			
		||||
>>> # the number of trainable parameters (full tuning)
 | 
			
		||||
>>> sum(p.numel() for p in model.parameters() if p.requires_grad)
 | 
			
		||||
468116832
 | 
			
		||||
 | 
			
		||||
>>> # lightweight tuning with randomly initialized prompts
 | 
			
		||||
>>> model.set_lightweight_tuning()
 | 
			
		||||
>>> # the number of trainable parameters (lightweight tuning)
 | 
			
		||||
>>> sum(p.numel() for p in model.parameters() if p.requires_grad)
 | 
			
		||||
61823328
 | 
			
		||||
 | 
			
		||||
>>> # lightweight tuning with task-specific prompts
 | 
			
		||||
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text")
 | 
			
		||||
>>> model.set_lightweight_tuning()
 | 
			
		||||
>>> # original lightweight Prefix-tuning
 | 
			
		||||
>>> model = MvpForConditionalGeneration.from_pretrained("facebook/bart-large", use_prompt=True)
 | 
			
		||||
>>> model.set_lightweight_tuning()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## MvpConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpConfig
 | 
			
		||||
 | 
			
		||||
## MvpTokenizer
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpTokenizer
 | 
			
		||||
 | 
			
		||||
## MvpTokenizerFast
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpTokenizerFast
 | 
			
		||||
 | 
			
		||||
## MvpModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## MvpForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpForConditionalGeneration
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## MvpForSequenceClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpForSequenceClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## MvpForQuestionAnswering
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpForQuestionAnswering
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## MvpForCausalLM
 | 
			
		||||
 | 
			
		||||
[[autodoc]] MvpForCausalLM
 | 
			
		||||
    - forward
 | 
			
		||||
@ -1,76 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Nezha
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei et al.
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
 | 
			
		||||
due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
 | 
			
		||||
In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
 | 
			
		||||
representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks. 
 | 
			
		||||
The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional 
 | 
			
		||||
Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
 | 
			
		||||
Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
 | 
			
		||||
achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
 | 
			
		||||
named entity recognition (People's Daily NER), sentence matching (LCQMC), Chinese sentiment classification (ChnSenti)
 | 
			
		||||
and natural language inference (XNLI).*
 | 
			
		||||
 | 
			
		||||
This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The original code can be found [here](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch).
 | 
			
		||||
 | 
			
		||||
## NezhaConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaConfig
 | 
			
		||||
 | 
			
		||||
## NezhaModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## NezhaForPreTraining
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaForPreTraining
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## NezhaForMaskedLM
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaForMaskedLM
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## NezhaForNextSentencePrediction
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaForNextSentencePrediction
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## NezhaForSequenceClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaForSequenceClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## NezhaForMultipleChoice
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaForMultipleChoice
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## NezhaForTokenClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaForTokenClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## NezhaForQuestionAnswering
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NezhaForQuestionAnswering
 | 
			
		||||
    - forward
 | 
			
		||||
@ -1,98 +0,0 @@
 | 
			
		||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# NLLB
 | 
			
		||||
 | 
			
		||||
**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=bug&template=bug-report.yml) and assign
 | 
			
		||||
@LysandreJik
 | 
			
		||||
 | 
			
		||||
## Overview of NLLB
 | 
			
		||||
 | 
			
		||||
The NLLB model was presented in [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by Marta R. Costa-jussà, James Cross, Onur Çelebi,
 | 
			
		||||
Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula,
 | 
			
		||||
Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews,
 | 
			
		||||
Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
 | 
			
		||||
Safiyyah Saleem, Holger Schwenk, and Jeff Wang.
 | 
			
		||||
 | 
			
		||||
The abstract of the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today.
 | 
			
		||||
However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the
 | 
			
		||||
200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by
 | 
			
		||||
first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed
 | 
			
		||||
at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of
 | 
			
		||||
Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training
 | 
			
		||||
improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using
 | 
			
		||||
a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety.
 | 
			
		||||
Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.*
 | 
			
		||||
 | 
			
		||||
This implementation contains the dense models available on release. Let us know via a GitHub issue if you would like to see the MoE models as well.
 | 
			
		||||
 | 
			
		||||
This model was contributed by [Lysandre](https://huggingface.co/lysandre). The authors' code can be found [here](https://github.com/facebookresearch/fairseq/tree/nllb).
 | 
			
		||||
 | 
			
		||||
## Generating with NLLB
 | 
			
		||||
 | 
			
		||||
While generating the target text set the `forced_bos_token_id` to the target language id. The following
 | 
			
		||||
example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model.
 | 
			
		||||
 | 
			
		||||
Note that we're using the BCP-47 code for French `fra_Latn`. See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
 | 
			
		||||
for the list of all BCP-47 in the Flores 200 dataset.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 | 
			
		||||
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 | 
			
		||||
>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
 | 
			
		||||
 | 
			
		||||
>>> article = "UN Chief says there is no military solution in Syria"
 | 
			
		||||
>>> inputs = tokenizer(article, return_tensors="pt")
 | 
			
		||||
 | 
			
		||||
>>> translated_tokens = model.generate(
 | 
			
		||||
...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=30
 | 
			
		||||
... )
 | 
			
		||||
>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
 | 
			
		||||
Le chef de l'ONU dit qu'il n'y a pas de solution militaire en Syrie
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Generating from any other language than English
 | 
			
		||||
 | 
			
		||||
English (`eng_Latn`) is set as the default language from which to translate. In order to specify that you'd like to translate from a different language,
 | 
			
		||||
you should specify the BCP-47 code in the `src_lang` keyword argument of the tokenizer initialization.
 | 
			
		||||
 | 
			
		||||
See example below for a translation from romanian to german:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 | 
			
		||||
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained(
 | 
			
		||||
...     "facebook/nllb-200-distilled-600M", use_auth_token=True, src_lang="ron_Latn"
 | 
			
		||||
... )
 | 
			
		||||
>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True)
 | 
			
		||||
 | 
			
		||||
>>> article = "Şeful ONU spune că nu există o soluţie militară în Siria"
 | 
			
		||||
>>> inputs = tokenizer(article, return_tensors="pt")
 | 
			
		||||
 | 
			
		||||
>>> translated_tokens = model.generate(
 | 
			
		||||
...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30
 | 
			
		||||
... )
 | 
			
		||||
>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
 | 
			
		||||
UN-Chef sagt, es gibt keine militärische Lösung in Syrien
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## NllbTokenizer
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NllbTokenizer
 | 
			
		||||
    - build_inputs_with_special_tokens
 | 
			
		||||
 | 
			
		||||
## NllbTokenizerFast
 | 
			
		||||
 | 
			
		||||
[[autodoc]] NllbTokenizerFast
 | 
			
		||||
@ -54,11 +54,6 @@ The original code can be found [here](https://github.com/facebookresearch/metase
 | 
			
		||||
[[autodoc]] TFOPTForCausalLM
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
## OPTForSequenceClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OPTForSequenceClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## FlaxOPTModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] FlaxOPTModel
 | 
			
		||||
 | 
			
		||||
@ -1,108 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# OWL-ViT
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The OWL-ViT (short for Vision Transformer for Open-World Localization) was proposed in [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. OWL-ViT is an open-vocabulary object detection network trained on a variety of (image, text) pairs. It can be used to query an image with one or multiple text queries to search for and detect target objects described in text.
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Combining simple architectures with large-scale pre-training has led to massive improvements in image classification. For object detection, pre-training and scaling approaches are less well established, especially in the long-tailed and open-vocabulary setting, where training data is relatively scarce. In this paper, we propose a strong recipe for transferring image-text models to open-vocabulary object detection. We use a standard Vision Transformer architecture with minimal modifications, contrastive image-text pre-training, and end-to-end detection fine-tuning. Our analysis of the scaling properties of this setup shows that increasing image-level pre-training and model size yield consistent improvements on the downstream detection task. We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. Code and models are available on GitHub.*
 | 
			
		||||
 | 
			
		||||
## Usage
 | 
			
		||||
 | 
			
		||||
OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection. 
 | 
			
		||||
 | 
			
		||||
[`OwlViTFeatureExtractor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`].
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> import requests
 | 
			
		||||
>>> from PIL import Image
 | 
			
		||||
>>> import torch
 | 
			
		||||
 | 
			
		||||
>>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
 | 
			
		||||
 | 
			
		||||
>>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
 | 
			
		||||
>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
 | 
			
		||||
 | 
			
		||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 | 
			
		||||
>>> image = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
>>> texts = [["a photo of a cat", "a photo of a dog"]]
 | 
			
		||||
>>> inputs = processor(text=texts, images=image, return_tensors="pt")
 | 
			
		||||
>>> outputs = model(**inputs)
 | 
			
		||||
 | 
			
		||||
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
 | 
			
		||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
 | 
			
		||||
>>> # Convert outputs (bounding boxes and class logits) to COCO API
 | 
			
		||||
>>> results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
 | 
			
		||||
 | 
			
		||||
>>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
 | 
			
		||||
>>> text = texts[i]
 | 
			
		||||
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
 | 
			
		||||
 | 
			
		||||
>>> score_threshold = 0.1
 | 
			
		||||
>>> for box, score, label in zip(boxes, scores, labels):
 | 
			
		||||
...     box = [round(i, 2) for i in box.tolist()]
 | 
			
		||||
...     if score >= score_threshold:
 | 
			
		||||
...         print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
 | 
			
		||||
Detected a photo of a cat with confidence 0.243 at location [1.42, 50.69, 308.58, 370.48]
 | 
			
		||||
Detected a photo of a cat with confidence 0.298 at location [348.06, 20.56, 642.33, 372.61]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
 | 
			
		||||
 | 
			
		||||
## OwlViTConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTConfig
 | 
			
		||||
    - from_text_vision_configs
 | 
			
		||||
 | 
			
		||||
## OwlViTTextConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTTextConfig
 | 
			
		||||
 | 
			
		||||
## OwlViTVisionConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTVisionConfig
 | 
			
		||||
 | 
			
		||||
## OwlViTFeatureExtractor
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTFeatureExtractor
 | 
			
		||||
    - __call__
 | 
			
		||||
 | 
			
		||||
## OwlViTProcessor
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTProcessor
 | 
			
		||||
 | 
			
		||||
## OwlViTModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTModel
 | 
			
		||||
    - forward
 | 
			
		||||
    - get_text_features
 | 
			
		||||
    - get_image_features
 | 
			
		||||
 | 
			
		||||
## OwlViTTextModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTTextModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## OwlViTVisionModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTVisionModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## OwlViTForObjectDetection
 | 
			
		||||
 | 
			
		||||
[[autodoc]] OwlViTForObjectDetection
 | 
			
		||||
    - forward
 | 
			
		||||
@ -45,9 +45,8 @@ target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
 | 
			
		||||
 | 
			
		||||
However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this.
 | 
			
		||||
 | 
			
		||||
In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format 
 | 
			
		||||
when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if
 | 
			
		||||
it's passed with the `text_target` keyword argument.
 | 
			
		||||
In cases where the language code is needed, The regular [`~PLBartTokenizer.__call__`] will encode source text format, and it should be wrapped
 | 
			
		||||
inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode target text format.
 | 
			
		||||
 | 
			
		||||
- Supervised training
 | 
			
		||||
 | 
			
		||||
@ -57,7 +56,11 @@ it's passed with the `text_target` keyword argument.
 | 
			
		||||
>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python")
 | 
			
		||||
>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
 | 
			
		||||
>>> expected_translation_english = "Returns the maximum value of a b c."
 | 
			
		||||
>>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
 | 
			
		||||
>>> inputs = tokenizer(example_python_phrase, return_tensors="pt")
 | 
			
		||||
>>> with tokenizer.as_target_tokenizer():
 | 
			
		||||
...     labels = tokenizer(expected_translation_english, return_tensors="pt")
 | 
			
		||||
>>> inputs["labels"] = labels["input_ids"]
 | 
			
		||||
>>> # forward pass
 | 
			
		||||
>>> model(**inputs)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -85,6 +88,7 @@ it's passed with the `text_target` keyword argument.
 | 
			
		||||
## PLBartTokenizer
 | 
			
		||||
 | 
			
		||||
[[autodoc]] PLBartTokenizer
 | 
			
		||||
    - as_target_tokenizer
 | 
			
		||||
    - build_inputs_with_special_tokens
 | 
			
		||||
 | 
			
		||||
## PLBartModel
 | 
			
		||||
 | 
			
		||||
@ -27,8 +27,7 @@ Tips:
 | 
			
		||||
- One can use [`AutoFeatureExtractor`] to prepare images for the model.
 | 
			
		||||
- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer)
 | 
			
		||||
 | 
			
		||||
This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of the model
 | 
			
		||||
was contributed by [sayakpaul](https://huggingface.com/sayakpaul) and [ariG23498](https://huggingface.com/ariG23498).
 | 
			
		||||
This model was contributed by [Francesco](https://huggingface.co/Francesco).
 | 
			
		||||
The original code can be found [here](https://github.com/facebookresearch/pycls).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -46,15 +45,4 @@ The original code can be found [here](https://github.com/facebookresearch/pycls)
 | 
			
		||||
## RegNetForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] RegNetForImageClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## TFRegNetModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFRegNetModel
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## TFRegNetForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFRegNetForImageClassification
 | 
			
		||||
    - call
 | 
			
		||||
    - forward
 | 
			
		||||
@ -31,7 +31,7 @@ The figure below illustrates the architecture of ResNet. Taken from the [origina
 | 
			
		||||
 | 
			
		||||
<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/resnet_architecture.png"/>
 | 
			
		||||
 | 
			
		||||
This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/KaimingHe/deep-residual-networks).
 | 
			
		||||
This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/KaimingHe/deep-residual-networks).
 | 
			
		||||
 | 
			
		||||
## ResNetConfig
 | 
			
		||||
 | 
			
		||||
@ -47,16 +47,4 @@ This model was contributed by [Francesco](https://huggingface.co/Francesco). The
 | 
			
		||||
## ResNetForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ResNetForImageClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## TFResNetModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFResNetModel
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## TFResNetForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFResNetForImageClassification
 | 
			
		||||
    - call
 | 
			
		||||
    - forward
 | 
			
		||||
@ -36,14 +36,13 @@ The figure below illustrates the architecture of SegFormer. Taken from the [orig
 | 
			
		||||
 | 
			
		||||
<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png"/>
 | 
			
		||||
 | 
			
		||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version 
 | 
			
		||||
of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer).
 | 
			
		||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/NVlabs/SegFormer).
 | 
			
		||||
 | 
			
		||||
Tips:
 | 
			
		||||
 | 
			
		||||
- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head.
 | 
			
		||||
- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decode head.
 | 
			
		||||
  [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to
 | 
			
		||||
  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on
 | 
			
		||||
  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decode head on
 | 
			
		||||
  top to perform semantic segmentation of images. In addition, there's
 | 
			
		||||
  [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The
 | 
			
		||||
  authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
 | 
			
		||||
@ -52,9 +51,6 @@ Tips:
 | 
			
		||||
  found on the [hub](https://huggingface.co/models?other=segformer).
 | 
			
		||||
- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
 | 
			
		||||
  fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
 | 
			
		||||
- TensorFlow users should refer to [this repository](https://github.com/deep-diver/segformer-tf-transformers) that shows off-the-shelf inference and fine-tuning.
 | 
			
		||||
- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers)
 | 
			
		||||
  to try out a SegFormer model on custom images.
 | 
			
		||||
- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. 
 | 
			
		||||
- One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
 | 
			
		||||
  for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
 | 
			
		||||
@ -69,8 +65,7 @@ Tips:
 | 
			
		||||
  used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as
 | 
			
		||||
  background class and include this class as part of all labels. In that case, `reduce_labels` should be set to
 | 
			
		||||
  `False`, as loss should also be computed for the background class.
 | 
			
		||||
- As most models, SegFormer comes in different sizes, the details of which can be found in the table below
 | 
			
		||||
  (taken from Table 7 of the [original paper](https://arxiv.org/abs/2105.15203)).
 | 
			
		||||
- As most models, SegFormer comes in different sizes, the details of which can be found in the table below.
 | 
			
		||||
 | 
			
		||||
| **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
 | 
			
		||||
| :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: |
 | 
			
		||||
@ -81,10 +76,6 @@ Tips:
 | 
			
		||||
| MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
 | 
			
		||||
| MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
 | 
			
		||||
 | 
			
		||||
Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For
 | 
			
		||||
SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://arxiv.org/abs/2105.15203).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## SegformerConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] SegformerConfig
 | 
			
		||||
@ -113,23 +104,3 @@ SegFormer's results on the segmentation datasets like ADE20k, refer to the [pape
 | 
			
		||||
 | 
			
		||||
[[autodoc]] SegformerForSemanticSegmentation
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## TFSegformerDecodeHead
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFSegformerDecodeHead
 | 
			
		||||
    - call
 | 
			
		||||
 | 
			
		||||
## TFSegformerModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFSegformerModel
 | 
			
		||||
    - call 
 | 
			
		||||
 | 
			
		||||
## TFSegformerForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFSegformerForImageClassification
 | 
			
		||||
    - call 
 | 
			
		||||
 | 
			
		||||
## TFSegformerForSemanticSegmentation
 | 
			
		||||
 | 
			
		||||
[[autodoc]] TFSegformerForSemanticSegmentation
 | 
			
		||||
    - call 
 | 
			
		||||
 | 
			
		||||
@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
# Speech Encoder Decoder Models
 | 
			
		||||
 | 
			
		||||
The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model
 | 
			
		||||
The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-sequence-to-text-sequence model
 | 
			
		||||
with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder.
 | 
			
		||||
 | 
			
		||||
The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
 | 
			
		||||
@ -20,96 +20,9 @@ recognition and speech translation has *e.g.* been shown in [Large-Scale Self- a
 | 
			
		||||
Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
 | 
			
		||||
Alexis Conneau.
 | 
			
		||||
 | 
			
		||||
An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in [Speech2Text2](speech_to_text_2).
 | 
			
		||||
An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in
 | 
			
		||||
[Speech2Text2](speech_to_text_2).
 | 
			
		||||
 | 
			
		||||
## Randomly initializing `SpeechEncoderDecoderModel` from model configurations.
 | 
			
		||||
 | 
			
		||||
[`SpeechEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`Wav2Vec2Model`] configuration for the encoder
 | 
			
		||||
and the default [`BertForCausalLM`] configuration for the decoder.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> config_encoder = Wav2Vec2Config()
 | 
			
		||||
>>> config_decoder = BertConfig()
 | 
			
		||||
 | 
			
		||||
>>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 | 
			
		||||
>>> model = SpeechEncoderDecoderModel(config=config)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Initialising `SpeechEncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
 | 
			
		||||
 | 
			
		||||
[`SpeechEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based speech model, *e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert) can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
 | 
			
		||||
Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
 | 
			
		||||
Initializing [`SpeechEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
 | 
			
		||||
To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecoderModel.from_encoder_decoder_pretrained`] method.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import SpeechEncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
 | 
			
		||||
...     "facebook/hubert-large-ll60k", "bert-base-uncased"
 | 
			
		||||
... )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Loading an existing `SpeechEncoderDecoderModel` checkpoint and perform inference.
 | 
			
		||||
 | 
			
		||||
To load fine-tuned checkpoints of the `SpeechEncoderDecoderModel` class, [`SpeechEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
 | 
			
		||||
 | 
			
		||||
To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel
 | 
			
		||||
>>> from datasets import load_dataset
 | 
			
		||||
>>> import torch
 | 
			
		||||
 | 
			
		||||
>>> # load a fine-tuned speech translation model and corresponding processor
 | 
			
		||||
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
 | 
			
		||||
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
 | 
			
		||||
 | 
			
		||||
>>> # let's perform inference on a piece of English speech (which we'll translate to German)
 | 
			
		||||
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 | 
			
		||||
>>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values
 | 
			
		||||
 | 
			
		||||
>>> # autoregressively generate transcription (uses greedy decoding by default)
 | 
			
		||||
>>> generated_ids = model.generate(input_values)
 | 
			
		||||
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 | 
			
		||||
>>> print(generated_text)
 | 
			
		||||
Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Training
 | 
			
		||||
 | 
			
		||||
Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (speech, text) pairs.
 | 
			
		||||
As you can see, only 2 inputs are required for the model in order to compute a loss: `input_values` (which are the
 | 
			
		||||
speech inputs) and `labels` (which are the `input_ids` of the encoded target sequence).
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import AutoTokenizer, AutoFeatureExtractor, SpeechEncoderDecoderModel
 | 
			
		||||
>>> from datasets import load_dataset
 | 
			
		||||
 | 
			
		||||
>>> encoder_id = "facebook/wav2vec2-base-960h"  # acoustic model encoder
 | 
			
		||||
>>> decoder_id = "bert-base-uncased"  # text decoder
 | 
			
		||||
 | 
			
		||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id)
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained(decoder_id)
 | 
			
		||||
>>> # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model
 | 
			
		||||
>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id)
 | 
			
		||||
 | 
			
		||||
>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
 | 
			
		||||
>>> model.config.pad_token_id = tokenizer.pad_token_id
 | 
			
		||||
 | 
			
		||||
>>> # load an audio input and pre-process (normalise mean/std to 0/1)
 | 
			
		||||
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 | 
			
		||||
>>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values
 | 
			
		||||
 | 
			
		||||
>>> # load its corresponding transcription and tokenize to generate labels
 | 
			
		||||
>>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
 | 
			
		||||
 | 
			
		||||
>>> # the forward function automatically creates the correct decoder_input_ids
 | 
			
		||||
>>> loss = model(**input_features).loss
 | 
			
		||||
>>> loss.backward()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## SpeechEncoderDecoderConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -37,7 +37,7 @@ predicted token ids.
 | 
			
		||||
 | 
			
		||||
The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to
 | 
			
		||||
install those packages before running the examples. You could either install those as extra speech dependencies with
 | 
			
		||||
`pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
 | 
			
		||||
`pip install transformers"[speech, sentencepiece]"` or install the packages seperately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
 | 
			
		||||
be installed as follows: `apt install libsndfile1-dev`
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -120,6 +120,7 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
 | 
			
		||||
    - save_pretrained
 | 
			
		||||
    - batch_decode
 | 
			
		||||
    - decode
 | 
			
		||||
    - as_target_processor
 | 
			
		||||
 | 
			
		||||
## Speech2TextModel
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -114,6 +114,7 @@ See [model hub](https://huggingface.co/models?filter=speech2text2) to look for S
 | 
			
		||||
    - save_pretrained
 | 
			
		||||
    - batch_decode
 | 
			
		||||
    - decode
 | 
			
		||||
    - as_target_processor
 | 
			
		||||
 | 
			
		||||
## Speech2Text2ForCausalLM
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,47 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Swin Transformer V2
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.*
 | 
			
		||||
 | 
			
		||||
Tips:
 | 
			
		||||
- One can use the [`AutoFeatureExtractor`] API to prepare images for the model.
 | 
			
		||||
 | 
			
		||||
This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
 | 
			
		||||
The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Swinv2Config
 | 
			
		||||
 | 
			
		||||
[[autodoc]] Swinv2Config
 | 
			
		||||
 | 
			
		||||
## Swinv2Model
 | 
			
		||||
 | 
			
		||||
[[autodoc]] Swinv2Model
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## Swinv2ForMaskedImageModeling
 | 
			
		||||
 | 
			
		||||
[[autodoc]] Swinv2ForMaskedImageModeling
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## Swinv2ForImageClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] transformers.Swinv2ForImageClassification
 | 
			
		||||
    - forward
 | 
			
		||||
@ -371,8 +371,3 @@ T5 is supported by several example scripts, both for pre-training and fine-tunin
 | 
			
		||||
    - __call__
 | 
			
		||||
    - encode
 | 
			
		||||
    - decode
 | 
			
		||||
 | 
			
		||||
## FlaxT5EncoderModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] FlaxT5EncoderModel
 | 
			
		||||
    - __call__
 | 
			
		||||
 | 
			
		||||
@ -94,6 +94,7 @@ See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOC
 | 
			
		||||
    - save_pretrained
 | 
			
		||||
    - batch_decode
 | 
			
		||||
    - decode
 | 
			
		||||
    - as_target_processor
 | 
			
		||||
 | 
			
		||||
## TrOCRForCausalLM
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,31 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# UL2
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The T5 model was presented in [Unifying Language Learning Paradigms](https://arxiv.org/pdf/2205.05131v1.pdf) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler.
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized and unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 and/or GPT-like models across multiple diverse setups. Finally, by scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised NLP tasks ranging from language generation (with automated and human evaluation), language understanding, text classification, question answering, commonsense reasoning, long text reasoning, structured knowledge grounding and information retrieval. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization.*
 | 
			
		||||
 | 
			
		||||
Tips:
 | 
			
		||||
 | 
			
		||||
- UL2 is an encoder-decoder model pre-trained on a mixture of denoising functions as well as fine-tuned on an array of downstream tasks.
 | 
			
		||||
- UL2 has the same architecture as [T5v1.1](t5v1.1) but uses the Gated-SiLU activation function instead of Gated-GELU.
 | 
			
		||||
- The authors release checkpoints of one architecture which can be seen [here](https://huggingface.co/google/ul2)
 | 
			
		||||
 | 
			
		||||
The original code can be found [here](https://github.com/google-research/google-research/tree/master/ul2).
 | 
			
		||||
 | 
			
		||||
This model was contributed by [DanielHesslow](https://huggingface.co/Seledorn).
 | 
			
		||||
@ -87,8 +87,3 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ViltForImageAndTextRetrieval
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## ViltForTokenClassification
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ViltForTokenClassification
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
@ -12,136 +12,16 @@ specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
# Vision Encoder Decoder Models
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text model with any
 | 
			
		||||
pretrained Transformer-based vision model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin))
 | 
			
		||||
The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text-sequence model with any
 | 
			
		||||
pretrained Transformer-based vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin))
 | 
			
		||||
and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)).
 | 
			
		||||
 | 
			
		||||
The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for
 | 
			
		||||
example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
 | 
			
		||||
Zhoujun Li, Furu Wei.
 | 
			
		||||
 | 
			
		||||
After such a [`VisionEncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like any other models (see the examples below
 | 
			
		||||
for more information).
 | 
			
		||||
An example of how to use a [`VisionEncoderDecoderModel`] for inference can be seen in [TrOCR](trocr).
 | 
			
		||||
 | 
			
		||||
An example application is image captioning, in which the encoder is used to encode the image, after which an autoregressive language model generates
 | 
			
		||||
the caption. Another example is optical character recognition. Refer to [TrOCR](trocr), which is an instance of [`VisionEncoderDecoderModel`].
 | 
			
		||||
 | 
			
		||||
## Randomly initializing `VisionEncoderDecoderModel` from model configurations.
 | 
			
		||||
 | 
			
		||||
[`VisionEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`ViTModel`] configuration for the encoder
 | 
			
		||||
and the default [`BertForCausalLM`] configuration for the decoder.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> config_encoder = ViTConfig()
 | 
			
		||||
>>> config_decoder = BertConfig()
 | 
			
		||||
 | 
			
		||||
>>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 | 
			
		||||
>>> model = VisionEncoderDecoderModel(config=config)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Initialising `VisionEncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
 | 
			
		||||
 | 
			
		||||
[`VisionEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based vision model, *e.g.* [Swin](swin), can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
 | 
			
		||||
Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
 | 
			
		||||
Initializing [`VisionEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
 | 
			
		||||
To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecoderModel.from_encoder_decoder_pretrained`] method.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import VisionEncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
 | 
			
		||||
...     "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased"
 | 
			
		||||
... )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Loading an existing `VisionEncoderDecoderModel` checkpoint and perform inference.
 | 
			
		||||
 | 
			
		||||
To load fine-tuned checkpoints of the `VisionEncoderDecoderModel` class, [`VisionEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
 | 
			
		||||
 | 
			
		||||
To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> import requests
 | 
			
		||||
>>> from PIL import Image
 | 
			
		||||
 | 
			
		||||
>>> from transformers import GPT2TokenizerFast, ViTFeatureExtractor, VisionEncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> # load a fine-tuned image captioning model and corresponding tokenizer and feature extractor
 | 
			
		||||
>>> model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 | 
			
		||||
>>> tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 | 
			
		||||
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 | 
			
		||||
 | 
			
		||||
>>> # let's perform inference on an image
 | 
			
		||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 | 
			
		||||
>>> image = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
>>> pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
 | 
			
		||||
 | 
			
		||||
>>> # autoregressively generate caption (uses greedy decoding by default)
 | 
			
		||||
>>> generated_ids = model.generate(pixel_values)
 | 
			
		||||
>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 | 
			
		||||
>>> print(generated_text)
 | 
			
		||||
a cat laying on a blanket next to a cat laying on a bed
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Loading a PyTorch checkpoint into `TFVisionEncoderDecoderModel`.
 | 
			
		||||
 | 
			
		||||
[`TFVisionEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
 | 
			
		||||
PyTorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only PyTorch
 | 
			
		||||
checkpoints for a particular vision encoder-decoder model, a workaround is:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import VisionEncoderDecoderModel, TFVisionEncoderDecoderModel
 | 
			
		||||
 | 
			
		||||
>>> _model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 | 
			
		||||
 | 
			
		||||
>>> _model.encoder.save_pretrained("./encoder")
 | 
			
		||||
>>> _model.decoder.save_pretrained("./decoder")
 | 
			
		||||
 | 
			
		||||
>>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
 | 
			
		||||
...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
 | 
			
		||||
... )
 | 
			
		||||
>>> # This is only for copying some specific attributes of this particular model.
 | 
			
		||||
>>> model.config = _model.config
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Training
 | 
			
		||||
 | 
			
		||||
Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (image, text) pairs.
 | 
			
		||||
As you can see, only 2 inputs are required for the model in order to compute a loss: `pixel_values` (which are the
 | 
			
		||||
images) and `labels` (which are the `input_ids` of the encoded target sequence).
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import ViTFeatureExtractor, BertTokenizer, VisionEncoderDecoderModel
 | 
			
		||||
>>> from datasets import load_dataset
 | 
			
		||||
 | 
			
		||||
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
 | 
			
		||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 | 
			
		||||
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
 | 
			
		||||
...     "google/vit-base-patch16-224-in21k", "bert-base-uncased"
 | 
			
		||||
... )
 | 
			
		||||
 | 
			
		||||
>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
 | 
			
		||||
>>> model.config.pad_token_id = tokenizer.pad_token_id
 | 
			
		||||
 | 
			
		||||
>>> dataset = load_dataset("huggingface/cats-image")
 | 
			
		||||
>>> image = dataset["test"]["image"][0]
 | 
			
		||||
>>> pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
 | 
			
		||||
 | 
			
		||||
>>> labels = tokenizer(
 | 
			
		||||
...     "an image of two cats chilling on a couch",
 | 
			
		||||
...     return_tensors="pt",
 | 
			
		||||
... ).input_ids
 | 
			
		||||
 | 
			
		||||
>>> # the forward function automatically creates the correct decoder_input_ids
 | 
			
		||||
>>> loss = model(pixel_values=pixel_values, labels=labels).loss
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This model was contributed by [nielsr](https://github.com/nielsrogge). This model's TensorFlow and Flax versions
 | 
			
		||||
were contributed by [ydshieh](https://github.com/ydshieh).
 | 
			
		||||
 | 
			
		||||
## VisionEncoderDecoderConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -62,6 +62,7 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 | 
			
		||||
    - save_pretrained
 | 
			
		||||
    - batch_decode
 | 
			
		||||
    - decode
 | 
			
		||||
    - as_target_processor
 | 
			
		||||
 | 
			
		||||
## Wav2Vec2ProcessorWithLM
 | 
			
		||||
 | 
			
		||||
@ -72,6 +73,7 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 | 
			
		||||
    - save_pretrained
 | 
			
		||||
    - batch_decode
 | 
			
		||||
    - decode
 | 
			
		||||
    - as_target_processor
 | 
			
		||||
 | 
			
		||||
## Wav2Vec2 specific outputs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -179,10 +179,10 @@ This creates a repository under your username with the model name `my-awesome-mo
 | 
			
		||||
>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`:
 | 
			
		||||
If you belong to an organization and want to push your model under the organization name instead, add the `organization` parameter:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
 | 
			
		||||
>>> pt_model.push_to_hub("my-awesome-model", organization="my-awesome-org")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository:
 | 
			
		||||
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Efficient Inference on a Multiple GPUs
 | 
			
		||||
 | 
			
		||||
This document will be completed soon with information on how to infer on a multiple GPUs. In the meantime you can check out [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Efficient Inference on a Single GPU
 | 
			
		||||
 | 
			
		||||
This document will be completed soon with information on how to infer on a single GPU. In the meantime you can check out [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Inference on Specialized Hardware
 | 
			
		||||
 | 
			
		||||
This document will be completed soon with information on how to infer on specialized hardware. In the meantime you can check out [the guide for inference on CPUs](perf_infer_cpu).
 | 
			
		||||
@ -1,92 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Efficient Training on Multiple CPUs
 | 
			
		||||
 | 
			
		||||
When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling distributed CPU training efficiently.
 | 
			
		||||
 | 
			
		||||
## Intel® oneCCL Bindings for PyTorch
 | 
			
		||||
 | 
			
		||||
[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) and [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html).
 | 
			
		||||
 | 
			
		||||
Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12)  implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now
 | 
			
		||||
 | 
			
		||||
Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl).
 | 
			
		||||
 | 
			
		||||
### Intel® oneCCL Bindings for PyTorch installation:
 | 
			
		||||
 | 
			
		||||
Wheel files are available for the following Python versions:
 | 
			
		||||
 | 
			
		||||
| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
 | 
			
		||||
| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
 | 
			
		||||
| 1.12.0            |            | √          | √          | √          | √           |
 | 
			
		||||
| 1.11.0            |            | √          | √          | √          | √           |
 | 
			
		||||
| 1.10.0            | √          | √          | √          | √          |             |
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
pip install oneccl_bind_pt=={pytorch_version} -f https://software.intel.com/ipex-whl-stable
 | 
			
		||||
```
 | 
			
		||||
where `{pytorch_version}` should be your PyTorch version, for instance 1.12.0.
 | 
			
		||||
Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl).
 | 
			
		||||
 | 
			
		||||
### Usage in Trainer
 | 
			
		||||
To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--xpu_backend ccl`** in the command arguments.
 | 
			
		||||
 | 
			
		||||
Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
 | 
			
		||||
```shell script
 | 
			
		||||
 export CCL_WORKER_COUNT=1
 | 
			
		||||
 export MASTER_ADDR=127.0.0.1
 | 
			
		||||
 mpirun -n 2 -genv OMP_NUM_THREADS=23 \
 | 
			
		||||
 python3 run_qa.py \
 | 
			
		||||
 --model_name_or_path bert-large-uncased \
 | 
			
		||||
 --dataset_name squad \
 | 
			
		||||
 --do_train \
 | 
			
		||||
 --do_eval \
 | 
			
		||||
 --per_device_train_batch_size 12  \
 | 
			
		||||
 --learning_rate 3e-5  \
 | 
			
		||||
 --num_train_epochs 2  \
 | 
			
		||||
 --max_seq_length 384 \
 | 
			
		||||
 --doc_stride 128  \
 | 
			
		||||
 --output_dir /tmp/debug_squad/ \
 | 
			
		||||
 --no_cuda \
 | 
			
		||||
 --xpu_backend ccl
 | 
			
		||||
```
 | 
			
		||||
The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
 | 
			
		||||
 | 
			
		||||
In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument.
 | 
			
		||||
```shell script
 | 
			
		||||
 cat hostfile
 | 
			
		||||
 xxx.xxx.xxx.xxx #node0 ip
 | 
			
		||||
 xxx.xxx.xxx.xxx #node1 ip
 | 
			
		||||
```
 | 
			
		||||
Now, run the following command in node0 and **4DDP** will be enabled in node0 and node1:
 | 
			
		||||
```shell script
 | 
			
		||||
 export CCL_WORKER_COUNT=1
 | 
			
		||||
 export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
 | 
			
		||||
 mpirun -f hostfile -n 4 -ppn 2 \
 | 
			
		||||
 -genv OMP_NUM_THREADS=23 \
 | 
			
		||||
 python3 run_qa.py \
 | 
			
		||||
 --model_name_or_path bert-large-uncased \
 | 
			
		||||
 --dataset_name squad \
 | 
			
		||||
 --do_train \
 | 
			
		||||
 --do_eval \
 | 
			
		||||
 --per_device_train_batch_size 12  \
 | 
			
		||||
 --learning_rate 3e-5  \
 | 
			
		||||
 --num_train_epochs 2  \
 | 
			
		||||
 --max_seq_length 384 \
 | 
			
		||||
 --doc_stride 128  \
 | 
			
		||||
 --output_dir /tmp/debug_squad/ \
 | 
			
		||||
 --no_cuda \
 | 
			
		||||
 --xpu_backend ccl
 | 
			
		||||
```
 | 
			
		||||
@ -13,12 +13,6 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 | 
			
		||||
 | 
			
		||||
When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a mutli-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
 | 
			
		||||
 | 
			
		||||
<Tip>
 | 
			
		||||
 | 
			
		||||
 Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
 | 
			
		||||
 | 
			
		||||
## Concepts
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 | 
			
		||||
 | 
			
		||||
# Efficient Training on a Single GPU
 | 
			
		||||
 | 
			
		||||
This guide focuses on training large models efficiently on a single GPU. These approaches are still valid if you have access to a machine with multiple GPUs but you will also have access to additional methods outlined in the [multi-GPU section](perf_train_gpu_many).
 | 
			
		||||
This guide focuses on training large models efficiently on a single GPU. These approaches are still valid if you have access to a machine with multiple GPUs but you will also have access to additional methods outlined in the [multi-GPU section](perf_train_gpu_many). 
 | 
			
		||||
 | 
			
		||||
In this section we have a look at a few tricks to reduce the memory footprint and speed up training for large models and how they are integrated in the [`Trainer`] and [🤗 Accelerate](https://huggingface.co/docs/accelerate/). Each method can improve speed or memory usage which is summarized in the table below:
 | 
			
		||||
 | 
			
		||||
@ -33,7 +33,7 @@ pip install transformers datasets accelerate nvidia-ml-py3
 | 
			
		||||
 | 
			
		||||
The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly.
 | 
			
		||||
 | 
			
		||||
Then we create some dummy data. We create random token IDs between 100 and 30000 and binary labels for a classifier. In total we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format.
 | 
			
		||||
Then we create some dummy data. We create random token IDs between 100 and 30000 and binary labels for a classifier. In total we get 512 sequences each with length 512 and store them in a [`Dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=dataset#datasets.Dataset) with PyTorch format.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
@ -367,7 +367,7 @@ Samples/second: 10.09
 | 
			
		||||
GPU memory occupied: 7275 MB.
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
We can see that with these tweaks we use about half the GPU memory as at the beginning while also being slightly faster.
 | 
			
		||||
We can see that with these tweaks we use about half the GPU memory as at the beginning while also being slightly faster. 
 | 
			
		||||
 | 
			
		||||
### BF16
 | 
			
		||||
If you have access to a Ampere or newer hardware you can use bf16 for your training and evaluation. While bf16 has a worse precision than fp16, it has a much much bigger dynamic range. Therefore, if in the past you were experiencing overflow issues while training the model, bf16 will prevent this from happening most of the time. Remember that in fp16 the biggest number you can have is `65535` and any number above that will overflow. A bf16 number can be as large as `3.39e+38` (!) which is about the same as fp32 - because both have 8-bits used for the numerical range.
 | 
			
		||||
@ -394,13 +394,13 @@ Like all cases with reduced precision this may or may not be satisfactory for yo
 | 
			
		||||
 | 
			
		||||
If you're already using fp16 or bf16 mixed precision it may help with the throughput as well.
 | 
			
		||||
 | 
			
		||||
You can enable this mode in the 🤗 Trainer with:
 | 
			
		||||
You can enable this mode in the 🤗 Trainer with: 
 | 
			
		||||
```python
 | 
			
		||||
TrainingArguments(tf32=True)
 | 
			
		||||
```
 | 
			
		||||
By default the PyTorch default is used.
 | 
			
		||||
 | 
			
		||||
Note: tf32 mode is internal to CUDA and can't be accessed directly via `tensor.to(dtype=torch.tf32)` as `torch.tf32` doesn't exist.
 | 
			
		||||
Note: tf32 mode is internal to CUDA and can't be accessed directly via `tensor.to(dtype=torch.tf32)` as `torch.tf32` doesn't exit.
 | 
			
		||||
 | 
			
		||||
Note: you need `torch>=1.7` to enjoy this feature.
 | 
			
		||||
 | 
			
		||||
@ -654,7 +654,7 @@ https://github.com/huggingface/transformers/blob/master/src/transformers/trainer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Choice of GPU
 | 
			
		||||
Sometimes, even when applying all the above tweaks the throughput on a given GPU might still not be good enough. One easy solution is to change the type of GPU. For example switching from let's say a K80 (which you typically get on Google Colab) to a fancier GPU such as the V100 or A100. Although they are more expensive they are usually more cost effective than cheaper GPUs due to their larger memory and faster architecture.
 | 
			
		||||
Sometimes, even when applying all the above tweaks the throughput on a given GPU might still not be good enough. One easy solution is to change the type of GPU. For example switching from let's say a K80 (which you typically get on Google Colab) to a fancier GPU such as the V100 or A100. Although they are more expensive they are usually more cost effective than cheaper GPUs due to their larger memory and faster architecture. 
 | 
			
		||||
 | 
			
		||||
Now, let's take a step back and discuss what we should optimize for when scaling the training of large models.
 | 
			
		||||
 | 
			
		||||
@ -718,15 +718,3 @@ For some applications, such as pretraining large language models, applying all t
 | 
			
		||||
 | 
			
		||||
Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many).
 | 
			
		||||
 | 
			
		||||
## Inference with torchdynamo
 | 
			
		||||
TorchDynamo is a new tracer that uses Python’s frame evaluation API to automatically create FX traces from existing PyTorch programs. After capturing the FX graph, different backends can be deployed to lower the graph to an optimized engine. One solution is using the [TensorRT](https://developer.nvidia.com/tensorrt) or NVFuser as backend. You can choose one option below for performance boost.
 | 
			
		||||
```
 | 
			
		||||
TrainingArguments(torchdynamo="eager")      #enable eager model GPU. No performance boost
 | 
			
		||||
TrainingArguments(torchdynamo="nvfuser")    #enable nvfuser
 | 
			
		||||
TrainingArguments(torchdynamo="fx2trt")     #enable tensorRT fp32
 | 
			
		||||
TrainingArguments(torchdynamo="fx2trt-f16") #enable tensorRT fp16
 | 
			
		||||
```
 | 
			
		||||
This feature involves 3 different libraries. To install them, please follow the instructions below:  
 | 
			
		||||
- [Torchdynamo installation](https://github.com/pytorch/torchdynamo#requirements-and-setup)  
 | 
			
		||||
- [Functorch installation](https://github.com/pytorch/functorch#install)  
 | 
			
		||||
- [Torch-TensorRT(FX) installation](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst#installation)  
 | 
			
		||||
 | 
			
		||||
@ -1,20 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Training on Specialized Hardware
 | 
			
		||||
 | 
			
		||||
<Tip>
 | 
			
		||||
 | 
			
		||||
 Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [mutli-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
This document will be completed soon with information on how to train on specialized hardware.
 | 
			
		||||
@ -1,20 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Training on TPUs
 | 
			
		||||
 | 
			
		||||
<Tip>
 | 
			
		||||
 | 
			
		||||
 Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [mutli-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
This document will be completed soon with information on how to train on TPUs.
 | 
			
		||||
@ -24,13 +24,7 @@ This document serves as an overview and entry point for the methods that could b
 | 
			
		||||
 | 
			
		||||
## Training
 | 
			
		||||
 | 
			
		||||
Training transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where you only have a single GPU, but there is also a section about mutli-GPU and CPU training (with more coming soon).
 | 
			
		||||
 | 
			
		||||
<Tip>
 | 
			
		||||
 | 
			
		||||
 Note: Most of the strategies introduced in the single GPU sections (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
Training transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where you only have a single GPU.
 | 
			
		||||
 | 
			
		||||
### Single GPU
 | 
			
		||||
 | 
			
		||||
@ -52,11 +46,11 @@ In some cases training on a single GPU is still too slow or won't fit the large
 | 
			
		||||
 | 
			
		||||
### TPU
 | 
			
		||||
 | 
			
		||||
[_Coming soon_](perf_train_tpu)
 | 
			
		||||
_Coming soon_
 | 
			
		||||
 | 
			
		||||
### Specialized Hardware
 | 
			
		||||
 | 
			
		||||
[_Coming soon_](perf_train_special)
 | 
			
		||||
_Coming soon_
 | 
			
		||||
 | 
			
		||||
## Inference
 | 
			
		||||
 | 
			
		||||
@ -64,19 +58,19 @@ Efficient inference with large models in a production environment can be as chal
 | 
			
		||||
 | 
			
		||||
### CPU
 | 
			
		||||
 | 
			
		||||
[Go to CPU inference section](perf_infer_cpu)
 | 
			
		||||
[Go to CPU inference section](perf_infer_cpu.mdx)
 | 
			
		||||
 | 
			
		||||
### Single GPU
 | 
			
		||||
 | 
			
		||||
[Go to single GPU inference section](perf_infer_gpu_one)
 | 
			
		||||
_Coming soon_
 | 
			
		||||
 | 
			
		||||
### Multi-GPU
 | 
			
		||||
 | 
			
		||||
[Go to multi-GPU inference section](perf_infer_gpu_many)
 | 
			
		||||
_Coming soon_
 | 
			
		||||
 | 
			
		||||
### Specialized Hardware
 | 
			
		||||
 | 
			
		||||
[_Coming soon_](perf_infer_special)
 | 
			
		||||
_Coming soon_
 | 
			
		||||
 | 
			
		||||
## Hardware
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -244,7 +244,7 @@ For example, the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) data
 | 
			
		||||
 'sampling_rate': 8000}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz:
 | 
			
		||||
1. Use 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) method to upsample the sampling rate to 16kHz:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
 | 
			
		||||
@ -486,8 +486,10 @@ A processor combines a feature extractor and tokenizer. Load a processor with [`
 | 
			
		||||
>>> def prepare_dataset(example):
 | 
			
		||||
...     audio = example["audio"]
 | 
			
		||||
 | 
			
		||||
...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
 | 
			
		||||
...     example["input_values"] = processor(audio["array"], sampling_rate=16000)
 | 
			
		||||
 | 
			
		||||
...     with processor.as_target_processor():
 | 
			
		||||
...         example["labels"] = processor(example["text"]).input_ids
 | 
			
		||||
...     return example
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -389,42 +389,3 @@ One particularly cool 🤗 Transformers feature is the ability to save a model a
 | 
			
		||||
```
 | 
			
		||||
</tf>
 | 
			
		||||
</frameworkcontent>
 | 
			
		||||
 | 
			
		||||
## Custom model builds
 | 
			
		||||
 | 
			
		||||
You can modify the model's configuration class to change how a model is built. The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. You start from scratch when you initialize a model from a custom configuration class. The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results.
 | 
			
		||||
 | 
			
		||||
Start by importing [`AutoConfig`], and then load the pretrained model you want to modify. Within [`AutoConfig.from_pretrained`], you can specify the attribute you want to change, such as the number of attention heads:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import AutoConfig
 | 
			
		||||
 | 
			
		||||
>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<frameworkcontent>
 | 
			
		||||
<pt>
 | 
			
		||||
Create a model from your custom configuration with [`AutoModel.from_config`]:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import AutoModel
 | 
			
		||||
 | 
			
		||||
>>> my_model = AutoModel.from_config(my_config)
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
Create a model from your custom configuration with [`TFAutoModel.from_config`]:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import TFAutoModel
 | 
			
		||||
 | 
			
		||||
>>> my_model = TFAutoModel.from_config(my_config)
 | 
			
		||||
```
 | 
			
		||||
</tf>
 | 
			
		||||
</frameworkcontent>
 | 
			
		||||
 | 
			
		||||
Take a look at the [Create a custom architecture](./create_a_model) guide for more information about building custom configurations.
 | 
			
		||||
 | 
			
		||||
## What's next?
 | 
			
		||||
 | 
			
		||||
Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides!
 | 
			
		||||
@ -53,17 +53,12 @@ Ready-made configurations include the following architectures:
 | 
			
		||||
- BigBird-Pegasus
 | 
			
		||||
- Blenderbot
 | 
			
		||||
- BlenderbotSmall
 | 
			
		||||
- BLOOM
 | 
			
		||||
- CamemBERT
 | 
			
		||||
- CodeGen
 | 
			
		||||
- ConvBERT
 | 
			
		||||
- ConvNeXT
 | 
			
		||||
- Data2VecText
 | 
			
		||||
- Data2VecVision
 | 
			
		||||
- DeBERTa
 | 
			
		||||
- DeBERTa-v2
 | 
			
		||||
- DeiT
 | 
			
		||||
- DETR
 | 
			
		||||
- DistilBERT
 | 
			
		||||
- ELECTRA
 | 
			
		||||
- FlauBERT
 | 
			
		||||
@ -71,14 +66,11 @@ Ready-made configurations include the following architectures:
 | 
			
		||||
- GPT-J
 | 
			
		||||
- I-BERT
 | 
			
		||||
- LayoutLM
 | 
			
		||||
- LayoutLMv3
 | 
			
		||||
- LeViT
 | 
			
		||||
- LongT5
 | 
			
		||||
- M2M100
 | 
			
		||||
- Marian
 | 
			
		||||
- mBART
 | 
			
		||||
- MobileBERT
 | 
			
		||||
- MobileViT
 | 
			
		||||
- OpenAI GPT-2
 | 
			
		||||
- Perceiver
 | 
			
		||||
- PLBart
 | 
			
		||||
@ -91,7 +83,6 @@ Ready-made configurations include the following architectures:
 | 
			
		||||
- XLM
 | 
			
		||||
- XLM-RoBERTa
 | 
			
		||||
- XLM-RoBERTa-XL
 | 
			
		||||
- YOLOS
 | 
			
		||||
 | 
			
		||||
In the next two sections, we'll show you how to:
 | 
			
		||||
 | 
			
		||||
@ -682,4 +673,4 @@ torch.neuron.trace(model, [token_tensor, segments_tensors])
 | 
			
		||||
This change enables Neuron SDK to trace the model and optimize it to run in Inf1 instances.
 | 
			
		||||
 | 
			
		||||
To learn more about AWS Neuron SDK features, tools, example tutorials and latest updates,
 | 
			
		||||
please see the [AWS NeuronSDK documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
 | 
			
		||||
please see the [AWS NeuronSDK documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
 | 
			
		||||
@ -109,14 +109,15 @@ The preprocessing function needs to:
 | 
			
		||||
>>> def prepare_dataset(batch):
 | 
			
		||||
...     audio = batch["audio"]
 | 
			
		||||
 | 
			
		||||
...     batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
 | 
			
		||||
...     batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
 | 
			
		||||
...     batch["input_length"] = len(batch["input_values"])
 | 
			
		||||
 | 
			
		||||
...     batch["labels"] = processor(text=batch["transcription"]).input_ids
 | 
			
		||||
...     with processor.as_target_processor():
 | 
			
		||||
...         batch["labels"] = processor(batch["transcription"]).input_ids
 | 
			
		||||
...     return batch
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the map function by increasing the number of processes with `num_proc`. Remove the columns you don't need:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the map function by increasing the number of processes with `num_proc`. Remove the columns you don't need:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
 | 
			
		||||
@ -145,9 +146,17 @@ Unlike other data collators, this specific data collator needs to apply a differ
 | 
			
		||||
...         input_features = [{"input_values": feature["input_values"]} for feature in features]
 | 
			
		||||
...         label_features = [{"input_ids": feature["labels"]} for feature in features]
 | 
			
		||||
 | 
			
		||||
...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
 | 
			
		||||
 | 
			
		||||
...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
 | 
			
		||||
...         batch = self.processor.pad(
 | 
			
		||||
...             input_features,
 | 
			
		||||
...             padding=self.padding,
 | 
			
		||||
...             return_tensors="pt",
 | 
			
		||||
...         )
 | 
			
		||||
...         with self.processor.as_target_processor():
 | 
			
		||||
...             labels_batch = self.processor.pad(
 | 
			
		||||
...                 label_features,
 | 
			
		||||
...                 padding=self.padding,
 | 
			
		||||
...                 return_tensors="pt",
 | 
			
		||||
...             )
 | 
			
		||||
 | 
			
		||||
...         # replace padding with -100 to ignore loss correctly
 | 
			
		||||
...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
 | 
			
		||||
 | 
			
		||||
@ -129,7 +129,7 @@ The preprocessing function needs to:
 | 
			
		||||
...     return inputs
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that is what the model expects:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that is what the model expects:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
 | 
			
		||||
 | 
			
		||||
@ -95,7 +95,7 @@ Create a preprocessing function that will apply the transforms and return the `p
 | 
			
		||||
...     return examples
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Dataset's [`~datasets.Dataset.with_transform`] method to apply the transforms over the entire dataset. The transforms are applied on-the-fly when you load an element of the dataset:
 | 
			
		||||
Use 🤗 Dataset's [`with_transform`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?#datasets.Dataset.with_transform) method to apply the transforms over the entire dataset. The transforms are applied on-the-fly when you load an element of the dataset:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> food = food.with_transform(transforms)
 | 
			
		||||
 | 
			
		||||
@ -118,7 +118,7 @@ Here is how you can create a preprocessing function to convert the list to a str
 | 
			
		||||
...     return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once and increasing the number of processes with `num_proc`. Remove the columns you don't need:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once and increasing the number of processes with `num_proc`. Remove the columns you don't need:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tokenized_eli5 = eli5.map(
 | 
			
		||||
@ -141,7 +141,6 @@ Now you need a second preprocessing function to capture text truncated from any
 | 
			
		||||
>>> def group_texts(examples):
 | 
			
		||||
...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
 | 
			
		||||
...     total_length = len(concatenated_examples[list(examples.keys())[0]])
 | 
			
		||||
...     total_length = (total_length // block_size) * block_size
 | 
			
		||||
...     result = {
 | 
			
		||||
...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
 | 
			
		||||
...         for k, t in concatenated_examples.items()
 | 
			
		||||
@ -245,7 +244,7 @@ At this point, only three steps remain:
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_set = lm_dataset["train"].to_tf_dataset(
 | 
			
		||||
@ -352,7 +351,7 @@ At this point, only three steps remain:
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_set = lm_dataset["train"].to_tf_dataset(
 | 
			
		||||
 | 
			
		||||
@ -79,7 +79,7 @@ The preprocessing function needs to do:
 | 
			
		||||
...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
tokenized_swag = swag.map(preprocess_function, batched=True)
 | 
			
		||||
@ -224,7 +224,7 @@ At this point, only three steps remain:
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs in `columns`, targets in `label_cols`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs in `columns`, targets in `label_cols`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
 | 
			
		||||
 | 
			
		||||
@ -126,7 +126,7 @@ Here is how you can create a function to truncate and map the start and end toke
 | 
			
		||||
...     return inputs
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
 | 
			
		||||
@ -199,7 +199,7 @@ At this point, only three steps remain:
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs and the start and end positions of an answer in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs and the start and end positions of an answer in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_set = tokenized_squad["train"].to_tf_dataset(
 | 
			
		||||
 | 
			
		||||
@ -66,7 +66,7 @@ Create a preprocessing function to tokenize `text` and truncate sequences to be
 | 
			
		||||
...     return tokenizer(examples["text"], truncation=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
tokenized_imdb = imdb.map(preprocess_function, batched=True)
 | 
			
		||||
@ -144,7 +144,7 @@ At this point, only three steps remain:
 | 
			
		||||
</Tip>
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_set = tokenized_imdb["train"].to_tf_dataset(
 | 
			
		||||
 | 
			
		||||
@ -67,7 +67,7 @@ Load the T5 tokenizer to process `text` and `summary`:
 | 
			
		||||
The preprocessing function needs to:
 | 
			
		||||
 | 
			
		||||
1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
 | 
			
		||||
2. Use the keyword `text_target` argument when tokenizing labels.
 | 
			
		||||
2. Use a context manager with the `as_target_tokenizer()` function to parallelize tokenization of inputs and labels.
 | 
			
		||||
3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
@ -78,13 +78,14 @@ The preprocessing function needs to:
 | 
			
		||||
...     inputs = [prefix + doc for doc in examples["text"]]
 | 
			
		||||
...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
 | 
			
		||||
 | 
			
		||||
...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
 | 
			
		||||
...     with tokenizer.as_target_tokenizer():
 | 
			
		||||
...         labels = tokenizer(examples["summary"], max_length=128, truncation=True)
 | 
			
		||||
 | 
			
		||||
...     model_inputs["labels"] = labels["input_ids"]
 | 
			
		||||
...     return model_inputs
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
 | 
			
		||||
@ -159,7 +160,7 @@ At this point, only three steps remain:
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_set = tokenized_billsum["train"].to_tf_dataset(
 | 
			
		||||
 | 
			
		||||
@ -126,7 +126,7 @@ Here is how you can create a function to realign the tokens and labels, and trun
 | 
			
		||||
...     return tokenized_inputs
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to tokenize and align the labels over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to tokenize and align the labels over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
 | 
			
		||||
@ -199,7 +199,7 @@ At this point, only three steps remain:
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_set = tokenized_wnut["train"].to_tf_dataset(
 | 
			
		||||
 | 
			
		||||
@ -78,42 +78,25 @@ The preprocessing function needs to:
 | 
			
		||||
>>> def preprocess_function(examples):
 | 
			
		||||
...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
 | 
			
		||||
...     targets = [example[target_lang] for example in examples["translation"]]
 | 
			
		||||
...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
 | 
			
		||||
...     model_inputs = tokenizer(inputs, max_length=128, truncation=True)
 | 
			
		||||
 | 
			
		||||
...     with tokenizer.as_target_tokenizer():
 | 
			
		||||
...         labels = tokenizer(targets, max_length=128, truncation=True)
 | 
			
		||||
 | 
			
		||||
...     model_inputs["labels"] = labels["input_ids"]
 | 
			
		||||
...     return model_inputs
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Use 🤗 Datasets [`~datasets.Dataset.map`] function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tokenized_books = books.map(preprocess_function, batched=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<frameworkcontent>
 | 
			
		||||
<pt>
 | 
			
		||||
Load T5 with [`AutoModelForSeq2SeqLM`]:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import AutoModelForSeq2SeqLM
 | 
			
		||||
 | 
			
		||||
>>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
Load T5 with [`TFAutoModelForSeq2SeqLM`]:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import TFAutoModelForSeq2SeqLM
 | 
			
		||||
 | 
			
		||||
>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
 | 
			
		||||
```
 | 
			
		||||
</tf>
 | 
			
		||||
</frameworkcontent>
 | 
			
		||||
 | 
			
		||||
Use [`DataCollatorForSeq2Seq`] to create a batch of examples. It will also *dynamically pad* your text and labels to the length of the longest element in its batch, so they are a uniform length. While it is possible to pad your text in the `tokenizer` function by setting `padding=True`, dynamic padding is more efficient.
 | 
			
		||||
 | 
			
		||||
<frameworkcontent>
 | 
			
		||||
<pt>
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import DataCollatorForSeq2Seq
 | 
			
		||||
 | 
			
		||||
@ -121,7 +104,6 @@ Use [`DataCollatorForSeq2Seq`] to create a batch of examples. It will also *dyna
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import DataCollatorForSeq2Seq
 | 
			
		||||
 | 
			
		||||
@ -134,6 +116,13 @@ Use [`DataCollatorForSeq2Seq`] to create a batch of examples. It will also *dyna
 | 
			
		||||
 | 
			
		||||
<frameworkcontent>
 | 
			
		||||
<pt>
 | 
			
		||||
Load T5 with [`AutoModelForSeq2SeqLM`]:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
 | 
			
		||||
 | 
			
		||||
>>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<Tip>
 | 
			
		||||
 | 
			
		||||
@ -148,8 +137,6 @@ At this point, only three steps remain:
 | 
			
		||||
3. Call [`~Trainer.train`] to fine-tune your model.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
 | 
			
		||||
 | 
			
		||||
>>> training_args = Seq2SeqTrainingArguments(
 | 
			
		||||
...     output_dir="./results",
 | 
			
		||||
...     evaluation_strategy="epoch",
 | 
			
		||||
@ -175,7 +162,7 @@ At this point, only three steps remain:
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`~datasets.Dataset.to_tf_dataset`]. Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
To fine-tune a model in TensorFlow, start by converting your datasets to the `tf.data.Dataset` format with [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specify inputs and labels in `columns`, whether to shuffle the dataset order, batch size, and the data collator:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_set = tokenized_books["train"].to_tf_dataset(
 | 
			
		||||
@ -207,6 +194,14 @@ Set up an optimizer function, learning rate schedule, and some training hyperpar
 | 
			
		||||
>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Load T5 with [`TFAutoModelForSeq2SeqLM`]:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> from transformers import TFAutoModelForSeq2SeqLM
 | 
			
		||||
 | 
			
		||||
>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
@ -227,4 +222,4 @@ For a more in-depth example of how to fine-tune a model for translation, take a
 | 
			
		||||
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)
 | 
			
		||||
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
</Tip>
 | 
			
		||||
@ -1226,7 +1226,7 @@ This whole process would have been much easier if we only could set something li
 | 
			
		||||
experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and
 | 
			
		||||
Github Actions don't support it at the moment.
 | 
			
		||||
 | 
			
		||||
You can vote for this feature and see where it is at these CI-specific threads:
 | 
			
		||||
You can vote for this feature and see where it is at at these CI-specific threads:
 | 
			
		||||
 | 
			
		||||
- [Github Actions:](https://github.com/actions/toolkit/issues/399)
 | 
			
		||||
- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
 | 
			
		||||
 | 
			
		||||
@ -169,7 +169,7 @@ The [`DefaultDataCollator`] assembles tensors into a batch for the model to trai
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
Next, convert the tokenized datasets to TensorFlow datasets with the [`~datasets.Dataset.to_tf_dataset`] method. Specify your inputs in `columns`, and your label in `label_cols`:
 | 
			
		||||
Next, convert the tokenized datasets to TensorFlow datasets with the [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset) method. Specify your inputs in `columns`, and your label in `label_cols`:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
>>> tf_train_dataset = small_train_dataset.to_tf_dataset(
 | 
			
		||||
 | 
			
		||||
@ -25,16 +25,12 @@
 | 
			
		||||
    title: Usa tokenizadores de 🤗 Tokenizers
 | 
			
		||||
  - local: create_a_model
 | 
			
		||||
    title: Crea una arquitectura personalizada
 | 
			
		||||
  - local: custom_models
 | 
			
		||||
    title: Compartir modelos personalizados
 | 
			
		||||
  - sections:
 | 
			
		||||
    - local: tasks/language_modeling
 | 
			
		||||
      title: Modelado de lenguaje
 | 
			
		||||
    - local: tasks/image_classification
 | 
			
		||||
      title: Clasificación de imágenes
 | 
			
		||||
    title: Fine-tuning para tareas posteriores
 | 
			
		||||
  - local: run_scripts
 | 
			
		||||
    title: Entrenamiento con scripts
 | 
			
		||||
  - local: sagemaker
 | 
			
		||||
    title: Ejecutar el entrenamiento en Amazon SageMaker
 | 
			
		||||
  - local: multilingual
 | 
			
		||||
 | 
			
		||||
@ -1,351 +0,0 @@
 | 
			
		||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Compartir modelos personalizados
 | 
			
		||||
 | 
			
		||||
La biblioteca 🤗 Transformers está diseñada para ser fácilmente ampliable. Cada modelo está completamente codificado 
 | 
			
		||||
sin abstracción en una subcarpeta determinada del repositorio, por lo que puedes copiar fácilmente un archivo del modelo 
 | 
			
		||||
y ajustarlo según tus necesidades.
 | 
			
		||||
 | 
			
		||||
Si estás escribiendo un modelo completamente nuevo, podría ser más fácil comenzar desde cero. En este tutorial, te mostraremos 
 | 
			
		||||
cómo escribir un modelo personalizado y su configuración para que pueda usarse dentro de Transformers, y cómo puedes compartirlo 
 | 
			
		||||
con la comunidad (con el código en el que se basa) para que cualquiera pueda usarlo, incluso si no está presente en la biblioteca 
 | 
			
		||||
🤗 Transformers.
 | 
			
		||||
 | 
			
		||||
Ilustraremos todo esto con un modelo ResNet, envolviendo la clase ResNet de la [biblioteca timm](https://github.com/rwightman/pytorch-image-models/tree/master/timm) en un [`PreTrainedModel`].
 | 
			
		||||
 | 
			
		||||
## Escribir una configuración personalizada
 | 
			
		||||
 | 
			
		||||
Antes de adentrarnos en el modelo, primero escribamos su configuración. La configuración de un modelo es un objeto que
 | 
			
		||||
contendrá toda la información necesaria para construir el modelo. Como veremos en la siguiente sección, el modelo solo puede
 | 
			
		||||
tomar un `config` para ser inicializado, por lo que realmente necesitamos que ese objeto esté lo más completo posible.
 | 
			
		||||
 | 
			
		||||
En nuestro ejemplo, tomaremos un par de argumentos de la clase ResNet que tal vez queramos modificar. Las diferentes 
 | 
			
		||||
configuraciones nos darán los diferentes tipos de ResNet que son posibles. Luego simplemente almacenamos esos argumentos 
 | 
			
		||||
después de verificar la validez de algunos de ellos.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import PretrainedConfig
 | 
			
		||||
from typing import List
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ResnetConfig(PretrainedConfig):
 | 
			
		||||
    model_type = "resnet"
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        block_type="bottleneck",
 | 
			
		||||
        layers: List[int] = [3, 4, 6, 3],
 | 
			
		||||
        num_classes: int = 1000,
 | 
			
		||||
        input_channels: int = 3,
 | 
			
		||||
        cardinality: int = 1,
 | 
			
		||||
        base_width: int = 64,
 | 
			
		||||
        stem_width: int = 64,
 | 
			
		||||
        stem_type: str = "",
 | 
			
		||||
        avg_down: bool = False,
 | 
			
		||||
        **kwargs,
 | 
			
		||||
    ):
 | 
			
		||||
        if block_type not in ["basic", "bottleneck"]:
 | 
			
		||||
            raise ValueError(f"`block` must be 'basic' or bottleneck', got {block}.")
 | 
			
		||||
        if stem_type not in ["", "deep", "deep-tiered"]:
 | 
			
		||||
            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {block}.")
 | 
			
		||||
 | 
			
		||||
        self.block_type = block_type
 | 
			
		||||
        self.layers = layers
 | 
			
		||||
        self.num_classes = num_classes
 | 
			
		||||
        self.input_channels = input_channels
 | 
			
		||||
        self.cardinality = cardinality
 | 
			
		||||
        self.base_width = base_width
 | 
			
		||||
        self.stem_width = stem_width
 | 
			
		||||
        self.stem_type = stem_type
 | 
			
		||||
        self.avg_down = avg_down
 | 
			
		||||
        super().__init__(**kwargs)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Las tres cosas importantes que debes recordar al escribir tu propia configuración son las siguientes:
 | 
			
		||||
- tienes que heredar de `PretrainedConfig`,
 | 
			
		||||
- el `__init__` de tu `PretrainedConfig` debe aceptar cualquier `kwargs`,
 | 
			
		||||
- esos `kwargs` deben pasarse a la superclase `__init__`.
 | 
			
		||||
 | 
			
		||||
La herencia es para asegurarte de obtener toda la funcionalidad de la biblioteca 🤗 Transformers, mientras que las otras dos 
 | 
			
		||||
restricciones provienen del hecho de que una `PretrainedConfig` tiene más campos que los que estás configurando. Al recargar una 
 | 
			
		||||
`config` con el método `from_pretrained`, esos campos deben ser aceptados por tu `config` y luego enviados a la superclase.
 | 
			
		||||
 | 
			
		||||
Definir un `model_type` para tu configuración (en este caso `model_type="resnet"`) no es obligatorio, a menos que quieras
 | 
			
		||||
registrar tu modelo con las clases automáticas (ver la última sección).
 | 
			
		||||
 | 
			
		||||
Una vez hecho esto, puedes crear y guardar fácilmente tu configuración como lo harías con cualquier otra configuración de un 
 | 
			
		||||
modelo de la biblioteca. Así es como podemos crear una configuración resnet50d y guardarla:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
 | 
			
		||||
resnet50d_config.save_pretrained("custom-resnet")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Esto guardará un archivo llamado `config.json` dentro de la carpeta `custom-resnet`. Luego puedes volver a cargar tu configuración 
 | 
			
		||||
con el método `from_pretrained`:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
También puedes usar cualquier otro método de la clase [`PretrainedConfig`], como [`~PretrainedConfig.push_to_hub`], para cargar 
 | 
			
		||||
directamente tu configuración en el Hub.
 | 
			
		||||
 | 
			
		||||
## Escribir un modelo personalizado
 | 
			
		||||
 | 
			
		||||
Ahora que tenemos nuestra configuración de ResNet, podemos seguir escribiendo el modelo. En realidad escribiremos dos: una que
 | 
			
		||||
extrae las características ocultas de un grupo de imágenes (como [`BertModel`]) y una que es adecuada para clasificación de
 | 
			
		||||
imagenes (como [`BertForSequenceClassification`]).
 | 
			
		||||
 | 
			
		||||
Como mencionamos antes, solo escribiremos un envoltura (_wrapper_) libre del modelo para simplificar este ejemplo. Lo único que debemos 
 | 
			
		||||
hacer antes de escribir esta clase es un mapeo entre los tipos de bloques y las clases de bloques reales. Luego se define el 
 | 
			
		||||
modelo desde la configuración pasando todo a la clase `ResNet`:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import PreTrainedModel
 | 
			
		||||
from timm.models.resnet import BasicBlock, Bottleneck, ResNet
 | 
			
		||||
from .configuration_resnet import ResnetConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ResnetModel(PreTrainedModel):
 | 
			
		||||
    config_class = ResnetConfig
 | 
			
		||||
 | 
			
		||||
    def __init__(self, config):
 | 
			
		||||
        super().__init__(config)
 | 
			
		||||
        block_layer = BLOCK_MAPPING[config.block_type]
 | 
			
		||||
        self.model = ResNet(
 | 
			
		||||
            block_layer,
 | 
			
		||||
            config.layers,
 | 
			
		||||
            num_classes=config.num_classes,
 | 
			
		||||
            in_chans=config.input_channels,
 | 
			
		||||
            cardinality=config.cardinality,
 | 
			
		||||
            base_width=config.base_width,
 | 
			
		||||
            stem_width=config.stem_width,
 | 
			
		||||
            stem_type=config.stem_type,
 | 
			
		||||
            avg_down=config.avg_down,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def forward(self, tensor):
 | 
			
		||||
        return self.model.forward_features(tensor)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Para el modelo que clasificará las imágenes, solo cambiamos el método de avance (es decir, el método `forward`):
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
class ResnetModelForImageClassification(PreTrainedModel):
 | 
			
		||||
    config_class = ResnetConfig
 | 
			
		||||
 | 
			
		||||
    def __init__(self, config):
 | 
			
		||||
        super().__init__(config)
 | 
			
		||||
        block_layer = BLOCK_MAPPING[config.block_type]
 | 
			
		||||
        self.model = ResNet(
 | 
			
		||||
            block_layer,
 | 
			
		||||
            config.layers,
 | 
			
		||||
            num_classes=config.num_classes,
 | 
			
		||||
            in_chans=config.input_channels,
 | 
			
		||||
            cardinality=config.cardinality,
 | 
			
		||||
            base_width=config.base_width,
 | 
			
		||||
            stem_width=config.stem_width,
 | 
			
		||||
            stem_type=config.stem_type,
 | 
			
		||||
            avg_down=config.avg_down,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def forward(self, tensor, labels=None):
 | 
			
		||||
        logits = self.model(tensor)
 | 
			
		||||
        if labels is not None:
 | 
			
		||||
            loss = torch.nn.cross_entropy(logits, labels)
 | 
			
		||||
            return {"loss": loss, "logits": logits}
 | 
			
		||||
        return {"logits": logits}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
En ambos casos, observa cómo heredamos de `PreTrainedModel` y llamamos a la inicialización de la superclase con `config` 
 | 
			
		||||
(un poco como cuando escribes `torch.nn.Module`). La línea que establece `config_class` no es obligatoria, a menos 
 | 
			
		||||
que quieras registrar tu modelo con las clases automáticas (consulta la última sección).
 | 
			
		||||
 | 
			
		||||
<Tip>
 | 
			
		||||
 | 
			
		||||
Si tu modelo es muy similar a un modelo dentro de la biblioteca, puedes reutilizar la misma configuración de ese modelo.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
Puedes hacer que tu modelo devuelva lo que quieras, pero devolver un diccionario como lo hicimos para 
 | 
			
		||||
`ResnetModelForImageClassification`, con el `loss` incluido cuando se pasan las etiquetas, hará que tu modelo se pueda 
 | 
			
		||||
usar directamente dentro de la clase [`Trainer`]. Usar otro formato de salida está bien, siempre y cuando estés planeando usar 
 | 
			
		||||
tu propio bucle de entrenamiento u otra biblioteca para el entrenamiento.
 | 
			
		||||
 | 
			
		||||
Ahora que tenemos nuestra clase, vamos a crear un modelo:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
resnet50d = ResnetModelForImageClassification(resnet50d_config)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Nuevamente, puedes usar cualquiera de los métodos de [`PreTrainedModel`], como [`~PreTrainedModel.save_pretrained`] o 
 | 
			
		||||
[`~PreTrainedModel.push_to_hub`]. Usaremos el segundo en la siguiente sección y veremos cómo pasar los pesos del modelo 
 | 
			
		||||
con el código de nuestro modelo. Pero primero, carguemos algunos pesos previamente entrenados dentro de nuestro modelo.
 | 
			
		||||
 | 
			
		||||
En tu caso de uso, probablemente estarás entrenando tu modelo personalizado con tus propios datos. Para ir rápido en este 
 | 
			
		||||
tutorial, usaremos la versión preentrenada de resnet50d. Dado que nuestro modelo es solo un envoltorio alrededor del resnet50d 
 | 
			
		||||
original, será fácil transferir esos pesos:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
import timm
 | 
			
		||||
 | 
			
		||||
pretrained_model = timm.create_model("resnet50d", pretrained=True)
 | 
			
		||||
resnet50d.model.load_state_dict(pretrained_model.state_dict())
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Ahora veamos cómo asegurarnos de que cuando hacemos [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], 
 | 
			
		||||
se guarda el código del modelo.
 | 
			
		||||
 | 
			
		||||
## Enviar el código al _Hub_
 | 
			
		||||
 | 
			
		||||
<Tip warning={true}>
 | 
			
		||||
 | 
			
		||||
Esta _API_ es experimental y puede tener algunos cambios leves en las próximas versiones.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
Primero, asegúrate de que tu modelo esté completamente definido en un archivo `.py`. Puedes basarte en importaciones 
 | 
			
		||||
relativas a otros archivos, siempre que todos los archivos estén en el mismo directorio (aún no admitimos submódulos 
 | 
			
		||||
para esta característica). Para nuestro ejemplo, definiremos un archivo `modeling_resnet.py` y un archivo 
 | 
			
		||||
`configuration_resnet.py` en una carpeta del directorio de trabajo actual llamado `resnet_model`. El archivo de configuración 
 | 
			
		||||
contiene el código de `ResnetConfig` y el archivo del modelo contiene el código de `ResnetModel` y 
 | 
			
		||||
`ResnetModelForImageClassification`.
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
.
 | 
			
		||||
└── resnet_model
 | 
			
		||||
    ├── __init__.py
 | 
			
		||||
    ├── configuration_resnet.py
 | 
			
		||||
    └── modeling_resnet.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
El `__init__.py`  puede estar vacío, solo está ahí para que Python detecte que `resnet_model` se puede usar como un módulo.
 | 
			
		||||
 | 
			
		||||
<Tip warning={true}>
 | 
			
		||||
 | 
			
		||||
Si copias archivos del modelo desde la biblioteca, deberás reemplazar todas las importaciones relativas en la parte superior 
 | 
			
		||||
del archivo para importarlos desde el paquete `transformers`.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
Ten en cuenta que puedes reutilizar (o subclasificar) una configuración o modelo existente.
 | 
			
		||||
 | 
			
		||||
Para compartir tu modelo con la comunidad, sigue estos pasos: primero importa el modelo y la configuración de ResNet desde 
 | 
			
		||||
los archivos recién creados:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from resnet_model.configuration_resnet import ResnetConfig
 | 
			
		||||
from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Luego, debes decirle a la biblioteca que deseas copiar el código de esos objetos cuando usas el método `save_pretrained` 
 | 
			
		||||
y registrarlos correctamente con una determinada clase automática (especialmente para modelos), simplemente ejecuta:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
ResnetConfig.register_for_auto_class()
 | 
			
		||||
ResnetModel.register_for_auto_class("AutoModel")
 | 
			
		||||
ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Ten en cuenta que no es necesario especificar una clase automática para la configuración (solo hay una clase automática 
 | 
			
		||||
para ellos, [`AutoConfig`]), pero es diferente para los modelos. Tu modelo personalizado podría ser adecuado para muchas 
 | 
			
		||||
tareas diferentes, por lo que debes especificar cuál de las clases automáticas es la correcta para tu modelo.
 | 
			
		||||
 | 
			
		||||
A continuación, vamos a crear la configuración y los modelos como lo hicimos antes:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
 | 
			
		||||
resnet50d = ResnetModelForImageClassification(resnet50d_config)
 | 
			
		||||
 | 
			
		||||
pretrained_model = timm.create_model("resnet50d", pretrained=True)
 | 
			
		||||
resnet50d.model.load_state_dict(pretrained_model.state_dict())
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Ahora, para enviar el modelo al Hub, asegúrate de haber iniciado sesión. Ejecuta en tu terminal:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
huggingface-cli login
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
o desde un _notebook_:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from huggingface_hub import notebook_login
 | 
			
		||||
 | 
			
		||||
notebook_login()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Luego puedes ingresar a tu propio espacio (o una organización de la que seas miembro) de esta manera:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
resnet50d.push_to_hub("custom-resnet50d")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Además de los pesos del modelo y la configuración en formato json, esto también copió los archivos `.py` del modelo y la
 | 
			
		||||
configuración en la carpeta `custom-resnet50d` y subió el resultado al Hub. Puedes verificar el resultado en este 
 | 
			
		||||
[repositorio de modelos](https://huggingface.co/sgugger/custom-resnet50d).
 | 
			
		||||
 | 
			
		||||
Consulta el tutorial sobre cómo [compartir modelos](model_sharing) para obtener más información sobre el método para subir modelos al Hub.
 | 
			
		||||
 | 
			
		||||
## Usar un modelo con código personalizado
 | 
			
		||||
 | 
			
		||||
Puedes usar cualquier configuración, modelo o _tokenizador_ con archivos de código personalizado en tu repositorio con las 
 | 
			
		||||
clases automáticas y el método `from_pretrained`. Todos los archivos y códigos cargados en el Hub se analizan en busca de 
 | 
			
		||||
malware (consulta la documentación de [seguridad del Hub](https://huggingface.co/docs/hub/security#malware-scanning) para 
 | 
			
		||||
obtener más información), pero aún debes revisar el código del modelo y el autor para evitar la ejecución de código malicioso 
 | 
			
		||||
en tu computadora. Configura `trust_remote_code=True` para usar un modelo con código personalizado:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import AutoModelForImageClassification
 | 
			
		||||
 | 
			
		||||
model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
También se recomienda encarecidamente pasar un _hash_ de confirmación como una "revisión" para asegurarte de que el autor 
 | 
			
		||||
de los modelos no actualizó el código con algunas líneas nuevas maliciosas (a menos que confíes plenamente en los autores 
 | 
			
		||||
de los modelos).
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
 | 
			
		||||
model = AutoModelForImageClassification.from_pretrained(
 | 
			
		||||
    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Ten en cuenta que al navegar por el historial de confirmaciones del repositorio del modelo en Hub, hay un botón para copiar 
 | 
			
		||||
fácilmente el hash de confirmación de cualquier _commit_.
 | 
			
		||||
 | 
			
		||||
## Registrar un model con código personalizado a las clases automáticas
 | 
			
		||||
 | 
			
		||||
Si estás escribiendo una biblioteca que amplía 🤗 Transformers, es posible que quieras ampliar las clases automáticas para 
 | 
			
		||||
incluir tu propio modelo. Esto es diferente de enviar el código al Hub en el sentido de que los usuarios necesitarán importar 
 | 
			
		||||
tu biblioteca para obtener los modelos personalizados (al contrario de descargar automáticamente el código del modelo desde Hub).
 | 
			
		||||
 | 
			
		||||
Siempre que tu configuración tenga un atributo `model_type` que sea diferente de los tipos de modelos existentes, y que tus 
 | 
			
		||||
clases modelo tengan los atributos `config_class` correctos, puedes agregarlos a las clases automáticas de la siguiente manera:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
 | 
			
		||||
 | 
			
		||||
AutoConfig.register("resnet", ResnetConfig)
 | 
			
		||||
AutoModel.register(ResnetConfig, ResnetModel)
 | 
			
		||||
AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Ten en cuenta que el primer argumento utilizado al registrar tu configuración personalizada en [`AutoConfig`] debe coincidir
 | 
			
		||||
con el `model_type` de tu configuración personalizada, y el primer argumento utilizado al registrar tus modelos personalizados
 | 
			
		||||
en cualquier clase del modelo automático debe coincidir con el `config_class ` de esos modelos.
 | 
			
		||||
@ -471,8 +471,10 @@ Un processor combina un extractor de características y un tokenizador. Cargue u
 | 
			
		||||
>>> def prepare_dataset(example):
 | 
			
		||||
...     audio = example["audio"]
 | 
			
		||||
 | 
			
		||||
...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
 | 
			
		||||
...     example["input_values"] = processor(audio["array"], sampling_rate=16000)
 | 
			
		||||
 | 
			
		||||
...     with processor.as_target_processor():
 | 
			
		||||
...         example["labels"] = processor(example["text"]).input_ids
 | 
			
		||||
...     return example
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,347 +0,0 @@
 | 
			
		||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Entrenamiento con scripts
 | 
			
		||||
 | 
			
		||||
Junto con los [notebooks](./noteboks/README) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 | 
			
		||||
 | 
			
		||||
También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca.
 | 
			
		||||
 | 
			
		||||
No se espera que los scripts de ejemplo funcionen de inmediato en todos los problemas, y es posible que debas adaptar el script al problema que estás tratando de resolver. Para ayudarte con esto, la mayoría de los scripts exponen completamente cómo se preprocesan los datos, lo que te permite editarlos según sea necesario para tu caso de uso.
 | 
			
		||||
 | 
			
		||||
Para cualquier característica que te gustaría implementar en un script de ejemplo, por favor discútelo en el [foro](https://discuss.huggingface.co/) o con un [issue](https://github.com/huggingface/transformers/issues) antes de enviar un Pull Request. Si bien agradecemos las correcciones de errores, es poco probable que fusionemos un Pull Request que agregue más funcionalidad a costa de la legibilidad.
 | 
			
		||||
 | 
			
		||||
Esta guía te mostrará cómo ejecutar un ejemplo de un script de entrenamiento para resumir texto en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) y [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Se espera que todos los ejemplos funcionen con ambos frameworks a menos que se especifique lo contrario.
 | 
			
		||||
 | 
			
		||||
## Configuración
 | 
			
		||||
 | 
			
		||||
Para ejecutar con éxito la última versión de los scripts de ejemplo debes **instalar 🤗 Transformers desde su fuente** en un nuevo entorno virtual:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
git clone https://github.com/huggingface/transformers
 | 
			
		||||
cd transformers
 | 
			
		||||
pip install .
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Para versiones anteriores de los scripts de ejemplo, haz clic en alguno de los siguientes links:
 | 
			
		||||
 | 
			
		||||
<details>
 | 
			
		||||
  <summary>Ejemplos de versiones anteriores de 🤗 Transformers</summary>
 | 
			
		||||
	<ul>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
 | 
			
		||||
		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
 | 
			
		||||
	</ul>
 | 
			
		||||
</details>
 | 
			
		||||
 | 
			
		||||
Luego cambia tu clon actual de 🤗 Transformers a una versión específica, por ejemplo v3.5.1:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
git checkout tags/v3.5.1
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Una vez que hayas configurado la versión correcta de la biblioteca, ve a la carpeta de ejemplo de tu elección e instala los requisitos específicos del ejemplo:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
pip install -r requirements.txt
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Ejecutar un script
 | 
			
		||||
 | 
			
		||||
<frameworkcontent>
 | 
			
		||||
<pt>
 | 
			
		||||
El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos con [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) en una arquitectura que soporta la tarea de resumen. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python examples/pytorch/summarization/run_summarization.py \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --overwrite_output_dir \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos utilizando Keras en una arquitectura que soporta la tarea de resumir. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python examples/tensorflow/summarization/run_summarization.py  \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --output_dir /tmp/tst-summarization  \
 | 
			
		||||
    --per_device_train_batch_size 8 \
 | 
			
		||||
    --per_device_eval_batch_size 16 \
 | 
			
		||||
    --num_train_epochs 3 \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval
 | 
			
		||||
```
 | 
			
		||||
</tf>
 | 
			
		||||
</frameworkcontent>
 | 
			
		||||
 | 
			
		||||
## Entrenamiento distribuido y de precisión mixta
 | 
			
		||||
 | 
			
		||||
[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) admite un entrenamiento distribuido y de precisión mixta, lo que significa que también puedes usarlo en un script. Para habilitar ambas características:
 | 
			
		||||
 | 
			
		||||
- Agrega el argumento `fp16` para habilitar la precisión mixta.
 | 
			
		||||
- Establece la cantidad de GPU que se usarás con el argumento `nproc_per_node`.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python -m torch.distributed.launch \
 | 
			
		||||
    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
 | 
			
		||||
    --fp16 \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --overwrite_output_dir \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Los scripts de TensorFlow utilizan [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) para el entrenamiento distribuido, y no es necesario agregar argumentos adicionales al script de entrenamiento. El script de TensorFlow utilizará múltiples GPUs de forma predeterminada si están disponibles.
 | 
			
		||||
 | 
			
		||||
## Ejecutar un script en una TPU
 | 
			
		||||
 | 
			
		||||
<frameworkcontent>
 | 
			
		||||
<pt>
 | 
			
		||||
Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. PyTorch admite TPU con el compilador de aprendizaje profundo [XLA](https://www.tensorflow.org/xla) (consulta [aquí](https://github.com/pytorch/xla/blob/master/README.md) para obtener más detalles). Para usar una TPU, inicia el script `xla_spawn.py` y usa el argumento `num_cores` para establecer la cantidad de núcleos de TPU que deseas usar.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python xla_spawn.py --num_cores 8 \
 | 
			
		||||
    summarization/run_summarization.py \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --overwrite_output_dir \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
</pt>
 | 
			
		||||
<tf>
 | 
			
		||||
Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. TensorFlow utiliza [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) para entrenar en TPUs. Para usar una TPU, pasa el nombre del recurso de la TPU al argumento `tpu`
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python run_summarization.py  \
 | 
			
		||||
    --tpu name_of_tpu_resource \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --output_dir /tmp/tst-summarization  \
 | 
			
		||||
    --per_device_train_batch_size 8 \
 | 
			
		||||
    --per_device_eval_batch_size 16 \
 | 
			
		||||
    --num_train_epochs 3 \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval
 | 
			
		||||
```
 | 
			
		||||
</tf>
 | 
			
		||||
</frameworkcontent>
 | 
			
		||||
 | 
			
		||||
## Ejecutar un script con 🤗 Accelerate
 | 
			
		||||
 | 
			
		||||
🤗 [Accelerate](https://huggingface.co/docs/accelerate/index.html) es una biblioteca exclusiva de PyTorch que ofrece un método unificado para entrenar un modelo en varios tipos de configuraciones (solo CPU, GPU múltiples, TPU) mientras mantiene una visibilidad completa en el ciclo de entrenamiento de PyTorch. Asegúrate de tener 🤗 Accelerate instalado si aún no lo tienes:
 | 
			
		||||
 | 
			
		||||
> Nota: Como Accelerate se está desarrollando rápidamente, debes instalar la versión git de Accelerate para ejecutar los scripts
 | 
			
		||||
```bash
 | 
			
		||||
pip install git+https://github.com/huggingface/accelerate
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
En lugar del script `run_summarization.py`, debes usar el script `run_summarization_no_trainer.py`. Los scripts compatibles con 🤗 Accelerate tendrán un archivo `task_no_trainer.py` en la carpeta. Comienza ejecutando el siguiente comando para crear y guardar un archivo de configuración:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
accelerate config
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Prueba tu configuración para asegurarte que esta configurada correctamente:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
accelerate test
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Todo listo para iniciar el entrenamiento:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
accelerate launch run_summarization_no_trainer.py \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir ~/tmp/tst-summarization
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Usar un conjunto de datos personalizado
 | 
			
		||||
 | 
			
		||||
El script de la tarea resumir admite conjuntos de datos personalizados siempre que sean un archivo CSV o JSON Line. Cuando uses tu propio conjunto de datos, necesitas especificar varios argumentos adicionales:
 | 
			
		||||
 | 
			
		||||
- `train_file` y `validation_file` especifican la ruta a tus archivos de entrenamiento y validación.
 | 
			
		||||
- `text_column` es el texto de entrada para resumir.
 | 
			
		||||
- `summary_column` es el texto de destino para la salida.
 | 
			
		||||
 | 
			
		||||
Un script para resumir que utiliza un conjunto de datos personalizado se vera así:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python examples/pytorch/summarization/run_summarization.py \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --train_file path_to_csv_or_jsonlines_file \
 | 
			
		||||
    --validation_file path_to_csv_or_jsonlines_file \
 | 
			
		||||
    --text_column text_column_name \
 | 
			
		||||
    --summary_column summary_column_name \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --overwrite_output_dir \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Prueba un script
 | 
			
		||||
 | 
			
		||||
A veces, es una buena idea ejecutar tu secuencia de comandos en una cantidad menor de ejemplos para asegurarte de que todo funciona como se espera antes de comprometerte con un conjunto de datos completo, lo que puede demorar horas en completarse. Utiliza los siguientes argumentos para truncar el conjunto de datos a un número máximo de muestras:
 | 
			
		||||
 | 
			
		||||
- `max_train_samples`
 | 
			
		||||
- `max_eval_samples`
 | 
			
		||||
- `max_predict_samples`
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python examples/pytorch/summarization/run_summarization.py \
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --max_train_samples 50 \
 | 
			
		||||
    --max_eval_samples 50 \
 | 
			
		||||
    --max_predict_samples 50 \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --overwrite_output_dir \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
No todos los scripts de ejemplo admiten el argumento `max_predict_samples`. Puede que desconozcas si la secuencia de comandos admite este argumento, agrega `-h` para verificar:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
examples/pytorch/summarization/run_summarization.py -h
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Reanudar el entrenamiento desde el punto de control
 | 
			
		||||
 | 
			
		||||
Otra opción útil para habilitar es reanudar el entrenamiento desde un punto de control anterior. Esto asegurará que puedas continuar donde lo dejaste sin comenzar de nuevo si tu entrenamiento se interrumpe. Hay dos métodos para reanudar el entrenamiento desde un punto de control.
 | 
			
		||||
 | 
			
		||||
El primer método utiliza el argumento `output_dir previous_output_dir` para reanudar el entrenamiento desde el último punto de control almacenado en `output_dir`. En este caso, debes eliminar `overwrite_output_dir`:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python examples/pytorch/summarization/run_summarization.py
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --output_dir previous_output_dir \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
El segundo método utiliza el argumento `resume_from_checkpoint path_to_specific_checkpoint` para reanudar el entrenamiento desde una carpeta de punto de control específica.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python examples/pytorch/summarization/run_summarization.py
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --overwrite_output_dir \
 | 
			
		||||
    --resume_from_checkpoint path_to_specific_checkpoint \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Comparte tu modelo
 | 
			
		||||
 | 
			
		||||
Todos los scripts pueden cargar tu modelo final en el [Model Hub](https://huggingface.co/models). Asegúrate de haber iniciado sesión en Hugging Face antes de comenzar:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
huggingface-cli login
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Luego agrega el argumento `push_to_hub` al script. Este argumento creará un repositorio con tu nombre de usuario Hugging Face y el nombre de la carpeta especificado en `output_dir`.
 | 
			
		||||
 | 
			
		||||
Para darle a tu repositorio un nombre específico, usa el argumento `push_to_hub_model_id` para añadirlo. El repositorio se incluirá automáticamente en tu namespace.
 | 
			
		||||
 | 
			
		||||
El siguiente ejemplo muestra cómo cargar un modelo con un nombre de repositorio específico:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
python examples/pytorch/summarization/run_summarization.py
 | 
			
		||||
    --model_name_or_path t5-small \
 | 
			
		||||
    --do_train \
 | 
			
		||||
    --do_eval \
 | 
			
		||||
    --dataset_name cnn_dailymail \
 | 
			
		||||
    --dataset_config "3.0.0" \
 | 
			
		||||
    --source_prefix "summarize: " \
 | 
			
		||||
    --push_to_hub \
 | 
			
		||||
    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
 | 
			
		||||
    --output_dir /tmp/tst-summarization \
 | 
			
		||||
    --per_device_train_batch_size=4 \
 | 
			
		||||
    --per_device_eval_batch_size=4 \
 | 
			
		||||
    --overwrite_output_dir \
 | 
			
		||||
    --predict_with_generate
 | 
			
		||||
```
 | 
			
		||||
@ -141,7 +141,6 @@ Ahora necesitas una segunda función de preprocesamiento para capturar el texto
 | 
			
		||||
>>> def group_texts(examples):
 | 
			
		||||
...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
 | 
			
		||||
...     total_length = len(concatenated_examples[list(examples.keys())[0]])
 | 
			
		||||
...     total_length = (total_length // block_size) * block_size
 | 
			
		||||
...     result = {
 | 
			
		||||
...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
 | 
			
		||||
...         for k, t in concatenated_examples.items()
 | 
			
		||||
 | 
			
		||||
@ -11,28 +11,4 @@
 | 
			
		||||
    title: Pipeline per l'inferenza
 | 
			
		||||
  - local: autoclass_tutorial
 | 
			
		||||
    title: Carica istanze pre-allenate con AutoClass
 | 
			
		||||
  - local: preprocessing
 | 
			
		||||
    title: Preprocess
 | 
			
		||||
  - local: training
 | 
			
		||||
    title: Fine-tuning di un modello pre-addestrato
 | 
			
		||||
  - local: accelerate
 | 
			
		||||
    title: Allenamento distribuito con 🤗 Accelerate
 | 
			
		||||
  - local: model_sharing
 | 
			
		||||
    title: Condividere un modello
 | 
			
		||||
  title: Esercitazione
 | 
			
		||||
- sections:
 | 
			
		||||
  - local: create_a_model
 | 
			
		||||
    title: Crea un'architettura personalizzata
 | 
			
		||||
  - local: custom_models
 | 
			
		||||
    title: Condividere modelli personalizzati
 | 
			
		||||
  - local: run_scripts
 | 
			
		||||
    title: Addestramento con script
 | 
			
		||||
  - local: multilingual
 | 
			
		||||
    title: Modelli multilingua per l'inferenza
 | 
			
		||||
  - local: converting_tensorflow_models
 | 
			
		||||
    title: Convertire modelli tensorflow
 | 
			
		||||
  - local: serialization
 | 
			
		||||
    title: Esporta modelli Transformers
 | 
			
		||||
  - local: debugging
 | 
			
		||||
    title: Debugging
 | 
			
		||||
  title: Guide pratiche
 | 
			
		||||
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user