trigger 5

trigger 4
trigger 3
2025-10-22 02:08:58 +08:00 · 2024-09-01 22:06:35 +02:00 · 2024-09-01 10:15:08 +02:00 · 2024-08-31 22:50:34 +02:00 · 2024-08-31 06:55:32 +02:00 · 2024-08-30 22:05:59 +02:00
346 changed files with 21711 additions and 3812 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -121,11 +121,16 @@ class CircleCIJob:
        )

        steps.append({"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}})
+
+        # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
+        if "examples" in self.name:
+            steps.append({"run": {"name": "Download NLTK files", "command": """python -c "import nltk; nltk.download('punkt', quiet=True)" """}})
+
        test_command = ""
        if self.command_timeout:
            test_command = f"timeout {self.command_timeout} "
        # junit familiy xunit1 is necessary to support splitting on test name or class name with circleci split
-        test_command += f"python3 -m pytest -rsfE -p no:warnings -o junit_family=xunit1 --tb=short --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+        test_command += f"python3 -m pytest -rsfE -p no:warnings --tb=short -o junit_family=xunit1 --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags)

        if self.parallelism == 1:
            if self.tests_to_run is None:
@ -185,10 +190,6 @@ class CircleCIJob:
            steps.append({"store_artifacts": {"path": "tests.txt"}})
            steps.append({"store_artifacts": {"path": "splitted_tests.txt"}})

-            test_command = ""
-            if self.command_timeout:
-                test_command = f"timeout {self.command_timeout} "
-            test_command += f"python3 -m pytest -rsfE -p no:warnings --tb=short  -o junit_family=xunit1 --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags)
            test_command += " $(cat splitted_tests.txt)"
        if self.marker is not None:
            test_command += f" -m {self.marker}"
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -23,7 +23,7 @@ jobs:
          sudo apt -y update && sudo apt install -y libsndfile1-dev

      - name: Load cached virtual environment
-        uses: actions/cache@v2
+        uses: actions/cache@v4
        id: cache
        with:
          path: ~/venv/
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -31,12 +31,12 @@ jobs:
        if: github.event_name == 'schedule'
        working-directory: /transformers
        run: |
-          python3 -m pip install optimum-benchmark>=0.2.0
+          python3 -m pip install optimum-benchmark>=0.3.0
          HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun

      - name: Benchmark (merged to main event)
        if: github.event_name == 'push' && github.ref_name == 'main'
        working-directory: /transformers
        run: |
-          python3 -m pip install optimum-benchmark>=0.2.0
+          python3 -m pip install optimum-benchmark>=0.3.0
          HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results_merge_event --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@ -74,4 +74,4 @@ jobs:
          slack_channel: "#transformers-ci-circleci-images"
          title: 🤗 New docker images for CircleCI are pushed.
          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/check_tiny_models.yml
+++ b/.github/workflows/check_tiny_models.yml
@ -23,7 +23,7 @@ jobs:

      - uses: actions/checkout@v4
      - name: Set up Python 3.8
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          # Semantic version range syntax or exact version of a Python version
          python-version: '3.8'
--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@ -19,7 +19,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4

      - name: Install miniconda
        uses: conda-incubator/setup-miniconda@v2
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@ -324,6 +324,7 @@ jobs:
        # We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
+          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -563,6 +563,7 @@ jobs:
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
-          pip install slack_sdk
+          pip install huggingface_hub
+          pip install slack_sdk 
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@ -506,6 +506,7 @@ jobs:
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
          sudo apt-get install -y curl
+          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -2,12 +2,9 @@ name: Self-hosted runner (scheduled)


 on:
-  repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
  push:
    branches:
-      - run_scheduled_ci*
+      - trigger_disable_multi_gpu

 jobs:
  model-ci:
@ -21,58 +18,58 @@ jobs:
      ci_event: Daily CI
    secrets: inherit

-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  tf-pipeline:
-    name: TF pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_tf_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
-      runner: daily-ci
-      docker: huggingface/transformers-tensorflow-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-examples"
-      runner: daily-ci
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-deepspeed"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Daily CI
-      working-directory-prefix: /workspace
-    secrets: inherit
-
-  quantization-ci:
-    name: Quantization CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-quantization"
-      runner: daily-ci
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: Daily CI
-    secrets: inherit
+#  torch-pipeline:
+#    name: Torch pipeline CI
+#    uses: ./.github/workflows/self-scheduled.yml
+#    with:
+#      job: run_pipelines_torch_gpu
+#      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
+#      runner: daily-ci
+#      docker: huggingface/transformers-pytorch-gpu
+#      ci_event: Daily CI
+#    secrets: inherit
+#
+#  tf-pipeline:
+#    name: TF pipeline CI
+#    uses: ./.github/workflows/self-scheduled.yml
+#    with:
+#      job: run_pipelines_tf_gpu
+#      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
+#      runner: daily-ci
+#      docker: huggingface/transformers-tensorflow-gpu
+#      ci_event: Daily CI
+#    secrets: inherit
+#
+#  example-ci:
+#    name: Example CI
+#    uses: ./.github/workflows/self-scheduled.yml
+#    with:
+#      job: run_examples_gpu
+#      slack_report_channel: "#transformers-ci-daily-examples"
+#      runner: daily-ci
+#      docker: huggingface/transformers-all-latest-gpu
+#      ci_event: Daily CI
+#    secrets: inherit
+#
+#  deepspeed-ci:
+#    name: DeepSpeed CI
+#    uses: ./.github/workflows/self-scheduled.yml
+#    with:
+#      job: run_torch_cuda_extensions_gpu
+#      slack_report_channel: "#transformers-ci-daily-deepspeed"
+#      runner: daily-ci
+#      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
+#      ci_event: Daily CI
+#      working-directory-prefix: /workspace
+#    secrets: inherit
+#
+#  quantization-ci:
+#    name: Quantization CI
+#    uses: ./.github/workflows/self-scheduled.yml
+#    with:
+#      job: run_quantization_torch_gpu
+#      slack_report_channel: "#transformers-ci-daily-quantization"
+#      runner: daily-ci
+#      docker: huggingface/transformers-quantization-latest-gpu
+#      ci_event: Daily CI
+#    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -102,7 +102,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [single-gpu, multi-gpu]
+        machine_type: [single-gpu]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -15,7 +15,7 @@ jobs:
    - uses: actions/checkout@v4

    - name: Setup Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: 3.8

--- a/README.md
+++ b/README.md
@ -48,6 +48,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+	<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -101,7 +101,7 @@ def summarize(run_dir, metrics, expand_metrics=False):
        # post-processing of report: show a few selected/important metric
        for metric in metrics:
            keys = metric.split(".")
-            value = report
+            value = report.to_dict()
            current = metrics_values
            for key in keys:
                # Avoid KeyError when a user's specified metric has typo.
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -2,13 +2,14 @@ FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 ARG REF=main
-RUN apt-get update && apt-get install -y time git pkg-config make git-lfs
+RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
+RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
-RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,torch-speech,vision,testing]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
 RUN git lfs install

 RUN pip uninstall -y transformers
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -22,7 +22,7 @@ RUN apt update && \
    apt clean && \
    rm -rf /var/lib/apt/lists/*

-RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
+RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic>=2.0.0"
 RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
 RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir

--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -42,12 +42,12 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop

 # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
-RUN python3 -m pip install -U --no-cache-dir "pydantic<2"
+RUN python3 -m pip install -U --no-cache-dir "pydantic>=2.0.0"
 RUN python3 -c "from deepspeed.launcher.runner import main"
--- a/docs/TRANSLATING.md
+++ b/docs/TRANSLATING.md
@ -54,4 +54,4 @@ The fields you should add are `local` (with the name of the file containing the

 Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.

-> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu and @MKhalusova.
+> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -120,7 +120,7 @@
  - local: custom_models
    title: Share a custom model
  - local: chat_templating
-    title: Templates for chat models
+    title: Chat templates
  - local: trainer
    title: Trainer
  - local: sagemaker
@ -163,6 +163,8 @@
    title: FBGEMM_FP8
  - local: quantization/optimum
    title: Optimum
+  - local: quantization/torchao
+    title: TorchAO
  - local: quantization/contribute
    title: Contribute new quantization method
  title: Quantization Methods
@ -370,6 +372,8 @@
        title: ESM
      - local: model_doc/falcon
        title: Falcon
+      - local: model_doc/falcon_mamba
+        title: FalconMamba
      - local: model_doc/fastspeech2_conformer
        title: FastSpeech2Conformer
      - local: model_doc/flan-t5
@ -408,6 +412,8 @@
        title: GPTSAN Japanese
      - local: model_doc/gpt-sw3
        title: GPTSw3
+      - local: model_doc/granite
+        title: Granite
      - local: model_doc/herbert
        title: HerBERT
      - local: model_doc/ibert
@ -510,6 +516,8 @@
        title: Qwen2Audio
      - local: model_doc/qwen2_moe
        title: Qwen2MoE
+      - local: model_doc/qwen2_vl
+        title: Qwen2VL
      - local: model_doc/rag
        title: RAG
      - local: model_doc/realm
@ -692,6 +700,8 @@
        title: Bark
      - local: model_doc/clap
        title: CLAP
+      - local: model_doc/dac
+        title: dac
      - local: model_doc/encodec
        title: EnCodec
      - local: model_doc/hiera
@ -818,7 +828,7 @@
        title: Llava
      - local: model_doc/llava_next
        title: LLaVA-NeXT
-      - local: model_doc/llava-next-video
+      - local: model_doc/llava_next_video
        title: LLaVa-NeXT-Video
      - local: model_doc/lxmert
        title: LXMERT
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -14,7 +14,7 @@ rendered properly in your Markdown viewer.

 -->

-# Templates for Chat Models
+# Chat Templates

 ## Introduction

@ -235,13 +235,14 @@ The sun.</s>
 From here, just continue training like you would with a standard language modelling task, using the `formatted_chat` column.

 <Tip>
-If you format text with `apply_chat_template(tokenize=False)` and then tokenize it in a separate step, you should set the argument
-`add_special_tokens=False`. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this!

 By default, some tokenizers add special tokens like `<bos>` and `<eos>` to text they tokenize. Chat templates should 
-always include all of the special tokens they need, and so adding extra special tokens with
-the default `add_special_tokens=True` can result in incorrect or duplicated special tokens, which will hurt model
-performance.
+already include all the special tokens they need, and so additional special tokens will often be incorrect or 
+duplicated, which will hurt model performance.
+
+Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument
+`add_special_tokens=False` when you tokenize that text later. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this!
+
 </Tip>

 ## Advanced: Extra inputs to chat templates
@ -325,7 +326,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer

 checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"

-tokenizer = AutoTokenizer.from_pretrained(checkpoint, revision="pr/13")
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
 ```

@ -370,7 +371,7 @@ messages = [
 Now, let's apply the chat template and generate a response:

 ```python
-inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
 inputs = {k: v.to(model.device) for k, v in inputs.items()}
 out = model.generate(**inputs, max_new_tokens=128)
 print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
@ -388,29 +389,56 @@ The model has called the function with valid arguments, in the format requested
 inferred that we're most likely referring to the Paris in France, and it remembered that, as the home of SI units,
 the temperature in France should certainly be displayed in Celsius.

-Let's append the model's tool call to the conversation. Note that we generate a random `tool_call_id` here. These IDs
-are not used by all models, but they allow models to issue multiple tool calls at once and keep track of which response
-corresponds to which call. You can generate them any way you like, but they should be unique within each chat.
+<Tip>
+
+The output format above is specific to the `Hermes-2-Pro` model we're using in this example. Other models may emit different
+tool call formats, and you may need to do some manual parsing at this step. For example, `Llama-3.1` models will emit
+slightly different JSON, with `parameters` instead of `arguments`. Regardless of the format the model outputs, you 
+should add the tool call to the conversation in the format below, with `tool_calls`, `function` and `arguments` keys. 
+
+</Tip>
+
+Next, let's append the model's tool call to the conversation.

 ```python
-tool_call_id = "vAHdf3"  # Random ID, should be unique for each tool call
 tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
-messages.append({"role": "assistant", "tool_calls": [{"id": tool_call_id, "type": "function", "function": tool_call}]})
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
 ```


 Now that we've added the tool call to the conversation, we can call the function and append the result to the
 conversation. Since we're just using a dummy function for this example that always returns 22.0, we can just append 
-that result directly. Again, note the `tool_call_id` - this should match the ID used in the tool call above.
+that result directly.
+
+```python
+messages.append({"role": "tool", "name": "get_current_temperature", "content": "22.0"})
+```
+
+<Tip>
+
+Some model architectures, notably Mistral/Mixtral, also require a `tool_call_id` here, which should be
+9 randomly-generated alphanumeric characters, and assigned to the `id` key of the tool call
+dictionary. The same key should also be assigned to the `tool_call_id` key of the tool response dictionary below, so 
+that tool calls can be matched to tool responses. So, for Mistral/Mixtral models, the code above would be:
+
+```python
+tool_call_id = "9Ae3bDc2F"  # Random ID, 9 alphanumeric characters
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
+```
+
+and

 ```python
 messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"})
 ```

+</Tip>
+
 Finally, let's let the assistant read the function outputs and continue chatting with the user:

 ```python
-inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
 inputs = {k: v.to(model.device) for k, v in inputs.items()}
 out = model.generate(**inputs, max_new_tokens=128)
 print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
@ -426,14 +454,6 @@ Although this was a simple demo with dummy tools and a single call, the same tec
 multiple real tools and longer conversations. This can be a powerful way to extend the capabilities of conversational
 agents with real-time information, computational tools like calculators, or access to large databases.

-<Tip>
-Not all of the tool-calling features shown above are used by all models. Some use tool call IDs, others simply use the function name and
-match tool calls to results using the ordering, and there are several models that use neither and only issue one tool 
-call at a time to avoid confusion. If you want your code to be compatible across as many models as possible, we 
-recommend structuring your tools calls like we've shown here, and returning tool results in the order that
-they were issued by the model. The chat templates on each model should handle the rest.
-</Tip>
-
 ### Understanding tool schemas

 Each function you pass to the `tools` argument of `apply_chat_template` is converted into a 
@ -765,14 +785,23 @@ it's time to put an end to them!

 ## Advanced: Template writing tips

-If you're unfamiliar with Jinja, we generally find that the easiest way to write a chat template is to first
-write a short Python script that formats messages the way you want, and then convert that script into a template.
+<Tip>

-Remember that the template handler will receive the conversation history as a variable called `messages`.  
+The easiest way to get started with writing Jinja templates is to take a look at some existing ones. You can use
+`print(tokenizer.chat_template)` for any chat model to see what template it's using. In general, models that support tool use have 
+much more complex templates than other models - so when you're just getting started, they're probably a bad example
+to learn from! You can also take a look at the 
+[Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for details
+of general Jinja formatting and syntax.
+
+</Tip>
+
+Jinja templates in `transformers` are identical to Jinja templates elsewhere. The main thing to know is that 
+the conversation history will be accessible inside your template as a variable called `messages`.  
 You will be able to access `messages` in your template just like you can in Python, which means you can loop over 
 it with `{% for message in messages %}` or access individual messages with `{{ messages[0] }}`, for example.

-You can also use the following tips to convert your code to Jinja:
+You can also use the following tips to write clean, efficient Jinja templates:

 ### Trimming whitespace

@ -797,46 +826,35 @@ rather than like this:
 Adding `-` will strip any whitespace that comes before the block. The second example looks innocent, but the newline
 and indentation may end up being included in the output, which is probably not what you want!

-### For loops
-
-For loops in Jinja look like this:
-
-```
-{%- for message in messages %}
-    {{- message['content'] }}
-{%- endfor %}
-```
-
-Note that whatever's inside the {{ expression block }} will be printed to the output. You can use operators like
-`+` to combine strings inside expression blocks.
-
-### If statements
-
-If statements in Jinja look like this:
-
-```
-{%- if message['role'] == 'user' %}
-    {{- message['content'] }}
-{%- endif %}
-```
-
-Note how where Python uses whitespace to mark the beginnings and ends of `for` and `if` blocks, Jinja requires you
-to explicitly end them with `{% endfor %}` and `{% endif %}`.
-
 ### Special variables

-Inside your template, you will have access to the list of `messages`, but you can also access several other special
-variables. These include special tokens like `bos_token` and `eos_token`, as well as the `add_generation_prompt`
-variable that we discussed above. You can also use the `loop` variable to access information about the current loop
-iteration, for example  using `{% if loop.last %}` to check if the current message is the last message in the 
-conversation. Here's an example that puts these ideas together to add a generation prompt at the end of the
-conversation if add_generation_prompt is `True`:
+Inside your template, you will have access several special variables. The most important of these is `messages`, 
+which contains the chat history as a list of message dicts. However, there are several others. Not every
+variable will be used in every template. The most common other variables are:

-```
-{%- if loop.last and add_generation_prompt %}
-    {{- bos_token + 'Assistant:\n' }}
-{%- endif %}
-```
+- `tools` contains a list of tools in JSON schema format. Will be `None` or undefined if no tools are passed.
+- `documents` contains a list of documents in the format `{"title": "Title", "contents": "Contents"}`, used for retrieval-augmented generation. Will be `None` or undefined if no documents are passed.
+- `add_generation_prompt` is a bool that is `True` if the user has requested a generation prompt, and `False` otherwise. If this is set, your template should add the header for an assistant message to the end of the conversation. If your model doesn't have a specific header for assistant messages, you can ignore this flag.
+- **Special tokens** like `bos_token` and `eos_token`. These are extracted from `tokenizer.special_tokens_map`. The exact tokens available inside each template will differ depending on the parent tokenizer.
+
+<Tip>
+
+You can actually pass any `kwarg` to `apply_chat_template`, and it will be accessible inside the template as a variable. In general,
+we recommend trying to stick to the core variables above, as it will make your model harder to use if users have
+to write custom code to pass model-specific `kwargs`. However, we're aware that this field moves quickly, so if you
+have a new use-case that doesn't fit in the core API, feel free to use a new `kwarg` for it! If a new `kwarg`
+becomes common we may promote it into the core API and create a standard, documented format for it.
+
+</Tip>
+
+### Callable functions
+
+There is also a short list of callable functions available to you inside your templates. These are:
+
+- `raise_exception(msg)`: Raises a `TemplateException`. This is useful for debugging, and for telling users when they're
+doing something that your template doesn't support.
+- `strftime_now(format_str)`: Equivalent to `datetime.now().strftime(format_str)` in Python. This is used for getting
+the current date/time in a specific format, which is sometimes included in system messages.

 ### Compatibility with non-Python Jinja

@ -855,4 +873,25 @@ all implementations of Jinja:
  in the Jinja documentation for more.
 - Replace `True`, `False` and `None`, which are Python-specific, with `true`, `false` and `none`.
 - Directly rendering a dict or list may give different results in other implementations (for example, string entries
-  might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here.
+  might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here.
+
+### Writing and debugging larger templates
+
+When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. 
+However, with new models and features like tool-use and RAG, some templates can be 100 lines long or more. When
+writing templates like these, it's a good idea to write them in a separate file, using a text editor. You can easily 
+extract a chat template to a file:
+
+```python
+open("template.jinja", "w").write(tokenizer.chat_template)
+```
+
+Or load the edited template back into the tokenizer:
+
+```python
+tokenizer.chat_template = open("template.jinja").read()
+```
+
+As an added bonus, when you write a long, multi-line template in a separate file, line numbers in that file will
+exactly correspond to line numbers in template parsing or execution errors. This will make it much easier to
+identify the source of issues.
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@ -185,7 +185,7 @@ class ResnetModelForImageClassification(PreTrainedModel):
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
+            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -105,6 +105,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
 |                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
 |                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
+|                           [DAC](model_doc/dac)                           |       ✅        |         ❌         |      ❌      |
 |                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
@ -120,7 +121,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
 |                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
 |                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
-|                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ❌      |
+|                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ✅      |
 |                    [DistilBERT](model_doc/distilbert)                    |       ✅        |         ✅         |      ✅      |
 |                           [DiT](model_doc/dit)                           |       ✅        |         ❌         |      ✅      |
 |                       [DonutSwin](model_doc/donut)                       |       ✅        |         ❌         |      ❌      |
@ -136,6 +137,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
 |              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
 |                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
+|                  [FalconMamba](model_doc/falcon_mamba)                   |       ✅        |         ❌         |      ❌      |
 |         [FastSpeech2Conformer](model_doc/fastspeech2_conformer)          |       ✅        |         ❌         |      ❌      |
 |                       [FLAN-T5](model_doc/flan-t5)                       |       ✅        |         ✅         |      ✅      |
 |                      [FLAN-UL2](model_doc/flan-ul2)                      |       ✅        |         ✅         |      ✅      |
@ -156,6 +158,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [GPT-Sw3](model_doc/gpt-sw3)                       |       ✅        |         ✅         |      ✅      |
 |                   [GPTBigCode](model_doc/gpt_bigcode)                    |       ✅        |         ❌         |      ❌      |
 |               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
+|                       [Granite](model_doc/granite)                       |       ✅        |         ❌         |      ❌      |
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
 |                [Grounding DINO](model_doc/grounding-dino)                |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
@ -185,7 +188,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [Llama3](model_doc/llama3)                        |       ✅        |         ❌         |      ✅      |
 |                         [LLaVa](model_doc/llava)                         |       ✅        |         ❌         |      ❌      |
 |                    [LLaVA-NeXT](model_doc/llava_next)                    |       ✅        |         ❌         |      ❌      |
-|              [LLaVa-NeXT-Video](model_doc/llava-next-video)              |       ✅        |         ❌         |      ❌      |
+|              [LLaVa-NeXT-Video](model_doc/llava_next_video)              |       ✅        |         ❌         |      ❌      |
 |                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
 |                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
 |                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
@ -258,6 +261,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                         [Qwen2](model_doc/qwen2)                         |       ✅        |         ❌         |      ❌      |
 |                   [Qwen2Audio](model_doc/qwen2_audio)                    |       ✅        |         ❌         |      ❌      |
 |                     [Qwen2MoE](model_doc/qwen2_moe)                      |       ✅        |         ❌         |      ❌      |
+|                      [Qwen2VL](model_doc/qwen2_vl)                       |       ✅        |         ❌         |      ❌      |
 |                           [RAG](model_doc/rag)                           |       ✅        |         ✅         |      ❌      |
 |                         [REALM](model_doc/realm)                         |       ✅        |         ❌         |      ❌      |
 |               [RecurrentGemma](model_doc/recurrent_gemma)                |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -140,9 +140,6 @@ generation.
 [[autodoc]] ForcedEOSTokenLogitsProcessor
    - __call__

-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
    - __call__

@ -158,9 +155,6 @@ generation.
 [[autodoc]] LogitsProcessorList
    - __call__

-[[autodoc]] LogitsWarper
-    - __call__
-
 [[autodoc]] MinLengthLogitsProcessor
    - __call__

@ -421,4 +415,3 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 [[autodoc]] WatermarkDetector
    - __call__
-
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -99,7 +99,7 @@ model.generation_config.max_new_tokens = 16

 past_key_values = StaticCache(
    config=model.config,
-    max_batch_size=1,
+    batch_size=1,
    # If you plan to reuse the cache, make sure the cache length is large enough for all cases
    max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
    device=model.device,
@ -161,7 +161,7 @@ There are a few important things you must do to enable static kv-cache and `torc
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
    past_key_values = StaticCache(
-        config=model.config, max_batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
+        config=model.config, batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
    )
    cache_position = torch.arange(seq_length, device=torch_device)
    generated_ids = torch.zeros(
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -267,5 +267,6 @@ While the autoregressive generation process is relatively straightforward, makin

 1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices.
 2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files);
-3. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
-4. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation;
+3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation. (e.g. JSON, SQL, Python)
+4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
+5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation;
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -61,3 +61,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] FbgemmFp8Config

+## TorchAoConfig
+
+[[autodoc]] TorchAoConfig
+
--- a/docs/source/en/model_doc/blip-2.md
+++ b/docs/source/en/model_doc/blip-2.md
@ -87,4 +87,17 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] Blip2ForConditionalGeneration
    - forward
-    - generate
+    - generate
+
+## Blip2ForImageTextRetrieval
+
+[[autodoc]] Blip2ForImageTextRetrieval
+    - forward
+
+## Blip2TextModelWithProjection
+
+[[autodoc]] Blip2TextModelWithProjection
+
+## Blip2VisionModelWithProjection
+
+[[autodoc]] Blip2VisionModelWithProjection
--- a/docs/source/en/model_doc/dac.md
+++ b/docs/source/en/model_doc/dac.md
@ -0,0 +1,80 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DAC
+
+## Overview
+
+
+The DAC model was proposed in [Descript Audio Codec: High-Fidelity Audio Compression with Improved RVQGAN](https://arxiv.org/abs/2306.06546) by Rithesh Kumar, Prem Seetharaman, Alejandro Luebs, Ishaan Kumar, Kundan Kumar.
+
+The Descript Audio Codec (DAC) model is a powerful tool for compressing audio data, making it highly efficient for storage and transmission. By compressing 44.1 KHz audio into tokens at just 8kbps bandwidth, the DAC model enables high-quality audio processing while significantly reducing the data footprint. This is particularly useful in scenarios where bandwidth is limited or storage space is at a premium, such as in streaming applications, remote conferencing, and archiving large audio datasets.
+
+The abstract from the paper is the following:
+
+*Language models have been successfully used to model natural signals, such as images, speech, and music. A key component of these models is a high quality neural compression model that can compress high-dimensional natural signals into lower dimensional discrete tokens. To that end, we introduce a high-fidelity universal neural audio compression algorithm that achieves ~90x compression of 44.1 KHz audio into tokens at just 8kbps bandwidth. We achieve this by combining advances in high-fidelity audio generation with better vector quantization techniques from the image domain, along with improved adversarial and reconstruction losses. We compress all domains (speech, environment, music, etc.) with a single universal model, making it widely applicable to generative modeling of all audio. We compare with competing audio compression algorithms, and find our method outperforms them significantly. We provide thorough ablations for every design choice, as well as open-source code and trained model weights. We hope our work can lay the foundation for the next generation of high-fidelity audio modeling.*
+
+This model was contributed by [Kamil Akesbi](https://huggingface.co/kamilakesbi).
+The original code can be found [here](https://github.com/descriptinc/descript-audio-codec/tree/main?tab=readme-ov-file).
+
+
+## Model structure
+
+The Descript Audio Codec (DAC) model is structured into three distinct stages:
+
+1. Encoder Model: This stage compresses the input audio, reducing its size while retaining essential information.
+2. Residual Vector Quantizer (RVQ) Model: Working in tandem with the encoder, this model quantizes the latent codes of the audio, refining the compression and ensuring high-quality reconstruction.
+3. Decoder Model: This final stage reconstructs the audio from its compressed form, restoring it to a state that closely resembles the original input.
+
+## Usage example 
+
+Here is a quick example of how to encode and decode an audio using this model: 
+
+```python 
+>>> from datasets import load_dataset, Audio
+>>> from transformers import DacModel, AutoProcessor
+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+>>> model = DacModel.from_pretrained("descript/dac_16khz")
+>>> processor = AutoProcessor.from_pretrained("descript/dac_16khz")
+>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+>>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")
+
+>>> encoder_outputs = model.encode(inputs["input_values"])
+>>> # Get the intermediate audio codes
+>>> audio_codes = encoder_outputs.audio_codes
+>>> # Reconstruct the audio from its quantized representation
+>>> audio_values = model.decode(encoder_outputs.quantized_representation)
+>>> # or the equivalent with a forward pass
+>>> audio_values = model(inputs["input_values"]).audio_values
+```
+
+## DacConfig
+
+[[autodoc]] DacConfig
+
+## DacFeatureExtractor
+
+[[autodoc]] DacFeatureExtractor
+    - __call__
+
+## DacModel
+
+[[autodoc]] DacModel
+    - decode
+    - encode
+    - forward
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@ -72,6 +72,9 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] Dinov2Config

+<frameworkcontent>
+<pt>
+
 ## Dinov2Model

 [[autodoc]] Dinov2Model
@ -81,3 +84,20 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] Dinov2ForImageClassification
    - forward
+
+</pt>
+<jax>
+
+## FlaxDinov2Model
+
+[[autodoc]] FlaxDinov2Model
+    - __call__
+
+
+## FlaxDinov2ForImageClassification
+
+[[autodoc]] FlaxDinov2ForImageClassification
+    - __call__
+
+</jax>
+</frameworkcontent>
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -0,0 +1,116 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FalconMamba
+
+## Overview
+
+The FalconMamba model was proposed by TII UAE (Technology Innovation Institute) in their release.
+
+The abstract from the paper is the following:
+
+*We present FalconMamba, a new base large language model based on the novel Mamba architecture. FalconMamba is trained on 5.8 trillion tokens with carefully selected data mixtures. As a pure Mamba-based model, FalconMamba surpasses leading open-weight models based on Transformers, such as Mistral 7B, Llama3 8B, and Falcon2 11B. It is on par with Gemma 7B and outperforms models with different architecture designs, such as RecurrentGemma 9B. Currently, FalconMamba is the best-performing Mamba model in the literature at this scale, surpassing both existing Mamba and hybrid Mamba-Transformer models.
+Due to its architecture, FalconMamba is significantly faster at inference and requires substantially less memory for long sequence generation. Despite recent studies suggesting that hybrid Mamba-Transformer models outperform pure architecture designs, we argue and demonstrate that the pure Mamba design can achieve similar, even superior results compared to the hybrid design. We make the weights of our implementation of FalconMamba publicly available under a permissive license.*
+
+Tips:
+
+- FalconMamba is mostly based on Mamba architecutre, the same [tips and best practices](./mamba) would be relevant here.
+
+The model has been trained on approximtely 6T tokens consisting a mixture of many data sources such as RefineWeb, Cosmopedia and Math data.
+
+For more details about the training procedure and the architecture, have a look at [the technical paper of FalconMamba]() (coming soon).
+
+# Usage
+
+Below we demonstrate how to use the model:
+
+```python 
+from transformers import FalconMambaForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
+model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b")
+
+input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
+
+out = model.generate(input_ids, max_new_tokens=10)
+print(tokenizer.batch_decode(out))
+```
+
+The architecture is also compatible with `torch.compile` for faster generation:
+
+```python 
+from transformers import FalconMambaForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
+model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", torch_dtype=torch.bfloat16).to(0)
+model = torch.compile(model)
+
+input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
+
+out = model.generate(input_ids, max_new_tokens=10)
+print(tokenizer.batch_decode(out))
+```
+
+If you have access to a GPU that is compatible with `bitsandbytes`, you can also quantize the model in 4-bit precision:
+
+```python 
+from transformers import FalconMambaForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", quantization_config=quantization_config)
+
+input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
+
+out = model.generate(input_ids, max_new_tokens=10)
+print(tokenizer.batch_decode(out))
+```
+
+You can also play with the instruction fine-tuned model:
+
+```python 
+from transformers import FalconMambaForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
+model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
+
+# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
+messages = [
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).input_ids
+
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+
+## FalconMambaConfig
+
+[[autodoc]] FalconMambaConfig
+
+## FalconMambaModel
+
+[[autodoc]] FalconMambaModel
+    - forward
+
+## FalconMambaLMHeadModel
+
+[[autodoc]] FalconMambaForCausalLM
+    - forward
--- a/docs/source/en/model_doc/granite.md
+++ b/docs/source/en/model_doc/granite.md
@ -0,0 +1,74 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Granite
+
+## Overview
+
+The Granite model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda.
+
+PowerLM-3B is a 3B state-of-the-art small language model trained with the Power learning rate scheduler. It is trained on a wide range of open-source and synthetic datasets with permissive licenses. PowerLM-3B has shown promising results compared to other models in the size categories across various benchmarks, including natural language multi-choices, code generation, and math reasoning.
+
+The abstract from the paper is the following:
+
+*Finding the optimal learning rate for language model pretraining is a challenging task.
+This is not only because there is a complicated correlation between learning rate, batch size, number of training tokens, model size, and other hyperparameters but also because it is prohibitively expensive to perform a hyperparameter search for large language models with Billions or Trillions of parameters. Recent studies propose using small proxy models and small corpus to perform hyperparameter searches and transposing the optimal parameters to large models and large corpus. While the zero-shot transferability is theoretically and empirically proven for model size related hyperparameters, like depth and width, the zero-shot transfer from small corpus to large corpus is underexplored.
+In this paper, we study the correlation between optimal learning rate, batch size, and number of training tokens for the recently proposed WSD scheduler. After thousands of small experiments, we found a power-law relationship between variables and demonstrated its transferability across model sizes. Based on the observation, we propose a new learning rate scheduler, Power scheduler, that is agnostic about the number of training tokens and batch size. The experiment shows that combining the Power scheduler with Maximum Update Parameterization (\mup) can consistently achieve impressive performance with one set of hyperparameters regardless of the number of training tokens, batch size, model size, and even model architecture. Our 3B dense and MoE models trained with the Power scheduler achieve comparable performance as state-of-the-art small language models.
+We [open source](https://huggingface.co/collections/ibm/power-lm-66be64ae647ddf11b9808000) these pretrained models.*
+
+Tips:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_path = "ibm/PowerLM-3b"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+# drop device_map if running on CPU
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
+model.eval()
+
+# change input text as desired
+prompt = "Write a code to find the maximum value in a list of numbers."
+
+# tokenize the text
+input_tokens = tokenizer(prompt, return_tensors="pt")
+# generate output tokens
+output = model.generate(**input_tokens, max_new_tokens=100)
+# decode output tokens into text
+output = tokenizer.batch_decode(output)
+# loop over the batch to print, in this example the batch size is 1
+for i in output:
+    print(i)
+```
+
+This model was contributed by [mayank-mishra](https://huggingface.co/mayank-mishra).
+
+
+## GraniteConfig
+
+[[autodoc]] GraniteConfig
+
+## GraniteModel
+
+[[autodoc]] GraniteModel
+    - forward
+
+## GraniteForCausalLM
+
+[[autodoc]] GraniteForCausalLM
+    - forward
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
--- a/docs/source/en/model_doc/mamba2.md
+++ b/docs/source/en/model_doc/mamba2.md
@ -39,11 +39,11 @@ The original code can be found [here](https://github.com/state-spaces/mamba).

 ### A simple generation example: 
 ```python 
-from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
+from transformers import Mamba2Config, Mamba2ForCausalLM, AutoTokenizer
 import torch
 model_id = 'mistralai/Mamba-Codestral-7B-v0.1'
 tokenizer = AutoTokenizer.from_pretrained(model_id, revision='refs/pr/9', from_slow=True, legacy=False)
-model = MambaForCausalLM.from_pretrained(model_id, revision='refs/pr/9')
+model = Mamba2ForCausalLM.from_pretrained(model_id, revision='refs/pr/9')
 input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]

 out = model.generate(input_ids, max_new_tokens=10)
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -0,0 +1,329 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Qwen2_VL
+
+
+## Overview
+
+The [Qwen2_VL](https://qwenlm.github.io/blog/qwen2-vl/) is a major update to our [Qwen-VL](https://arxiv.org/pdf/2308.12966) model from the Qwen team. 
+
+The abstract from the blog is the following:
+
+*This blog introduces Qwen2-VL, an advanced version of the Qwen-VL model that has undergone significant enhancements over the past year. Key improvements include enhanced image comprehension, advanced video understanding, integrated visual agent functionality, and expanded multilingual support. The model architecture has been optimized for handling arbitrary image resolutions through Naive Dynamic Resolution support and utilizes Multimodal Rotary Position Embedding (M-ROPE) to effectively process both 1D textual and multi-dimensional visual data. This updated model demonstrates competitive performance against leading AI systems like GPT-4o and Claude 3.5 Sonnet in vision-related tasks and ranks highly among open-source models in text capabilities. These advancements make Qwen2-VL a versatile tool for various applications requiring robust multimodal processing and reasoning abilities.*
+
+
+## Usage example
+
+### Single Media inference
+
+The model can accept both images and videos as input. Here's an example code for inference.
+
+```python
+
+from PIL import Image
+import requests
+import torch
+from torchvision import io
+from typing import Dict
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+
+# Load the model in half-precision on the available device(s)
+model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto")
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+
+# Image
+url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+conversation = [
+    {
+        "role":"user",
+        "content":[
+            {
+                "type":"image",
+            },
+            {
+                "type":"text",
+                "text":"Describe this image."
+            }
+        ]
+    }
+]
+
+
+# Preprocess the inputs
+text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
+
+inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
+inputs = inputs.to('cuda')
+
+# Inference: Generation of the output
+output_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(output_text)
+
+
+
+# Video
+def fetch_video(ele: Dict, nframe_factor=2):
+    if isinstance(ele['video'], str):
+        def round_by_factor(number: int, factor: int) -> int:
+            return round(number / factor) * factor
+
+        video = ele["video"]
+        if video.startswith("file://"):
+            video = video[7:]
+
+        video, _, info = io.read_video(
+            video,
+            start_pts=ele.get("video_start", 0.0),
+            end_pts=ele.get("video_end", None),
+            pts_unit="sec",
+            output_format="TCHW",
+        )
+        assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
+        if "nframes" in ele:
+            nframes = round_by_factor(ele["nframes"], nframe_factor)
+        else:
+            fps = ele.get("fps", 1.0)
+            nframes = round_by_factor(video.size(0) / info["video_fps"] * fps, nframe_factor)
+        idx = torch.linspace(0, video.size(0) - 1, nframes, dtype=torch.int64)
+        return video[idx]
+
+video_info = {"type": "video", "video": "/path/to/video.mp4", "fps": 1.0}
+video = fetch_video(video_info)
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "video"},
+            {"type": "text", "text": "What happened in the video?"},
+        ],
+    }
+]
+
+# Preprocess the inputs
+text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>What happened in the video?<|im_end|>\n<|im_start|>assistant\n'
+
+inputs = processor(text=[text_prompt], videos=[video], padding=True, return_tensors="pt")
+inputs = inputs.to('cuda')
+
+# Inference: Generation of the output
+output_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(output_text)
+
+```
+
+
+### Batch Mixed Media Inference
+
+The model can batch inputs composed of mixed samples of various types such as images, videos, and text. Here is an example.
+
+```python
+
+image1 = Image.open("/path/to/image1.jpg")
+image2 = Image.open("/path/to/image2.jpg")
+image3 = Image.open("/path/to/image3.jpg")
+image4 = Image.open("/path/to/image4.jpg")
+image5 = Image.open("/path/to/image5.jpg")
+video = fetch_video({
+    "type": "video",
+    "video": "/path/to/video.mp4",
+    "fps": 1.0
+})
+
+# Conversation for the first image
+conversation1 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "Describe this image."}
+        ]
+    }
+]
+
+# Conversation with two images
+conversation2 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "image"},
+            {"type": "text", "text": "What is written in the pictures?"}
+        ]
+    }
+]
+
+# Conversation with pure text
+conversation3 = [
+    {
+        "role": "user",
+        "content": "who are you?"
+    }
+]
+
+
+# Conversation with mixed midia
+conversation4 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "image"},
+            {"type": "video"},
+            {"type": "text", "text": "What are the common elements in these medias?"},
+        ],
+    }
+]
+
+conversations = [conversation1, conversation2, conversation3, conversation4]
+# Preparation for batch inference
+texts = [processor.apply_chat_template(msg, add_generation_prompt=True) for msg in conversations]
+inputs = processor(
+    text=texts,
+    images=[image1, image2, image3, image4, image5],
+    videos=[video],
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to('cuda')
+
+# Batch Inference
+output_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(output_text)
+```
+
+### Usage Tips
+
+#### Image Resolution for performance boost
+
+The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs.
+
+```python
+
+min_pixels = 224*224
+max_pixels = 2048*2048
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+
+```
+
+
+
+#### Multiple Image Inputs
+
+By default, images and video content are directly included in the conversation. When handling multiple images, it's helpful to add labels to the images and videos for better reference. Users can control this behavior with the following settings:
+
+
+
+```python
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"}, 
+            {"type": "text", "text": "Hello, how are you?"}
+        ]
+    },
+    {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Can you describe these images and video?"}, 
+            {"type": "image"}, 
+            {"type": "image"}, 
+            {"type": "video"}, 
+            {"type": "text", "text": "These are from my vacation."}
+        ]
+    },
+    {
+        "role": "assistant",
+        "content": "I'd be happy to describe the images and video for you. Could you please provide more context about your vacation?"
+    },
+    {
+        "role": "user",
+        "content": "It was a trip to the mountains. Can you see the details in the images and video?"
+    }
+]
+
+# default:
+prompt_without_id = processor.apply_chat_template(conversation, add_generation_prompt=True)
+# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
+
+
+# add ids
+prompt_with_id = processor.apply_chat_template(conversation, add_generation_prompt=True, add_vision_id=True)
+# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nPicture 1: <|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?Picture 2: <|vision_start|><|image_pad|><|vision_end|>Picture 3: <|vision_start|><|image_pad|><|vision_end|>Video 1: <|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
+
+```
+
+#### Flash-Attention 2 to speed up generation
+
+First, make sure to install the latest version of Flash Attention 2:
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Also, you should have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). FlashAttention-2 can only be used when a model is loaded in `torch.float16` or `torch.bfloat16`.
+
+To load and run a model using Flash Attention-2, simply add `attn_implementation="flash_attention_2"` when loading the model as follows:
+
+```python
+from transformers import Qwen2VLForConditionalGeneration
+
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct", 
+    torch_dtype=torch.bfloat16, 
+    attn_implementation="flash_attention_2",
+)
+```
+
+
+## Qwen2VLConfig
+
+[[autodoc]] Qwen2VLConfig
+
+## Qwen2VLImageProcessor
+
+[[autodoc]] Qwen2VLImageProcessor
+    - preprocess
+
+## Qwen2VLProcessor
+
+[[autodoc]] Qwen2VLProcessor
+
+## Qwen2VLModel
+
+[[autodoc]] Qwen2VLModel
+    - forward
+
+## Qwen2VLForConditionalGeneration
+
+[[autodoc]] Qwen2VLForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/sam.md
+++ b/docs/source/en/model_doc/sam.md
@ -34,7 +34,7 @@ Tips:
 - The model predicts much better results if input 2D points and/or input bounding boxes are provided
 - You can prompt multiple points for the same image, and predict a single mask. 
 - Fine-tuning the model is not supported yet
- According to the paper, textual input should be also supported. However, at this time of writing this seems to be not supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). 
+- According to the paper, textual input should be also supported. However, at this time of writing this seems not to be supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). 


 This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
--- a/docs/source/en/model_doc/vits.md
+++ b/docs/source/en/model_doc/vits.md
@ -93,12 +93,33 @@ from transformers import VitsTokenizer
 tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
 print(tokenizer.is_uroman)
 ```
+If the is_uroman attribute is `True`, the tokenizer will automatically apply the `uroman` package to your text inputs, but you need to install uroman if not already installed using:  
+```
+pip install --upgrade uroman
+```
+Note: Python version required to use `uroman` as python package should be >= `3.10`. 
+You can use the tokenizer as usual without any additional preprocessing steps:
+```python
+import torch
+from transformers import VitsTokenizer, VitsModel, set_seed
+import os
+import subprocess

-If required, you should apply the uroman package to your text inputs **prior** to passing them to the `VitsTokenizer`, 
-since currently the tokenizer does not support performing the pre-processing itself.  
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor")
+model = VitsModel.from_pretrained("facebook/mms-tts-kor")
+text = "이봐 무슨 일이야"
+inputs = tokenizer(text=text, return_tensors="pt")

+set_seed(555)  # make deterministic
+with torch.no_grad():
+   outputs = model(inputs["input_ids"])
+
+waveform = outputs.waveform[0]
+```
+If you don't want to upgrade to python >= `3.10`, then you can use the `uroman` perl package to pre-process the text inputs to the Roman alphabet.
 To do this, first clone the uroman repository to your local machine and set the bash variable `UROMAN` to the local path:

+
 ```bash
 git clone https://github.com/isi-nlp/uroman.git
 cd uroman
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@ -27,6 +27,27 @@ The abstract from the paper is the following:
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
 The original code can be found [here](https://github.com/openai/whisper).

+## Quick usage
+
+You can run Whisper in less than 4 lines of code and transcribe in less than a minute!
+
+```python
+# pip install transformers torch
+
+import torch
+from transformers import pipeline
+
+whisper = pipeline("automatic-speech-recognition", "openai/whisper-large-v3", torch_dtype=torch.float16, device="cuda:0")
+
+transcription = whisper("<audio_file.mp3>")
+
+print(transcription["text"])
+```
+
+Voila! You can swap the model with any [Whisper checkpoints](https://huggingface.co/models?other=whisper&sort=downloads) on the Hugging Face Hub with the same pipeline based on your needs.
+
+Bonus: You can replace `"cuda"` with `"mps"` to make it seamlessly work on Macs.
+
 ## Usage tips

 - The model usually performs well without requiring any finetuning.
--- a/docs/source/en/model_memory_anatomy.md
+++ b/docs/source/en/model_memory_anatomy.md
@ -42,7 +42,7 @@ In total, we get 512 sequences each with length 512 and store them in a [`~datas
 >>> seq_len, dataset_size = 512, 512
 >>> dummy_data = {
 ...     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
-...     "labels": np.random.randint(0, 1, (dataset_size)),
+...     "labels": np.random.randint(0, 2, (dataset_size)),
 ... }
 >>> ds = Dataset.from_dict(dummy_data)
 >>> ds.set_format("pt")
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -51,6 +51,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [GPTNeo](https://huggingface.co/docs/transformers/model_doc/gpt_neo#transformers.GPTNeoModel)
 * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel)
 * [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#transformers.GPTJModel)
+* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
 * [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
@ -73,17 +74,18 @@ FlashAttention-2 is currently supported for the following architectures:
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
-* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
 * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
 * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
 * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder)
 * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
+* [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/qwen2_vl#transformers.Qwen2VLModel)
 * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 * [Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
 * [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
+* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
 * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
 * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)

@ -202,9 +204,11 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel)
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
+* [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel)
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
@ -214,9 +218,16 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
 * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel)
+* [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
+* [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
+* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
 * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
+* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
+* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
+* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
+* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
@ -230,6 +241,15 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
 * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder)
 * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
+* [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel)
+* [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
+* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
+* [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
+* [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
+* [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
+* [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
+* [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel)
+* [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/qwen2_vl#transformers.Qwen2VLModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
@ -239,12 +259,9 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn#transformers.ViTMSNModel)
 * [VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae#transformers.VideoMAEModell)
 * [wav2vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
-* [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
-* [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
-* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
-* [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
-* [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
-* [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
+* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
+* [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaModel)
+* [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel)
 * [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel)


--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@ -155,13 +155,20 @@ This example assumes that you have:
 The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then
 extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image:
 ```dockerfile
-FROM intel/ai-workflows:torch-2.0.1-huggingface-multinode-py3.9
+FROM intel/intel-optimized-pytorch:2.3.0-pip-multinode
+
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    google-perftools \
+    libomp-dev

 WORKDIR /workspace

 # Download and extract the transformers code
-ARG HF_TRANSFORMERS_VER="4.35.2"
-RUN mkdir transformers && \
+ARG HF_TRANSFORMERS_VER="4.44.0"
+RUN pip install --no-cache-dir \
+    transformers==${HF_TRANSFORMERS_VER} && \
+    mkdir transformers && \
    curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf -
 ```
 The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the
@ -189,7 +196,6 @@ apiVersion: "kubeflow.org/v1"
 kind: PyTorchJob
 metadata:
  name: transformers-pytorchjob
-  namespace: kubeflow
 spec:
  elasticPolicy:
    rdzvBackend: c10d
@ -206,32 +212,27 @@ spec:
            - name: pytorch
              image: <image name>:<tag>  # Specify the docker image to use for the worker pods
              imagePullPolicy: IfNotPresent
-              command:
-                - torchrun
-                - /workspace/transformers/examples/pytorch/question-answering/run_qa.py
-                - --model_name_or_path
-                - "google-bert/bert-large-uncased"
-                - --dataset_name
-                - "squad"
-                - --do_train
-                - --do_eval
-                - --per_device_train_batch_size
-                - "12"
-                - --learning_rate
-                - "3e-5"
-                - --num_train_epochs
-                - "2"
-                - --max_seq_length
-                - "384"
-                - --doc_stride
-                - "128"
-                - --output_dir
-                - "/tmp/pvc-mount/output"
-                - --no_cuda
-                - --ddp_backend
-                - "ccl"
-                - --use_ipex
-                - --bf16  # Specify --bf16 if your hardware supports bfloat16
+              command: ["/bin/bash", "-c"]
+              args:
+                - >-
+                  cd /workspace/transformers;
+                  pip install -r /workspace/transformers/examples/pytorch/question-answering/requirements.txt;
+                  source /usr/local/lib/python3.10/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh;
+                  torchrun /workspace/transformers/examples/pytorch/question-answering/run_qa.py \
+                    --model_name_or_path distilbert/distilbert-base-uncased \
+                    --dataset_name squad \
+                    --do_train \
+                    --do_eval \
+                    --per_device_train_batch_size 12 \
+                    --learning_rate 3e-5 \
+                    --num_train_epochs 2 \
+                    --max_seq_length 384 \
+                    --doc_stride 128 \
+                    --output_dir /tmp/pvc-mount/output_$(date +%Y%m%d_%H%M%S) \
+                    --no_cuda \
+                    --ddp_backend ccl \
+                    --bf16 \
+                    --use_ipex;
              env:
              - name: LD_PRELOAD
                value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so"
@ -244,13 +245,13 @@ spec:
              - name: CCL_WORKER_COUNT
                value: "1"
              - name: OMP_NUM_THREADS  # Can be tuned for optimal performance
-                value: "56"
+                value: "240"
              resources:
                limits:
-                  cpu: 200  # Update the CPU and memory limit values based on your nodes
+                  cpu: 240  # Update the CPU and memory limit values based on your nodes
                  memory: 128Gi
                requests:
-                  cpu: 200  # Update the CPU and memory request values based on your nodes
+                  cpu: 240  # Update the CPU and memory request values based on your nodes
                  memory: 128Gi
              volumeMounts:
              - name: pvc-volume
@ -258,8 +259,8 @@ spec:
              - mountPath: /dev/shm
                name: dshm
          restartPolicy: Never
-          nodeSelector:  #  Optionally use the node selector to specify what types of nodes to use for the workers
-            node-type: spr
+          nodeSelector:  # Optionally use nodeSelector to match a certain node label for the worker pods
+            node-type: gnr
          volumes:
          - name: pvc-volume
            persistentVolumeClaim:
@ -287,10 +288,12 @@ set the same CPU and memory amounts for both the resource limits and requests.
 After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed
 to the cluster using:
 ```bash
-kubectl create -f pytorchjob.yaml
+export NAMESPACE=<specify your namespace>
+
+kubectl create -f pytorchjob.yaml -n ${NAMESPACE}
 ```

-The `kubectl get pods -n kubeflow` command can then be used to list the pods in the `kubeflow` namespace. You should see
+The `kubectl get pods -n ${NAMESPACE}` command can then be used to list the pods in your namespace. You should see
 the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of "Pending" as
 the containers get pulled and created, then the status should change to "Running".
 ```
@ -303,13 +306,13 @@ transformers-pytorchjob-worker-3                         1/1     Running
 ...
 ```

-The logs for worker can be viewed using `kubectl logs -n kubeflow <pod name>`. Add `-f` to stream the logs, for example:
+The logs for worker can be viewed using `kubectl logs <pod name> -n ${NAMESPACE}`. Add `-f` to stream the logs, for example:
 ```bash
-kubectl logs -n kubeflow transformers-pytorchjob-worker-0 -f
+kubectl logs transformers-pytorchjob-worker-0 -n ${NAMESPACE} -f
 ```

 After the training job completes, the trained model can be copied from the PVC or storage location. When you are done
-with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml`.
+with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}`.

 ## Summary

--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@ -54,7 +54,7 @@ speech-to-text.
 Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) 
 on the Hub to see if you can get a better transcription.

-Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large) model from OpenAI. Whisper was released 
+Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large-v2) model from OpenAI. Whisper was released 
 2 years later than Wav2Vec2, and was trained on close to 10x more data. As such, it beats Wav2Vec2 on most downstream 
 benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with  
 Wav2Vec2.
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -56,4 +56,4 @@ Use the table below to help you decide which quantization method to use.
 | [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
-
+| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -0,0 +1,45 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# TorchAO
+
+[TorchAO](https://github.com/pytorch/ao) is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like `torch.compile`, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main?tab=readme-ov-file#without-intrusive-code-changes)
+
+Before you begin, make sure the following libraries are installed with their latest version:
+
+```bash
+pip install --upgrade torch torchao
+```
+
+
+```py
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+
+model_name = "meta-llama/Meta-Llama-3-8B"
+# We support int4_weight_only, int8_weight_only and int8_dynamic_activation_int8_weight
+# More examples and documentations for arguments can be found in https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# compile the quantizd model to get speedup
+import torchao
+torchao.quantization.utils.recommended_inductor_config_setter()
+quantized_model = torch.compile(quantized_model, mode="max-autotune")
+
+output = quantized_model.generate(**input_ids, max_new_tokens=10)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+torchao quantization is implemented with tensor subclasses, currently it does not work with huggingface serialization, both the safetensor option and [non-safetensor option](https://github.com/huggingface/transformers/issues/32364), we'll update here with instructions when it's working.
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -382,6 +382,41 @@ trainer.train()

 Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue.

+## Liger Kernel
+
+[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
+
+<Tip>
+Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)
+</Tip>
+
+First make sure to install Liger official repository:
+```bash
+pip install liger-kernel
+```
+
+You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example:
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger_kernel=True
+)
+```
+
+The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
+
 ## LOMO optimizer

 The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195). 
@ -432,6 +467,57 @@ trainer = trl.SFTTrainer(
 trainer.train()
 ```

+## GrokAdamW optimizer
+
+The GrokAdamW optimizer is designed to enhance training performance and stability, particularly for models that benefit from grokking signal functions. To use GrokAdamW, first install the optimizer package with `pip install grokadamw`.
+
+<Tip>
+
+GrokAdamW is particularly useful for models that require advanced optimization techniques to achieve better performance and stability.
+
+</Tip>
+
+Below is a simple script to demonstrate how to fine-tune [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the IMDB dataset using the GrokAdamW optimizer:
+
+```python
+import torch
+import datasets
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer
+
+# Load the IMDB dataset
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+# Define the training arguments
+args = TrainingArguments(
+    output_dir="./test-grokadamw",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    optim="grokadamw",
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-5,
+    save_strategy="no",
+    run_name="grokadamw-imdb",
+)
+
+# Load the model and tokenizer
+model_id = "google/gemma-2b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+
+# Initialize the Trainer
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+)
+
+# Train the model
+trainer.train()
+```
+
+This script demonstrates how to fine-tune the `google/gemma-2b` model on the IMDB dataset using the GrokAdamW optimizer. The `TrainingArguments` are configured to use GrokAdamW, and the dataset is passed to the `Trainer` for training.
+
 ## Accelerate and Trainer

 The [`Trainer`] class is powered by [Accelerate](https://hf.co/docs/accelerate), a library for easily training PyTorch models in distributed environments with support for integrations such as [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/).
--- a/docs/source/es/custom_models.md
+++ b/docs/source/es/custom_models.md
@ -173,7 +173,7 @@ class ResnetModelForImageClassification(PreTrainedModel):
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
+            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
--- a/docs/source/it/custom_models.md
+++ b/docs/source/it/custom_models.md
@ -174,7 +174,7 @@ class ResnetModelForImageClassification(PreTrainedModel):
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
+            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
--- a/docs/source/ja/chat_templating.md
+++ b/docs/source/ja/chat_templating.md
@ -14,7 +14,7 @@ rendered properly in your Markdown viewer.

 -->

-# Templates for Chat Models
+# Chat Templates

 ## Introduction

--- a/docs/source/ja/custom_models.md
+++ b/docs/source/ja/custom_models.md
@ -161,7 +161,7 @@ class ResnetModelForImageClassification(PreTrainedModel):
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
+            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
--- a/docs/source/ja/internal/generation_utils.md
+++ b/docs/source/ja/internal/generation_utils.md
@ -139,9 +139,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
    - __call__

-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
    - __call__

@ -157,9 +154,6 @@ generation_output[:2]
 [[autodoc]] LogitsProcessorList
    - __call__

-[[autodoc]] LogitsWarper
-    - __call__
-
 [[autodoc]] MinLengthLogitsProcessor
    - __call__

--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -27,8 +27,8 @@
    title: 에이전트
  - local: llm_tutorial
    title: 대규모 언어 모델로 생성하기
-  - local: in_translation
-    title: (번역중)Chatting with Transformers
+  - local: conversations
+    title: Transformers로 채팅하기
  title: 튜토리얼
 - sections:
  - isExpanded: false
@ -79,8 +79,8 @@
        title: 이미지 특징 추출
      - local: tasks/mask_generation
        title: 마스크 생성
-      - local: in_translation
-        title: (번역중) Knowledge Distillation for Computer Vision
+      - local: tasks/knowledge_distillation_for_image_classification
+        title: 컴퓨터 비전(이미지 분류)를 위한 지식 증류(knowledge distillation)
    title: 컴퓨터 비전
  - isExpanded: false
    sections:
@ -145,8 +145,8 @@
    title: bitsandbytes
  - local: in_translation
    title: (번역중) GPTQ
-  - local: in_translation
-    title: (번역중) AWQ
+  - local: quantization/awq
+    title: AWQ
  - local: in_translation
    title: (번역중) AQLM
  - local: in_translation
@ -192,10 +192,10 @@
      title: (번역중) Methods and tools for efficient training on a single GPU
    - local: perf_train_gpu_many
      title: 다중 GPU에서 훈련 진행하기
+    - local: deepspeed
+      title: DeepSpeed
    - local: fsdp
      title: 완전 분할 데이터 병렬 처리
-    - local: in_translation
-      title: (번역중) DeepSpeed
    - local: perf_train_cpu
      title: CPU에서 훈련
    - local: perf_train_cpu_many
@ -266,8 +266,8 @@
  title: (번역중) 개념 가이드
 - sections:
  - sections:
-    - local: in_translation
-      title: (번역중) Agents and Tools
+    - local: main_classes/agent
+      title: 에이전트와 도구
    - local: in_translation
      title: (번역중) Auto Classes
    - local: in_translation
@ -302,8 +302,8 @@
      title: (번역중) Tokenizer
    - local: in_translation
      title: (번역중) Trainer
-    - local: in_translation
-      title: (번역중) DeepSpeed
+    - local: deepspeed
+      title: DeepSpeed
    - local: in_translation
      title: (번역중) Feature Extractor
    - local: in_translation
--- a/docs/source/ko/conversations.md
+++ b/docs/source/ko/conversations.md
@ -0,0 +1,306 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformers로 채팅하기[[chatting-with-transformers]]
+
+이 글을 보고 있다면 **채팅 모델**에 대해 어느 정도 알고 계실 것입니다.
+채팅 모델이란 메세지를 주고받을 수 있는 대화형 인공지능입니다. 
+대표적으로 ChatGPT가 있고, 이와 비슷하거나 더 뛰어난 오픈소스 채팅 모델이 많이 존재합니다.  
+이러한 모델들은 무료 다운로드할 수 있으며, 로컬에서 실행할 수 있습니다. 
+크고 무거운 모델은 고성능 하드웨어와 메모리가 필요하지만, 
+저사양 GPU 혹은 일반 데스크탑이나 노트북 CPU에서도 잘 작동하는 소형 모델들도 있습니다.
+
+이 가이드는 채팅 모델을 처음 사용하는 분들에게 유용할 것입니다.
+우리는 간편한 고수준(High-Level) "pipeline"을 통해 빠른 시작 가이드를 진행할 것입니다.
+가이드에는 채팅 모델을 바로 시작할 때 필요한 모든 정보가 담겨 있습니다.
+빠른 시작 가이드 이후에는 채팅 모델이 정확히 무엇인지, 적절한 모델을 선택하는 방법과, 
+채팅 모델을 사용하는 각 단계의 저수준(Low-Level) 분석 등 더 자세한 정보를 다룰 것입니다. 
+또한 채팅 모델의 성능과 메모리 사용을 최적화하는 방법에 대한 팁도 제공할 것입니다.
+
+
+## 빠른 시작[[quickstart]]
+
+자세히 볼 여유가 없는 분들을 위해 간단히 요약해 보겠습니다: 
+채팅 모델은 대화 메세지를 계속해서 생성해 나갑니다.
+즉, 짤막한 채팅 메세지를 모델에게 전달하면, 모델은 이를 바탕으로 응답을 추가하며 대화를 이어 나갑니다.
+이제 실제로 어떻게 작동하는지 살펴보겠습니다. 
+먼저, 채팅을 만들어 보겠습니다:
+
+
+```python
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+```
+
+주목하세요, 대화를 처음 시작할 때 유저 메세지 이외의도, 별도의 **시스템** 메세지가 필요할 수 있습니다.
+모든 채팅 모델이 시스템 메세지를 지원하는 것은 아니지만,
+지원하는 경우에는 시스템 메세지는 대화에서 모델이 어떻게 행동해야 하는지를 지시할 수 있습니다.
+예를 들어, 유쾌하거나 진지하고자 할 때, 짧은 답변이나 긴 답변을 원할 때 등을 설정할 수 있습니다.
+시스템 메세지를 생략하고
+"You are a helpful and intelligent AI assistant who responds to user queries."
+와 같은 간단한 프롬프트를 사용하는 것도 가능합니다.
+
+채팅을 시작했다면 대화를 이어 나가는 가장 빠른 방법은 [`TextGenerationPipeline`]를 사용하는 것입니다. 
+한번 `LLaMA-3`를 사용하여 이를 시연해 보겠습니다. 
+우선 `LLaMA-3`를 사용하기 위해서는 승인이 필요합니다. [권한 신청](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)을 하고 Hugging Face 계정으로 로그인한 후에 사용할 수 있습니다. 
+또한 우리는 `device_map="auto"`를 사용합니다. GPU 메모리가 충분하다면 로드될 것입니다. 
+그리고 메모리 절약을 위해 dtype을 `torch.bfloat16`으로 설정할 것입니다.
+
+```python
+import torch
+from transformers import pipeline
+
+pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipe(chat, max_new_tokens=512)
+print(response[0]['generated_text'][-1]['content'])
+```
+
+이후 실행을 하면 아래와 같이 출력됩니다:
+
+```text
+(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright, 
+alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!
+
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
+some wild stuff, like that Warhol guy's soup cans and all that jazz.
+
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
+those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.
+
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
+even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)
+
+And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
+pizzerias around the city. Just don't try to order a "robot-sized" slice, trust me, it won't end well. (laughs)
+
+So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
+excuse me, I've got some oil changes to attend to. (winks)
+```
+
+채팅을 계속하려면, 자신의 답장을 추가하면 됩니다. 
+파이프라인에서 반환된 `response` 객체에는 현재까지 모든 채팅을 포함하고 있으므로 
+메세지를 추가하고 다시 전달하기만 하면 됩니다.
+
+```python
+chat = response[0]['generated_text']
+chat.append(
+    {"role": "user", "content": "Wait, what's so wild about soup cans?"}
+)
+response = pipe(chat, max_new_tokens=512)
+print(response[0]['generated_text'][-1]['content'])
+```
+
+이후 실행을 하면 아래와 같이 출력됩니다:
+
+```text
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
+(sarcastically) Oh, yeah, real original, Andy.
+
+But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
+status quo, and Warhol was like, the king of that. He took the ordinary and made it extraordinary.
+And, let me tell you, it was like, a real game-changer. I mean, who would've thought that a can of soup could be art? (laughs)
+
+But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (winks)
+But, hey, that's what makes art, art, right? (laughs)
+```
+
+이 튜토리얼의 후반부에서는 성능과 메모리 관리, 
+그리고 사용자의 필요에 맞는 채팅 모델 선택과 같은 구체적인 주제들을 다룰 것입니다.
+
+## 채팅 모델 고르기[[choosing-a-chat-model]]
+
+[Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending)는 채팅 모델을 다양하게 제공하고 있습니다.
+처음 사용하는 사람에게는 모델을 선택하기가 어려울지 모릅니다.
+하지만 걱정하지 마세요! 두 가지만 명심하면 됩니다:
+
+- 모델의 크기는 실행 속도와 메모리에 올라올 수 있는지 여부를 결정.
+- 모델이 생성한 출력의 품질.
+
+일반적으로 이러한 요소들은 상관관계가 있습니다. 더 큰 모델일수록 더 뛰어난 성능을 보이는 경향이 있지만, 동일한 크기의 모델이라도 유의미한 차이가 날 수 있습니다!
+
+### 모델의 명칭과 크기[[size-and-model-naming]]
+
+모델의 크기는 모델 이름에 있는 숫자로 쉽게 알 수 있습니다. 
+예를 들어, "8B" 또는 "70B"와 같은 숫자는 모델의 **파라미터** 수를 나타냅니다. 
+양자화된 경우가 아니라면, 파라미터 하나당 약 2바이트의 메모리가 필요하다고 예상 가능합니다. 
+따라서 80억 개의 파라미터를 가진 "8B" 모델은 16GB의 메모리를 차지하며, 추가적인 오버헤드를 위한 약간의 여유가 필요합니다. 
+이는 3090이나 4090와 같은 24GB의 메모리를 갖춘 하이엔드 GPU에 적합합니다.
+
+일부 채팅 모델은 "Mixture of Experts" 모델입니다. 
+이러한 모델은 크기를 "8x7B" 또는 "141B-A35B"와 같이 다르게 표시하곤 합니다. 
+숫자가 다소 모호하다 느껴질 수 있지만, 첫 번째 경우에는 약 56억(8x7) 개의 파라미터가 있고, 
+두 번째 경우에는 약 141억 개의 파라미터가 있다고 해석할 수 있습니다.
+
+양자화는 파라미터당 메모리 사용량을 8비트, 4비트, 또는 그 이하로 줄이는 데 사용됩니다. 
+이 주제에 대해서는 아래의 [메모리 고려사항](#memory-considerations) 챕터에서 더 자세히 다룰 예정입니다.
+
+### 그렇다면 어떤 채팅 모델이 가장 좋을까요?[[but-which-chat-model-is-best]]
+모델의 크기 외에도 고려할 점이 많습니다. 
+이를 한눈에 살펴보려면 **리더보드**를 참고하는 것이 좋습니다. 
+가장 인기 있는 리더보드 두 가지는 [OpenLLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)와 [LMSys Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard)입니다. 
+LMSys 리더보드에는 독점 모델도 포함되어 있으니,
+`license` 열에서 접근 가능한 모델을 선택한 후
+[Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending)에서 검색해 보세요.
+
+### 전문 분야[[specialist-domains]]
+일부 모델은 의료 또는 법률 텍스트와 같은 특정 도메인이나 비영어권 언어에 특화되어 있기도 합니다. 
+이러한 도메인에서 작업할 경우 특화된 모델이 좋은 성능을 보일 수 있습니다. 
+하지만 항상 그럴 것이라 단정하기는 힘듭니다. 
+특히 모델의 크기가 작거나 오래된 모델인 경우, 
+최신 범용 모델이 더 뛰어날 수 있습니다. 
+다행히도 [domain-specific leaderboards](https://huggingface.co/blog/leaderboard-medicalllm)가 점차 등장하고 있어, 특정 도메인에 최고의 모델을 쉽게 찾을 수 있을 것입니다. 
+
+
+## 파이프라인 내부는 어떻게 되어있는가?[[what-happens-inside-the-pipeline]]
+위의 빠른 시작에서는 고수준(High-Level) 파이프라인을 사용하였습니다.
+이는 간편한 방법이지만, 유연성은 떨어집니다.
+이제 더 저수준(Low-Level) 접근 방식을 통해 대화에 포함된 각 단계를 살펴보겠습니다. 
+코드 샘플로 시작한 후 이를 분석해 보겠습니다:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+# 입력값을 사전에 준비해 놓습니다
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+
+# 1: 모델과 토크나이저를 불러옵니다
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+
+# 2: 채팅 템플릿에 적용합니다
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+print("Formatted chat:\n", formatted_chat)
+
+# 3: 채팅을 토큰화합니다 (바로 이전 과정에서 tokenized=True로 설정하면 한꺼번에 처리할 수 있습니다)
+inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
+# 토큰화된 입력값을 모델이 올라와 있는 기기(CPU/GPU)로 옮깁니다.
+inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
+print("Tokenized inputs:\n", inputs)
+
+# 4: 모델로부터 응답을 생성합니다
+outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
+print("Generated tokens:\n", outputs)
+
+# 5: 모델이 출력한 토큰을 다시 문자열로 디코딩합니다
+decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
+print("Decoded output:\n", decoded_output)
+```
+여기에는 각 부분이 자체 문서가 될 수 있을 만큼 많은 내용이 담겨 있습니다! 
+너무 자세히 설명하기보다는 넓은 개념을 다루고, 세부 사항은 링크된 문서에서 다루겠습니다. 
+주요 단계는 다음과 같습니다:
+
+1. [모델](https://huggingface.co/learn/nlp-course/en/chapter2/3)과 [토크나이저](https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt)를 Hugging Face Hub에서 로드합니다.
+2. 대화는 토크나이저의 [채팅 템플릿](https://huggingface.co/docs/transformers/main/en/chat_templating)을 사용하여 양식을 구성합니다.
+3. 구성된 채팅은 토크나이저를 사용하여 [토큰화](https://huggingface.co/learn/nlp-course/en/chapter2/4)됩니다.
+4. 모델에서 응답을 [생성](https://huggingface.co/docs/transformers/en/llm_tutorial)합니다.
+5. 모델이 출력한 토큰을 다시 문자열로 디코딩합니다.
+
+## 성능, 메모리와 하드웨어[[performance-memory-and-hardware]]
+이제 대부분의 머신 러닝 작업이 GPU에서 실행된다는 것을 아실 겁니다. 
+다소 느리기는 해도 CPU에서 채팅 모델이나 언어 모델로부터 텍스트를 생성하는 것도 가능합니다. 
+하지만 모델을 GPU 메모리에 올려놓을 수만 있다면, GPU를 사용하는 것이 일반적으로 더 선호되는 방식입니다.
+
+### 메모리 고려사항[[memory-considerations]]
+
+기본적으로, [`TextGenerationPipeline`]이나 [`AutoModelForCausalLM`]과 같은 
+Hugging Face 클래스는 모델을 `float32` 정밀도(Precision)로 로드합니다. 
+이는 파라미터당 4바이트(32비트)를 필요로 하므로, 
+80억 개의 파라미터를 가진 "8B" 모델은 약 32GB의 메모리를 필요로 한다는 것을 의미합니다. 
+하지만 이는 낭비일 수 있습니다! 
+대부분의 최신 언어 모델은 파라미터당 2바이트를 사용하는 "bfloat16" 정밀도(Precision)로 학습됩니다. 
+하드웨어가 이를 지원하는 경우(Nvidia 30xx/Axxx 이상), 
+`torch_dtype` 파라미터로 위와 같이 `bfloat16` 정밀도(Precision)로 모델을 로드할 수 있습니다.
+
+또한, 16비트보다 더 낮은 정밀도(Precision)로 모델을 압축하는 
+"양자화(quantization)" 방법을 사용할 수도 있습니다. 
+이 방법은 모델의 가중치를 손실 압축하여 각 파라미터를 8비트, 
+4비트 또는 그 이하로 줄일 수 있습니다. 
+특히 4비트에서 모델의 출력이 부정적인 영향을 받을 수 있지만, 
+더 크고 강력한 채팅 모델을 메모리에 올리기 위해 이 같은 트레이드오프를 감수할 가치가 있습니다. 
+이제 `bitsandbytes`를 사용하여 이를 실제로 확인해 보겠습니다:
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", quantization_config=quantization_config)
+```
+
+위의 작업은 `pipeline` API에도 적용 가능합니다:
+
+```python
+from transformers import pipeline, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit
+pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
+```
+
+`bitsandbytes` 외에도 모델을 양자화하는 다양한 방법이 있습니다. 
+자세한 내용은 [Quantization guide](./quantization)를 참조해 주세요.
+
+
+### 성능 고려사항[[performance-considerations]]
+
+<Tip>
+
+언어 모델 성능과 최적화에 대한 보다 자세한 가이드는 [LLM Inference Optimization](./llm_optims)을 참고하세요.
+
+</Tip>
+
+
+일반적으로 더 큰 채팅 모델은 메모리를 더 많이 요구하고, 
+속도도 느려지는 경향이 있습니다. 구체적으로 말하자면, 
+채팅 모델에서 텍스트를 생성할 때는 컴퓨팅 파워보다 **메모리 대역폭**이 병목 현상을 일으키는 경우가 많습니다. 
+이는 모델이 토큰을 하나씩 생성할 때마다 파라미터를 메모리에서 읽어야 하기 때문입니다. 
+따라서 채팅 모델에서 초당 생성할 수 있는 토큰 수는 모델이 위치한 메모리의 대역폭을 모델의 크기로 나눈 값에 비례합니다.
+
+위의 예제에서는 모델이 bfloat16 정밀도(Precision)로 로드될 때 용량이 약 16GB였습니다. 
+이 경우, 모델이 생성하는 각 토큰마다 16GB를 메모리에서 읽어야 한다는 의미입니다. 
+총 메모리 대역폭은 소비자용 CPU에서는 20-100GB/sec, 
+소비자용 GPU나 Intel Xeon, AMD Threadripper/Epyc, 
+애플 실리콘과 같은 특수 CPU에서는 200-900GB/sec, 
+데이터 센터 GPU인 Nvidia A100이나 H100에서는 최대 2-3TB/sec에 이를 수 있습니다. 
+이러한 정보는 각자 하드웨어에서 생성 속도를 예상하는 데 도움이 될 것입니다.
+
+따라서 텍스트 생성 속도를 개선하려면 가장 간단한 방법은 모델의 크기를 줄이거나(주로 양자화를 사용), 
+메모리 대역폭이 더 높은 하드웨어를 사용하는 것입니다. 
+이 대역폭 병목 현상을 피할 수 있는 고급 기술도 여러 가지 있습니다. 
+가장 일반적인 방법은 [보조 생성](https://huggingface.co/blog/assisted-generation), "추측 샘플링"이라고 불리는 기술입니다. 
+이 기술은 종종 더 작은 "초안 모델"을 사용하여 여러 개의 미래 토큰을 한 번에 추측한 후, 
+채팅 모델로 생성 결과를 확인합니다.
+만약 채팅 모델이 추측을 확인하면, 한 번의 순전파에서 여러 개의 토큰을 생성할 수 있어 
+병목 현상이 크게 줄어들고 생성 속도가 빨라집니다.
+
+마지막으로, "Mixture of Experts" (MoE) 모델에 대해서도 짚고 넘어가 보도록 합니다. 
+Mixtral, Qwen-MoE, DBRX와 같은 인기 있는 채팅 모델이 바로 MoE 모델입니다. 
+이 모델들은 토큰을 생성할 때 모든 파라미터가 사용되지 않습니다. 
+이로 인해 MoE 모델은 전체 크기가 상당히 클 수 있지만, 
+차지하는 메모리 대역폭은 낮은 편입니다. 
+따라서 동일한 크기의 일반 "조밀한(Dense)" 모델보다 몇 배 빠를 수 있습니다. 
+하지만 보조 생성과 같은 기술은 MoE 모델에서 비효율적일 수 있습니다. 
+새로운 추측된 토큰이 추가되면서 더 많은 파라미터가 활성화되기 때문에, 
+MoE 아키텍처가 제공하는 속도 이점이 상쇄될 수 있습니다.
--- a/docs/source/ko/custom_models.md
+++ b/docs/source/ko/custom_models.md
@ -169,7 +169,7 @@ class ResnetModelForImageClassification(PreTrainedModel):
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
+            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
--- a/docs/source/ko/deepspeed.md
+++ b/docs/source/ko/deepspeed.md
--- a/docs/source/ko/main_classes/agent.md
+++ b/docs/source/ko/main_classes/agent.md
@ -0,0 +1,134 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 에이전트 & 도구 [[agents-tools]]
+
+<Tip warning={true}>
+
+Transformers Agent는 실험 중인 API이므로 언제든지 변경될 수 있습니다. 
+API나 기반 모델이 자주 업데이트되므로, 에이전트가 제공하는 결과물은 달라질 수 있습니다.
+
+</Tip>
+
+에이전트와 도구에 대해 더 알아보려면 [소개 가이드](../transformers_agents)를 꼭 읽어보세요. 
+이 페이지에는 기본 클래스에 대한 API 문서가 포함되어 있습니다.
+
+## 에이전트 [[agents]]
+
+우리는 기본 [`Agent`] 클래스를 기반으로 두 가지 유형의 에이전트를 제공합니다:
+- [`CodeAgent`]는 한 번에 동작합니다. 작업을 해결하기 위해 코드를 생성한 다음, 바로 실행합니다.
+- [`ReactAgent`]는 단계별로 동작하며, 각 단계는 하나의 생각, 하나의 도구 호출 및 실행으로 구성됩니다. 이 에이전트에는 두 가지 클래스가 있습니다:
+  - [`ReactJsonAgent`]는 도구 호출을 JSON으로 작성합니다.
+  - [`ReactCodeAgent`]는 도구 호출을 Python 코드로 작성합니다.
+
+### Agent [[agent]]
+
+[[autodoc]] Agent
+
+### CodeAgent [[codeagent]]
+
+[[autodoc]] CodeAgent
+
+### React agents [[react-agents]]
+
+[[autodoc]] ReactAgent
+
+[[autodoc]] ReactJsonAgent
+
+[[autodoc]] ReactCodeAgent
+
+## Tools [[tools]]
+
+### load_tool [[loadtool]]
+
+[[autodoc]] load_tool
+
+### Tool [[tool]]
+
+[[autodoc]] Tool
+
+### Toolbox [[toolbox]]
+
+[[autodoc]] Toolbox
+
+### PipelineTool [[pipelinetool]]
+
+[[autodoc]] PipelineTool
+
+### launch_gradio_demo [[launchgradiodemo]]
+
+[[autodoc]] launch_gradio_demo
+
+### ToolCollection [[toolcollection]]
+
+[[autodoc]] ToolCollection
+
+## 엔진 [[engines]]
+
+에이전트 프레임워크에서 사용할 수 있는 엔진을 자유롭게 만들고 사용할 수 있습니다.
+이 엔진들은 다음과 같은 사양을 가지고 있습니다:
+1. 입력(`List[Dict[str, str]]`)에 대한 [메시지 형식](../chat_templating.md)을 따르고 문자열을 반환해야 합니다.
+2. 인수 `stop_sequences`에 시퀀스가 전달되기 *전에* 출력을 생성하는 것을 중지해야 합니다.
+
+### HfEngine [[hfengine]]
+
+편의를 위해, 위의 사항을 구현하고 대규모 언어 모델 실행을 위해 추론 엔드포인트를 사용하는 `HfEngine`을 추가했습니다.
+
+```python
+>>> from transformers import HfEngine
+
+>>> messages = [
+...   {"role": "user", "content": "Hello, how are you?"},
+...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...   {"role": "user", "content": "No need to help, take it easy."},
+... ]
+
+>>> HfEngine()(messages, stop_sequences=["conversation"])
+
+"That's very kind of you to say! It's always nice to have a relaxed "
+```
+
+[[autodoc]] HfEngine
+
+
+## 에이전트 유형 [[agent-types]]
+
+에이전트는 도구 간의 모든 유형의 객체를 처리할 수 있습니다; 도구는 완전히 멀티모달이므로 텍스트, 이미지, 오디오, 비디오 등 다양한 유형을 수락하고 반환할 수 있습니다. 
+도구 간의 호환성을 높이고 ipython (jupyter, colab, ipython 노트북, ...)에서 이러한 
+반환 값을 올바르게 렌더링하기 위해 이러한 유형을 중심으로 래퍼 클래스를 
+구현합니다.
+
+래핑된 객체는 처음과 동일하게 작동해야 합니다; 텍스트 객체는 여전히 문자열로 작동해야 하며, 
+이미지 객체는 여전히 `PIL.Image`로 작동해야 합니다.
+
+이러한 유형에는 세 가지 특정 목적이 있습니다:
+
+- `to_raw`를 호출하면 기본 객체가 반환되어야 합니다.
+- `to_string`을 호출하면 객체가 문자열로 반환되어야 합니다: 
+`AgentText`의 경우 문자열이 될 수 있지만, 다른 경우에는 객체의 직렬화된 버전의 경로일 수 있습니다.
+- ipython 커널에서 표시할 때 객체가 올바르게 표시되어야 합니다.
+
+### AgentText [[agenttext]]
+
+[[autodoc]] transformers.agents.agent_types.AgentText
+
+### AgentImage [[agentimage]]
+
+[[autodoc]] transformers.agents.agent_types.AgentImage
+
+### AgentAudio [[agentaudio]]
+
+[[autodoc]] transformers.agents.agent_types.AgentAudio
--- a/docs/source/ko/quantization/awq.md
+++ b/docs/source/ko/quantization/awq.md
@ -0,0 +1,233 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AWQ [[awq]]
+
+<Tip>
+
+이 [노트북](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY) 으로 AWQ 양자화를 실습해보세요 !
+
+</Tip>
+
+[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978)은 모델의 모든 가중치를 양자화하지 않고, LLM 성능에 중요한 가중치를 유지합니다. 이로써 4비트 정밀도로 모델을 실행해도 성능 저하 없이 양자화 손실을 크게 줄일 수 있습니다.
+
+AWQ 알고리즘을 사용하여 모델을 양자화할 수 있는 여러 라이브러리가 있습니다. 예를 들어 [llm-awq](https://github.com/mit-han-lab/llm-awq), [autoawq](https://github.com/casper-hansen/AutoAWQ) , [optimum-intel](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc) 등이 있습니다. Transformers는 llm-awq, autoawq 라이브러리를 이용해 양자화된 모델을 가져올 수 있도록 지원합니다. 이 가이드에서는 autoawq로 양자화된 모델을 가져오는 방법을 보여드리나, llm-awq로 양자화된 모델의 경우도 유사한 절차를 따릅니다.
+
+autoawq가 설치되어 있는지 확인하세요:
+
+```bash
+pip install autoawq
+```
+
+AWQ 양자화된 모델은 해당 모델의 [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) 파일의 `quantization_config` 속성을 통해 식별할 수 있습니다.:
+
+```json
+{
+  "_name_or_path": "/workspace/process/huggingfaceh4_zephyr-7b-alpha/source",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  ...
+  ...
+  ...
+  "quantization_config": {
+    "quant_method": "awq",
+    "zero_point": true,
+    "group_size": 128,
+    "bits": 4,
+    "version": "gemm"
+  }
+}
+```
+
+양자화된 모델은 [`~PreTrainedModel.from_pretrained`] 메서드를 사용하여 가져옵니다. 모델을 CPU에 가져왔다면, 먼저 모델을 GPU 장치로 옮겨야 합니다.  `device_map` 파라미터를 사용하여 모델을 배치할 위치를 지정하세요:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
+```
+
+AWQ 양자화 모델을 가져오면 자동으로 성능상의 이유로 인해 가중치들의 기본값이 fp16으로 설정됩니다. 가중치를 다른 형식으로 가져오려면, `torch_dtype` 파라미터를 사용하세요:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+```
+
+추론을 더욱 가속화하기 위해 AWQ 양자화와 [FlashAttention-2](../perf_infer_gpu_one#flashattention-2) 를 결합 할 수 있습니다:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0")
+```
+
+## 퓨즈된 모듈 [[fused-modules]]
+
+퓨즈된 모듈은 정확도와 성능을 개선합니다. 퓨즈된 모듈은 [Llama](https://huggingface.co/meta-llama) 아키텍처와 [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) 아키텍처의 AWQ모듈에 기본적으로 지원됩니다. 그러나 지원되지 않는 아키텍처에 대해서도 AWQ 모듈을 퓨즈할 수 있습니다.
+
+<Tip warning={true}>
+
+퓨즈된 모듈은 FlashAttention-2와 같은 다른 최적화 기술과 결합할 수 없습니다.
+
+</Tip>
+
+
+<hfoptions id="fuse">
+<hfoption id="supported architectures">
+
+지원되는 아키텍처에서 퓨즈된 모듈을 활성화하려면, [`AwqConfig`] 를 생성하고 매개변수 `fuse_max_seq_len` 과 `do_fuse=True`를 설정해야 합니다.  `fuse_max_seq_len` 매개변수는 전체 시퀀스 길이로, 컨텍스트 길이와 예상 생성 길이를 포함해야 합니다. 안전하게 사용하기 위해 더 큰 값으로 설정할 수 있습니다.
+
+예를 들어, [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) 모델의 AWQ 모듈을 퓨즈해보겠습니다.
+
+```python
+import torch
+from transformers import AwqConfig, AutoModelForCausalLM
+
+model_id = "TheBloke/Mistral-7B-OpenOrca-AWQ"
+
+quantization_config = AwqConfig(
+    bits=4,
+    fuse_max_seq_len=512,
+    do_fuse=True,
+)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
+```
+
+[TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) 모델은 퓨즈된 모듈이 있는 경우와 없는 경우 모두 `batch_size=1` 로 성능 평가되었습니다.
+
+<figcaption class="text-center text-gray-500 text-lg">퓨즈되지 않은 모듈</figcaption>
+
+|   배치 크기  |   프리필 길이 |   디코드 길이 |   프리필 토큰/초 |   디코드 토큰/초  |  메모리 (VRAM)   |
+|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
+|            1 |               32 |              32 |            60.0984 |           38.4537 | 4.50 GB (5.68%) |
+|            1 |               64 |              64 |          1333.67   |           31.6604 | 4.50 GB (5.68%) |
+|            1 |              128 |             128 |          2434.06   |           31.6272 | 4.50 GB (5.68%) |
+|            1 |              256 |             256 |          3072.26   |           38.1731 | 4.50 GB (5.68%) |
+|            1 |              512 |             512 |          3184.74   |           31.6819 | 4.59 GB (5.80%) |
+|            1 |             1024 |            1024 |          3148.18   |           36.8031 | 4.81 GB (6.07%) |
+|            1 |             2048 |            2048 |          2927.33   |           35.2676 | 5.73 GB (7.23%) |
+
+<figcaption class="text-center text-gray-500 text-lg">퓨즈된 모듈</figcaption>
+
+|   배치 크기  |   프리필 길이 |   디코드 길이 |   프리필 토큰/초 |   디코드 토큰/초  |  메모리 (VRAM)   |
+|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
+|            1 |               32 |              32 |            81.4899 |           80.2569 | 4.00 GB (5.05%) |
+|            1 |               64 |              64 |          1756.1    |          106.26   | 4.00 GB (5.05%) |
+|            1 |              128 |             128 |          2479.32   |          105.631  | 4.00 GB (5.06%) |
+|            1 |              256 |             256 |          1813.6    |           85.7485 | 4.01 GB (5.06%) |
+|            1 |              512 |             512 |          2848.9    |           97.701  | 4.11 GB (5.19%) |
+|            1 |             1024 |            1024 |          3044.35   |           87.7323 | 4.41 GB (5.57%) |
+|            1 |             2048 |            2048 |          2715.11   |           89.4709 | 5.57 GB (7.04%) |
+
+퓨즈된 모듈 및 퓨즈되지 않은 모듈의 속도와 처리량은 [optimum-benchmark](https://github.com/huggingface/optimum-benchmark)라이브러리를 사용하여 테스트 되었습니다.
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_forward_memory_plot.png" alt="generate throughput per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">포워드 피크 메모리 (forward peak memory)/배치 크기</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_generate_throughput_plot.png" alt="forward latency per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500"> 생성 처리량/배치크기</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="unsupported architectures">
+
+퓨즈된 모듈을 지원하지 않는 아키텍처의 경우, `modules_to_fuse` 매개변수를 사용해 직접 퓨즈 매핑을 만들어 어떤 모듈을 퓨즈할지 정의해야합니다. 예로, [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) 모델의 AWQ 모듈을 퓨즈하는 방법입니다.
+
+```python
+import torch
+from transformers import AwqConfig, AutoModelForCausalLM
+
+model_id = "TheBloke/Yi-34B-AWQ"
+
+quantization_config = AwqConfig(
+    bits=4,
+    fuse_max_seq_len=512,
+    modules_to_fuse={
+        "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+        "layernorm": ["ln1", "ln2", "norm"],
+        "mlp": ["gate_proj", "up_proj", "down_proj"],
+        "use_alibi": False,
+        "num_attention_heads": 56,
+        "num_key_value_heads": 8,
+        "hidden_size": 7168
+    }
+)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
+```
+
+ `modules_to_fuse` 매개변수는 다음을 포함해야 합니다:
+
+- `"attention"`: 어텐션 레이어는 다음 순서로 퓨즈하세요 : 쿼리 (query), 키 (key), 값 (value) , 출력 프로젝션 계층 (output projection layer). 해당 레이어를 퓨즈하지 않으려면 빈 리스트를 전달하세요.
+- `"layernorm"`: 사용자 정의 퓨즈 레이어 정규화로 교할 레이어 정규화 레이어명. 해당 레이어를 퓨즈하지 않으려면 빈 리스트를 전달하세요. 
+- `"mlp"`: 단일 MLP 레이어로 퓨즈할 MLP 레이어 순서 : (게이트 (gate) (덴스(dense), 레이어(layer), 포스트 어텐션(post-attention)) / 위 / 아래 레이어).
+- `"use_alibi"`: 모델이 ALiBi positional embedding을 사용할 경우 설정합니다.
+- `"num_attention_heads"`: 어텐션 헤드 (attention heads)의 수를 설정합니다.
+- `"num_key_value_heads"`: 그룹화 쿼리 어텐션 (GQA)을 구현하는데 사용되는 키 값 헤드의 수를 설정합니다. `num_key_value_heads=num_attention_heads`로 설정할 경우, 모델은 다중 헤드 어텐션 (MHA)가 사용되며, `num_key_value_heads=1` 는 다중 쿼리 어텐션 (MQA)가, 나머지는 GQA가 사용됩니다.
+- `"hidden_size"`: 숨겨진 표현(hidden representations)의 차원을 설정합니다.
+
+</hfoption>
+</hfoptions>
+
+
+
+## ExLlama-v2 서포트 [[exllama-v2-support]]
+
+최신 버전 `autoawq`는 빠른 프리필과 디코딩을 위해 ExLlama-v2 커널을 지원합니다. 시작하기 위해 먼저 최신 버전 `autoawq` 를 설치하세요 : 
+
+```bash
+pip install git+https://github.com/casper-hansen/AutoAWQ.git
+```
+
+매개변수를 `version="exllama"`로 설정해 `AwqConfig()`를 생성하고 모델에 넘겨주세요.
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
+
+quantization_config = AwqConfig(version="exllama")
+
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
+    quantization_config=quantization_config,
+    device_map="auto",
+)
+
+input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cuda")
+output = model(input_ids)
+print(output.logits)
+
+tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-AWQ")
+input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(model.device)
+output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=50256)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+<Tip warning={true}>
+
+이 기능은 AMD GPUs에서 지원됩니다.
+
+</Tip>
--- a/docs/source/ko/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/ko/tasks/knowledge_distillation_for_image_classification.md
@ -0,0 +1,193 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# 컴퓨터 비전을 위한 지식 증류[[Knowledge-Distillation-for-Computer-Vision]]
+
+[[open-in-colab]]
+
+지식 증류(Knowledge distillation)는 더 크고 복잡한 모델(교사)에서 더 작고 간단한 모델(학생)로 지식을 전달하는 기술입니다. 한 모델에서 다른 모델로 지식을 증류하기 위해, 특정 작업(이 경우 이미지 분류)에 대해 학습된 사전 훈련된 교사 모델을 사용하고, 랜덤으로 초기화된 학생 모델을 이미지 분류 작업에 대해 학습합니다. 그다음, 학생 모델이 교사 모델의 출력을 모방하여 두 모델의 출력 차이를 최소화하도록 훈련합니다. 이 기법은 Hinton 등 연구진의 [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531)에서 처음 소개되었습니다. 이 가이드에서는 특정 작업에 맞춘 지식 증류를 수행할 것입니다. 이번에는 [beans dataset](https://huggingface.co/datasets/beans)을 사용할 것입니다.
+
+이 가이드는 [미세 조정된 ViT 모델](https://huggingface.co/merve/vit-mobilenet-beans-224) (교사 모델)을 [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (학생 모델)으로 증류하는 방법을 🤗 Transformers의 [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) 를 사용하여 보여줍니다.
+
+증류와 과정 평가를 위해 필요한 라이브러리를 설치해 봅시다.
+
+
+```bash
+pip install transformers datasets accelerate tensorboard evaluate --upgrade
+```
+
+이 예제에서는 `merve/beans-vit-224` 모델을 교사 모델로 사용하고 있습니다. 이 모델은 beans 데이터셋에서 파인 튜닝된 `google/vit-base-patch16-224-in21k` 기반의 이미지 분류 모델입니다. 이 모델을 무작위로 초기화된 MobileNetV2로 증류해볼 것입니다.
+
+이제 데이터셋을 로드하겠습니다.
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("beans")
+```
+
+이 경우 두 모델의 이미지 프로세서가 동일한 해상도로 동일한 출력을 반환하기 때문에, 두가지를 모두 사용할 수 있습니다. 데이터셋의 모든 분할마다 전처리를 적용하기 위해 `dataset`의 `map()` 메소드를 사용할 것 입니다.
+
+
+```python
+from transformers import AutoImageProcessor
+teacher_processor = AutoImageProcessor.from_pretrained("merve/beans-vit-224")
+
+def process(examples):
+    processed_inputs = teacher_processor(examples["image"])
+    return processed_inputs
+
+processed_datasets = dataset.map(process, batched=True)
+```
+
+학생 모델(무작위로 초기화된 MobileNet)이 교사 모델(파인 튜닝된 비전 트랜스포머)을 모방하도록 할 것 입니다. 이를 위해 먼저 교사와 학생 모델의 로짓 출력값을 구합니다. 그런 다음 각 출력값을 매개변수 `temperature` 값으로 나누는데, 이 매개변수는 각 소프트 타겟의 중요도를 조절하는 역할을 합니다. 매개변수 `lambda` 는 증류 손실의 중요도에 가중치를 줍니다. 이 예제에서는 `temperature=5`와 `lambda=0.5`를 사용할 것입니다. 학생과 교사 간의 발산을 계산하기 위해 Kullback-Leibler Divergence 손실을 사용합니다. 두 데이터 P와 Q가 주어졌을 때, KL Divergence는 Q를 사용하여 P를 표현하는 데 얼만큼의 추가 정보가 필요한지를 말해줍니다. 두 데이터가 동일하다면, KL Divergence는 0이며, Q로 P를 설명하는 데 추가 정보가 필요하지 않음을 의미합니다. 따라서 지식 증류의 맥락에서 KL Divergence는 유용합니다.
+
+
+```python
+from transformers import TrainingArguments, Trainer
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ImageDistilTrainer(Trainer):
+    def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None,  *args, **kwargs):
+        super().__init__(model=student_model, *args, **kwargs)
+        self.teacher = teacher_model
+        self.student = student_model
+        self.loss_function = nn.KLDivLoss(reduction="batchmean")
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.teacher.to(device)
+        self.teacher.eval()
+        self.temperature = temperature
+        self.lambda_param = lambda_param
+
+    def compute_loss(self, student, inputs, return_outputs=False):
+        student_output = self.student(**inputs)
+
+        with torch.no_grad():
+          teacher_output = self.teacher(**inputs)
+
+        #  교사와 학생의 소프트 타겟(soft targets) 계산
+
+        soft_teacher = F.softmax(teacher_output.logits / self.temperature, dim=-1)
+        soft_student = F.log_softmax(student_output.logits / self.temperature, dim=-1)
+
+        # 손실(loss) 계산
+        distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2)
+
+        # 실제 레이블 손실 계산
+        student_target_loss = student_output.loss
+
+        # 최종 손실 계산
+        loss = (1. - self.lambda_param) * student_target_loss + self.lambda_param * distillation_loss
+        return (loss, student_output) if return_outputs else loss
+```
+
+이제 Hugging Face Hub에 로그인하여 `Trainer`를 통해 Hugging Face Hub에 모델을 푸시할 수 있도록 하겠습니다.
+
+
+```python
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+이제 `TrainingArguments`, 교사 모델과 학생 모델을 설정하겠습니다.
+
+
+```python
+from transformers import AutoModelForImageClassification, MobileNetV2Config, MobileNetV2ForImageClassification
+
+training_args = TrainingArguments(
+    output_dir="my-awesome-model",
+    num_train_epochs=30,
+    fp16=True,
+    logging_dir=f"{repo_name}/logs",
+    logging_strategy="epoch",
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
+    report_to="tensorboard",
+    push_to_hub=True,
+    hub_strategy="every_save",
+    hub_model_id=repo_name,
+    )
+
+num_labels = len(processed_datasets["train"].features["labels"].names)
+
+# 모델 초기화
+teacher_model = AutoModelForImageClassification.from_pretrained(
+    "merve/beans-vit-224",
+    num_labels=num_labels,
+    ignore_mismatched_sizes=True
+)
+
+# MobileNetV2 밑바닥부터 학습
+student_config = MobileNetV2Config()
+student_config.num_labels = num_labels
+student_model = MobileNetV2ForImageClassification(student_config)
+```
+
+`compute_metrics` 함수를 사용하여 테스트 세트에서 모델을 평가할 수 있습니다. 이 함수는 훈련 과정에서 모델의 `accuracy`와 `f1`을 계산하는 데 사용됩니다.
+
+
+```python
+import evaluate
+import numpy as np
+
+accuracy = evaluate.load("accuracy")
+
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    acc = accuracy.compute(references=labels, predictions=np.argmax(predictions, axis=1))
+    return {"accuracy": acc["accuracy"]}
+```
+
+정의한 훈련 인수로 `Trainer`를 초기화해봅시다. 또한 데이터 콜레이터(data collator)를 초기화하겠습니다.
+
+```python
+from transformers import DefaultDataCollator
+
+data_collator = DefaultDataCollator()
+trainer = ImageDistilTrainer(
+    student_model=student_model,
+    teacher_model=teacher_model,
+    training_args=training_args,
+    train_dataset=processed_datasets["train"],
+    eval_dataset=processed_datasets["validation"],
+    data_collator=data_collator,
+    tokenizer=teacher_processor,
+    compute_metrics=compute_metrics,
+    temperature=5,
+    lambda_param=0.5
+)
+```
+
+이제 모델을 훈련할 수 있습니다.
+
+```python
+trainer.train()
+```
+
+모델을 테스트 세트에서 평가할 수 있습니다.
+
+```python
+trainer.evaluate(processed_datasets["test"])
+```
+
+
+테스트 세트에서 모델의 정확도는 72%에 도달했습니다. 증류의 효율성을 검증하기 위해 동일한 하이퍼파라미터로 beans 데이터셋에서 MobileNet을 처음부터 훈련하였고, 테스트 세트에서의 정확도는 63% 였습니다. 다양한 사전 훈련된 교사 모델, 학생 구조, 증류 매개변수를 시도해보시고 결과를 보고하기를 권장합니다. 증류된 모델의 훈련 로그와 체크포인트는 [이 저장소](https://huggingface.co/merve/vit-mobilenet-beans-224)에서 찾을 수 있으며, 처음부터 훈련된 MobileNetV2는 이 [저장소](https://huggingface.co/merve/resnet-mobilenet-beans-5)에서 찾을 수 있습니다.
--- a/docs/source/pt/custom_models.md
+++ b/docs/source/pt/custom_models.md
@ -173,7 +173,7 @@ class ResnetModelForImageClassification(PreTrainedModel):
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
+            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
--- a/docs/source/zh/custom_models.md
+++ b/docs/source/zh/custom_models.md
@ -154,7 +154,7 @@ class ResnetModelForImageClassification(PreTrainedModel):
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
+            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
--- a/docs/source/zh/internal/generation_utils.md
+++ b/docs/source/zh/internal/generation_utils.md
@ -133,9 +133,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
    - __call__

-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
    - __call__

@ -151,9 +148,6 @@ generation_output[:2]
 [[autodoc]] LogitsProcessorList
    - __call__

-[[autodoc]] LogitsWarper
-    - __call__
-
 [[autodoc]] MinLengthLogitsProcessor
    - __call__

--- a/examples/README.md
+++ b/examples/README.md
@ -104,7 +104,7 @@ for running remotely as well. You can easily customize the example used, command
 and type of compute hardware, and then run the script to automatically launch the example.

 You can refer to
-[hardware setup](https://runhouse-docs.readthedocs-hosted.com/en/latest/api/python/cluster.html#hardware-setup)
+[hardware setup](https://www.run.house/docs/tutorials/quick-start-cloud)
 for more information about hardware and dependency setup with Runhouse, or this
 [Colab tutorial](https://colab.research.google.com/drive/1sh_aNQzJX5BKAdNeXthTNGxKz7sM9VPc) for a more in-depth
 walkthrough.
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -544,7 +544,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -723,7 +723,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -639,7 +639,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -638,7 +638,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -838,7 +838,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -675,7 +675,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -619,7 +619,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -677,7 +677,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -879,7 +879,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    accelerator.save_state(f"step_{completed_steps}")

            if completed_steps >= args.max_train_steps:
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -894,7 +894,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -516,7 +516,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -688,7 +688,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -564,7 +564,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -722,7 +722,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -664,7 +664,7 @@ def main():
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@ -1,5 +1,5 @@
 absl-py==1.0.0
-aiohttp==3.9.4
+aiohttp==3.10.2
 aiosignal==1.2.0
 alembic==1.7.7
 appdirs==1.4.4
@ -115,7 +115,7 @@ mujoco-py==2.1.2.14
 multidict==6.0.2
 multiprocess==0.70.12.2
 mypy-extensions==0.4.3
-nltk==3.7
+nltk==3.9
 numba==0.55.1
 numpy==1.22.3
 oauthlib==3.2.2
@ -205,7 +205,7 @@ tensorboard==2.8.0
 tensorboard-data-server==0.6.1
 tensorboard-plugin-wit==1.8.1
 tensorboardX==2.5
-tensorflow==2.11.1
+tensorflow==2.12.1
 tensorflow-io-gcs-filesystem==0.24.0
 termcolor==1.1.0
 text-unidecode==1.3
--- a/examples/research_projects/jax-projects/big_bird/evaluate.py
+++ b/examples/research_projects/jax-projects/big_bird/evaluate.py
@ -94,7 +94,6 @@ def main():

    short_validation_dataset = dataset.filter(lambda x: (len(x["question"]) + len(x["context"])) < 4 * 4096)
    short_validation_dataset = short_validation_dataset.filter(lambda x: x["category"] != "null")
-    short_validation_dataset

    model_id = "vasudevgupta/flax-bigbird-natural-questions"
    model = FlaxBigBirdForNaturalQuestions.from_pretrained(model_id)
--- a/examples/research_projects/jax-projects/hybrid_clip/requirements.txt
+++ b/examples/research_projects/jax-projects/hybrid_clip/requirements.txt
@ -3,6 +3,6 @@ jaxlib>=0.1.59
 flax>=0.3.5
 optax>=0.0.8
 -f https://download.pytorch.org/whl/torch_stable.html
-torch==1.13.1 
+torch==2.2.0 
 -f https://download.pytorch.org/whl/torch_stable.html
 torchvision==0.10.0+cpu
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@ -84,7 +84,7 @@ six==1.14.0
 terminado==0.8.3
 testpath==0.4.4
 tokenizers==0.8.1rc2
-torch==1.13.1
+torch==2.2.0
 torchvision==0.7.0
 tornado==6.4.1
 tqdm==4.66.3
--- a/i18n/README_ar.md
+++ b/i18n/README_ar.md
@ -0,0 +1,317 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
+    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
+    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
+  </picture>
+  <br/>
+  <br/>
+</p>
+
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
+    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
+    <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
+    <a href="https://github.com/huggingface/transformers/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg"></a>
+    <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg"></a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hans.md">简体中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hant.md">繁體中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ko.md">한국어</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_es.md">Español</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+		<b>العربية</b> |
+    </p>
+</h4>
+
+<h3 align="center">
+    <p>أحدث تقنيات التعلم الآلي لـ JAX وPyTorch وTensorFlow</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
+</h3>
+
+يوفر 🤗 Transformers آلاف النماذج المُدربة مسبقًا لأداء المهام على طرائق مختلفة مثل النص والصورة والصوت.
+
+يمكن تطبيق هذه النماذج على:
+
+* 📝 النص، لمهام مثل تصنيف النص واستخراج المعلومات والرد على الأسئلة والتلخيص والترجمة وتوليد النص، في أكثر من 100 لغة.
+* 🖼️ الصور، لمهام مثل تصنيف الصور وكشف الأشياء والتجزئة.
+* 🗣️ الصوت، لمهام مثل التعرف على الكلام وتصنيف الصوت.
+
+يمكن لنماذج المحول أيضًا أداء مهام على **طرائق متعددة مجتمعة**، مثل الرد على الأسئلة الجدولية والتعرف البصري على الحروف واستخراج المعلومات من المستندات الممسوحة ضوئيًا وتصنيف الفيديو والرد على الأسئلة المرئية.
+
+يوفر 🤗 Transformers واجهات برمجة التطبيقات (APIs) لتحميل تلك النماذج المُدربة مسبقًا واستخدامها على نص معين، وضبطها بدقة على مجموعات البيانات الخاصة بك، ثم مشاركتها مع المجتمع على [مركز النماذج](https://huggingface.co/models) الخاص بنا. وفي الوقت نفسه، فإن كل وحدة نمطية Python التي تحدد بنية هي وحدة مستقلة تمامًا ويمكن تعديلها لتمكين تجارب البحث السريعة.
+
+يتم دعم 🤗 Transformers بواسطة مكتبات التعلم العميق الثلاث الأكثر شيوعًا - [Jax](https://jax.readthedocs.io/en/latest/) و [PyTorch](https://pytorch.org/) و [TensorFlow](https://www.tensorflow.org/) - مع تكامل سلس بينها. من السهل تدريب نماذجك باستخدام واحدة قبل تحميلها للاستنتاج باستخدام الأخرى.
+
+## العروض التوضيحية عبر الإنترنت
+
+يمكنك اختبار معظم نماذجنا مباشرة على صفحاتها من [مركز النماذج](https://huggingface.co/models). كما نقدم [استضافة النماذج الخاصة وإصداراتها وواجهة برمجة تطبيقات الاستدلال](https://huggingface.co/pricing) للنماذج العامة والخاصة.
+
+فيما يلي بعض الأمثلة:
+
+في معالجة اللغات الطبيعية:
+- [استكمال الكلمات المقنعة باستخدام BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [التعرف على الكيانات المسماة باستخدام إليكترا](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [توليد النص باستخدام ميسترال](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [الاستدلال اللغوي الطبيعي باستخدام RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [التلخيص باستخدام BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [الرد على الأسئلة باستخدام DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [الترجمة باستخدام T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+في رؤية الكمبيوتر:
+- [تصنيف الصور باستخدام ViT](https://huggingface.co/google/vit-base-patch16-224)
+- [كشف الأشياء باستخدام DETR](https://huggingface.co/facebook/detr-resnet-50)
+- [التجزئة الدلالية باستخدام SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [التجزئة الشاملة باستخدام Mask2Former](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
+- [تقدير العمق باستخدام Depth Anything](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
+- [تصنيف الفيديو باستخدام VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [التجزئة الشاملة باستخدام OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+في الصوت:
+- [الاعتراف التلقائي بالكلام مع Whisper](https://huggingface.co/openai/whisper-large-v3)
+- [اكتشاف الكلمات الرئيسية باستخدام Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [تصنيف الصوت باستخدام محول طيف الصوت](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+في المهام متعددة الطرائق:
+- [الرد على الأسئلة الجدولية باستخدام TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [الرد على الأسئلة المرئية باستخدام ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [وصف الصورة باستخدام LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- [تصنيف الصور بدون تدريب باستخدام SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384)
+- [الرد على أسئلة المستندات باستخدام LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
+- [تصنيف الفيديو بدون تدريب باستخدام X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
+- [كشف الأشياء بدون تدريب باستخدام OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
+- [تجزئة الصور بدون تدريب باستخدام CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)
+- [توليد الأقنعة التلقائي باستخدام SAM](https://huggingface.co/docs/transformers/model_doc/sam)
+
+
+## 100 مشروع يستخدم المحولات
+
+🤗 Transformers هو أكثر من مجرد مجموعة أدوات لاستخدام النماذج المُدربة مسبقًا: إنه مجتمع من المشاريع المبنية حوله ومركز Hugging Face. نريد أن يمكّن 🤗 Transformers المطورين والباحثين والطلاب والأساتذة والمهندسين وأي شخص آخر من بناء مشاريعهم التي يحلمون بها.
+
+للاحتفال بالـ 100,000 نجمة من النماذج المحولة، قررنا تسليط الضوء على المجتمع، وقد أنشأنا صفحة [awesome-transformers](./awesome-transformers.md) التي تُدرج 100 مشروعًا رائعًا تم بناؤها بالقرب من النماذج المحولة.
+
+إذا كنت تمتلك أو تستخدم مشروعًا تعتقد أنه يجب أن يكون جزءًا من القائمة، فالرجاء فتح PR لإضافته!
+
+## إذا كنت تبحث عن دعم مخصص من فريق Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## جولة سريعة
+
+لاستخدام نموذج على الفور على إدخال معين (نص أو صورة أو صوت، ...)، نوفر واجهة برمجة التطبيقات (API) الخاصة بـ `pipeline`. تجمع خطوط الأنابيب بين نموذج مُدرب مسبقًا ومعالجة ما قبل التدريب التي تم استخدامها أثناء تدريب هذا النموذج. فيما يلي كيفية استخدام خط أنابيب بسرعة لتصنيف النصوص الإيجابية مقابل السلبية:
+
+```python
+>>> from transformers import pipeline
+
+# خصص خط أنابيب للتحليل الشعوري
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+يسمح السطر الثاني من التعليمات البرمجية بتحميل النموذج المُدرب مسبقًا الذي يستخدمه خط الأنابيب وتخزينه مؤقتًا، بينما يقوم السطر الثالث بتقييمه على النص المحدد. هنا، تكون الإجابة "إيجابية" بثقة تبلغ 99.97%.
+
+تتوفر العديد من المهام على خط أنابيب مُدرب مسبقًا جاهز للاستخدام، في NLP ولكن أيضًا في رؤية الكمبيوتر والخطاب. على سبيل المثال، يمكننا بسهولة استخراج الأشياء المكتشفة في صورة:
+
+``` python
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import pipeline
+
+# قم بتنزيل صورة بها قطط لطيفة
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+# خصص خط أنابيب لكشف الأشياء
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621،
+  'label': 'remote'،
+  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}}،
+ {'score': 0.9960021376609802،
+  'label': 'remote'،
+  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}}،
+ {'score': 0.9954745173454285،
+  'label': 'couch'،
+  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}}،
+ {'score': 0.9988006353378296،
+  'label': 'cat'،
+  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}}،
+ {'score': 0.9986783862113953،
+  'label': 'cat'،
+  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+```
+
+هنا، نحصل على قائمة بالأشياء المكتشفة في الصورة، مع مربع يحيط بالشيء وتقييم الثقة. فيما يلي الصورة الأصلية على اليسار، مع عرض التوقعات على اليمين:
+
+<h3 align="center">
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
+</h3>
+
+يمكنك معرفة المزيد حول المهام التي تدعمها واجهة برمجة التطبيقات (API) الخاصة بـ `pipeline` في [هذا البرنامج التعليمي](https://huggingface.co/docs/transformers/task_summary).
+
+بالإضافة إلى `pipeline`، لاستخدام أي من النماذج المُدربة مسبقًا على مهمتك، كل ما عليك هو ثلاثة أسطر من التعليمات البرمجية. فيما يلي إصدار PyTorch:
+```python
+>>> from transformers import AutoTokenizer، AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+وهنا رمز مماثل لـ TensorFlow:
+```python
+>>> from transformers import AutoTokenizer، TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+المُعلم مسؤول عن جميع المعالجة المسبقة التي يتوقعها النموذج المُدرب مسبقًا ويمكن استدعاؤه مباشرة على سلسلة واحدة (كما هو موضح في الأمثلة أعلاه) أو قائمة. سيقوم بإخراج قاموس يمكنك استخدامه في التعليمات البرمجية لأسفل أو تمريره مباشرة إلى نموذجك باستخدام عامل فك التعبئة **.
+
+النموذج نفسه هو وحدة نمطية عادية [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) أو [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (حسب backend) والتي يمكنك استخدامها كالمعتاد. [يوضح هذا البرنامج التعليمي](https://huggingface.co/docs/transformers/training) كيفية دمج مثل هذا النموذج في حلقة تدريب PyTorch أو TensorFlow التقليدية، أو كيفية استخدام واجهة برمجة تطبيقات `Trainer` لدينا لضبطها بدقة بسرعة على مجموعة بيانات جديدة.
+
+## لماذا يجب أن أستخدم المحولات؟
+
+1. نماذج سهلة الاستخدام وحديثة:
+    - أداء عالي في فهم اللغة الطبيعية وتوليدها ورؤية الكمبيوتر والمهام الصوتية.
+    - حاجز دخول منخفض للمربين والممارسين.
+    - عدد قليل من التجريدات التي يواجهها المستخدم مع ثلاث فئات فقط للتعلم.
+    - واجهة برمجة تطبيقات (API) موحدة لاستخدام جميع نماذجنا المُدربة مسبقًا.
+
+1. تكاليف الكمبيوتر أقل، وبصمة كربونية أصغر:
+    - يمكن للباحثين مشاركة النماذج المدربة بدلاً من إعادة التدريب دائمًا.
+    - يمكن للممارسين تقليل وقت الكمبيوتر وتكاليف الإنتاج.
+    - عشرات البنيات مع أكثر من 400,000 نموذج مُدرب مسبقًا عبر جميع الطرائق.
+
+1. اختر الإطار المناسب لكل جزء من عمر النموذج:
+    - تدريب النماذج الحديثة في 3 أسطر من التعليمات البرمجية.
+    - قم بنقل نموذج واحد بين إطارات TF2.0/PyTorch/JAX حسب الرغبة.
+    - اختر الإطار المناسب بسلاسة للتدريب والتقييم والإنتاج.
+
+1. قم بسهولة بتخصيص نموذج أو مثال وفقًا لاحتياجاتك:
+    - نوفر أمثلة لكل بنية لإعادة إنتاج النتائج التي نشرها مؤلفوها الأصليون.
+    - يتم عرض داخليات النموذج بشكل متسق قدر الإمكان.
+    - يمكن استخدام ملفات النموذج بشكل مستقل عن المكتبة للتجارب السريعة.
+
+## لماذا لا يجب أن أستخدم المحولات؟
+
+- ليست هذه المكتبة عبارة عن مجموعة أدوات من الصناديق المكونة للشبكات العصبية. لم يتم إعادة صياغة التعليمات البرمجية في ملفات النموذج باستخدام تجريدات إضافية عن قصد، بحيث يمكن للباحثين إجراء حلقات تكرار سريعة على كل من النماذج دون الغوص في تجريدات/ملفات إضافية.
+- لا يُقصد بواجهة برمجة التطبيقات (API) للتدريب العمل على أي نموذج ولكنه مُستَهدف للعمل مع النماذج التي توفرها المكتبة. للحلقات العامة للتعلم الآلي، يجب استخدام مكتبة أخرى (ربما، [تسريع](https://huggingface.co/docs/accelerate)).
+- في حين أننا نسعى جاهدين لتقديم أكبر عدد ممكن من حالات الاستخدام، فإن البرامج النصية الموجودة في مجلد [الأمثلة](https://github.com/huggingface/transformers/tree/main/examples) الخاص بنا هي مجرد أمثلة. من المتوقع ألا تعمل هذه البرامج النصية خارج الصندوق على مشكلتك المحددة وأنه سيُطلب منك تغيير بضع أسطر من التعليمات البرمجية لتكييفها مع احتياجاتك.
+
+## التثبيت
+
+### باستخدام pip
+
+تم اختبار هذا المستودع على Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، و TensorFlow 2.6+.
+
+يجب تثبيت 🤗 Transformers في [بيئة افتراضية](https://docs.python.org/3/library/venv.html). إذا كنت غير معتاد على البيئات الافتراضية Python، فراجع [دليل المستخدم](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+
+أولاً، قم بإنشاء بيئة افتراضية بالإصدار Python الذي تنوي استخدامه وقم بتنشيطه.
+
+بعد ذلك، ستحتاج إلى تثبيت واحدة على الأقل من Flax أو PyTorch أو TensorFlow.
+يرجى الرجوع إلى [صفحة تثبيت TensorFlow](https://www.tensorflow.org/install/)، و [صفحة تثبيت PyTorch](https://pytorch.org/get-started/locally/#start-locally) و/أو [صفحة تثبيت Flax](https://github.com/google/flax#quick-install) و [صفحة تثبيت Jax](https://github.com/google/jax#installation) بشأن أمر التثبيت المحدد لمنصتك.
+
+عندما يتم تثبيت إحدى هذه المكتبات الخلفية، يمكن تثبيت 🤗 Transformers باستخدام pip كما يلي:
+
+```bash
+pip install transformers
+```
+
+إذا كنت ترغب في اللعب مع الأمثلة أو تحتاج إلى أحدث إصدار من التعليمات البرمجية ولا يمكنك الانتظار حتى يتم إصدار إصدار جديد، فيجب [تثبيت المكتبة من المصدر](https://huggingface.co/docs/transformers/installation#installing-from-source).
+
+### باستخدام conda
+
+يمكن تثبيت 🤗 Transformers باستخدام conda كما يلي:
+
+```shell script
+conda install conda-forge::transformers
+```
+
+> **_ملاحظة:_** تم إيقاف تثبيت `transformers` من قناة `huggingface`.
+
+اتبع صفحات التثبيت الخاصة بـ Flax أو PyTorch أو TensorFlow لمعرفة كيفية تثبيتها باستخدام conda.
+
+> **_ملاحظة:_**  على Windows، قد تتم مطالبتك بتنشيط وضع المطور للاستفادة من التخزين المؤقت. إذا لم يكن هذا خيارًا بالنسبة لك، فيرجى إعلامنا بذلك في [هذه المشكلة](https://github.com/huggingface/huggingface_hub/issues/1062).
+
+## بنيات النماذج
+
+**[جميع نقاط تفتيش النموذج](https://huggingface.co/models)** التي يوفرها 🤗 Transformers مدمجة بسلاسة من مركز [huggingface.co](https://huggingface.co/models) [model hub](https://huggingface.co/models)، حيث يتم تحميلها مباشرة من قبل [المستخدمين](https://huggingface.co/users) و [المنظمات](https://huggingface.co/organizations).
+
+عدد نقاط التفتيش الحالية: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+يوفر 🤗 Transformers حاليًا البنيات التالية: راجع [هنا](https://huggingface.co/docs/transformers/model_summary) للحصول على ملخص لكل منها.
+
+للتحقق مما إذا كان لكل نموذج تنفيذ في Flax أو PyTorch أو TensorFlow، أو كان لديه مُعلم مرفق مدعوم من مكتبة 🤗 Tokenizers، يرجى الرجوع إلى [هذا الجدول](https://huggingface.co/docs/transformers/index#supported-frameworks).
+
+تم اختبار هذه التطبيقات على العديد من مجموعات البيانات (راجع البرامج النصية المثالية) ويجب أن تتطابق مع أداء التنفيذ الأصلي. يمكنك العثور على مزيد من التفاصيل حول الأداء في قسم الأمثلة من [الوثائق](https://github.com/huggingface/transformers/tree/main/examples).
+
+
+## تعلم المزيد
+
+| القسم | الوصف |
+|-|-|
+| [وثائق](https://huggingface.co/docs/transformers/) | وثائق واجهة برمجة التطبيقات (API) الكاملة والبرامج التعليمية |
+| [ملخص المهام](https://huggingface.co/docs/transformers/task_summary) | المهام التي يدعمها 🤗 Transformers |
+| [برنامج تعليمي لمعالجة مسبقة](https://huggingface.co/docs/transformers/preprocessing) | استخدام فئة `Tokenizer` لإعداد البيانات للنماذج |
+| [التدريب والضبط الدقيق](https://huggingface.co/docs/transformers/training) | استخدام النماذج التي يوفرها 🤗 Transformers في حلقة تدريب PyTorch/TensorFlow وواجهة برمجة تطبيقات `Trainer` |
+| [جولة سريعة: البرامج النصية للضبط الدقيق/الاستخدام](https://github.com/huggingface/transformers/tree/main/examples) | البرامج النصية المثالية للضبط الدقيق للنماذج على مجموعة واسعة من المهام |
+| [مشاركة النماذج وتحميلها](https://huggingface.co/docs/transformers/model_sharing) | تحميل ومشاركة نماذجك المضبوطة بدقة مع المجتمع |
+
+## الاستشهاد
+
+لدينا الآن [ورقة](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) يمكنك الاستشهاد بها لمكتبة 🤗 Transformers:
+```bibtex
+@inproceedings{wolf-etal-2020-transformers،
+    title = "Transformers: State-of-the-Art Natural Language Processing"،
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R{\'e}mi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush"،
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"،
+    month = oct،
+    year = "2020"،
+    address = "Online"،
+    publisher = "Association for Computational Linguistics"،
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6"،
+    pages = "38--45"
+}
+```
--- a/i18n/README_de.md
+++ b/i18n/README_de.md
@ -48,6 +48,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <b>Deutsch</b> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_es.md
+++ b/i18n/README_es.md
@ -43,6 +43,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_fr.md
+++ b/i18n/README_fr.md
@ -48,6 +48,7 @@ limitations under the License.
        <b>Français</b> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_hd.md
+++ b/i18n/README_hd.md
@ -68,6 +68,7 @@ checkpoint: जाँच बिंदु
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_ja.md
+++ b/i18n/README_ja.md
@ -78,6 +78,7 @@ user: ユーザ
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_ko.md
+++ b/i18n/README_ko.md
@ -43,6 +43,8 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+
    </p>
 </h4>

--- a/i18n/README_pt-br.md
+++ b/i18n/README_pt-br.md
@ -48,6 +48,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_ru.md
+++ b/i18n/README_ru.md
@ -48,6 +48,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    <p>
 </h4>

--- a/i18n/README_te.md
+++ b/i18n/README_te.md
@ -50,6 +50,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_vi.md
+++ b/i18n/README_vi.md
@ -48,6 +48,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <b>Tiếng việt</b> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@ -68,6 +68,7 @@ checkpoint: 检查点
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/i18n/README_zh-hant.md
+++ b/i18n/README_zh-hant.md
@ -80,6 +80,7 @@ user: 使用者
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
    </p>
 </h4>

--- a/pyproject.toml
+++ b/pyproject.toml
@ -30,8 +30,8 @@ skip-magic-trailing-comma = false
 line-ending = "auto"

 [tool.pytest.ini_options]
+addopts = "--doctest-glob='**/*.md'"
 doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
-doctest_glob="**/*.md"
 markers = [
    "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
    "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
--- a/setup.py
+++ b/setup.py
@ -96,7 +96,7 @@ if stale_egg_info.exists():
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
    "Pillow>=10.0.1,<=15.0",
-    "accelerate>=0.21.0",
+    "accelerate>=0.26.0",
    "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
    "beautifulsoup4",
    "codecarbon==1.2.0",
@ -137,7 +137,7 @@ _deps = [
    "onnxruntime-tools>=1.4.2",
    "onnxruntime>=1.4.0",
    "opencv-python",
-    "optimum-benchmark>=0.2.0",
+    "optimum-benchmark>=0.3.0",
    "optuna",
    "optax>=0.0.8,<=0.1.4",
    "packaging>=20.0",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -312,6 +312,7 @@ _import_structure = {
        "CTRLTokenizer",
    ],
    "models.cvt": ["CvtConfig"],
+    "models.dac": ["DacConfig", "DacFeatureExtractor"],
    "models.data2vec": [
        "Data2VecAudioConfig",
        "Data2VecTextConfig",
@ -416,6 +417,7 @@ _import_structure = {
    "models.ernie": ["ErnieConfig"],
    "models.esm": ["EsmConfig", "EsmTokenizer"],
    "models.falcon": ["FalconConfig"],
+    "models.falcon_mamba": ["FalconMambaConfig"],
    "models.fastspeech2_conformer": [
        "FastSpeech2ConformerConfig",
        "FastSpeech2ConformerHifiGanConfig",
@ -459,6 +461,7 @@ _import_structure = {
    "models.gpt_neox_japanese": ["GPTNeoXJapaneseConfig"],
    "models.gpt_sw3": [],
    "models.gptj": ["GPTJConfig"],
+    "models.granite": ["GraniteConfig"],
    "models.grounding_dino": [
        "GroundingDinoConfig",
        "GroundingDinoProcessor",
@ -661,6 +664,10 @@ _import_structure = {
        "Qwen2AudioProcessor",
    ],
    "models.qwen2_moe": ["Qwen2MoeConfig"],
+    "models.qwen2_vl": [
+        "Qwen2VLConfig",
+        "Qwen2VLProcessor",
+    ],
    "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
    "models.recurrent_gemma": ["RecurrentGemmaConfig"],
    "models.reformer": ["ReformerConfig"],
@ -929,6 +936,7 @@ _import_structure = {
        "is_tokenizers_available",
        "is_torch_available",
        "is_torch_mlu_available",
+        "is_torch_musa_available",
        "is_torch_neuroncore_available",
        "is_torch_npu_available",
        "is_torch_tpu_available",
@ -947,6 +955,7 @@ _import_structure = {
        "GPTQConfig",
        "HqqConfig",
        "QuantoConfig",
+        "TorchAoConfig",
    ],
 }

@ -1185,6 +1194,7 @@ else:
    _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
    _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
    _import_structure["models.pvt"].extend(["PvtImageProcessor"])
+    _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
    _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"])
    _import_structure["models.sam"].extend(["SamImageProcessor"])
    _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
@ -1272,7 +1282,6 @@ else:
            "ExponentialDecayLengthPenalty",
            "ForcedBOSTokenLogitsProcessor",
            "ForcedEOSTokenLogitsProcessor",
-            "ForceTokensLogitsProcessor",
            "GenerationMixin",
            "HammingDiversityLogitsProcessor",
            "InfNanRemoveLogitsProcessor",
@ -1570,10 +1579,13 @@ else:
    _import_structure["models.blip_2"].extend(
        [
            "Blip2ForConditionalGeneration",
+            "Blip2ForImageTextRetrieval",
            "Blip2Model",
            "Blip2PreTrainedModel",
            "Blip2QFormerModel",
+            "Blip2TextModelWithProjection",
            "Blip2VisionModel",
+            "Blip2VisionModelWithProjection",
        ]
    )
    _import_structure["models.bloom"].extend(
@ -1754,6 +1766,12 @@ else:
            "CvtPreTrainedModel",
        ]
    )
+    _import_structure["models.dac"].extend(
+        [
+            "DacModel",
+            "DacPreTrainedModel",
+        ]
+    )
    _import_structure["models.data2vec"].extend(
        [
            "Data2VecAudioForAudioFrameClassification",
@ -2138,6 +2156,13 @@ else:
            "FalconPreTrainedModel",
        ]
    )
+    _import_structure["models.falcon_mamba"].extend(
+        [
+            "FalconMambaForCausalLM",
+            "FalconMambaModel",
+            "FalconMambaPreTrainedModel",
+        ]
+    )
    _import_structure["models.fastspeech2_conformer"].extend(
        [
            "FastSpeech2ConformerHifiGan",
@ -2301,6 +2326,13 @@ else:
            "GPTJPreTrainedModel",
        ]
    )
+    _import_structure["models.granite"].extend(
+        [
+            "GraniteForCausalLM",
+            "GraniteModel",
+            "GranitePreTrainedModel",
+        ]
+    )
    _import_structure["models.grounding_dino"].extend(
        [
            "GroundingDinoForObjectDetection",
@ -3001,6 +3033,13 @@ else:
            "Qwen2MoePreTrainedModel",
        ]
    )
+    _import_structure["models.qwen2_vl"].extend(
+        [
+            "Qwen2VLForConditionalGeneration",
+            "Qwen2VLModel",
+            "Qwen2VLPreTrainedModel",
+        ]
+    )
    _import_structure["models.rag"].extend(
        [
            "RagModel",
@ -4577,6 +4616,13 @@ else:
            "FlaxCLIPVisionPreTrainedModel",
        ]
    )
+    _import_structure["models.dinov2"].extend(
+        [
+            "FlaxDinov2Model",
+            "FlaxDinov2ForImageClassification",
+            "FlaxDinov2PreTrainedModel",
+        ]
+    )
    _import_structure["models.distilbert"].extend(
        [
            "FlaxDistilBertForMaskedLM",
@ -5009,6 +5055,10 @@ if TYPE_CHECKING:
        CTRLTokenizer,
    )
    from .models.cvt import CvtConfig
+    from .models.dac import (
+        DacConfig,
+        DacFeatureExtractor,
+    )
    from .models.data2vec import (
        Data2VecAudioConfig,
        Data2VecTextConfig,
@ -5127,6 +5177,7 @@ if TYPE_CHECKING:
    from .models.ernie import ErnieConfig
    from .models.esm import EsmConfig, EsmTokenizer
    from .models.falcon import FalconConfig
+    from .models.falcon_mamba import FalconMambaConfig
    from .models.fastspeech2_conformer import (
        FastSpeech2ConformerConfig,
        FastSpeech2ConformerHifiGanConfig,
@ -5173,6 +5224,7 @@ if TYPE_CHECKING:
        GPTNeoXJapaneseConfig,
    )
    from .models.gptj import GPTJConfig
+    from .models.granite import GraniteConfig
    from .models.grounding_dino import (
        GroundingDinoConfig,
        GroundingDinoProcessor,
@ -5396,6 +5448,10 @@ if TYPE_CHECKING:
        Qwen2AudioProcessor,
    )
    from .models.qwen2_moe import Qwen2MoeConfig
+    from .models.qwen2_vl import (
+        Qwen2VLConfig,
+        Qwen2VLProcessor,
+    )
    from .models.rag import RagConfig, RagRetriever, RagTokenizer
    from .models.recurrent_gemma import RecurrentGemmaConfig
    from .models.reformer import ReformerConfig
@ -5697,6 +5753,7 @@ if TYPE_CHECKING:
        is_tokenizers_available,
        is_torch_available,
        is_torch_mlu_available,
+        is_torch_musa_available,
        is_torch_neuroncore_available,
        is_torch_npu_available,
        is_torch_tpu_available,
@ -5717,6 +5774,7 @@ if TYPE_CHECKING:
        GPTQConfig,
        HqqConfig,
        QuantoConfig,
+        TorchAoConfig,
    )

    try:
@ -5948,6 +6006,7 @@ if TYPE_CHECKING:
            PoolFormerImageProcessor,
        )
        from .models.pvt import PvtImageProcessor
+        from .models.qwen2_vl import Qwen2VLImageProcessor
        from .models.rt_detr import RTDetrImageProcessor
        from .models.sam import SamImageProcessor
        from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
@ -6028,7 +6087,6 @@ if TYPE_CHECKING:
            ExponentialDecayLengthPenalty,
            ForcedBOSTokenLogitsProcessor,
            ForcedEOSTokenLogitsProcessor,
-            ForceTokensLogitsProcessor,
            GenerationMixin,
            HammingDiversityLogitsProcessor,
            InfNanRemoveLogitsProcessor,
@ -6281,10 +6339,13 @@ if TYPE_CHECKING:
        )
        from .models.blip_2 import (
            Blip2ForConditionalGeneration,
+            Blip2ForImageTextRetrieval,
            Blip2Model,
            Blip2PreTrainedModel,
            Blip2QFormerModel,
+            Blip2TextModelWithProjection,
            Blip2VisionModel,
+            Blip2VisionModelWithProjection,
        )
        from .models.bloom import (
            BloomForCausalLM,
@ -6430,6 +6491,10 @@ if TYPE_CHECKING:
            CvtModel,
            CvtPreTrainedModel,
        )
+        from .models.dac import (
+            DacModel,
+            DacPreTrainedModel,
+        )
        from .models.data2vec import (
            Data2VecAudioForAudioFrameClassification,
            Data2VecAudioForCTC,
@ -6739,6 +6804,11 @@ if TYPE_CHECKING:
            FalconModel,
            FalconPreTrainedModel,
        )
+        from .models.falcon_mamba import (
+            FalconMambaForCausalLM,
+            FalconMambaModel,
+            FalconMambaPreTrainedModel,
+        )
        from .models.fastspeech2_conformer import (
            FastSpeech2ConformerHifiGan,
            FastSpeech2ConformerModel,
@ -6877,6 +6947,11 @@ if TYPE_CHECKING:
            GPTJModel,
            GPTJPreTrainedModel,
        )
+        from .models.granite import (
+            GraniteForCausalLM,
+            GraniteModel,
+            GranitePreTrainedModel,
+        )
        from .models.grounding_dino import (
            GroundingDinoForObjectDetection,
            GroundingDinoModel,
@ -7419,6 +7494,11 @@ if TYPE_CHECKING:
            Qwen2MoeModel,
            Qwen2MoePreTrainedModel,
        )
+        from .models.qwen2_vl import (
+            Qwen2VLForConditionalGeneration,
+            Qwen2VLModel,
+            Qwen2VLPreTrainedModel,
+        )
        from .models.rag import (
            RagModel,
            RagPreTrainedModel,
@ -8688,6 +8768,11 @@ if TYPE_CHECKING:
            FlaxCLIPVisionModel,
            FlaxCLIPVisionPreTrainedModel,
        )
+        from .models.dinov2 import (
+            FlaxDinov2ForImageClassification,
+            FlaxDinov2Model,
+            FlaxDinov2PreTrainedModel,
+        )
        from .models.distilbert import (
            FlaxDistilBertForMaskedLM,
            FlaxDistilBertForMultipleChoice,
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -977,13 +977,14 @@ class StaticCache(Cache):
    Parameters:
        config (`PretrainedConfig`):
            The configuration file defining the shape-related attributes required to initialize the static cache.
-        max_batch_size (`int`):
-            The maximum batch size with which the model will be used.
+        batch_size (`int`):
+            The batch size with which the model will be used. Note that a new instance must be instantiated if a
+            smaller batch size is used. If you are manually setting the batch size, make sure to take into account the number of beams if you are running beam search
        max_cache_len (`int`):
            The maximum sequence length with which the model will be used.
-        device (`torch.device`):
+        device (`torch.device` or `str`):
            The device on which the cache should be initialized. Should be the same as the layer.
-        dtype (*optional*, defaults to `torch.float32`):
+        dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.

    Example:
@ -999,22 +1000,37 @@ class StaticCache(Cache):
        >>> # Prepare a cache class and pass it to model's forward
        >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
        >>> max_generated_length = inputs.input_ids.shape[1] + 10
-        >>> past_key_values = StaticCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+        >>> past_key_values = StaticCache(config=model.config, batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> past_kv_length = outputs.past_key_values # access cache filled with key/values from generation
        ```
    """

-    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
+    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        batch_size: int = None,
+        max_cache_len: int = None,
+        device: torch.device = None,
+        dtype: torch.dtype = torch.float32,
+        max_batch_size: Optional[int] = None,
+    ) -> None:
        super().__init__()
-        self.max_batch_size = max_batch_size
+        if max_batch_size is not None:
+            logger.warning_once(
+                f"The 'max_batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
+                "v4.46. Use the more precisely named 'batch_size' argument instead."
+            )
+
+        self.batch_size = batch_size or max_batch_size
        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
        self.head_dim = (
            config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
        )

-        self.dtype = dtype if dtype is not None else torch.float32
+        self.dtype = dtype
        self.num_key_value_heads = (
            config.num_attention_heads
            if getattr(config, "num_key_value_heads", None) is None
@ -1024,7 +1040,7 @@ class StaticCache(Cache):
        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
        # Note: There will be significant perf decrease if switching to use 5D tensors instead.
-        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
+        cache_shape = (self.batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        for idx in range(config.num_hidden_layers):
            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
@ -1069,6 +1085,8 @@ class StaticCache(Cache):
            A tuple containing the updated key and value states.
        """
        cache_position = cache_kwargs.get("cache_position")
+        self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device)
+        self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device)
        k_out = self.key_cache[layer_idx]
        v_out = self.value_cache[layer_idx]

@ -1080,8 +1098,6 @@ class StaticCache(Cache):
            # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
            # operation, that avoids copies and uses less memory.
            try:
-                # If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one
-                cache_position.to(device=k_out.device)
                k_out.index_copy_(2, cache_position, key_states)
                v_out.index_copy_(2, cache_position, value_states)
            except NotImplementedError:
@ -1130,13 +1146,14 @@ class SlidingWindowCache(StaticCache):
    Parameters:
        config (`PretrainedConfig`):
            The configuration file defining the shape-related attributes required to initialize the static cache.
-        max_batch_size (`int`):
-            The maximum batch size with which the model will be used.
+        batch_size (`int`):
+            The batch size with which the model will be used. Note that a new instance must be instantiated if a
+            smaller batch size is used.
        max_cache_len (`int`):
            The maximum sequence length with which the model will be used.
-        device (`torch.device`):
+        device (`torch.device` or `str`):
            The device on which the cache should be initialized. Should be the same as the layer.
-        dtype (*optional*, defaults to `torch.float32`):
+        dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.

    Example:
@ -1152,13 +1169,22 @@ class SlidingWindowCache(StaticCache):
        >>> # Prepare a cache class and pass it to model's forward
        >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
        >>> max_generated_length = inputs.input_ids.shape[1] + 10
-        >>> past_key_values = SlidingWindowCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+        >>> past_key_values = SlidingWindowCache(config=model.config, batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> past_kv_length = outputs.past_key_values # access cache filled with key/values from generation
        ```
    """

-    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
+    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        batch_size: int = None,
+        max_cache_len: int = None,
+        device: torch.device = None,
+        dtype: torch.dtype = torch.float32,
+        max_batch_size: Optional[int] = None,
+    ) -> None:
        super().__init__()
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
            raise ValueError(
@ -1168,7 +1194,12 @@ class SlidingWindowCache(StaticCache):
            )
        max_cache_len = min(config.sliding_window, max_cache_len)
        super().__init__(
-            config=config, max_batch_size=max_batch_size, max_cache_len=max_cache_len, device=device, dtype=dtype
+            config=config,
+            batch_size=batch_size,
+            max_cache_len=max_cache_len,
+            device=device,
+            dtype=dtype,
+            max_batch_size=max_batch_size,
        )

    def update(
@ -1407,13 +1438,14 @@ class HybridCache(Cache):
    Parameters:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
-        max_batch_size (`int`):
-            The maximum batch size with which the model will be used.
+        batch_size (`int`):
+            The batch size with which the model will be used. Note that a new instance must be instantiated if a
+            smaller batch size is used.
        max_cache_len (`int`):
            The maximum sequence length with which the model will be used.
-        device (`torch.device`, *optional*, defaults to `"cpu"`):
+        device (`torch.device` or `str`, *optional*, defaults to `"cpu"`):
            The device on which the cache should be initialized. Should be the same as the layer.
-        dtype (*optional*, defaults to `torch.float32`):
+        dtype (torch.dtype, *optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.

    Example:
@ -1429,14 +1461,28 @@ class HybridCache(Cache):
        >>> # Prepare a cache class and pass it to model's forward
        >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
        >>> max_generated_length = inputs.input_ids.shape[1] + 10
-        >>> past_key_values = HybridCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+        >>> past_key_values = HybridCache(config=model.config, batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> past_kv_length = outputs.past_key_values # access cache filled with key/values from generation
        ```
    """

-    def __init__(self, config: PretrainedConfig, max_batch_size, max_cache_len, device="cpu", dtype=None) -> None:
+    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        batch_size: int = None,
+        max_cache_len: int = None,
+        device: Union[torch.device, str] = "cpu",
+        dtype: torch.dtype = torch.float32,
+        max_batch_size: Optional[int] = None,
+    ) -> None:
        super().__init__()
+        if max_batch_size is not None:
+            logger.warning_once(
+                f"The 'max_batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
+                "v4.46. Use the more precisely named 'batch_size' argument instead."
+            )
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
            raise ValueError(
                "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
@ -1444,13 +1490,13 @@ class HybridCache(Cache):
                "config and it's not set to None."
            )
        self.max_cache_len = max_cache_len
-        self.max_batch_size = max_batch_size
+        self.batch_size = batch_size or max_batch_size
        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
        self.head_dim = (
            config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
        )

-        self.dtype = dtype if dtype is not None else torch.float32
+        self.dtype = dtype
        self.num_key_value_heads = (
            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
        )
@ -1459,9 +1505,9 @@ class HybridCache(Cache):
        )
        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
-        global_cache_shape = (max_batch_size, self.num_key_value_heads, max_cache_len, self.head_dim)
+        global_cache_shape = (self.batch_size, self.num_key_value_heads, max_cache_len, self.head_dim)
        sliding_cache_shape = (
-            max_batch_size,
+            self.batch_size,
            self.num_key_value_heads,
            min(config.sliding_window, max_cache_len),
            self.head_dim,
@ -1564,11 +1610,12 @@ class MambaCache:
    Arguments:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
-        max_batch_size (`int`):
-            The maximum batch size with which the model will be used.
-        dtype (*optional*, defaults to `torch.float16`):
+        batch_size (`int`):
+            The batch size with which the model will be used. Note that a new instance must be instantiated if a
+            smaller batch size is used.
+        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default `dtype` to use when initializing the layer.
-        device (`torch.device`, *optional*):
+        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Attributes:
@ -1596,29 +1643,35 @@ class MambaCache:
        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
-        >>> past_key_values = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
+        >>> past_key_values = MambaCache(config=model.config, batch_size=1, device=model.device, dtype=model.dtype)
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> past_kv = outputs.past_key_values
        ```
    """

+    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    def __init__(
        self,
        config: PretrainedConfig,
-        max_batch_size: int,
+        batch_size: int = None,
        dtype: torch.dtype = torch.float16,
-        device: Optional[str] = None,
-        **kwargs,
+        device: Optional[Union[torch.device, str]] = None,
+        max_batch_size: Optional[int] = None,
    ):
+        if max_batch_size is not None:
+            logger.warning_once(
+                f"The 'max_batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
+                "v4.46. Use the more precisely named 'batch_size' argument instead."
+            )
        self.dtype = dtype
-        self.max_batch_size = max_batch_size
+        self.batch_size = batch_size or max_batch_size
        self.intermediate_size = config.intermediate_size
        self.ssm_state_size = config.state_size
        self.conv_kernel_size = config.conv_kernel

        self.conv_states: torch.Tensor = torch.zeros(
            config.num_hidden_layers,
-            self.max_batch_size,
+            self.batch_size,
            self.intermediate_size,
            self.conv_kernel_size,
            device=device,
@ -1626,7 +1679,7 @@ class MambaCache:
        )
        self.ssm_states: torch.Tensor = torch.zeros(
            config.num_hidden_layers,
-            self.max_batch_size,
+            self.batch_size,
            self.intermediate_size,
            self.ssm_state_size,
            device=device,
--- a/src/transformers/commands/pt_to_tf.py
+++ b/src/transformers/commands/pt_to_tf.py
@ -12,45 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import inspect
 import os
 from argparse import ArgumentParser, Namespace
-from importlib import import_module

-import huggingface_hub
-import numpy as np
-from packaging import version
-
-from .. import (
-    FEATURE_EXTRACTOR_MAPPING,
-    IMAGE_PROCESSOR_MAPPING,
-    PROCESSOR_MAPPING,
-    TOKENIZER_MAPPING,
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoImageProcessor,
-    AutoProcessor,
-    AutoTokenizer,
-    is_datasets_available,
-    is_tf_available,
-    is_torch_available,
-)
-from ..utils import TF2_WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, logging
+from ..utils import logging
 from . import BaseTransformersCLICommand


-if is_tf_available():
-    import tensorflow as tf
-
-    tf.config.experimental.enable_tensor_float_32_execution(False)
-
-if is_torch_available():
-    import torch
-
-if is_datasets_available():
-    from datasets import load_dataset
-
-
 MAX_ERROR = 5e-5  # larger error tolerance than in our internal tests, to avoid flaky user-facing errors


@ -136,44 +104,6 @@ class PTtoTFCommand(BaseTransformersCLICommand):
        )
        train_parser.set_defaults(func=convert_command_factory)

-    @staticmethod
-    def find_pt_tf_differences(pt_outputs, tf_outputs):
-        """
-        Compares the TensorFlow and PyTorch outputs, returning a dictionary with all tensor differences.
-        """
-        # 1. All output attributes must be the same
-        pt_out_attrs = set(pt_outputs.keys())
-        tf_out_attrs = set(tf_outputs.keys())
-        if pt_out_attrs != tf_out_attrs:
-            raise ValueError(
-                f"The model outputs have different attributes, aborting. (Pytorch: {pt_out_attrs}, TensorFlow:"
-                f" {tf_out_attrs})"
-            )
-
-        # 2. For each output attribute, computes the difference
-        def _find_pt_tf_differences(pt_out, tf_out, differences, attr_name=""):
-            # If the current attribute is a tensor, it is a leaf and we make the comparison. Otherwise, we will dig in
-            # recursivelly, keeping the name of the attribute.
-            if isinstance(pt_out, torch.Tensor):
-                tensor_difference = np.max(np.abs(pt_out.numpy() - tf_out.numpy()))
-                differences[attr_name] = tensor_difference
-            else:
-                root_name = attr_name
-                for i, pt_item in enumerate(pt_out):
-                    # If it is a named attribute, we keep the name. Otherwise, just its index.
-                    if isinstance(pt_item, str):
-                        branch_name = root_name + pt_item
-                        tf_item = tf_out[pt_item]
-                        pt_item = pt_out[pt_item]
-                    else:
-                        branch_name = root_name + f"[{i}]"
-                        tf_item = tf_out[i]
-                    differences = _find_pt_tf_differences(pt_item, tf_item, differences, branch_name)
-
-            return differences
-
-        return _find_pt_tf_differences(pt_outputs, tf_outputs, {})
-
    def __init__(
        self,
        model_name: str,
@ -196,237 +126,12 @@ class PTtoTFCommand(BaseTransformersCLICommand):
        self._extra_commit_description = extra_commit_description
        self._override_model_class = override_model_class

-    def get_inputs(self, pt_model, tf_dummy_inputs, config):
-        """
-        Returns the right inputs for the model, based on its signature.
-        """
-
-        def _get_audio_input():
-            ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-            speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
-            raw_samples = [x["array"] for x in speech_samples]
-            return raw_samples
-
-        model_config_class = type(pt_model.config)
-        if model_config_class in PROCESSOR_MAPPING:
-            processor = AutoProcessor.from_pretrained(self._local_dir)
-            if model_config_class in TOKENIZER_MAPPING and processor.tokenizer.pad_token is None:
-                processor.tokenizer.pad_token = processor.tokenizer.eos_token
-        elif model_config_class in IMAGE_PROCESSOR_MAPPING:
-            processor = AutoImageProcessor.from_pretrained(self._local_dir)
-        elif model_config_class in FEATURE_EXTRACTOR_MAPPING:
-            processor = AutoFeatureExtractor.from_pretrained(self._local_dir)
-        elif model_config_class in TOKENIZER_MAPPING:
-            processor = AutoTokenizer.from_pretrained(self._local_dir)
-            if processor.pad_token is None:
-                processor.pad_token = processor.eos_token
-        else:
-            raise ValueError(f"Unknown data processing type (model config type: {model_config_class})")
-
-        model_forward_signature = set(inspect.signature(pt_model.forward).parameters.keys())
-        processor_inputs = {}
-        if "input_ids" in model_forward_signature:
-            processor_inputs.update(
-                {
-                    "text": ["Hi there!", "I am a batch with more than one row and different input lengths."],
-                    "padding": True,
-                    "truncation": True,
-                }
-            )
-        if "pixel_values" in model_forward_signature:
-            sample_images = load_dataset("uoft-cs/cifar10", "plain_text", split="test")[:2]["img"]  # no-script
-            processor_inputs.update({"images": sample_images})
-        if "input_features" in model_forward_signature:
-            feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters
-            # Pad to the largest input length by default but take feature extractor default
-            # padding value if it exists e.g. "max_length" and is not False or None
-            if "padding" in feature_extractor_signature:
-                default_strategy = feature_extractor_signature["padding"].default
-                if default_strategy is not False and default_strategy is not None:
-                    padding_strategy = default_strategy
-                else:
-                    padding_strategy = True
-            else:
-                padding_strategy = True
-            processor_inputs.update({"audio": _get_audio_input(), "padding": padding_strategy})
-        if "input_values" in model_forward_signature:  # Wav2Vec2 audio input
-            processor_inputs.update({"audio": _get_audio_input(), "padding": True})
-        pt_input = processor(**processor_inputs, return_tensors="pt")
-        tf_input = processor(**processor_inputs, return_tensors="tf")
-
-        # Extra input requirements, in addition to the input modality
-        if (
-            config.is_encoder_decoder
-            or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder"))
-            or "decoder_input_ids" in tf_dummy_inputs
-        ):
-            decoder_input_ids = np.asarray([[1], [1]], dtype=int) * (pt_model.config.decoder_start_token_id or 0)
-            pt_input.update({"decoder_input_ids": torch.tensor(decoder_input_ids)})
-            tf_input.update({"decoder_input_ids": tf.convert_to_tensor(decoder_input_ids)})
-
-        return pt_input, tf_input
-
    def run(self):
-        self._logger.warning(
-            "\n\nConverting PyTorch weights to TensorFlow is deprecated and will be removed in v4.43. "
+        # TODO (joao): delete file in v4.47
+        raise NotImplementedError(
+            "\n\nConverting PyTorch weights to TensorFlow weights was removed in v4.43. "
            "Instead, we recommend that you convert PyTorch weights to Safetensors, an improved "
            "format that can be loaded by any framework, including TensorFlow. For more information, "
            "please see the Safetensors conversion guide: "
            "https://huggingface.co/docs/safetensors/en/convert-weights\n\n"
        )
-        # hub version 0.9.0 introduced the possibility of programmatically opening PRs with normal write tokens.
-        if version.parse(huggingface_hub.__version__) < version.parse("0.9.0"):
-            raise ImportError(
-                "The huggingface_hub version must be >= 0.9.0 to use this command. Please update your huggingface_hub"
-                " installation."
-            )
-        else:
-            from huggingface_hub import Repository, create_commit
-            from huggingface_hub._commit_api import CommitOperationAdd
-
-        # Fetch remote data
-        repo = Repository(local_dir=self._local_dir, clone_from=self._model_name)
-
-        # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights
-        config = AutoConfig.from_pretrained(self._local_dir)
-        architectures = config.architectures
-        if self._override_model_class is not None:
-            if self._override_model_class.startswith("TF"):
-                architectures = [self._override_model_class[2:]]
-            else:
-                architectures = [self._override_model_class]
-            try:
-                pt_class = getattr(import_module("transformers"), architectures[0])
-            except AttributeError:
-                raise ValueError(f"Model class {self._override_model_class} not found in transformers.")
-            try:
-                tf_class = getattr(import_module("transformers"), "TF" + architectures[0])
-            except AttributeError:
-                raise ValueError(f"TF model class TF{self._override_model_class} not found in transformers.")
-        elif architectures is None:  # No architecture defined -- use auto classes
-            pt_class = getattr(import_module("transformers"), "AutoModel")
-            tf_class = getattr(import_module("transformers"), "TFAutoModel")
-            self._logger.warning("No detected architecture, using AutoModel/TFAutoModel")
-        else:  # Architecture defined -- use it
-            if len(architectures) > 1:
-                raise ValueError(f"More than one architecture was found, aborting. (architectures = {architectures})")
-            self._logger.warning(f"Detected architecture: {architectures[0]}")
-            pt_class = getattr(import_module("transformers"), architectures[0])
-            try:
-                tf_class = getattr(import_module("transformers"), "TF" + architectures[0])
-            except AttributeError:
-                raise AttributeError(f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers.")
-
-        # Check the TF dummy inputs to see what keys we need in the forward pass
-        tf_from_pt_model = tf_class.from_config(config)
-        tf_dummy_inputs = tf_from_pt_model.dummy_inputs
-
-        del tf_from_pt_model  # Try to keep only one model in memory at a time
-
-        # Load the model and get some basic inputs
-        pt_model = pt_class.from_pretrained(self._local_dir)
-        pt_model.eval()
-
-        pt_input, tf_input = self.get_inputs(pt_model, tf_dummy_inputs, config)
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_input, output_hidden_states=True)
-        del pt_model  # will no longer be used, and may have a large memory footprint
-
-        tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True)
-        tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True, training=False)
-
-        # Confirms that cross loading PT weights into TF worked.
-        crossload_differences = self.find_pt_tf_differences(pt_outputs, tf_from_pt_outputs)
-        output_differences = {k: v for k, v in crossload_differences.items() if "hidden" not in k}
-        hidden_differences = {k: v for k, v in crossload_differences.items() if "hidden" in k}
-        if len(output_differences) == 0 and architectures is not None:
-            raise ValueError(
-                f"Something went wrong -- the config file has architectures ({architectures}), but no model head"
-                " output was found. All outputs start with 'hidden'"
-            )
-        max_crossload_output_diff = max(output_differences.values()) if output_differences else 0.0
-        max_crossload_hidden_diff = max(hidden_differences.values())
-        if max_crossload_output_diff > self._max_error or max_crossload_hidden_diff > self._max_error:
-            raise ValueError(
-                "The cross-loaded TensorFlow model has different outputs, something went wrong!\n"
-                + f"\nList of maximum output differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in output_differences.items() if v > self._max_error])
-                + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_error])
-            )
-
-        # Save the weights in a TF format (if needed) and confirms that the results are still good
-        tf_weights_path = os.path.join(self._local_dir, TF2_WEIGHTS_NAME)
-        tf_weights_index_path = os.path.join(self._local_dir, TF2_WEIGHTS_INDEX_NAME)
-        if (not os.path.exists(tf_weights_path) and not os.path.exists(tf_weights_index_path)) or self._new_weights:
-            tf_from_pt_model.save_pretrained(self._local_dir)
-        del tf_from_pt_model  # will no longer be used, and may have a large memory footprint
-
-        tf_model = tf_class.from_pretrained(self._local_dir)
-        tf_outputs = tf_model(**tf_input, output_hidden_states=True)
-
-        conversion_differences = self.find_pt_tf_differences(pt_outputs, tf_outputs)
-        output_differences = {k: v for k, v in conversion_differences.items() if "hidden" not in k}
-        hidden_differences = {k: v for k, v in conversion_differences.items() if "hidden" in k}
-        if len(output_differences) == 0 and architectures is not None:
-            raise ValueError(
-                f"Something went wrong -- the config file has architectures ({architectures}), but no model head"
-                " output was found. All outputs start with 'hidden'"
-            )
-        max_conversion_output_diff = max(output_differences.values()) if output_differences else 0.0
-        max_conversion_hidden_diff = max(hidden_differences.values())
-        if max_conversion_output_diff > self._max_error or max_conversion_hidden_diff > self._max_error:
-            raise ValueError(
-                "The converted TensorFlow model has different outputs, something went wrong!\n"
-                + f"\nList of maximum output differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in output_differences.items() if v > self._max_error])
-                + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_error])
-            )
-
-        commit_message = "Update TF weights" if self._new_weights else "Add TF weights"
-        if self._push:
-            repo.git_add(auto_lfs_track=True)
-            repo.git_commit(commit_message)
-            repo.git_push(blocking=True)  # this prints a progress bar with the upload
-            self._logger.warning(f"TF weights pushed into {self._model_name}")
-        elif not self._no_pr:
-            self._logger.warning("Uploading the weights into a new PR...")
-            commit_descrition = (
-                "Model converted by the [`transformers`' `pt_to_tf`"
-                " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py). "
-                "All converted model outputs and hidden layers were validated against its PyTorch counterpart.\n\n"
-                f"Maximum crossload output difference={max_crossload_output_diff:.3e}; "
-                f"Maximum crossload hidden layer difference={max_crossload_hidden_diff:.3e};\n"
-                f"Maximum conversion output difference={max_conversion_output_diff:.3e}; "
-                f"Maximum conversion hidden layer difference={max_conversion_hidden_diff:.3e};\n"
-            )
-            if self._max_error > MAX_ERROR:
-                commit_descrition += (
-                    f"\n\nCAUTION: The maximum admissible error was manually increased to {self._max_error}!"
-                )
-            if self._extra_commit_description:
-                commit_descrition += "\n\n" + self._extra_commit_description
-
-            # sharded model -> adds all related files (index and .h5 shards)
-            if os.path.exists(tf_weights_index_path):
-                operations = [
-                    CommitOperationAdd(path_in_repo=TF2_WEIGHTS_INDEX_NAME, path_or_fileobj=tf_weights_index_path)
-                ]
-                for shard_path in tf.io.gfile.glob(self._local_dir + "/tf_model-*.h5"):
-                    operations += [
-                        CommitOperationAdd(path_in_repo=os.path.basename(shard_path), path_or_fileobj=shard_path)
-                    ]
-            else:
-                operations = [CommitOperationAdd(path_in_repo=TF2_WEIGHTS_NAME, path_or_fileobj=tf_weights_path)]
-
-            hub_pr_url = create_commit(
-                repo_id=self._model_name,
-                operations=operations,
-                commit_message=commit_message,
-                commit_description=commit_descrition,
-                repo_type="model",
-                create_pr=True,
-            ).pr_url
-            self._logger.warning(f"PR open in {hub_pr_url}")
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -81,6 +81,15 @@ class PretrainedConfig(PushToHubMixin):
      model.
    - **num_hidden_layers** (`int`) -- The number of blocks in the model.

+    <Tip warning={true}>
+
+    Setting parameters for sequence generation in the model config is deprecated. For backward compatibility, loading
+    some of them will still be possible, but attempting to overwrite them will throw an exception -- you should set
+    them in a [~transformers.GenerationConfig]. Check the documentation of [~transformers.GenerationConfig] for more
+    information about the individual parameters.
+
+    </Tip>
+
    Arg:
        name_or_path (`str`, *optional*, defaults to `""`):
            Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
@ -117,77 +126,6 @@ class PretrainedConfig(PushToHubMixin):
            sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
            Forward Chunking work?](../glossary.html#feed-forward-chunking).

-        > Parameters for sequence generation
-
-        max_length (`int`, *optional*, defaults to 20):
-            Maximum length that will be used by default in the `generate` method of the model.
-        min_length (`int`, *optional*, defaults to 0):
-            Minimum length that will be used by default in the `generate` method of the model.
-        do_sample (`bool`, *optional*, defaults to `False`):
-            Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
-            use greedy decoding otherwise.
-        early_stopping (`bool`, *optional*, defaults to `False`):
-            Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
-            when at least `num_beams` sentences are finished per batch or not.
-        num_beams (`int`, *optional*, defaults to 1):
-            Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
-            no beam search.
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams
-            that will be used by default in the `generate` method of the model. 1 means no group beam search.
-        diversity_penalty (`float`, *optional*, defaults to 0.0):
-            Value to control diversity for group beam search. that will be used by default in the `generate` method of
-            the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
-        temperature (`float`, *optional*, defaults to 1.0):
-            The value used to module the next token probabilities that will be used by default in the `generate` method
-            of the model. Must be strictly positive.
-        top_k (`int`, *optional*, defaults to 50):
-            Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in
-            the `generate` method of the model.
-        top_p (`float`, *optional*, defaults to 1):
-            Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
-            only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
-        typical_p (`float`, *optional*, defaults to 1):
-            Local typicality measures how similar the conditional probability of predicting a target token next is to
-            the expected conditional probability of predicting a random token next, given the partial text already
-            generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
-            add up to `typical_p` or higher are kept for generation. See [this
-            paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
-        repetition_penalty (`float`, *optional*, defaults to 1):
-            Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
-            means no penalty.
-        length_penalty (`float`, *optional*, defaults to 1):
-            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
-            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
-            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
-            `length_penalty` < 0.0 encourages shorter sequences.
-        no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
-            `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
-            only occur once.
-        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by
-            default in the `generate` method of the model for `encoder_no_repeat_ngram_size`. If set to int > 0, all
-            ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
-        bad_words_ids (`List[int]`, *optional*):
-            List of token ids that are not allowed to be generated that will be used by default in the `generate`
-            method of the model. In order to get the tokens of the words that should not appear in the generated text,
-            use `tokenizer.encode(bad_word, add_prefix_space=True)`.
-        num_return_sequences (`int`, *optional*, defaults to 1):
-            Number of independently computed returned sequences for each element in the batch that will be used by
-            default in the `generate` method of the model.
-        output_scores (`bool`, *optional*, defaults to `False`):
-            Whether the model should return the logits when used for generation.
-        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-            Whether the model should return a [`~transformers.utils.ModelOutput`] instead of a `torch.LongTensor`.
-        forced_bos_token_id (`int`, *optional*):
-            The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
-            multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
-            language token.
-        forced_eos_token_id (`int`, *optional*):
-            The id of the token to force as the last generated token when `max_length` is reached.
-        remove_invalid_values (`bool`, *optional*):
-            Whether to remove possible _nan_ and _inf_ outputs of the model to prevent the generation method to crash.
-            Note that using `remove_invalid_values` can slow down generation.
-
        > Parameters for fine-tuning tasks

        architectures (`List[str]`, *optional*):
@ -287,7 +225,7 @@ class PretrainedConfig(PushToHubMixin):

        # Retrocompatibility: Parameters for sequence generation. While we will keep the ability to load these
        # parameters, saving them will be deprecated. In a distant future, we won't need to load them.
-        for parameter_name, default_value in self._get_generation_defaults().items():
+        for parameter_name, default_value in self._get_global_generation_defaults().items():
            setattr(self, parameter_name, kwargs.pop(parameter_name, default_value))

        # Fine-tuning task arguments
@ -440,16 +378,13 @@ class PretrainedConfig(PushToHubMixin):
        if os.path.isfile(save_directory):
            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")

-        non_default_generation_parameters = {}
-        for parameter_name, default_value in self._get_generation_defaults().items():
-            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
-                non_default_generation_parameters[parameter_name] = getattr(self, parameter_name)
+        non_default_generation_parameters = self._get_non_default_generation_parameters()
        if len(non_default_generation_parameters) > 0:
-            logger.warning(
-                "Some non-default generation parameters are set in the model config. These should go into a "
-                "GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
-                "instead. This warning will be raised to an exception in v4.41.\n"
-                f"Non-default generation parameters: {str(non_default_generation_parameters)}"
+            raise ValueError(
+                "Some non-default generation parameters are set in the model config. These should go into either a) "
+                "`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
+                "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
+                f"\nNon-default generation parameters: {str(non_default_generation_parameters)}"
            )

        os.makedirs(save_directory, exist_ok=True)
@ -1049,7 +984,7 @@ class PretrainedConfig(PushToHubMixin):
        cls._auto_class = auto_class

    @staticmethod
-    def _get_generation_defaults() -> Dict[str, Any]:
+    def _get_global_generation_defaults() -> Dict[str, Any]:
        return {
            "max_length": 20,
            "min_length": 0,
@ -1078,14 +1013,49 @@ class PretrainedConfig(PushToHubMixin):
            "begin_suppress_tokens": None,
        }

-    def _has_non_default_generation_parameters(self) -> bool:
+    def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
        """
-        Whether or not this instance holds non-default generation parameters.
+        Gets the non-default generation parameters on the PretrainedConfig instance
        """
-        for parameter_name, default_value in self._get_generation_defaults().items():
-            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
-                return True
-        return False
+        non_default_generation_parameters = {}
+        decoder_attribute_name = None
+        default_config = None
+
+        # Composite models don't have a default config, use their decoder config as a fallback for default values
+        # If no known pattern is matched, then `default_config = None` -> check against the global generation defaults
+        try:
+            default_config = self.__class__()
+        except ValueError:
+            for decoder_attribute_name in ("decoder", "generator", "text_config"):
+                if hasattr(self, decoder_attribute_name):
+                    default_config = getattr(self, decoder_attribute_name).__class__()
+                    break
+
+        # If it is a composite model, we want to check the subconfig that will be used for generation
+        self_decoder_config = self if decoder_attribute_name is None else getattr(self, decoder_attribute_name)
+
+        for parameter_name, default_global_value in self._get_global_generation_defaults().items():
+            if hasattr(self_decoder_config, parameter_name):
+                is_default_in_config = is_default_generation_value = None
+                parameter_value = getattr(self_decoder_config, parameter_name)
+                # Three cases in which is okay for the model config to hold generation config parameters:
+                # 1. The parameter is set to `None`, effectivelly delegating its value to the generation config
+                if parameter_value is None:
+                    continue
+                # 2. If we have a default config, then the instance should hold the same generation defaults
+                if default_config is not None:
+                    is_default_in_config = parameter_value == getattr(default_config, parameter_name)
+                # 3. if we don't have a default config, then the instance should hold the global generation defaults
+                else:
+                    is_default_generation_value = parameter_value == default_global_value
+
+                is_non_default = (is_default_in_config is False) or (
+                    is_default_in_config is None and is_default_generation_value is False
+                )
+                if is_non_default:
+                    non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
+
+        return non_default_generation_parameters


 def get_configuration_file(configuration_files: List[str]) -> str:
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
    "Pillow": "Pillow>=10.0.1,<=15.0",
-    "accelerate": "accelerate>=0.21.0",
+    "accelerate": "accelerate>=0.26.0",
    "av": "av==9.2.0",
    "beautifulsoup4": "beautifulsoup4",
    "codecarbon": "codecarbon==1.2.0",
@ -43,7 +43,7 @@ deps = {
    "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
    "onnxruntime": "onnxruntime>=1.4.0",
    "opencv-python": "opencv-python",
-    "optimum-benchmark": "optimum-benchmark>=0.2.0",
+    "optimum-benchmark": "optimum-benchmark>=0.3.0",
    "optuna": "optuna",
    "optax": "optax>=0.0.8,<=0.1.4",
    "packaging": "packaging>=20.0",
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@ -137,8 +137,15 @@ class BatchFeature(UserDict):
            import torch  # noqa

            def as_tensor(value):
-                if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray):
-                    value = np.array(value)
+                if isinstance(value, (list, tuple)) and len(value) > 0:
+                    if isinstance(value[0], np.ndarray):
+                        value = np.array(value)
+                    elif (
+                        isinstance(value[0], (list, tuple))
+                        and len(value[0]) > 0
+                        and isinstance(value[0][0], np.ndarray)
+                    ):
+                        value = np.array(value)
                return torch.tensor(value)

            is_tensor = torch.is_tensor
--- a/Show More
+++ b/Show More