Prevent crash when FA is not installed

Workflow benchmarks more
Rm generate_sall configs
2025-11-03 11:24:34 +08:00 · 2025-10-24 16:13:37 +00:00 · 2025-10-24 16:05:50 +00:00 · 2025-10-24 16:05:41 +00:00 · 2025-10-24 16:05:30 +00:00 · 2025-10-23 12:39:58 +00:00
928 changed files with 17402 additions and 25916 deletions
--- a/.github/scripts/codeowners_for_review_action
+++ b/.github/scripts/codeowners_for_review_action
@ -22,7 +22,6 @@ tests/generation/ @gante
 /src/transformers/models/auto/ @ArthurZucker
 /src/transformers/utils/ @ArthurZucker @Rocketknight1
 /src/transformers/loss/ @ArthurZucker
-/src/transformers/onnx/ @michaelbenayoun

 # Specific files come after the sections/globs, so they take priority
 /.circleci/config.yml @ArthurZucker @ydshieh
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -1,7 +1,10 @@
 name: Self-hosted runner (benchmark)

 on:
-  workflow_dispatch:
+  push:
+    branches: [main]
+  pull_request:
+    types: [ opened, labeled, reopened, synchronize ]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@ -9,6 +12,8 @@ concurrency:

 env:
  HF_HOME: /mnt/cache
+  DATASET_ID: hf-benchmarks/transformers
+  MODEL_ID: meta-llama/Llama-3.1-8B-Instruct

 jobs:
  benchmark:
@ -31,26 +36,12 @@ jobs:
        with:
          ref: ${{ github.event.pull_request.head.sha || github.sha }}

-      - name: Install libpq-dev & psql
-        run: |
-          apt update
-          apt install -y libpq-dev postgresql-client
-
      - name: Install benchmark script dependencies
-        run: python3 -m pip install -r benchmark/requirements.txt
+        run: python3 -m pip install -r benchmark_v2/requirements.txt kernels

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
-
-      - name: Run database init script
-        run: |
-          psql -f benchmark/utils/init_db.sql
-        env:
-          PGDATABASE: metrics
-          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
-          PGUSER: transformers_benchmarks
-          PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix

      - name: Run benchmark
        run: |
@ -61,13 +52,11 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg"
+          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --cross-generate --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+          PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
          # Enable this to see debug logs
          # HF_HUB_VERBOSITY: debug
          # TRANSFORMERS_VERBOSITY: debug
-          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
-          PGUSER: transformers_benchmarks
-          PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -41,9 +41,14 @@ env:

 jobs:
  check_new_failures:
-    name: " "
+    name: "Find commits for new failing tests"
+    strategy:
+      matrix:
+        run_idx: [1]
    runs-on:
      group: aws-g5-4xlarge-cache
+    outputs:
+      process: ${{ steps.check_file.outputs.process }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -54,14 +59,17 @@ jobs:
          path: /transformers/ci_results_${{ inputs.job }}

      - name: Check file
+        id: check_file
        working-directory: /transformers
        run: |
          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
            echo "process=true" >> $GITHUB_ENV
+            echo "process=true" >> $GITHUB_OUTPUT
          else
            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
            echo "process=false" >> $GITHUB_ENV
+            echo "process=false" >> $GITHUB_OUTPUT
          fi

      - uses: actions/download-artifact@v4
@ -118,6 +126,10 @@ jobs:
        run: |
          python3 utils/print_env.py

+      - name: Install pytest-flakefinder
+        if: ${{ env.process == 'true' }}
+        run: python3 -m pip install pytest-flakefinder
+
      - name: Show installed libraries and their versions
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
@ -126,25 +138,63 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
        run: |
-          ls -l new_failures_with_bad_commit.json
-          cat new_failures_with_bad_commit.json
+          ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+          cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

-      - name: Checkout back
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}
+          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+
+  process_new_failures_with_commit_info:
+    name: "process bad commit reports"
+    needs: check_new_failures
+    if: needs.check_new_failures.outputs.process == 'true'
+    runs-on:
+      group: aws-g5-4xlarge-cache
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: ci_results_${{ inputs.job }}
+          path: /transformers/ci_results_${{ inputs.job }}
+
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: new_failures_with_bad_commit_${{ inputs.job }}*
+          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+          merge-multiple: true
+
+      - name: Check files
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        run: |
-          git checkout ${{ inputs.start_sha }}
+          ls -la /transformers
+          ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+
+      # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
+      # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
+      - name: Merge files
+        shell: bash
+        working-directory: /transformers
+        run: |
+          cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Process report
        shell: bash
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -156,7 +206,6 @@ jobs:
      - name: Process report
        shell: bash
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -171,13 +220,12 @@ jobs:

      - name: Prepare Slack report title
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        run: |
          pip install slack_sdk
          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV

      - name: Send processed report
-        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
+        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
        with:
          # Slack channel id, channel name, or user id to post message.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -112,7 +112,125 @@ New models are constantly released and if you want to implement a new model, ple

 If you are willing to contribute the model yourself, let us know so we can help you add it to 🤗 Transformers!

-We have a technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/add_new_model).
+We have a technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/modular_transformers).
+
+### Vision-Language Model Contribution Checklist
+
+If you're contributing a **vision-language model** (or any multimodal model that processes images/videos), please follow this checklist. Maintainers will use this to review your PR, and completing these steps will significantly increase the likelihood of your PR being merged quickly.
+
+**Required checklist for all vision-language model contributions:**
+
+☐ **1. Implement a modular file**
+
+All new models should use the modular architecture pattern. Create a `modular_<model_name>.py` file using the modular model converter:
+
+- Use the CLI, [`transformers add-new-model-like`](https://github.com/huggingface/transformers/blob/main/src/transformers/cli/add_new_model_like.py) to generate a modular skeleton and get started
+- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. 
+- Reuse existing patterns from similar models as much as possible
+
+To verify your modular file is correct, run:
+
+```bash
+python utils/modular_model_converter.py <model_name>
+```
+
+This will generate the separate files (`modeling_*.py`, `configuration_*.py`, etc.) from your modular file. The CI will enforce that these generated files match your modular file.
+
+☐ **2. Add a fast image processor (for image models)**
+
+If your model processes images, implement a fast image processor that uses `torch` and `torchvision` instead of PIL/numpy for better inference performance:
+
+- See the detailed guide in [#36978](https://github.com/huggingface/transformers/issues/36978)
+- Fast processors inherit from `BaseImageProcessorFast`
+- Examples: `LlavaOnevisionImageProcessorFast`, `Idefics2ImageProcessorFast`
+
+☐ **3. Create a weight conversion script**
+
+Add a `convert_<model_name>_to_hf.py` script that converts the original model weights to the HuggingFace format:
+
+- Script should handle checkpoint loading, key mapping, and saving in HF format
+- Include usage examples and documentation in the script
+- Examples: [`convert_llava_onevision_weights_to_hf.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py), [`convert_idefics2_weights_to_hf.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py)
+
+☐ **4. Add integration tests with exact output matching**
+
+At minimum, add an `IntegrationTest` class that tests end-to-end generation (processing and modelling) with **exact** output matching:
+
+- For generative models: test that generated text matches expected output exactly
+- For non-generative models: test that output logits match expected values
+- Tests should use real checkpoints (load in 4-bit or half precision if the checkpoint is too big to fit in our CI runners) and real inputs
+- Example pattern:
+
+```python
+class MyModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_model_integration(self):
+        model = MyModelForConditionalGeneration.from_pretrained("org/model-name")
+        processor = AutoProcessor.from_pretrained("org/model-name")
+
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_TEXT = "exact expected output"
+        self.assertEqual(processor.decode(output[0]), EXPECTED_TEXT)
+```
+
+See `tests/models/llava_onevision/test_modeling_llava_onevision.py` for complete examples.
+
+☐ **5. Update documentation**
+
+Add or update model documentation:
+
+- Create if the cli hasn't `docs/source/en/model_doc/<model_name>.md` with usage examples
+- Include model description, paper link, and basic usage with `Pipeline` and `AutoModel`
+- Add the model to the appropriate TOC files
+
+☐ **6. Look for reusable patterns**
+
+The library has 400+ models with many established patterns:
+
+- Search for similar models (e.g., other vision-language models)
+- Reuse attention mechanisms, layer implementations, and processing patterns
+- Check models like LLaVA, Idefics2, Fuyu for vision-language patterns
+- Use provided decorators like (`auto_docstring`, `can_return_tuple`, `check_model_inputs` and `_can_record_outputs`) where relevant. 
+- Don't reinvent the wheel
+
+☐ **7. Run quality checks and read the output**
+
+Before submitting your PR, install quality dependencies and run the full check suite:
+
+```bash
+pip install -e ".[quality]"
+make fixup
+```
+
+**Important**: Take time to read the output of `make fixup`. It will:
+- Lint and format your code automatically
+- Run consistency checks (imports, docstrings, etc.)
+- Show any remaining issues that need manual fixes
+
+All checks must pass before your PR can be merged.
+
+**If this checklist is complete, your PR has a very high likelihood of being merged!** Following these steps makes the maintainers' work much easier and will reduce the number of review iterations, getting your important work out there faster.
+
+#### Copy-pastable checklist for maintainers
+
+Here's a condensed version maintainers can copy into PRs:
+
+```markdown
+## Multimodal Model Addition Checklist
+
+Please ensure your PR completes all following items. See the [full checklist](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#vision-language-model-contribution-checklist) for details.
+
+- [ ] **Modular file**: `modular_<model_name>.py` implemented and verified with `python utils/modular_model_converter.py <model_name>`
+- [ ] **Fast image processor**: Implemented using `BaseImageProcessorFast` (see [#36978](https://github.com/huggingface/transformers/issues/36978))
+- [ ] **Conversion script**: `convert_<model_name>_to_hf.py` added with usage examples
+- [ ] **Integration tests**: End-to-end tests with exact output matching (text or logits)
+- [ ] **Documentation**: Model docs added/updated in `docs/source/en/model_doc/`
+- [ ] **Pattern reuse**: Verified against similar models (LLaVA, Idefics2, etc.)
+- [ ] **Quality checks**: `make fixup` passes with no errors
+
+```

 ## Do you want to add documentation?

--- a/README.md
+++ b/README.md
@ -64,8 +64,8 @@ limitations under the License.
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
 </h3>

-Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
-vision, audio, video, and multimodal model, for both inference and training.
+Transformers acts as the model-definition framework for state-of-the-art machine learning with text, computer
+vision, audio, video, and multimodal models, for both inference and training.

 It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
 pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -3,6 +3,8 @@ import json
 import logging
 from typing import Any

+from transformers.utils.import_utils import is_flash_attn_2_available
+

 KERNELIZATION_AVAILABLE = False
 try:
@ -18,11 +20,21 @@ logger = logging.getLogger(__name__)
 class BenchmarkConfig:
    """Configuration for a single benchmark scenario."""

+    all_attn_implementations = [
+        ("flash_attention_2", None),
+        ("eager", None),
+        ("sdpa", "math"),
+        ("sdpa", "flash_attention"),
+        ("flex_attention", None),
+    ]
+
+    all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
+
    def __init__(
        self,
        warmup_iterations: int = 5,
        measurement_iterations: int = 20,
-        gpu_monitoring: bool = False,  # False by default because it slows down the benchmark by a lot
+        gpu_monitoring: bool = True,  # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
        batch_size: int = 1,
        sequence_length: int = 128,
        num_tokens_to_generate: int = 128,
@ -59,6 +71,13 @@ class BenchmarkConfig:
    def check_validity(self, skip_validity_check: bool = False) -> None:
        if skip_validity_check:
            return
+        # Check FA is installed
+        if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
+            logger.warning(
+                "Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
+            )
+            self.attn_implementation = "sdpa"
+            self.sdpa_backend = "flash_attention"
        # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
        is_fa = self.attn_implementation == "flash_attention_2"
        is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
@ -136,7 +155,7 @@ def cross_generate_configs(
    batch_size: int = 1,
    sequence_length: int = 128,
    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = False,  # this slows down the benchmark by a lot so we disable it by default
+    gpu_monitoring: bool = True,
 ) -> list[BenchmarkConfig]:
    # Create kwargs common to all configs
    kwargs = {
@ -163,41 +182,12 @@ def cross_generate_configs(
    return configs


-def generate_all_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = False,
-) -> list[BenchmarkConfig]:
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),
-        ("flex_attention", None),
-    ]
-    return cross_generate_configs(
-        attn_impl_and_sdpa_backend=all_attn_implementations,
-        compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
-        kernelized=[False, KERNELIZATION_AVAILABLE],
-        warmup_iterations=warmup_iterations,
-        measurement_iterations=measurement_iterations,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_tokens_to_generate=num_tokens_to_generate,
-        gpu_monitoring=gpu_monitoring,
-    )
-
-
 def generate_main_configs(
    warmup_iterations: int = 5,
    measurement_iterations: int = 20,
    batch_size: int = 1,
    sequence_length: int = 128,
    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = False,
 ) -> list[BenchmarkConfig]:
    # Create kwargs common to all configs
    kwargs = {
@ -206,10 +196,10 @@ def generate_main_configs(
        "batch_size": batch_size,
        "sequence_length": sequence_length,
        "num_tokens_to_generate": num_tokens_to_generate,
-        "gpu_monitoring": gpu_monitoring,
    }
    return [  # TODO: test max-autotune instead of default
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", **kwargs),
-        BenchmarkConfig(attn_implementation="eager", compile_mode="default", **kwargs),
-        BenchmarkConfig(attn_implementation="flash_attention_2", **kwargs),
+        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
+        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
+        BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
+        BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
    ]
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@ -4,6 +4,7 @@ import logging
 import os
 import pathlib
 import re
+import tempfile
 import time
 from contextlib import nullcontext
 from datetime import datetime
@ -11,6 +12,8 @@ from queue import Queue
 from typing import Any

 import torch
+from datasets import Dataset
+from huggingface_hub import HfApi
 from tqdm import trange

 from transformers import (
@ -50,6 +53,8 @@ DEFAULT_PROMPT = "\n".join([
    "Its instability ended in the coup of 18 Brumaire and the establishment of the Consulate, with Napoleon Bonaparte as First Consul.",
 ])  # fmt: skip

+PUSH_TO_HUB_TOKEN = os.getenv("PUSH_TO_HUB_TOKEN", None)
+

 def compact_json_numeric_arrays(data: dict):
    # Match arrays that contain only numbers (ints/floats), whitespace, commas, and newlines
@ -120,15 +125,19 @@ def flush_memory():

 class BenchmarkStreamer(BaseStreamer):
    def __init__(self, **kwargs) -> None:
+        self.timeout = kwargs.pop("timeout", 10)
        self.timestamps = []
        self.text_queue = Queue()
+        self.stop_signal = None

    def put(self, value):
        """Receives tokens and logs the timestamp of the generation."""
        self.timestamps.append(time.perf_counter())
+        self.text_queue.put(value)

    def end(self):
        self.timestamps.append(time.perf_counter())
+        self.text_queue.put(self.stop_signal)

    def __iter__(self):
        return self
@ -144,13 +153,22 @@ class BenchmarkStreamer(BaseStreamer):
 class BenchmarkRunner:
    """Main benchmark runner that coordinates benchmark execution."""

-    def __init__(self, logger: logging.Logger, output_dir: str | None = None, commit_id: str | None = None) -> None:
+    def __init__(
+        self,
+        logger: logging.Logger,
+        output_dir: str | None = None,
+        branch_name: str | None = None,
+        commit_id: str | None = None,
+        commit_message: str | None = None,
+    ) -> None:
        # Those stay constant for the whole run
        self.logger = logger
        if output_dir is None:
            output_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "benchmark_results")
        self.output_dir = output_dir
+        self.branch_name = branch_name
        self.commit_id = get_git_revision() if commit_id is None else commit_id
+        self.commit_message = commit_message
        os.makedirs(self.output_dir, exist_ok=True)
        self.profile_dir = None
        # Attributes that are reset for each model
@ -163,7 +181,7 @@ class BenchmarkRunner:
        self.model = None
        flush_memory()

-    def setup_one_run(self, model_id: str, config: BenchmarkConfig) -> None:
+    def setup_benchmark(self, model_id: str, config: BenchmarkConfig) -> None:
        # Some attributes only need to be set once per model
        if self._setup_for != model_id:
            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -200,10 +218,13 @@ class BenchmarkRunner:
        self.model = self.model.eval().to(config.device)

        # Kernelize the model if needed
-        if config.kernelize:
+        if config.kernelize and kernelize is not None and Mode is not None:
            self.model = kernelize(self.model, mode=Mode.INFERENCE)

-    def run_one_benchmark(self, model_id: str, config: BenchmarkConfig, num_tokens_to_profile: int = 0) -> None:
+    def run_benchmark(
+        self, model_id: str, config: BenchmarkConfig, num_tokens_to_profile: int = 0
+    ) -> dict[str, Any] | None:
+        """Run a single benchmark with the given model ID and config."""
        sdpa_ctx = nullcontext()
        if config.attn_implementation == "sdpa":
            sdpa_backend = get_sdpa_backend(config.sdpa_backend)
@ -243,7 +264,12 @@ class BenchmarkRunner:
                self.profile_generate(num_tokens_to_profile, config.name)

            return {
-                "metadata": BenchmarkMetadata(model_id=model_id, commit_id=self.commit_id),
+                "metadata": BenchmarkMetadata(
+                    model_id=model_id,
+                    branch_name=self.branch_name,
+                    commit_id=self.commit_id,
+                    commit_message=self.commit_message,
+                ),
                "measurements": result,
                "config": config,
            }
@ -305,7 +331,8 @@ class BenchmarkRunner:
        benchmark_configs: list[BenchmarkConfig],
        num_tokens_to_profile: int = 0,
        pretty_print_summary: bool = True,
-    ) -> dict[str, Any]:
+    ) -> tuple[str, dict[str, Any]]:
+        """Run multiple benchmarks for the given model ID and list of benchmark configs."""
        all_results = {}
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        start_time = time.perf_counter()
@ -324,14 +351,14 @@ class BenchmarkRunner:
                continue

            # Otherwise, run the benchmark
-            self.setup_one_run(model_id, config)
+            self.setup_benchmark(model_id, config)
            self.logger.info(
                f"Running benchmark of model {model_id} with scenario: {config.name} ({i + 1}/{n_configs})"
            )

            # Launch benchmark in a try/except block to avoid stopping the whole run if one benchmark fails
            try:
-                results = self.run_one_benchmark(model_id, config, num_tokens_to_profile)
+                results = self.run_benchmark(model_id, config, num_tokens_to_profile)
                if results is not None:
                    all_results[config.hash] = results

@ -358,7 +385,7 @@ class BenchmarkRunner:
                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
            print("=" * 100)

-        return all_results
+        return (timestamp, all_results)

    def save_results(self, model_name: str, results: dict, timestamp: str = "") -> str:
        """Save benchmark results to JSON file."""
@ -387,3 +414,43 @@ class BenchmarkRunner:

        self.logger.info(f"Results saved to {filepath}")
        return filepath
+
+    def push_results_to_hub(self, dataset_id: str, results: dict[Any, Any], timestamp: str) -> None:
+        if PUSH_TO_HUB_TOKEN is None:
+            raise ValueError(
+                "PUSH_TO_HUB_TOKEN is not set, cannot push results to the Hub. When setting dataset_id, please also set the PUSH_TO_HUB_TOKEN environment variable."
+            )
+
+        n_results = len(results)
+        self.logger.info(f"Pushing {n_results} results to: {dataset_id}")
+        rows = []
+        for cfg_hash, entry in results.items():
+            row = {
+                "benchmark_config_hash": cfg_hash,
+                "config": entry["config"].to_dict(),
+                "measurements": entry["measurements"].to_dict(),
+                "metadata": entry["metadata"].to_dict(),
+            }
+            rows.append(row)
+
+        ds = Dataset.from_list(rows)
+        with tempfile.TemporaryDirectory() as tmp:
+            jsonl_path = os.path.join(tmp, "data.jsonl")
+            with open(jsonl_path, "w") as f:
+                json_lines = []
+                for ex in ds:
+                    json_lines.append(json.dumps(ex, ensure_ascii=False))
+                f.write("\n".join(json_lines))
+
+            api = HfApi()
+            # NOTE: we expect the repository to already exist
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if not timestamp else timestamp
+            file_name = f"benchmark_run_{timestamp}.jsonl"
+            api.upload_file(
+                path_or_fileobj=jsonl_path,
+                path_in_repo=file_name,
+                repo_id=dataset_id,
+                repo_type="dataset",
+                token=PUSH_TO_HUB_TOKEN,
+            )
+        self.logger.info(f"Succesfully uploaded results to: {dataset_id}")
--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Any

 import numpy as np
@ -59,19 +59,26 @@ class BenchmarkMetadata:

    model_id: str
    timestamp: str
+    branch_name: str
    commit_id: str
+    commit_message: str
    hardware_info: HardwareInfo

-    def __init__(self, model_id: str, commit_id: str):
+    def __init__(self, model_id: str, commit_id: str, branch_name: str = "main", commit_message: str = "") -> None:
        self.model_id = model_id
-        self.timestamp = datetime.utcnow().isoformat()
+        self.timestamp = datetime.now(timezone.utc).isoformat()
+        self.branch_name = branch_name
        self.commit_id = commit_id
+        self.commit_message = commit_message
        self.hardware_info = HardwareInfo()

    def to_dict(self) -> dict[str, Any]:
        return {
+            "model_id": self.model_id,
            "timestamp": self.timestamp,
+            "branch_name": self.branch_name,
            "commit_id": self.commit_id,
+            "commit_message": self.commit_message,
            "hardware_info": self.hardware_info.to_dict(),
        }

--- a/benchmark_v2/requirements.txt
+++ b/benchmark_v2/requirements.txt
@ -4,4 +4,4 @@ gpustat>=1.0.0
 torch>=2.0.0
 transformers>=4.30.0
 datasets>=2.10.0
-huggingface_hub>=0.16.0 
+huggingface_hub>=0.16.0
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -23,7 +23,12 @@ import logging
 import sys
 import uuid

-from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
+from framework.benchmark_config import (
+    KERNELIZATION_AVAILABLE,
+    BenchmarkConfig,
+    cross_generate_configs,
+    generate_main_configs,
+)
 from framework.benchmark_runner import BenchmarkRunner


@ -33,9 +38,8 @@ if __name__ == "__main__":
    parser.add_argument("--output-dir", type=str, default=None, help="Output dir for benchmark results")
    parser.add_argument("--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO")
    parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
-
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations")
-    parser.add_argument("--iterations", type=int, default=10, help="Number of measurement iterations")
+    parser.add_argument("--warmup", "-w", type=int, default=3, help="Number of warmup iterations")
+    parser.add_argument("--iterations", "-i", type=int, default=10, help="Number of measurement iterations")

    parser.add_argument("--batch-size", "-b", type=int, nargs="+", help="Batch size")
    parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
@ -44,7 +48,20 @@ if __name__ == "__main__":
    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
    parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")

+    parser.add_argument("--branch-name", type=str, help="Git branch name")
    parser.add_argument("--commit-id", type=str, help="Git commit ID (if not provided, will auto-detect from git)")
+    parser.add_argument("--commit-message", type=str, help="Git commit message")
+
+    parser.add_argument(
+        "--no-gpu-monitoring", action="store_true", help="Disables GPU monitoring during benchmark runs"
+    )
+
+    parser.add_argument(
+        "--push-result-to-dataset",
+        type=str,
+        default=None,
+        help="Name of the dataset to push results to. If not provided, results are not pushed to the Hub.",
+    )
    args = parser.parse_args()

    # Setup logging
@ -70,12 +87,16 @@ if __name__ == "__main__":
    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
        if args.cross_generate:
-            benchmark_configs = generate_all_configs(
+            benchmark_configs = cross_generate_configs(
+                attn_impl_and_sdpa_backend=BenchmarkConfig.all_attn_implementations,
+                compiled_mode=[None, "default"],  # usually there is not much to gain by compiling with other modes
+                kernelized=[False, KERNELIZATION_AVAILABLE],
                warmup_iterations=args.warmup,
                measurement_iterations=args.iterations,
                batch_size=args.batch_size[0],
                sequence_length=args.sequence_length[0],
                num_tokens_to_generate=args.num_tokens_to_generate[0],
+                gpu_monitoring=not args.no_gpu_monitoring,
            )
        else:
            benchmark_configs = generate_main_configs(
@ -106,11 +127,24 @@ if __name__ == "__main__":
                    cfg_dict.pop("name")
                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))

-    runner = BenchmarkRunner(logger, args.output_dir, args.commit_id)
-    results = runner.run_benchmarks(
+    runner = BenchmarkRunner(
+        logger,
+        args.output_dir,
+        args.branch_name,
+        args.commit_id,
+        args.commit_message,
+    )
+    timestamp, results = runner.run_benchmarks(
        args.model_id,
        benchmark_configs,
        args.num_tokens_to_profile,
        pretty_print_summary=True,
    )
-    # runner.save_results(args.model_id, results)
+
+    dataset_id = args.push_result_to_dataset
+    if dataset_id is not None and len(results) > 0:
+        runner.push_results_to_hub(
+            dataset_id,
+            results,
+            timestamp,
+        )
--- a/conftest.py
+++ b/conftest.py
@ -58,7 +58,6 @@ NOT_DEVICE_TESTS = {
    "test_model_get_set_embeddings",
    "test_model_main_input_name",
    "test_correct_missing_keys",
-    "test_tie_model_weights",
    "test_can_use_safetensors",
    "test_load_save_without_tied_weights",
    "test_tied_weights_keys",
@ -88,6 +87,8 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
    config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
    config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
+    config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
+    config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")

    os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"

--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer

--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"

--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"

--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -24,7 +24,8 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+# 3. For `torchcodec<0.8`: this is quickly added as torch 2.9.0 + torchcodec 0.8.0 fails on our CI env. Need to remove later once they work.
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio "torchcodec<0.8" --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 RUN python3 -m pip install --no-cache-dir -U timm

--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
+FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -10,8 +10,8 @@ RUN apt update && \

 RUN git lfs install

-RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
-RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
+RUN python3 -m pip install --no-cache-dir --upgrade pip numpy importlib-metadata setuptools wheel ninja pytesseract "itsdangerous<2.1.0"
+RUN python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git

 ARG REF=main
 WORKDIR /
@ -39,6 +39,7 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
 # Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
 RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
    cd flash-attention && \
-    GPU_ARCHS="gfx942" python setup.py install
+    GPU_ARCHS="gfx942;gfx950" python setup.py install  
+# GPU_ARCHS builds for MI300, MI325 and MI355

 RUN python3 -m pip install --no-cache-dir einops
--- a/docker/transformers-pytorch-xpu/Dockerfile
+++ b/docker/transformers-pytorch-xpu/Dockerfile
@ -3,11 +3,10 @@ LABEL maintainer="Hugging Face"

 SHELL ["/bin/bash", "-c"]

-ARG PYTHON_VER=3.11
+ARG PYTHON_VER=3.12
 ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get remove -y python3.10 && apt-get autoremove -y
 RUN apt-get update && \
    apt-get install -y software-properties-common && \
    add-apt-repository -y ppa:deadsnakes/ppa && \
@ -23,7 +22,6 @@ RUN apt-get update && \
        apt-utils \
        build-essential \
        ca-certificates \
-        clinfo \
        curl \
        git \
        git-lfs \
@ -35,7 +33,6 @@ RUN apt-get update && \
        rsync \
        sudo \
        libnl-genl-3-200 \
-        xpu-smi \
        unzip \
        ffmpeg \
        tesseract-ocr \
@ -45,34 +42,47 @@ RUN apt-get update && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-
 RUN apt-get update && \
    apt-get install -y \
-        linux-headers-$(uname -r) \
-        linux-modules-extra-$(uname -r) \
+        linux-headers-$(uname -r) linux-modules-extra-$(uname -r) \
        flex bison \
-        intel-fw-gpu intel-i915-dkms xpu-smi \
+        intel-fw-gpu intel-i915-dkms xpu-smi intel-ocloc clinfo\
        intel-opencl-icd libze-intel-gpu1 libze1 \
        intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+        libegl-mesa0 libegl1 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
        libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
+        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo \
        libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
    apt-get clean && \
    rm -rf  /var/lib/apt/lists/*

-RUN pip install --upgrade pip
-RUN pip install triton==3.3.0
+# Use virtual env because Ubuntu-24 does not allowed pip on original python
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VER} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

-RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
+RUN pip install --upgrade pip wheel
+RUN pip install triton==3.4.0

-RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
-RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
-RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
-RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
+RUN pip install torch==2.8.0+xpu torchvision==0.23.0+xpu torchaudio==2.8.0+xpu --index-url https://download.pytorch.org/whl/xpu --no-cache-dir

+RUN pip install torchcodec torchdata --no-cache-dir
+
+RUN pip install evaluate pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
+RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree setuptools
+RUN pip install gptqmodel --no-build-isolation
+RUN pip install gguf hqq compressed_tensors autoawq deepspeed torchao onnx auto_round
+RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft diffusers trl kernels
+
+# install liger-kernel
 RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu

+# install mergekit
+RUN pip install --break-system-packages git+https://github.com/arcee-ai/mergekit.git@v0.1.3
+
 # install bitsandbytes
 RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git

--- a/docs/README.md
+++ b/docs/README.md
@ -24,7 +24,7 @@ pip install -e ".[dev]"
 ```

 > [!NOTE]
-> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to workaround it.
+> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to work around it.

 Then you need to install our special tool that builds the documentation:

@ -38,7 +38,7 @@ pip install git+https://github.com/huggingface/doc-builder

 ## Building the documentation

-Once you have setup the `doc-builder` and additional packages, you can generate the documentation by 
+Once you have set up the `doc-builder` and additional packages, you can generate the documentation by 
 typing the following command:

 ```bash
@ -295,12 +295,11 @@ Here's an example of a tuple return, comprising several objects:
 Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
 the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
 them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
-If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
-to this dataset.
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate them to this dataset.

 ## Styling the docstring

-We have an automatic script running with the `make style` comment that will make sure that:
+We have an automatic script running with the `make style` command that will make sure that:
 - the docstrings fully take advantage of the line width
 - all code examples are formatted using black, like the code of the Transformers library

--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -123,8 +123,6 @@
    title: تشغيل التدريب على Amazon SageMaker
  - local: serialization
    title: التصدير إلى ONNX
-  - local: torchscript
-    title: التصدير إلى TorchScript
  - local: notebooks
    title: دفاتر الملاحظات مع الأمثلة
  - local: community
@ -260,8 +258,6 @@
 #       title: النماذج
 #     - local: main_classes/text_generation
 #       title: توليد النصوص
-#     - local: main_classes/onnx
-#       title: ONNX
 #     - local: main_classes/optimizer_schedules
 #       title: التحسين
 #     - local: main_classes/output
--- a/docs/source/ar/serialization.md
+++ b/docs/source/ar/serialization.md
@ -32,7 +32,7 @@
 لتصدير نموذج 🤗 Transformers إلى ONNX، قم أولاً بتثبيت اعتماد إضافي:

 ```bash
-pip install optimum[exporters]
+pip install optimum-onnx
 ```

 للاطلاع على جميع المعامﻻت المتاحة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)، أو عرض المساعدة في سطر الأوامر:
@ -111,60 +111,3 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s
 ### تصدير نموذج لهندسة غير مدعومة

 إذا كنت ترغب في المساهمة من خلال إضافة دعم لنموذج لا يُمكن تصديره حاليًا، فيجب عليك أولاً التحقق مما إذا كان مدعومًا في [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)، وإذا لم يكن مدعومًا، [فيمكنك المساهمة في 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) مُباشرةً.
-
-### تصدير نموذج باستخدام `transformers.onnx`
-
-<Tip warning={true}>
-
-لم يعد يتم دعم `transformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
-
-</Tip>
-
-لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية:
-
-```bash
-pip install transformers[onnx]
-```
-
-استخدم حزمة `transformers.onnx` كنموذج Python لتصدير نقطة حفظ باستخدام تكوين جاهز:
-
-```bash
-python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
-```
-
-يُصدّر هذا رسمًا بيانيًا ONNX لنقطة الحفظ المُحددة بواسطة وسيطة `--model`. مرر أي نقطة حفظ على 🤗 Hub أو نقطة حفظ مُخزنة محليًا.
-يُمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد المُسرعات العديدة التي تدعم معيار ONNX. على سبيل المثال، قم بتحميل وتشغيل النموذج باستخدام ONNX Runtime كما يلي:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # يتوقع ONNX Runtime مصفوفات NumPy كمدخلات
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-يُمكن الحصول على أسماء المخرجات المطلوبة (مثل `["last_hidden_state"]`) من خلال إلقاء نظرة على تكوين ONNX لكل نموذج. على سبيل المثال، بالنسبة لـ DistilBERT، لدينا:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-العمليات مُتطابقة لنقاط الحفظ TensorFlow على Hub. على سبيل المثال، صدّر نقطة حفظ TensorFlow خالصة كما يلي:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-لتصدير نموذج مُخزن محليًا، احفظ أوزان النموذج ومجزىء اللغوى في نفس الدليل (على سبيل المثال `local-pt-checkpoint`)، ثم قم بتصديره إلى ONNX عن طريق توجيه وسيط `--model` لحزمة `transformers.onnx` إلى الدليل المطلوب:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
--- a/docs/source/ar/torchscript.md
+++ b/docs/source/ar/torchscript.md
@ -1,154 +0,0 @@
-# التصدير إلى TorchScript
-
-<Tip>
-
-هذه هي بداية تجاربنا مع TorchScript ولا زلنا نستكشف قدراته مع نماذج المدخلات المتغيرة الحجم. إنه مجال اهتمامنا وسنعمق تحليلنا في الإصدارات القادمة، مع المزيد من الأمثلة البرمجية، وتنفيذ أكثر مرونة، ومقاييس مقارنة بين  الأكواد القائمة على Python مع أكواد TorchScript المُجمّعة.
-
-</Tip>
-
-وفقًا لـ [وثائق TorchScript](https://pytorch.org/docs/stable/jit.html):
-
-> TorchScript هي طريقة لإنشاء نماذج قابلة للتسلسل والتحسين من تعليمات PyTorch البرمجية.
-
-هناك وحدتان من PyTorch، [JIT and TRACE](https://pytorch.org/docs/stable/jit.html)، تتيحان للمطورين تصدير نماذجهم لإعادة استخدامها في برامج أخرى مثل برامج C++ المُحسّنة للأداء.
-
-نقدم واجهة تتيح لك تصدير نماذج 🤗 Transformers إلى TorchScript بحيث يمكن إعادة استخدامها في بيئة مختلفة عن برامج Python القائمة إلى PyTorch. هنا نشرح كيفية تصدير نماذجنا واستخدامها باستخدام TorchScript.
-
-يتطلب تصدير نموذج أمرين:
-
- تهيئة مثيل للنموذج باستخدام علامة `torchscript`
- تمرير مُدخلات وهمية (dummy inputs) خلال النموذج
-
-تنطوي هذه الضرورات على عدة أمور يجب على المطورين توخي الحذر بشأنها كما هو مفصل أدناه.
-
-## علامة TorchScript والأوزان المرتبطة
-
-علامة `torchscript` ضرورية لأن معظم نماذج اللغة 🤗 Transformers لها أوزان مرتبطة بين طبقة `Embedding` وطبقة `Decoding`. لا يسمح لك TorchScript بتصدير النماذج ذات الأوزان المرتبطة، لذلك من الضروري فصل الأوزان ونسخها مسبقًا.
-
-النماذج المُهيأة باستخدام علامة `torchscript` لها طبقة `Embedding` وطبقة`Decoding` منفصلتين، مما يعني أنه لا ينبغي تدريبها لاحقًا. سيؤدي التدريب إلى عدم تزامن الطبقتين، مما يؤدي إلى نتائج غير متوقعة.
-
-هذا لا ينطبق على النماذج التي لا تحتوي على رأس نموذج اللغة، حيث لا تملك أوزانًا مرتبطة. يمكن تصدير هذه النماذج بأمان دون علامة `torchscript`.
-
-## المدخلات الوهمية والأطوال القياسية
-
-تُستخدم المُدخلات الوهمية لتمرير أمامي خلال النموذج. أثناء انتشار قيم المُدخلات عبر الطبقات، يتتبع PyTorch العمليات المختلفة التي يتم تنفيذها على كل مصفوفة(tensor). ثم يتم استخدام هذه العمليات المُسجلة بعد ذلك لإنشاء *أثر* النموذج.
-
-يتم إنشاء التتبع بالنسبة لأبعاد المُدخلات. وبالتالي، فهو مُقيّد بأبعاد المُدخلات الوهمية، ولن يعمل لأي طول تسلسل أو حجم دفعة مختلف. عند المحاولة بحجم مختلف، يتم رفع الخطأ التالي:
-
-```
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
-```
-
-نوصي بتتبع النموذج باستخدام حجم مُدخلات وهمية لا يقل عن أكبر مُدخل سيتم تقديمه للنموذج أثناء الاستدلال. يمكن أن تساعد الحشوة(padding) في ملء القيم المفقودة. ومع ذلك، نظرًا لتتبع النموذج بحجم مُدخل أكبر، ستكون أبعاد المصفوفة ستكون كبيرة أيضًا، مما يؤدي عنه المزيد من الحسابات.
-
-انتبه إلى إجمالي عدد العمليات المُنفذة على كل مُدخل وتابع الأداء عن كثب عند تصدير نماذج متغيرة طول التسلسل.
-
-## استخدام TorchScript في Python
-
-يوضح هذا القسم كيفية حفظ النماذج وتحميلها، بالإضافة إلى كيفية استخدام التتبع للاستدلال.
-
-### حفظ نموذج
-
-لتصدير `BertModel` باستخدام TorchScript، قم بتهيئة ـ `BertModel` من فئة `BertConfig` ثم احفظه على القرص تحت اسم الملف `traced_bert.pt`:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-# Tokenizing input text
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# Masking one of the input tokens
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Creating a dummy input
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# Initializing the model with the torchscript flag
-# Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# Instantiating the model
-model = BertModel(config)
-
-# The model needs to be in evaluation mode
-model.eval()
-
-# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
-
-# Creating the trace
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-### تحميل نموذج
-
-يمكنك الآن تحميل `BertModel` المُحفظ سابقًا، `traced_bert.pt`، من القرص واستخدامه على `dummy_input` المُهيأ سابقًا:
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-### استخدام نموذج مُتتبع للاستدلال
-
-استخدم النموذج المُتتبع للاستدلال باستخدام أسلوب `__call__` الخاص به:
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-
-## نشر نماذج Hugging Face TorchScript على AWS باستخدام Neuron SDK
-
-قدمت AWS عائلة [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) من اﻷجهزة لخفض التكلفة وأداء التعلم الآلي عالي الأداء في البيئة السحابية. تعمل أجهزة Inf1 بواسطة شريحة Inferentia من AWS، وهي مُسرّع أجهزة مُخصص، متخصص في أعباء عمل الاستدلال للتعلم العميق. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) هي SDK لـ Inferentia التي تدعم تتبع نماذج المحولات وتحسينها للنشر على Inf1. توفر Neuron SDK ما يلي:
-
-1. واجهة برمجة تطبيقات سهلة الاستخدام مع تغيير سطر واحد من التعليمات البرمجية لتتبع نموذج TorchScript وتحسينه للاستدلال في البيئة السحابية.
-2. تحسينات الأداء الجاهزة للاستخدام [تحسين التكلفة والأداء](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
-3. دعم نماذج Hugging Face المحولات المبنية باستخدام إما [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) أو [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
-
-### الآثار المترتبة
-
-تعمل نماذج المحولات المستندة إلى بنية [BERT (تمثيلات الترميز ثنائية الاتجاه من المحولات)](https://huggingface.co/docs/transformers/main/model_doc/bert) أو متغيراتها مثل [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) و [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) بشكل أفضل على Inf1 للمهام غير التوليدية مثل الإجابة على الأسئلة الاستخراجية، وتصنيف التسلسلات، وتصنيف الرموز (tokens). ومع ذلك، يمكن تكييف مهام توليد النصوص للعمل على Inf1 وفقًا لهذا [برنامج تعليمي AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). يمكن العثور على مزيد من المعلومات حول النماذج التي يمكن تحويلها جاهزة على Inferentia في قسم [ملاءمة بنية النموذج](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) من وثائق Neuron.
-
-### التبعيات (Dependencies)
-
-يتطلب استخدام AWS Neuron لتحويل النماذج [بيئة SDK Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) والتي تأتي مسبقًا على [AMI للتعلم العميق من AWS](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
-
-### تحويل نموذج لـ AWS Neuron
-
-قم بتحويل نموذج لـ AWS NEURON باستخدام نفس التعليمات البرمجية من [استخدام TorchScript في Python](torchscript#using-torchscript-in-python) لتتبع `BertModel`. قم باستيراد امتداد إطار عمل `torch.neuron` للوصول إلى مكونات Neuron SDK من خلال واجهة برمجة تطبيقات Python:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-
-كل ما عليك فعله هو تعديل السطر التالي:
-
-```diff
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-يتيح ذلك لـ Neuron SDK تتبع النموذج وتحسينه لمثيلات Inf1.
-
-لمعرفة المزيد حول ميزات AWS Neuron SDK والأدوات ودروس البرامج التعليمية والتحديثات الأخيرة، يرجى الاطلاع على [وثائق AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -88,6 +88,8 @@
      title: Tool use
    - local: chat_templating_writing
      title: Writing a chat template
+    - local: chat_response_parsing
+      title: Response parsing
    title: Chat with models
  - sections:
    - local: serving
@ -227,8 +229,6 @@
    title: ONNX
  - local: executorch
    title: ExecuTorch
-  - local: torchscript
-    title: TorchScript
  title: Export to production
 - isExpanded: false
  sections:
@ -1255,6 +1255,8 @@
      title: Importing Utilities
    - local: internal/time_series_utils
      title: Utilities for Time Series
+    - local: internal/rope_utils
+      title: Rotary Embeddings Utilities
    title: Internal helpers
  - sections:
    - local: reference/environment_variables
--- a/docs/source/en/chat_extras.md
+++ b/docs/source/en/chat_extras.md
@ -95,9 +95,12 @@ print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))

 The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.

-A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history.
+A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history. For
+models that support [response parsing](./chat_response_parsing), the response parsing will be handled automatically, and you can just use
+[`~PreTrainedTokenizer.parse_response] to extract the tool call. For other models, you'll need to manually translate the output
+string into a tool call dict.

-Hold the call in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.
+Regardless of the approach you use, the tool call should go in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.

 > [!WARNING]
 > Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
--- a/docs/source/en/chat_response_parsing.md
+++ b/docs/source/en/chat_response_parsing.md
@ -0,0 +1,233 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Response Parsing
+
+It is increasingly common for chat models to generate structured outputs, rather than just a single reply string. 
+The most common uses for structured outputs are [tool calling](./chat_extras) and [reasoning models](https://huggingface.co/reasoning-course).
+Tool calling models can output tool calls, containing the name of the tool to call and any arguments to be passed to it,
+while reasoning models often output reasoning steps as a "chain of thought". Some recent models even use both of these,
+and may output reasoning and/or one or more tool calls before their final answer.
+
+Models with structured outputs pose a challenge for chat templating, because the output needs to be parsed before it
+can be appended to the chat. For a concrete example, let's say we ask [GPT-OSS](https://huggingface.co/openai/gpt-oss-120b)
+what the weather is like, and it thinks and decides to call a tool. Here's what the raw model output might look like:
+
+```txt
+<|start|>analysis<|message|>The user asks: "What is the weather like in SF?" We need to get the location of the user? The user explicitly asks about SF (San Francisco).
+So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data.
+So we should call get_current_weather with location "San Francisco, CA". Let's do that.
+We will call function get_current_weather.<|end|><|start|>commentary to=functions.get_current_weather<|channel|>commentary <|constrain|>json<|message|>{"location":"San Francisco, CA"}<|call|>
+}
+```
+
+But if you want to append this to a chat, you'll need to format it as a chat message dict, like this:
+
+```json
+{
+  "role": "assistant",
+  "thinking": "The user asks: \"What is the weather like in SF?\" We need to get the location of the user? The user explicitly asks about SF (San Francisco). So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data. So we should call get_current_weather with location \"San Francisco, CA\". Let's do that.",
+  "tool_calls": [
+    {
+      "name": "get_current_weather",
+      "arguments": {
+        "location": "San Francisco, CA"
+      }
+    }
+  ]
+}
+```
+
+Chat **templates** give us a way to turn messages into formatted input for a model, but we need something else to
+parse model output back into a standard message dict. This is what chat **parsing** is for.
+
+## The [parse_response](~PreTrainedTokenizerBase.parse_response) method
+
+Parsing a chat response on a model that supports it is straightforward. Simply take the raw, decoded output from
+[generate](`~generation.GenerationMixin.generate`), and pass it to the tokenizer's [parse_response](~PreTrainedTokenizerBase.parse_response) method:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "HuggingFaceTB/SmolLM3-3B"
+
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype="auto", device_map="auto")
+
+messages = [
+    {
+        "role": "user",
+        "content": "Hey! Can you summarize the end of the Cold War as briefly as possible? Like, comically briefly. It should really leave out almost most of the relevant information."
+    }
+]
+
+input_ids = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_tensors="pt"
+).to(model.device)
+
+outputs = model.generate(input_ids, max_new_tokens=1024)[0, input_ids.shape[1]:]
+out_text = tokenizer.decode(outputs)
+parsed = tokenizer.parse_response(out_text)
+print(parsed.keys())
+```
+
+And you should get:
+
+```text
+dict_keys(['thinking', 'content'])
+```
+
+And that's all you need to start using response parsing! `parse_response` should return a complete message dict that is ready to be appended to the chat history. 
+When the tokenizer does not support response parsing, `parse_response` will throw an error. We hope to add support
+to more tokenizers over time.
+
+## Developers: Understanding a simple response schema
+
+Under the hood, `parse_response` uses a **JSON schema** to parse the model output. A JSON schema represents
+the structure of the output message dict. The schema is augmented with additional fields that indicate how the 
+output message string should be parsed into the expected format. Let's take a look at the schema for a SmolLM response,
+excluding tool calls for now:
+
+```python
+{
+    "x-regex": "(?:<think>\n?(?P<thinking>.+?)\n?</think>)?\s*(?P<content>.+?)?\s*(?:<\|im_end\|>|$)",
+    "type": "object",
+    "properties": {
+        "role": {"const": "assistant"},
+        "content": {"type": "string"},
+        "thinking": {"type": "string"}
+    }
+}
+```
+
+We can see that the schema describes a JSON "object" (a `dict`, in other words) with three keys: `role`, `content`, and `thinking`.
+Because all assistant responses have the role "assistant", the `role` key is a `const`(ant). The other two keys are strings, extracted
+from the named groups in the regex in the `x-regex` field.
+
+Like chat templates, response schemas are set as a property of the tokenizer. To enable response parsing, all you need
+to do is set `tokenizer.response_schema` to a valid schema dict, and `tokenizer.parse_response()` will work! Again, like
+chat templates, this schema will be saved with the processor, so once you set it, you can use `save_pretrained()` or `push_to_hub()` to
+save and share the schema. 
+
+## Developers: Complex schemas
+
+Now, let's look at a more complex schema, which includes tool calls, to gain more of an understanding of the parser
+internals. For this, we'll use the `GPT-OSS` schema. GPT-OSS emits both tool calls and thinking blocks, and it uses
+an unusual format where model responses are tagged with one of three "channels": `commentary` for things like
+tool calls, `analysis` for chain of thought blocks, and `final` for messages intended to be sent to the user. 
+A full message where the model calls a tool named `get_current_weather` might look like this, with some extra linebreaks added for clarity:
+
+```text
+<|channel|>analysis<|message|>
+The user asks: "What is the weather like in SF?" So we need to get the current weather in San Francisco, CA. 
+We need to call get_current_weather function. So we should call get_current_weather with location "San Francisco, CA".
+<|end|>
+<|start|>assistant<|channel|>commentary 
+to=functions.get_current_weather <|constrain|>json<|message|>
+{
+  "location": "San Francisco, CA"
+}
+<|call|>
+```
+
+Parsing proceeds recursively; the output of a regex (or other parser) at one level becomes the input to the nodes below it.
+In other words, don't feel like you have to parse the entire output in one enormous regex! Instead, start with the schema,
+and then add regexes to extract the relevant chunks as you go. Here's a schema that will parse it, with some
+explanatory comments:
+
+```python
+{
+    "type": "object",
+    "properties": {
+        "role": {"const": "assistant"},
+        # "content" and "thinking" are both similar to the previous example, and just extract a single string
+        # However, rather than using a single regex with named groups to extract both, we use a regex in each subkey.
+        # When an object node has no parser/regex, the entire input string is passed to all of its children, so 
+        # parsing can either be done with named groups at the object level, or with separate regexes at the property level.
+        "content": {"type": "string", "x-regex": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)"},
+        "thinking": {"type": "string", "x-regex": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>"},
+        "tool_calls": {
+            # "x-regex-iterator" uses re.findall to find multiple possible manages, and returns them as an
+            # array/list. You don't need to worry about array handling, though - each item in the array will be
+            # parsed by the `items` schema, so just write the schema for a single item.
+            "x-regex-iterator": r"<\|channel\|>commentary (to=functions\..*?<\|message\|>.*?)(?:<\|call\|>|$)",
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    # A const property is a fixed value, and the input has no effect on it.
+                    "type": {"const": "function"},
+                    # Here, we wrap the entire tool call dict in a `{"function": ...}` block. The input string is passed through to it unchanged.
+                    "function": {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string", "x-regex": r"^to=functions\.(\w+)"},
+                            "arguments": {
+                                "type": "object",
+                                "x-regex": "<\|message\|>(.*)",
+                                # The "x-parser" field indicates that the extracted string should be parsed as JSON.
+                                # The output is then passed to the schema nodes below and recursive parsing continues.
+                                "x-parser": "json",
+                                "additionalProperties": {"type": "any"},
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    },
+}
+```
+
+## Developers: Understanding the parser logic
+
+The parser follows a few simple rules:
+
+1. Each level of the schema receives input from the level above, applies any regex or parser it has, and then passes the output to its children.
+2. The root level receives the entire decoded model output string as input.
+3. If a node has structured content after parsing (for example, if the regex has named groups and returns a dict, or if the parser returns a dict or list),
+   then that structured content is mapped to the node's children, and each child node receives its corresponding value as input.
+4. If an `object` (dict) node has unstructured (string) output, then the entire string is passed to all of its children. This allows child nodes
+   to handle parsing individually rather than requiring a single parent regex to extract all keys at once.
+5. If an `array` (list) node has unstructured (string) output, then this throws an error.
+
+There is a small set of allowable `x-` keys that indicate how parsing should be done at each node:
+- `x-regex`: A regex string to apply to the input. If the regex has named groups, the output is a dict of group names to values. Named groups should only be used in `object` nodes.
+  Otherwise, the regex must have exactly one unnamed capturing group, and the output is the value of that group as a string.
+- `x-regex-iterator`: A regex string to apply to the input using `re.findall()`. The output is a list of all matches.
+  This should only be used in `array` nodes, and the regex must have exactly one unnamed capturing group. The output is distributed to
+  the node's `items` schema.
+- `x-parser`: Calls a built-in parser to apply to the input. Currently, the only supported parser is `json`, which parses the input string as JSON.
+  The output is passed to the child nodes for further parsing. Note that the `json` parser can return deeply nested output - in this case, the output
+  will be progressively unwrapped as it is passed through child nodes. The child nodes do not need additional `x-parser` or `x-regex` fields in this case, 
+  but their structure must match the structure of the parsed JSON.
+- `x-parser-args`: Only allowed in conjunction with `x-parser`. This is a dict of additional arguments that control parsing. Right now, the only supported
+  argument is `transform`, which specifies a `jmespath` transformation to apply to the output. This is useful when the JSON parser returns a structure
+  that needs to be modified to match the schema.
+- `x-regex-key-value`: This is rarely necessary, but it can be useful when parsing key-value pairs in non-JSON format where the names of the keys are not known
+  in advance, such as when a model emits XML tool calls with arbitrary argument names. The regex must have exactly two named capturing groups, 
+  `key` and `value`, and the output is a dict mapping keys to values. This should only be used in `object` nodes.
+
+In general, multiple regexes/parsers cannot be combined at the same level. The exception is that `x-regex`, returning a single string, can be combined with the other parsers. In this case,
+`x-regex` is applied first, and then the output is passed to the other parser, either `x-regex-iterator`, `x-parser`, or `x-regex-key-value`.
+
+Putting these ideas together, you can see that the input flows through the schema, being parsed at each level and then distributed to child nodes. Each level
+only needs to extract the input content that is relevant for that part of the schema, and can then let its child nodes handle the rest. Internally, this is handled
+with a parser function that receives input, applies any regexes/parsers at the current level, then maps the result to its child nodes before recursively calling itself on each of them.
+Recursion terminates when it reaches leaf nodes, usually primitive types like `string` or `number`, which simply return the input they receive.
--- a/docs/source/en/executorch.md
+++ b/docs/source/en/executorch.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 [ExecuTorch](https://pytorch.org/executorch/stable/index.html) runs PyTorch models on mobile and edge devices. Export your Transformers models to the ExecuTorch format with [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch) with the command below.

-```
+```bash
 optimum-cli export executorch \
    --model "HuggingFaceTB/SmolLM2-135M-Instruct" \
    --task "text-generation" \
@ -29,4 +29,5 @@ optimum-cli export executorch \
    --qembedding 8w \
    --output_dir="hf_smollm2"
 ```
+
 Run `optimum-cli export executorch --help` to see all export options. For detailed export instructions, check the [README](optimum/exporters/executorch/README.md).
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -320,7 +320,7 @@ df.sort_values(by=['skipped_proportion'], ascending=False)
 You can focus on a specific test method using `--test_method_name`:

 ```bash
-$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
+python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
 ```

 - `--test_method_name`: Name of the test method to scan (e.g., `test_inputs_embeds`).
--- a/docs/source/en/internal/rope_utils.md
+++ b/docs/source/en/internal/rope_utils.md
@ -0,0 +1,83 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilities for Rotary Embedding
+
+This page explains how the Rotary Embedding is computed and applied in Transformers and what types of RoPE are supported.
+
+## Overview
+
+Rotary Position Embeddings are a technique used to inject positional information into attention mechanisms without relying on explicit position encodings.  
+Instead of adding position vectors to token embeddings, RoPE rotates query and key vectors in the complex plane according to their positions enabling relative positional awareness and better extrapolation to unseen sequence lengths.
+
+The Transformers library provides a flexible and extensible implementation of various RoPE types defined in `[`~modeling_rope_utils.ROPE_VALIDATION_FUNCTIONS`]`, including both the default and scaled variants:
+
+| Rope Type | Description |
+|------------|-------------|
+| `"default"` | Standard rotary embedding as in LLaMA. |
+| `"linear"` | Linear-scaled RoPE which allows longer context windows. |
+| `"dynamic"` | NTK-aware scaling computed by rescaling frequency base (`θ`) for longer context. |
+| `"yarn"` | YaRN scaling variant providing smoother extrapolation and stability. |
+| `"longrope"` | [LongRoPE](https://github.com/microsoft/LongRoPE) scaling as in Phi-2 model series. |
+| `"llama3"` | RoPE scaling as in Llama3.1. |
+
+## Configuration in Model Configs
+
+To enable and customize rotary embeddings, add a `rope_parameters` field to your model’s configuration file (`config.json`). This field controls the RoPE behavior across model layers. Note that each RoPE variant defines its own set of expected keys and missing keys will raise an error. See the example below which creates a llama config with default RoPE parameters:
+
+```python
+from transformers import LlamaConfig
+
+config = LlamaConfig()
+config.rope_parameters = {
+    "rope_type": "default", # type of RoPE to use
+    "rope_theta": 10000.0 # base frequency parameter
+}
+
+# If we want to apply a scaled RoPE type, we need to pass extra parameters
+config.rope_parameters = {
+    "rope_type": "linear",
+    "rope_theta": 10000.0,
+    "factor": 8.0  # scale factor for context extension
+}
+```
+
+## Per-Layer-Type RoPE Configuration
+
+Some models such as Gemma-3 use different layer types with different attention mechanisms, i.e. "full attention" in some blocks and "sliding-window attention" in others. Transformers supports specifying distinct RoPE parameters per layer type for these models. In this case, `rope_parameters` should be a nested dictionary, where top-level keys correspond to `config.layer_types` and values are per-type RoPE parameters. During model initialization, each decoder layer will automatically look up the matching RoPE configuration based on its declared layer type.
+
+```python
+from transformers import Gemma3Config
+
+config = Gemma3Config()
+config.rope_parameters = {
+    "full_attention": {
+        "rope_type": "dynamic",
+        "rope_theta": 1000000.0,
+        "factor": 8.0,
+        "original_max_position_embeddings": 8096,
+    },
+    "sliding_attention": {
+        "rope_type": "default",
+        "rope_theta": 10000.0,
+    }
+}
+```
+
+## Utilities
+
+[[autodoc]] RopeParameters
+    - __call__
--- a/docs/source/en/kernel_doc/overview.md
+++ b/docs/source/en/kernel_doc/overview.md
@ -1,3 +1,3 @@
 # Overview

-Kernels in transformers are used to optimize the performance of models with custom layers from the hub and very low effort.
+Kernels in transformers are used to optimize the performance of models with custom layers from the hub and very low effort.
--- a/docs/source/en/main_classes/data_collator.md
+++ b/docs/source/en/main_classes/data_collator.md
@ -67,6 +67,6 @@ Examples of use can be found in the [example scripts](../examples) or [example n

 [[autodoc]] data.data_collator.DataCollatorWithFlattening

-# DataCollatorForMultipleChoice
+## DataCollatorForMultipleChoice

 [[autodoc]] data.data_collator.DataCollatorForMultipleChoice
--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@ -267,6 +267,7 @@ about how many forward passes you inputs are actually going to trigger, you can
 independently of the inputs. The caveats from the previous section still apply.

 ## Pipeline FP16 inference
+
 Models can be run in FP16 which can be significantly faster on GPU while saving memory. Most models will not suffer noticeable performance loss from this. The larger the model, the less likely that it will.

 To enable FP16 inference, you can simply pass `dtype=torch.float16` or `dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.
@ -334,6 +335,7 @@ Pipelines available for audio tasks include the following.
 Pipelines available for computer vision tasks include the following.

 ### DepthEstimationPipeline
+
 [[autodoc]] DepthEstimationPipeline
    - __call__
    - all
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -43,6 +43,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 [[autodoc]] AwqConfig

 ## EetqConfig
+
 [[autodoc]] EetqConfig

 ## GPTQConfig
--- a/docs/source/en/main_classes/tokenizer.md
+++ b/docs/source/en/main_classes/tokenizer.md
@ -50,14 +50,14 @@ several advanced alignment methods which can be used to map between the original
 token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
 to a given token).

-# Multimodal Tokenizer
+## Multimodal Tokenizer

 Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens
 as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will
 be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder.

 To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not
-have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
+have to be modality related and can be anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
 to three more special tokens.  

 ```python
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -23,6 +23,7 @@ The video processor extends the functionality of image processors by allowing Vi
 When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't updated your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.

 ### Usage Example
+
 Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:

 ```python
--- a/docs/source/en/model_doc/fastspeech2_conformer.md
+++ b/docs/source/en/model_doc/fastspeech2_conformer.md
@ -31,7 +31,7 @@ This model was contributed by [Connor Henderson](https://huggingface.co/connor-h

 FastSpeech2's general structure with a Mel-spectrogram decoder was implemented, and the traditional transformer blocks were replaced with conformer blocks as done in the ESPnet library.

-#### FastSpeech2 Model Architecture
+### FastSpeech2 Model Architecture

 ![FastSpeech2 Model Architecture](https://www.microsoft.com/en-us/research/uploads/prod/2021/04/fastspeech2-1.png)

--- a/docs/source/en/model_doc/florence2.md
+++ b/docs/source/en/model_doc/florence2.md
@ -70,8 +70,8 @@ from transformers import AutoProcessor, Florence2ForConditionalGeneration
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
 image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

-model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-base", dtype=torch.bfloat16, device_map="auto")
-processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base")
+model = Florence2ForConditionalGeneration.from_pretrained("florence-community/Florence-2-base", dtype=torch.bfloat16, device_map="auto")
+processor = AutoProcessor.from_pretrained("florence-community/Florence-2-base")

 task_prompt = "<OD>"
 inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(model.device)
@ -105,12 +105,12 @@ from transformers import AutoProcessor, Florence2ForConditionalGeneration, BitsA
 quantization_config = BitsAndBytesConfig(load_in_4bit=True)

 model = Florence2ForConditionalGeneration.from_pretrained(
-    "microsoft/Florence-2-large",
+    "florence-community/Florence-2-base",
    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config
 )
-processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large")
+processor = AutoProcessor.from_pretrained("florence-community/Florence-2-base")

 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
 image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@ -33,7 +33,7 @@ this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented R
 [MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
 a similar attention pattern to [Gemma 3](./gemma3) with alternating 4 local sliding window self-attention layers for
 every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
-[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
+MobileNet v5 as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
 trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.

 The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@ -63,11 +63,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipVideoVideoProcessor
    - preprocess

-## InstructBlipVideoImageProcessor
-
-[[autodoc]] InstructBlipVideoImageProcessor
-    - preprocess
-
 ## InstructBlipVideoVisionModel

 [[autodoc]] InstructBlipVideoVisionModel
--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@ -88,16 +88,16 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    import torch
    from PIL import Image
    import requests
-    
+
    processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
    model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
-    
+
    # LightGlue requires pairs of images
    images = [image1, image2]
    inputs = processor(images, return_tensors="pt")
    with torch.inference_mode():
        outputs = model(**inputs)
-    
+
    # Extract matching information
    keypoints0 = outputs.keypoints0  # Keypoints in first image
    keypoints1 = outputs.keypoints1  # Keypoints in second image
@ -112,7 +112,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    # Process outputs for visualization
    image_sizes = [[(image.height, image.width) for image in images]]
    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-    
+
    for i, output in enumerate(processed_outputs):
        print(f"For the image pair {i}")
        for keypoint0, keypoint1, matching_score in zip(
@ -147,6 +147,13 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    - post_process_keypoint_matching
    - visualize_keypoint_matching

+## LightGlueImageProcessorFast
+
+[[autodoc]] LightGlueImageProcessorFast
+    - preprocess
+    - post_process_keypoint_matching
+    - visualize_keypoint_matching
+
 ## LightGlueForKeypointMatching

 [[autodoc]] LightGlueForKeypointMatching
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@ -247,10 +247,6 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaNextVideoProcessor

-## LlavaNextVideoImageProcessor
-
-[[autodoc]] LlavaNextVideoImageProcessor
-
 ## LlavaNextVideoVideoProcessor

 [[autodoc]] LlavaNextVideoVideoProcessor
--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@ -54,7 +54,7 @@ model.set_output_embeddings(resized_embeddings)

 ## Usage Example

-#### Instruct model
+### Instruct model

 ```python
 import torch
@ -80,7 +80,7 @@ output = model.generate(**inputs, max_new_tokens=25)
 print(processor.decode(output[0]))
 ```

-#### Base model
+### Base model

 ```python
 import requests
--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@ -88,16 +88,16 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    import torch
    from PIL import Image
    import requests
-    
+
    processor = AutoImageProcessor.from_pretrained("magic-leap-community/superglue_outdoor")
    model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")
-    
+
    # SuperGlue requires pairs of images
    images = [image1, image2]
    inputs = processor(images, return_tensors="pt")
    with torch.inference_mode():
        outputs = model(**inputs)
-    
+
    # Extract matching information
    keypoints0 = outputs.keypoints0  # Keypoints in first image
    keypoints1 = outputs.keypoints1  # Keypoints in second image
@ -112,7 +112,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    # Process outputs for visualization
    image_sizes = [[(image.height, image.width) for image in images]]
    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-    
+
    for i, output in enumerate(processed_outputs):
        print(f"For the image pair {i}")
        for keypoint0, keypoint1, matching_score in zip(
@ -147,6 +147,13 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    - post_process_keypoint_matching
    - visualize_keypoint_matching

+## SuperGlueImageProcessorFast
+
+[[autodoc]] SuperGlueImageProcessorFast
+    - preprocess
+    - post_process_keypoint_matching
+    - visualize_keypoint_matching
+
 ## SuperGlueForKeypointMatching

 [[autodoc]] SuperGlueForKeypointMatching
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -288,7 +288,7 @@ class Olmo2DecoderLayer(OlmoDecoderLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        **kwargs,
    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -45,7 +45,13 @@ This guide shows how to enable tensor parallelism with Transformers and differen

 ## Partitioning a model

-Transformers supports tensor parallelism if a model has a `tp_plan`. Set `tp_plan="auto"` to automatically use a tensor parallelism plan based on a model's predefined configuration.
+Transformers supports tensor parallelism if a model has a `tp_plan`. There are two ways to partition a model.
+
+- Set `tp_plan="auto"` to automatically use a tensor parallelism plan based on a model's predefined configuration.
+- Define and pass a manual `tp_plan`.
+
+<hfoptions id="tp_plan">
+<hfoption id="auto plan">

 ```py
 import os
@ -53,9 +59,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 # model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"  # better for smaller number of GPUs
-
-model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, tp_plan="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct" , dtype=torch.bfloat16, tp_plan="auto")
 print(model._tp_plan)

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
@ -72,6 +76,31 @@ Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/
 torchrun --nproc-per-node 4 demo.py
 ```

+</hfoption>
+<hfoption id="manual plan">
+
+Define a tensor parallel plan for each layer in `tp_plan` and pass it to [`~PreTrainedModel.from_pretrained`]. The example below uses column and row partitioning. See the [Partitioning strategies](#partitioning-strategies) section for other supported strategies.
+
+Manual partitioning requires deep understanding of model architecture and strategy interactions. Poor partitioning choices create slow models that fail or produce incorrect results. The [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) explains partitioning strategies in detail.
+
+```py
+from transformers import AutoModelForCausalLM
+
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise",
+    "model.layers.*.self_attn.k_proj": "colwise",
+    "model.layers.*.self_attn.v_proj": "colwise",
+    "model.layers.*.self_attn.o_proj": "rowwise",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", dtype="auto", tp_plan=tp_plan)
+print(model.tp_plan)
+```
+
+</hfoption>
+</hfoptions>
+
 ## Partitioning strategies

 All partitioning strategies are defined in the [`ParallelInterface`] class which maps a string to the strategy implementation. You don't need to interact with this class directly since all the strategies are set with `tp_plan` in [`~PreTrainedModel.from_pretrained`], but it is useful for checking what strategies are available.
--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@ -33,7 +33,7 @@ Export a Transformers model to ONNX with the Optimum CLI or the `optimum.onnxrun
 Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module.

 ```bash
-pip install optimum[exporters]
+pip install optimum-onnx
 ```

 > [!TIP]
--- a/docs/source/en/tasks/image_captioning.md
+++ b/docs/source/en/tasks/image_captioning.md
@ -169,7 +169,7 @@ def compute_metrics(eval_pred):
    return {"wer_score": wer_score}
 ```

-## Train!
+## Train

 Now, you are ready to start fine-tuning the model. You will use the 🤗 [`Trainer`] for this.

--- a/docs/source/en/tasks/training_vision_backbone.md
+++ b/docs/source/en/tasks/training_vision_backbone.md
@ -126,7 +126,6 @@ def rebuild_objects(bboxes, labels):
 train_dataset = train_dataset.with_transform(train_transform)
 ```

-
 Build COCO-style annotations for the image processor.

 ```py
@ -247,4 +246,4 @@ image = Image.open(requests.get("https://huggingface.co/datasets/merve/vlm_test_
 plot_results(image, results, threshold=0.05)
 ```

-![Results](https://huggingface.co/datasets/huggingface/documentation-images/results/main/transformers/tasks/backbone_training_results.png)
+![Results](https://huggingface.co/datasets/huggingface/documentation-images/results/main/transformers/tasks/backbone_training_results.png)
--- a/docs/source/en/torchscript.md
+++ b/docs/source/en/torchscript.md
@ -1,138 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# TorchScript
-
-[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may not be the most performant choice.
-
-Transformers can export a model to TorchScript by:
-
-1. creating dummy inputs to create a *trace* of the model to serialize to TorchScript
-2. enabling the `torchscript` parameter in either [`~PreTrainedConfig.torchscript`] for a randomly initialized model or [`~PreTrainedModel.from_pretrained`] for a pretrained model
-
-## Dummy inputs
-
-The dummy inputs are used in the forward pass, and as the input values are propagated through each layer, PyTorch tracks the different operations executed on each tensor. The recorded operations are used to create the model trace. Once it is recorded, it is serialized into a TorchScript program.
-
-```py
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = tokenizer.tokenize(text)
-
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# creating a dummy input
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-```
-
-The trace is created based on the provided inputs dimensions and it can only handle inputs with the same shape as the provided input during tracing. An input with a different size raises the error message shown below.
-
-```bash
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`.
-```
-
-Try to create a trace with a dummy input size at least as large as the largest expected input during inference. Padding can help fill missing values for larger inputs. It may be slower though since a larger input size requires more calculations. Be mindful of the total number of operations performed on each input and track the model performance when exporting models with variable sequence lengths.
-
-## Tied weights
-
-Weights between the `Embedding` and `Decoding` layers are tied in Transformers and TorchScript can't export models with tied weights. Instantiating a model with `torchscript=True`, separates the `Embedding` and `Decoding` layers and they aren't trained any further because it would throw the two layers out of sync which can lead to unexpected results.
-
-Models *without* a language model head don't have tied weights and can be safely exported without the `torchscript` parameter.
-
-<hfoptions id="torchscript">
-<hfoption id="randomly initialized model">
-
-```py
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-model = BertModel(config)
-model.eval()
-```
-
-</hfoption>
-<hfoption id="pretrained model">
-
-```py
-model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
-model.eval()
-```
-
-</hfoption>
-</hfoptions>
-
-## Export to TorchScript
-
-Create the Torchscript program with [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), and save with [torch.jit.save](https://pytorch.org/docs/stable/generated/torch.jit.save.html).
-
-```py
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-Use [torch.jit.load](https://pytorch.org/docs/stable/generated/torch.jit.load.html) to load the traced model.
-
-```py
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-To use the traced model for inference, use the `__call__` dunder method.
-
-```py
-traced_model(tokens_tensor, segments_tensors)
-```
-
-## Deploy to AWS
-
-TorchScript programs serialized from Transformers can be deployed on [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) instances. The instance is powered by AWS Inferentia chips, a custom hardware accelerator designed for deep learning inference workloads. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) supports tracing Transformers models for deployment on Inf1 instances.
-
-> [!TIP]
-> AWS Neuron requires a [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/inference-torch-neuron.html#inference-torch-neuron) which is preconfigured on [AWS DLAMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
-
-Instead of [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), use [torch.neuron.trace](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/api-compilation-python-api.html) to trace a model and optimize it for Inf1 instances.
-
-```py
-import torch.neuron
-
-torch.neuron.trace(model, [tokens_tensor, segments_tensors])
-```
-
-Refer to the [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) documentation for more information.
-
-### Model architectures
-
-BERT-based models - like [DistilBERT](./model_doc/distilbert) or [RoBERTa](./model_doc/roberta) - run best on Inf1 instances for non-generative tasks such as extractive question answering, and sequence or token classification.
-
-Text generation can be adapted to run on an Inf1 instance as shown in the [Transformers MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html) tutorial.
-
-Refer to the [Inference Samples/Tutorials (Inf1)](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/models/inference-inf1-samples.html#model-samples-inference-inf1) guide for more information about which models can be converted out of the box to run on Inf1 instances.
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@ -64,8 +64,6 @@
    title: Entrenador
  - local: sagemaker
    title: Ejecutar el entrenamiento en Amazon SageMaker
-  - local: torchscript
-    title: Exportar a TorchScript
  - local: community
    title: Los recursos de la comunidad
  title: Guías para desarrolladores
--- a/docs/source/es/torchscript.md
+++ b/docs/source/es/torchscript.md
@ -1,167 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Exportar a TorchScript
-
-<Tip>
-Este es el comienzo de nuestros experimentos con TorchScript y todavía estamos explorando sus capacidades con modelos de variables de entrada. Es un tema de interés para nosotros y profundizaremos en nuestro análisis en las próximas versiones, con más ejemplos de código, una implementación más flexible y comparativas de rendimiento comparando códigos basados en Python con TorchScript compilado.  
-
-</Tip>
-
-De acuerdo con la documentación de TorchScript: 
-
-> "TorchScript es una manera de crear modelos serializables y optimizables a partir del código PyTorch."
-
-Hay dos módulos de PyTorch, [JIT y TRACE](https://pytorch.org/docs/stable/jit.html), que permiten a los desarrolladores exportar sus modelos para ser reusados en otros programas, como los programas de C++ orientados a la eficiencia.
-
-Nosotros proveemos una interface que te permite exportar los modelos 🤗Transformers a TorchScript para que puedan ser reusados en un entorno diferente al de los programas Python basados en PyTorch. Aquí explicamos como exportar y usar nuestros modelos utilizando TorchScript.
-
-Exportar un modelo requiere de dos cosas:
-
- La instanciación del modelo con la bandera TorchScript.
- Un paso hacia adelante con entradas ficticias.
-
-Estas necesidades implican varias cosas de las que los desarrolladores deben tener cuidado, como se detalla a continuación.
-
-## Bandera TorchScript y pesos atados.
-
-La bandera `torchscript` es necesaria porque la mayoría de los modelos de lenguaje de 🤗Transformers tienen pesos atados entre su `capa de incrustación` (`Embedding`) y su `capa de decodificación` (`Decoding`). TorchScript no te permite exportar modelos que tienen pesos atados, por lo que es necesario desatar y clonar los pesos de antemano.
-
-Los modelos instanciados con la bandera `torchscript` tienen su `capa de incrustación` (`Embedding`) y su `capa de decodificación` (`Decoding`) separadas, lo que significa que no deben ser entrenados más adelante. Entrenar desincronizaría las dos capas, lo que llevaría a resultados inesperados.
-
-Esto no es así para los modelos que no tienen una cabeza de modelo de lenguaje, ya que esos modelos no tienen pesos atados. Estos modelos pueden ser exportados de manera segura sin la bandera `torchscript`.
-
-## Entradas ficticias y longitudes estándar
-
-Las entradas ficticias se utilizan para un paso del modelo hacia adelante. Mientras los valores de las entradas se propagan a través de las capas, PyTorch realiza un seguimiento de las diferentes operaciones ejecutadas en cada tensor. Estas operaciones registradas se utilizan luego para crear *la traza* del modelo.
-La traza se crea en relación con las dimensiones de las entradas. Por lo tanto, está limitada por las dimensiones de la entrada ficticia y no funcionará para ninguna otra longitud de secuencia o tamaño de lote. Cuando se intenta con un tamaño diferente, se genera el siguiente error:
-
-```
-`El tamaño expandido del tensor (3) debe coincidir con el tamaño existente (7) en la dimensión no singleton 2`.
-```
-
-Recomendamos trazar el modelo con un tamaño de entrada ficticio al menos tan grande como la entrada más grande con la que se alimentará al modelo durante la inferencia. El relleno puede ayudar a completar los valores faltantes. Sin embargo, dado que el modelo se traza con un tamaño de entrada más grande, las dimensiones de la matriz también serán grandes, lo que resultará en más cálculos.
-
-Ten cuidado con el número total de operaciones realizadas en cada entrada y sigue de cerca el rendimiento al exportar modelos con longitudes de secuencia variables.
-
-## Usando TorchScript en Python
-
-Esta sección demuestra cómo guardar y cargar modelos, así como cómo usar la traza para la inferencia.
-
-### Guardando un modelo
-
-Para exportar un `BertModel` con TorchScript, instancia `BertModel` a partir de la clase `BertConfig` y luego guárdalo en disco bajo el nombre de archivo `traced_bert.pt`:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
-
-# Tokenizing input text
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# Masking one of the input tokens
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Creating a dummy input
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# Initializing the model with the torchscript flag
-# Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# Instantiating the model
-model = BertModel(config)
-
-# The model needs to be in evaluation mode
-model.eval()
-
-# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-
-# Creating the trace
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-### Cargando un modelo
-
-Ahora puedes cargar el `BertModel` guardado anteriormente, `traced_bert.pt`, desde el disco y usarlo en la entrada ficticia (`dummy_input`) previamente inicializada:
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-## Usando un modelo trazado para inferencia
-
-Utiliza el modelo trazado para inferencia utilizando su método `_call_` dunder:
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-## Despliega modelos TorchScript de Hugging Face en AWS con el Neuron SDK
-
-AWS introdujo la familia de instancias [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) para inferencia de aprendizaje automático de alto rendimiento y bajo costo en la nube. Las instancias Inf1 están alimentadas por el chip AWS Inferentia, un acelerador de hardware personalizado que se especializa en cargas de trabajo de inferencia de aprendizaje profundo. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) es el SDK para Inferentia que admite el trazado y la optimización de modelos de transformers para implementación en Inf1. El SDK Neuron proporciona:
-
-1. Una API fácil de usar con un solo cambio de línea de código para trazar y optimizar un modelo TorchScript para inferencia en la nube.
-
-2. Optimizaciones de rendimiento listas para usar [para mejorar el rendimiento y el costo](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
-
-3. Soporte para modelos de transformers de Hugging Face construidos tanto con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) como con [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
-
-### Implicaciones
-
-Los modelos transformers basados en la arquitectura [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), o sus variantes como [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) y [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta), funcionan mejor en Inf1 para tareas no generativas como la respuesta a preguntas extractivas, la clasificación de secuencias y la clasificación de tokens. Sin embargo, las tareas de generación de texto aún pueden adaptarse para ejecutarse en Inf1 según este [tutorial de AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). Se puede encontrar más información sobre los modelos que se pueden convertir fácilmente para usar en Inferentia en la sección de [Model Architecture Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) de la documentación de Neuron.
-
-### Dependencias
-
-El uso de AWS Neuron para convertir modelos requiere un [entorno de Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) que viene preconfigurado en [la AMI de AWS Deep Learning](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
-
-### Convertir un modelo para AWS Neuron
-
-Convierte un modelo para AWS NEURON utilizando el mismo código de [Uso de TorchScript en Python](torchscript#using-torchscript-in-python) para trazar un `BertModel`. Importa la extensión del framework `torch.neuron` para acceder a los componentes del Neuron SDK a través de una API de Python:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-Solo necesitas la linea sigueda:
-
-```diff
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-Esto permite que el Neuron SDK trace el modelo y lo optimice para las instancias Inf1.
-
-Para obtener más información sobre las características, herramientas, tutoriales de ejemplo y últimas actualizaciones del AWS Neuron SDK, consulta [la documentación de AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@ -109,8 +109,6 @@
    title: チャットモデルのテンプレート
  - local: serialization
    title: ONNX へのエクスポート
-  - local: torchscript
-    title: トーチスクリプトへのエクスポート
  - local: community
    title: コミュニティリソース
  - local: troubleshooting
@ -202,8 +200,6 @@
      title: モデル
    - local: main_classes/text_generation
      title: テキストの生成
-    - local: main_classes/onnx
-      title: ONNX
    - local: main_classes/optimizer_schedules
      title: 最適化
    - local: main_classes/output
--- a/docs/source/ja/main_classes/onnx.md
+++ b/docs/source/ja/main_classes/onnx.md
@ -1,50 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Exporting 🤗 Transformers models to ONNX
-
-🤗 Transformers は `transformers.onnx` パッケージを提供します。
-設定オブジェクトを利用することで、モデルのチェックポイントをONNXグラフに変換することができます。
-
-詳細は[ガイド](../serialization) を参照してください。
-を参照してください。
-
-## ONNX Configurations
-
-以下の3つの抽象クラスを提供しています。
-エクスポートしたいモデルアーキテクチャのタイプに応じて、継承すべき3つの抽象クラスを提供します：
-
-* エンコーダーベースのモデルは [`~onnx.config.OnnxConfig`] を継承します。
-* デコーダーベースのモデルは [`~onnx.config.OnnxConfigWithPast`] を継承します。
-* エンコーダー・デコーダーモデルは [`~onnx.config.OnnxSeq2SeqConfigWithPast`] を継承しています。
-
-
-### OnnxConfig
-
-[[autodoc]] onnx.config.OnnxConfig
-
-### OnnxConfigWithPast
-
-[[autodoc]] onnx.config.OnnxConfigWithPast
-
-### OnnxSeq2SeqConfigWithPast
-
-[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
-
-## ONNX Features
-
-各 ONNX 構成は、次のことを可能にする一連の _機能_ に関連付けられています。
-さまざまなタイプのトポロジまたはタスクのモデルをエクスポートします。
--- a/docs/source/ja/perf_train_gpu_many.md
+++ b/docs/source/ja/perf_train_gpu_many.md
@ -472,8 +472,6 @@ FlexFlowは、サンプル-オペレータ-属性-パラメータの4D並列化

 したがって、このフレームワークの約束は非常に魅力的です。選択したクラスタで30分間のシミュレーションを実行し、この特定の環境を最適に利用するための最良の戦略を提供します。部分を追加/削除/置換すると、それに対して実行して再最適化プランを作成します。その後、トレーニングできます。異なるセットアップには独自の最適化があります。

-🤗 Transformersの現在の状況: まだ統合されていません。すでに[transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py)を使用してモデルがFXトレース可能であるため、FlexFlowを動作させるために必要な手順を誰かが見つける必要があります。
-
 ## Which Strategy To Use When

 ここでは、どの並列化戦略をいつ使用するかの非常におおまかなアウトラインを示します。各リストの最初が通常よりも速いことが一般的です。
--- a/docs/source/ja/serialization.md
+++ b/docs/source/ja/serialization.md
@ -47,7 +47,7 @@ ONNX形式にエクスポートされたモデルは、以下のように使用
 🤗 TransformersモデルをONNXにエクスポートするには、まず追加の依存関係をインストールしてください：

 ```bash
-pip install optimum[exporters]
+pip install optimum-onnx
 ```

 すべての利用可能な引数を確認するには、[🤗 Optimumドキュメント](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)を参照してください。または、コマンドラインでヘルプを表示することもできます：
@ -128,64 +128,3 @@ CLIの代わりに、🤗 TransformersモデルをONNXにプログラム的に
 ### Exporting a model for an unsupported architecture

 現在エクスポートできないモデルをサポートするために貢献したい場合、まず[`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)でサポートされているかどうかを確認し、サポートされていない場合は[🤗 Optimumに貢献](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)してください。
-
-### Exporting a model with `transformers.onnx`
-
-<Tip warning={true}>
-
-`transformers.onnx`はもはやメンテナンスされていないため、モデルを上記で説明したように🤗 Optimumでエクスポートしてください。このセクションは将来のバージョンで削除されます。
-
-</Tip>
-
-🤗 TransformersモデルをONNXにエクスポートするには、追加の依存関係をインストールしてください：
-
-
-```bash
-pip install transformers[onnx]
-```
-
-`transformers.onnx`パッケージをPythonモジュールとして使用して、事前に用意された設定を使用してチェックポイントをエクスポートする方法は以下の通りです：
-
-```bash
-python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
-```
-
-この方法は、`--model`引数で定義されたチェックポイントのONNXグラフをエクスポートします。🤗 Hubのいずれかのチェックポイントまたはローカルに保存されたチェックポイントを渡すことができます。エクスポートされた`model.onnx`ファイルは、ONNX標準をサポートする多くのアクセラレータで実行できます。例えば、ONNX Runtimeを使用してモデルを読み込んで実行する方法は以下の通りです：
-
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-必要な出力名（例: `["last_hidden_state"]`）は、各モデルのONNX構成を確認することで取得できます。例えば、DistilBERTの場合、次のようになります：
-
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-ハブから純粋なTensorFlowのチェックポイントをプログラム的にエクスポートするプロセスは、以下のように同様です：
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-ローカルに保存されたモデルをエクスポートする場合、モデルの重みとトークナイザのファイルを同じディレクトリに保存してください（例： `local-pt-checkpoint`）。その後、`transformers.onnx`パッケージの `--model`引数を希望するディレクトリに向けて設定して、ONNXにエクスポートします：
-
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
-
--- a/docs/source/ja/torchscript.md
+++ b/docs/source/ja/torchscript.md
@ -1,177 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Export to TorchScript
-
-<Tip>
-
-これはTorchScriptを使用した実験の最初であり、可変入力サイズのモデルに対するその能力をまだ探求中です。これは私たちの関心の焦点であり、今後のリリースでは、より柔軟な実装や、PythonベースのコードとコンパイルされたTorchScriptを比較するベンチマークを含む、より多くのコード例で詳細な分析を行います。
-
-</Tip>
-
-[TorchScriptのドキュメント](https://pytorch.org/docs/stable/jit.html)によれば：
-
-> TorchScriptは、PyTorchコードから直列化および最適化可能なモデルを作成する方法です。
-
-TorchScriptを使用すると、効率志向のC++プログラムなど、他のプログラムでモデルを再利用できるようになります。PyTorchベースのPythonプログラム以外の環境で🤗 Transformersモデルをエクスポートして使用するためのインターフェースを提供しています。ここでは、TorchScriptを使用してモデルをエクスポートし、使用する方法を説明します。
-
-モデルをエクスポートするには、次の2つの要件があります：
-
- `torchscript`フラグを使用したモデルのインスタンス化
- ダミーの入力を使用したフォワードパス
-
-これらの必要条件は、以下で詳細に説明されているように、開発者が注意する必要があるいくつかのことを意味します。
-
-## TorchScript flag and tied weights
-
-`torchscript`フラグは、ほとんどの🤗 Transformers言語モデルにおいて、`Embedding`レイヤーと`Decoding`レイヤー間で重みが連結されているため必要です。
-TorchScriptでは、重みが連結されているモデルをエクスポートすることはできませんので、事前に重みを切り離して複製する必要があります。
-
-`torchscript`フラグを使用してインスタンス化されたモデルは、`Embedding`レイヤーと`Decoding`レイヤーが分離されており、そのため後でトレーニングしてはいけません。
-トレーニングは、これらの2つのレイヤーを非同期にする可能性があり、予期しない結果をもたらす可能性があります。
-
-言語モデルヘッドを持たないモデルには言及しませんが、これらのモデルには連結された重みが存在しないため、`torchscript`フラグなしで安全にエクスポートできます。
-
-## Dummy inputs and standard lengths
-
-ダミー入力はモデルのフォワードパスに使用されます。入力の値はレイヤーを通じて伝播される間、PyTorchは各テンソルに実行された異なる操作を追跡します。これらの記録された操作は、モデルの*トレース*を作成するために使用されます。
-
-トレースは入力の寸法に対して作成されます。そのため、ダミー入力の寸法に制約され、他のシーケンス長やバッチサイズでは動作しません。異なるサイズで試すと、以下のエラーが発生します：
-
-```
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
-```
-
-お勧めしますのは、モデルの推論中に供給される最大の入力と同じ大きさのダミー入力サイズでモデルをトレースすることです。パディングを使用して不足値を補完することもできます。ただし、モデルがより大きな入力サイズでトレースされるため、行列の寸法も大きくなり、より多くの計算が発生します。
-
-異なるシーケンス長のモデルをエクスポートする際に、各入力に対して実行される演算の総数に注意して、パフォーマンスを密接にフォローすることをお勧めします。
-
-## Using TorchScript in Python
-
-このセクションでは、モデルの保存と読み込み、および推論にトレースを使用する方法を示します。
-
-### Saving a model
-
-TorchScriptで`BertModel`をエクスポートするには、`BertConfig`クラスから`BertModel`をインスタンス化し、それをファイル名`traced_bert.pt`でディスクに保存します：
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-# Tokenizing input text
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# Masking one of the input tokens
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Creating a dummy input
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# Initializing the model with the torchscript flag
-# Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# Instantiating the model
-model = BertModel(config)
-
-# The model needs to be in evaluation mode
-model.eval()
-
-# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
-
-# Creating the trace
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-### Loading a model
-
-以前に保存した `BertModel`、`traced_bert.pt` をディスクから読み込んで、以前に初期化した `dummy_input` で使用できます。
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-
-### Using a traced model for inference
-
-トレースモデルを使用して推論を行うには、その `__call__` ダンダーメソッドを使用します。
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-
-
-## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK
-
-AWSはクラウドでの低コストで高性能な機械学習推論向けに [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) インスタンスファミリーを導入しました。Inf1インスタンスはAWS Inferentiaチップによって駆動され、ディープラーニング推論ワークロードに特化したカスタムビルドのハードウェアアクセラレータです。[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) はInferentia用のSDKで、トランスフォーマーモデルをトレースして最適化し、Inf1に展開するためのサポートを提供します。
-
-Neuron SDK が提供するもの:
-
-1. クラウドでの推論のためにTorchScriptモデルをトレースして最適化するための、1行のコード変更で使用できる簡単なAPI。
-2. [改善されたコストパフォーマンス](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/) のためのボックス外のパフォーマンス最適化。
-3. [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) または [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html) で構築されたHugging Faceトランスフォーマーモデルへのサポート。
-
-### Implications
-
-BERT（Bidirectional Encoder Representations from Transformers）アーキテクチャやその変種（[distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) や [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) など）に基づくトランスフォーマーモデルは、非生成タスク（抽出型質問応答、シーケンス分類、トークン分類など）において、Inf1上で最適に動作します。ただし、テキスト生成タスクも [AWS Neuron MarianMT チュートリアル](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html) に従ってInf1上で実行できます。Inferentiaでボックス外で変換できるモデルに関する詳細情報は、Neuronドキュメンテーションの [Model Architecture Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) セクションにあります。
-
-### Dependencies
-
-モデルをAWS Neuronに変換するには、[Neuron SDK 環境](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) が必要で、[AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html) に事前に構成されています。
-
-### Converting a model for AWS Neuron
-
-モデルをAWS NEURON用に変換するには、[PythonでTorchScriptを使用する](torchscript#using-torchscript-in-python) と同じコードを使用して `BertModel` をトレースします。Python APIを介してNeuron SDKのコンポーネントにアクセスするために、`torch.neuron` フレームワーク拡張をインポートします。
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-
-次の行を変更するだけで済みます。
-
-```diff
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-これにより、Neuron SDKはモデルをトレースし、Inf1インスタンス向けに最適化します。
-
-AWS Neuron SDKの機能、ツール、サンプルチュートリアル、最新のアップデートについて詳しく知りたい場合は、[AWS NeuronSDK ドキュメンテーション](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) をご覧ください。
-
-
-
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -84,6 +84,8 @@
      title: Transformers로 채팅하기
    - local: chat_templating
      title: 챗봇 템플릿 익히기
+    - local: chat_extras
+      title: Tools 와 RAG
    - local: in_translation
      title: (번역중) Multimodal templates
    - local: in_translation
@ -210,8 +212,6 @@
    title: ONNX로 내보내기
  - local: executorch
    title: ExecuTorch
-  - local: torchscript
-    title: TorchScript로 내보내기
  title: 배포환경에 내보내기
 - isExpanded: false
  sections:
@ -406,8 +406,6 @@
      title: Models
    - local: main_classes/text_generation
      title: 텍스트 생성
-    - local: main_classes/onnx
-      title: ONNX
    - local: main_classes/optimizer_schedules
      title: 최적화
    - local: main_classes/output
@ -459,7 +457,7 @@
        title: BertJapanese
      - local: model_doc/bertweet
        title: BERTweet
-      - local: in_translation
+      - local: model_doc/big_bird
        title: BigBird
      - local: in_translation
        title: BigBirdPegasus
@ -483,7 +481,7 @@
        title: CANINE
      - local: model_doc/codegen
        title: CodeGen
-      - local: in_translation
+      - local: model_doc/code_llama
        title: CodeLlama
      - local: model_doc/cohere
        title: Cohere
@ -601,7 +599,7 @@
        title: Jukebox
      - local: in_translation
        title: LED
-      - local: in_translation
+      - local: model_doc/lfm2
        title: LFM2
      - local: in_translation
        title: LFM2-VL
@ -881,6 +879,8 @@
        title: SegFormer
      - local: in_translation
        title: SegGpt
+      - local: model_doc/sam_hq
+        title: Segment Anything High Quality (SAM-HQ)
      - local: in_translation
        title: SuperGlue
      - local: in_translation
@ -1055,7 +1055,7 @@
        title: FLAVA
      - local: model_doc/gemma3
        title: Gemma3
-      - local: in_translation
+      - local: model_doc/gemma3n
        title: Gemma3n
      - local: in_translation
        title: GIT
@ -1097,7 +1097,7 @@
        title: LayoutXLM
      - local: in_translation
        title: LiLT
-      - local: in_translation
+      - local: model_doc/llama4
        title: Llama4
      - local: in_translation
        title: Llava
--- a/docs/source/ko/chat_extras.md
+++ b/docs/source/ko/chat_extras.md
@ -0,0 +1,299 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 도구와 RAG[[Tools-and-RAG]]
+
+[`~PreTrainedTokenizerBase.apply_chat_template`] 메소드는 채팅 메시지 외에도 문자열, 리스트, 딕셔너리 등 거의 모든 종류의 추가 인수 타입을 지원합니다. 이를 통해 다양한 사용 상황에서 채팅 템플릿을 활용할 수 있습니다.
+
+이 가이드에서는 도구 및 검색 증강 생성(RAG)과 함께 채팅 템플릿을 사용하는 방법을 보여드립니다.
+
+## 도구[[Tools]]
+
+도구는 대규모 언어 모델(LLM)이 특정 작업을 수행하기 위해 호출할 수 있는 함수입니다. 이는 실시간 정보, 계산 도구 또는 대규모 데이터베이스 접근 등을 통해 대화형 에이전트의 기능을 확장하는 강력한 방법입니다.
+
+도구를 만들 때는 아래 규칙을 따르세요.
+
+1. 함수는 기능을 잘 설명하는 이름을 가져야 합니다.
+2. 함수의 인수는 함수 헤더에 타입 힌트를 포함해야 합니다(`Args` 블록에는 포함하지 마세요).
+3. 함수에는 [Google 스타일](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) 의 독스트링(docstring)이 포함되어야 합니다.
+4. 함수에 반환 타입과 `Returns` 블록을 포함할 수 있지만, 도구를 활용하는 대부분의 모델에서 이를 사용하지 않기 때문에 무시할 수 있습니다.
+
+주어진 위치의 현재 온도와 풍속을 가져오는 도구의 예시는 아래와 같습니다.
+
+```py
+def get_current_temperature(location: str, unit: str) -> float:
+    """
+    주어진 위치의 현재 온도를 가져옵니다.
+    
+    Args:
+        location: 온도를 가져올 위치, "도시, 국가" 형식
+        unit: 온도를 반환할 단위. (선택지: ["celsius(섭씨)", "fahrenheit(화씨)"])
+    Returns:
+        주어진 위치의 지정된 단위로 표시된 현재 온도(float 자료형).
+    """
+    return 22.  # 실제 함수라면 아마 진짜로 기온을 가져와야겠죠!
+
+def get_current_wind_speed(location: str) -> float:
+    """
+    주어진 위치의 현재 풍속을 km/h 단위로 가져옵니다.
+    
+    Args:
+        location: 온도를 가져올 위치, "도시, 국가" 형식
+    Returns:
+        주어진 위치의 현재 풍속(km/h, float 자료형).
+    """
+    return 6.  # 실제 함수라면 아마 진짜로 풍속을 가져와야겠죠!
+
+tools = [get_current_temperature, get_current_wind_speed]
+```
+
+[NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B)와 같이 도구 사용을 지원하는 모델과 토크나이저를 가져오세요. 하드웨어가 지원된다면 [Command-R](./model_doc/cohere)이나 [Mixtral-8x22B](./model_doc/mixtral)와 같은 더 큰 모델도 고려할 수 있습니다.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+채팅 메시지를 생성합니다.
+
+```py
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+`messages`와 도구 목록 `tools`를 [`~PreTrainedTokenizerBase.apply_chat_template`]에 전달한 뒤, 이를 모델의 입력으로 사용하여 텍스트를 생성할 수 있습니다.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+outputs = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+채팅 모델은 독스트링(docstring)에 정의된 형식에 따라 `get_current_temperature` 함수에 올바른 매개변수를 전달해 호출했습니다. 파리를 기준으로 위치를 프랑스로 추론했으며, 온도 단위는 섭씨를 사용해야 한다고 판단했습니다.
+
+이제 `get_current_temperature` 함수와 해당 인수들을 `tool_call` 딕셔너리에 담아 채팅 메시지에 추가합니다. `tool_call` 딕셔너리는 `system`이나 `user`가 아닌 `assistant` 역할로 제공되어야 합니다.
+
+> [!WARNING]
+> OpenAI API는 `tool_call` 형식으로 JSON 문자열을 사용합니다. Transformers에서 사용할 경우 딕셔너리를 요구하기 때문에, 오류가 발생하거나 모델이 이상하게 동작할 수 있습니다.
+
+<hfoptions id="tool-call">
+<hfoption id="Llama">
+
+```py
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+
+어시스턴트가 함수 출력을 읽고 사용자와 채팅할 수 있도록 합니다.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
+```
+
+</hfoption>
+<hfoption id="Mistral/Mixtral">
+
+[Mistral](./model_doc/mistral) 및 [Mixtral](./model_doc/mixtral) 모델의 경우 추가적으로 `tool_call_id`가 필요합니다. `tool_call_id`는 9자리 영숫자 문자열로 생성되어 `tool_call` 딕셔너리의 `id` 키에 할당됩니다.
+
+```py
+tool_call_id = "9Ae3bDc2F"
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
+```
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+</hfoption>
+</hfoptions>
+
+## 스키마[[Schema]]
+
+[`~PreTrainedTokenizerBase.apply_chat_template`]은 함수를 [JSON 스키마](https://json-schema.org/learn/getting-started-step-by-step)로 변환하여 채팅 템플릿에 전달합니다. LLM은 함수 내부의 코드를 보지 못합니다. 다시 말해, LLM은 함수가 기술적으로 어떻게 작동하는지는 신경 쓰지 않고, 함수의 **정의**와 **인수**만 참조합니다.
+
+함수가 앞서 나열된 규칙을 따르면, 내부에서 JSON 스키마가 자동으로 생성됩니다. 하지만 더 나은 가독성이나 디버깅을 위해 [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205)를 사용하여 스키마를 수동으로 변환할 수 있습니다.
+
+```py
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    두 숫자를 곱하는 함수
+    
+    Args:
+        a: 곱할 첫 번째 숫자
+        b: 곱할 두 번째 숫자
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+스키마를 편집하거나 처음부터 직접 작성할 수 있습니다. 이를 통해 더 복잡한 함수에 대한 정확한 스키마를 유연하게 정의할 수 있습니다.
+
+> [!WARNING]
+> 함수 시그니처를 단순하게 유지하고 인수를 최소한으로 유지하세요. 이러한 함수는 중첩된 인수를 가진 복잡한 함수에 비해 모델이 더 쉽게 이해하고 사용할 수 있습니다.
+
+아래 예시는 스키마를 수동으로 작성한 다음 [`~PreTrainedTokenizerBase.apply_chat_template`]에 전달하는 방법을 보여줍니다.
+
+```py
+# 인수를 받지 않는 간단한 함수
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# 두 개의 숫자 인수를 받는 더 완전한 함수
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## RAG[[RAG]]
+
+검색 증강 생성(Retrieval-augmented generation, RAG) 모델은 쿼리를 반환하기 전에 문서를 검색해 추가 정보를 얻어 모델이 기존에 가지고 있던 지식을 확장시킵니다. RAG 모델의 경우, [`~PreTrainedTokenizerBase.apply_chat_template`]에 `documents` 매개변수를 추가하세요. 이 `documents` 매개변수는 문서 목록이어야 하며, 각 문서는 `title`과 `content` 키를 가진 단일 딕셔너리여야 합니다.
+
+> [!TIP]
+> RAG를 위한 `documents` 매개변수는 폭넓게 지원되지 않으며 많은 모델들이 `documents`를 무시하는 채팅 템플릿을 가지고 있습니다. 모델이 `documents`를 지원하는지 확인하려면 모델 카드를 읽거나 `print(tokenizer.chat_template)`를 실행하여 `documents` 키가 있는지 확인하세요. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024)과 [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024)는 모두 RAG 채팅 템플릿에서 `documents`를 지원합니다.
+
+모델에 전달할 문서 목록을 생성하세요.
+
+```py
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+```
+
+[`~PreTrainedTokenizerBase.apply_chat_template`]에서 `chat_template="rag"`를 설정하고 응답을 생성하세요.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# 모델과 토크나이저 로드
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
+device = model.device # 모델을 가져온 장치 확인
+
+# 대화 입력 정의
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# 응답 생성
+generated_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# 생성된 텍스트를 디코딩하고 생성 프롬프트와 함께 출력
+generated_text = tokenizer.decode(generated_tokens[0])
+print(generated_text)
+```
--- a/docs/source/ko/main_classes/onnx.md
+++ b/docs/source/ko/main_classes/onnx.md
@ -1,45 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 🤗 Transformers 모델을 ONNX로 내보내기[[exporting--transformers-models-to-onnx]]
-
-🤗 트랜스포머는 `transformers.onnx` 패키지를 제공하며, 이 패키지는 설정 객체를 활용하여 모델 체크포인트를 ONNX 그래프로 변환할 수 있게 합니다.
-
-🤗 Transformers에 대한 자세한 내용은 [이 가이드](../serialization)를 참조하세요.
-
-## ONNX 설정[[onnx-configurations]]
-
-내보내려는(export) 모델 아키텍처의 유형에 따라 상속받아야 할 세 가지 추상 클래스를 제공합니다:
-
-* 인코더 기반 모델은 [`~onnx.config.OnnxConfig`]을 상속받습니다.
-* 디코더 기반 모델은 [`~onnx.config.OnnxConfigWithPast`]을 상속받습니다.
-* 인코더-디코더 기반 모델은 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]을 상속받습니다.
-
-### OnnxConfig[[transformers.onnx.OnnxConfig]]
-
-[[autodoc]] onnx.config.OnnxConfig
-
-### OnnxConfigWithPast[[transformers.onnx.OnnxConfigWithPast]]
-
-[[autodoc]] onnx.config.OnnxConfigWithPast
-
-### OnnxSeq2SeqConfigWithPast[[OnnxSeq2SeqConfigWithPast]]
-
-[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
-
-## ONNX 특징[[onnx-features]]
-
-각 ONNX 설정은 다양한 유형의 토폴로지나 작업에 대해 모델을 내보낼 수 있게(exporting) 해주는 _features_ 세트와 연관되어 있습니다.
--- a/docs/source/ko/model_doc/big_bird.md
+++ b/docs/source/ko/model_doc/big_bird.md
@ -0,0 +1,158 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2020-07-28에 출시되었으며 2021-03-30에 Hugging Face Transformers에 추가되었습니다.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
+# BigBird[[bigbird]]
+
+[BigBird](https://huggingface.co/papers/2007.14062)는 [BERT](./bert)의 512토큰과 달리 최대 4096토큰까지의 시퀀스 길이를 처리하도록 설계된 트랜스포머 모델입니다. 기존 트랜스포머들은 시퀀스 길이가 늘어날수록 어텐션 계산 비용이 급격히 증가하여 긴 입력 처리에 어려움을 겪습니다. BigBird는 희소 어텐션 메커니즘으로 이 문제를 해결하는데, 모든 토큰을 동시에 살펴보는 대신 로컬 어텐션, 랜덤 어텐션, 그리고 몇 개의 전역 토큰을 조합하여 전체 입력을 효율적으로 처리합니다. 이런 방식을 통해 계산 효율성을 유지하면서도 시퀀스 전체를 충분히 이해할 수 있게 됩니다. 따라서 BigBird는 질의응답, 요약, 유전체학 응용처럼 긴 문서를 다루는 작업에 특히 우수한 성능을 보입니다.
+
+모든 원본 BigBird 체크포인트는 [Google](https://huggingface.co/google?search_models=bigbird) 조직에서 찾아볼 수 있습니다.
+
+> [!TIP]
+> 오른쪽 사이드바의 BigBird 모델들을 클릭하여 다양한 언어 작업에 BigBird를 적용하는 더 많은 예시를 확인해보세요.
+
+아래 예시는 [`Pipeline`], [`AutoModel`], 그리고 명령줄에서 `[MASK]` 토큰을 예측하는 방법을 보여줍니다.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="fill-mask",
+    model="google/bigbird-roberta-base",
+    dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create [MASK] through a process known as photosynthesis.")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-roberta-base",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "google/bigbird-roberta-base",
+    dtype=torch.float16,
+    device_map="auto",
+)
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+!echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google/bigbird-roberta-base --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## 참고사항[[notes]]
+
+- BigBird는 절대 위치 임베딩을 사용하므로 입력을 오른쪽에 패딩해야 합니다.
+- BigBird는 `original_full`과 `block_sparse` 어텐션을 지원합니다. 입력 시퀀스 길이가 1024 미만인 경우에는 희소 패턴의 이점이 크지 않으므로 `original_full` 사용을 권장합니다.
+- 현재 구현은 3블록 윈도우 크기와 2개의 전역 블록을 사용하며, ITC 구현만 지원하고 `num_random_blocks=0`은 지원하지 않습니다.
+- 시퀀스 길이는 블록 크기로 나누어떨어져야 합니다.
+
+## 리소스[[resources]]
+
+- BigBird 어텐션 메커니즘의 자세한 작동 원리는 [BigBird](https://huggingface.co/blog/big-bird) 블로그 포스트를 참고하세요.
+
+## BigBirdConfig[[bigbirdconfig]]
+
+[[autodoc]] BigBirdConfig
+
+## BigBirdTokenizer[[bigbirdtokenizer]]
+
+[[autodoc]] BigBirdTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BigBirdTokenizerFast[[bigbirdtokenizerfast]]
+
+[[autodoc]] BigBirdTokenizerFast
+
+## BigBird 특정 출력[[bigbird-specific-outputs]]
+
+[[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
+
+## BigBirdModel[[bigbirdmodel]]
+
+[[autodoc]] BigBirdModel
+    - forward
+
+## BigBirdForPreTraining[[bigbirdforpretraining]]
+
+[[autodoc]] BigBirdForPreTraining
+    - forward
+
+## BigBirdForCausalLM[[bigbirdforcausallm]]
+
+[[autodoc]] BigBirdForCausalLM
+    - forward
+
+## BigBirdForMaskedLM[[bigbirdformaskedlm]]
+
+[[autodoc]] BigBirdForMaskedLM
+    - forward
+
+## BigBirdForSequenceClassification[[bigbirdforsequenceclassification]]
+
+[[autodoc]] BigBirdForSequenceClassification
+    - forward
+
+## BigBirdForMultipleChoice[[bigbirdformultiplechoice]]
+
+[[autodoc]] BigBirdForMultipleChoice
+    - forward
+
+## BigBirdForTokenClassification[[bigbirdfortokenclassification]]
+
+[[autodoc]] BigBirdForTokenClassification
+    - forward
+
+## BigBirdForQuestionAnswering[[bigbirdforquestionanswering]]
+
+[[autodoc]] BigBirdForQuestionAnswering
+    - forward
--- a/docs/source/ko/model_doc/code_llama.md
+++ b/docs/source/ko/model_doc/code_llama.md
@ -0,0 +1,180 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2023년 8월 24일에 공개되었으며, 2023년 8월 25일에 Hugging Face Transformers에 추가되었습니다.*
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        ">
+    </div>
+</div>
+
+# CodeLlama[[codellama]]
+
+[Code Llama](https://huggingface.co/papers/2308.12950)는 코딩 작업에 특화된 대규모 언어 모델 계열로,  [Llama 2](./llama2)를 기반으로 개발되었습니다. 일반적인 코드, Python 특화, 명령어(지시) 기반 변형 등 다양한 버전으로 제공되며, 모두 7B, 13B, 34B, 70B 매개변수 크기로 사용할 수 있습니다. Code Llama 모델은 코드를 생성하고 설명하며, 코드의 누락된 부분을 채울 수도 있습니다. 이를 인필링(infilling)이라고 합니다. 16K 토큰 길이로 훈련되었지만, 최대 100K 토큰까지 안정적으로 생성하며 긴 컨텍스트도 처리할 수 있습니다.
+
+[Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933) 컬렉션에서 모든 원본 Code Llama 체크포인트를 찾을 수 있습니다.
+
+> [!TIP]
+> 다양한 코딩 작업에 Code Llama를 적용하는 더 많은 예시를 보려면 오른쪽 사이드바의 Code Llama 모델을 클릭하세요.
+
+아래 예시는 [`Pipeline`], [`AutoModel`], 그리고 명령줄에서 코드를 생성하는 방법을 보여줍니다.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    "text-generation",
+    model="meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map=0
+)
+
+# 기본 코드 생성
+result = pipe("# Function to calculate the factorial of a number\ndef factorial(n):", max_new_tokens=256)
+print(result[0]['generated_text'])
+
+# 인필링
+infill_result = pipe("def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result", max_new_tokens=200)
+print(infill_result[0]['generated_text'])
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+# 기본 코드 생성
+prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(
+    **input_ids,
+    max_new_tokens=256,
+    cache_implementation="static"
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+# 인필링
+infill_prompt = "def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result"
+input_ids = tokenizer(infill_prompt, return_tensors="pt").to(model.device)
+
+filled_output = model.generate(**input_ids, max_new_tokens=200)
+filled_text = tokenizer.decode(filled_output[0], skip_special_tokens=True)
+print(filled_text)
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+양자화는 가중치를 더 낮은 정밀도로 표현하여 대규모 모델의 메모리 부담을 줄입니다. 더 많은 사용 가능한 양자화 백엔드는 [양자화](../quantization/overview) 개요를 참조하세요.
+
+아래 예시는 [bitsandbytes](../quantization/bitsandbytes)를 사용하여 가중치를 4비트로만 양자화합니다.
+
+```py
+# bitsandbytes를 설치합니다.
+import torch
+from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
+tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-34b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+   "meta-llama/CodeLlama-34b-hf",
+   torch_dtype=torch.bfloat16,
+   device_map="auto",
+   quantization_config=bnb_config
+)
+
+prompt = "# Write a Python function to check if a string is a palindrome\ndef is_palindrome(s):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, max_new_tokens=200, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+[AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139)를 사용하면 모델이 어떤 토큰에 주의를 기울일 수 있고 기울일 수 없는지를 더 잘 이해할 수 있습니다.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+
+visualizer = AttentionMaskVisualizer("meta-llama/CodeLlama-7b-hf")
+visualizer("""def func(a, b):
+  return a + b""")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/codellama-attn-mask.png"/>
+</div>
+
+## 참고사항[[notes]]
+
+- 인필링 기능은 7B 및 13B 기반 모델에서만 사용할 수 있으며, Python, Instruct, 34B 또는 70B 모델에서는 사용할 수 없습니다.
+- 코드를 채워 넣고 싶은 부분에 `<FILL_ME>` 토큰을 사용하세요. 토크나이저는 이 토큰을 분할하여 [원본 훈련 패턴](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402) 을 따르는 입력 문자열로 변환합니다. 이는 직접 패턴을 준비하는 것보다 더 안정적입니다.
+    ```py
+    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+
+    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    PROMPT = '''def remove_non_ascii(s: str) -> str:
+        """ <FILL_ME>
+        return result
+    '''
+    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
+    generated_ids = model.generate(input_ids, max_new_tokens=128)
+
+    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
+    print(PROMPT.replace("<FILL_ME>", filling))
+    ```
+- 추가 훈련이나 미세 조정에는 `bfloat16`을 사용하고 추론에는 `float16`을 사용하세요.
+- `BOS` 문자는 접두사나 접미사를 인코딩할 때 인필링 작업에 사용되지 않으며, 각 프롬프트의 맨 앞에서만 사용됩니다.
+- 토크나이저는 [SentencePiece](https://github.com/google/sentencepiece)를 기반으로 하는 byte-pair 인코딩 모델입니다. 디코딩 과정에서 첫 번째 토큰이 단어의 시작인 경우(예를 들어 "Banana"), 토크나이저는 문자열에 접두사 공백을 추가하지 않습니다.
+
+## CodeLlamaTokenizer
+
+[[autodoc]] CodeLlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CodeLlamaTokenizerFast
+
+[[autodoc]] CodeLlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
--- a/docs/source/ko/model_doc/gemma3n.md
+++ b/docs/source/ko/model_doc/gemma3n.md
@ -0,0 +1,189 @@
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2025년 5월 20일에 출시되었으며, 2025년 6월 26일에 Hugging Face Transformers에 추가되었습니다.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# Gemma3n[[gemma3n]]
+
+## 개요[[overview]]
+
+[Gemma3n](https://developers.googleblog.com/en/introducing-gemma-3n/)은 사전 훈련된 버전과 명령어 기반 미세조정 버전이 제공되는 멀티모달 모델이며, 모델 크기는 E4B와 E2B 두 가지로 출시되었습니다. 언어 모델 아키텍처는 이전 Gemma 버전과 많은 부분을 공유하지만 이번 버전에는 여러 가지 새로운 기법이 추가되었습니다. 대표적으로 [교차 업데이트(AltUp)](https://proceedings.neurips.cc/paper_files/paper/2023/hash/f2059277ac6ce66e7e5543001afa8bb5-Abstract-Conference.html), [학습된 증강 잔여 레이어(LAuReL)](https://huggingface.co/papers/2411.07501), [MatFormer](https://huggingface.co/papers/2310.07707), 레이어별 임베딩, [통계적 Top-k를 이용한 활성화 희소성(SPARk-Transformer)](https://huggingface.co/papers/2506.06644), KV 캐시 공유 등이 있습니다. Gemma 3n은 [Gemma 3](./gemma3)와 유사한 어텐션 패턴을 사용합니다. 글로벌 셀프 어텐션 레이어 1개마다 로컬 슬라이딩 윈도우 셀프 어텐션 레이어 4개를 교차로 배치하며, 최대 컨텍스트 길이는 32k 토큰까지 지원합니다. 비전 모달리티에서는 MobileNet v5를 비전 인코더로 도입하여 기본 해상도를 768x768 픽셀로 처리합니다. 또한 오디오 모달리티에서는 [Universal Speech Model(USM)](https://huggingface.co/papers/2303.01037) 아키텍처를 기반으로 새롭게 학습된 오디오 인코더가 추가되었습니다.
+
+명령어 기반 미세조정 버전은 지식 증류와 강화 학습을 통해 후처리 학습 되었습니다.
+
+Gemma 3n의 원본 체크포인트는 [Gemma 3n][gemma3n-collection] 출시 페이지에서 확인할 수 있습니다.
+
+> [!TIP]
+> 오른쪽 사이드바에 있는 Gemma 3n 모델을 클릭하면, Gemma를 다양한 비전, 오디오, 
+> 언어 작업에 적용하는 더 많은 예시를 확인할 수 있습니다.
+
+아래 예시는 [`Pipeline`] 또는 [`AutoModel`] 클래스를 사용하여 이미지를 입력으로 받아 텍스트를 생성하는 방법을 보여줍니다.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="image-text-to-text",
+    model="google/gemma-3n-e4b",
+    device=0,
+    dtype=torch.bfloat16
+)
+pipeline(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+    text="이 이미지에 무엇이 보이나요?"
+)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+
+model = Gemma3nForConditionalGeneration.from_pretrained(
+    "google/gemma-3n-e4b-it",
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+processor = AutoProcessor.from_pretrained(
+    "google/gemma-3n-e4b-it",
+    padding_side="left"
+)
+
+messages = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "당신은 도움이 되는 어시스턴트입니다."}
+        ]
+    },
+    {
+        "role": "user", "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+            {"type": "text", "text": "이 이미지에 무엇이 보이나요?"},
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to(model.device)
+
+output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
+print(processor.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "식물은 특정 과정을 통해 에너지를 생성합니다." | transformers run --task text-generation --model google/gemma-3n-e2b --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## 참고사항[[notes]]
+
+-   [`Gemma3nForConditionalGeneration`] 클래스를 사용하면 이미지-오디오-텍스트, 이미지-텍스트, 이미지-오디오, 오디오-텍스트, 이미지 단독, 오디오 단독 입력을 모두 처리할 수 있습니다.
+-   Gemma 3n은 한 번의 입력에 여러 이미지를 지원합니다. 다만 프로세서에 전달하기 전에 이미지들이 배치 단위로 올바르게 묶여있는지 확인해야 합니다. 각 배치는 하나 이상의 이미지를 담은 리스트 형식입니다.
+
+    ```py
+    url_cow = "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4="
+    url_cat = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+
+    messages =[
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "당신은 도움이 되는 어시스턴트입니다."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": url_cow},
+                {"type": "image", "url": url_cat},
+                {"type": "text", "text": "어떤 이미지가 더 귀엽습니까?"},
+            ]
+        },
+    ]
+    ```
+-   프로세서에 전달되는 텍스트에는 이미지를 삽입해야 하는 위치에 `<image_soft_token>` 토큰을 포함해야 합니다.
+-   Gemma 3n은 입력당 최대 하나의 타깃 오디오 클립만 허용합니다. 다만 퓨샷 프롬프트에서는 여러 개의 오디오 클립을 함께 제공할 수 있습니다.
+-   프로세서에 전달되는 텍스트에는 오디오 클립을 삽입해야 하는 위치에 `<audio_soft_token>` 토큰을 포함해야 합니다.
+-   프로세서에는 채팅 메시지를 모델 입력 형식으로 변환하기 위한 자체 메서드인 [`~ProcessorMixin.apply_chat_template`]가 포함되어 있습니다.
+
+## Gemma3nAudioFeatureExtractor[[transformers.Gemma3nAudioFeatureExtractor]]
+
+[[autodoc]] Gemma3nAudioFeatureExtractor
+
+## Gemma3nProcessor[[transformers.Gemma3nProcessor]]
+
+[[autodoc]] Gemma3nProcessor
+
+## Gemma3nTextConfig[[transformers.Gemma3nTextConfig]]
+
+[[autodoc]] Gemma3nTextConfig
+
+## Gemma3nVisionConfig[[transformers.Gemma3nVisionConfig]]
+
+[[autodoc]] Gemma3nVisionConfig
+
+## Gemma3nAudioConfig[[transformers.Gemma3nAudioConfig]]
+
+[[autodoc]] Gemma3nAudioConfig
+
+## Gemma3nConfig[[transformers.Gemma3nConfig]]
+
+[[autodoc]] Gemma3nConfig
+
+## Gemma3nTextModel[[transformers.Gemma3nTextModel]]
+
+[[autodoc]] Gemma3nTextModel
+    - forward
+
+## Gemma3nModel[[transformers.Gemma3nModel]]
+
+[[autodoc]] Gemma3nModel
+    - forward
+
+## Gemma3nForCausalLM[[transformers.Gemma3nForCausalLM]]
+
+[[autodoc]] Gemma3nForCausalLM
+    - forward
+
+## Gemma3nForConditionalGeneration[[transformers.Gemma3nForConditionalGeneration]]
+
+[[autodoc]] Gemma3nForConditionalGeneration
+    - forward
+
--- a/docs/source/ko/model_doc/lfm2.md
+++ b/docs/source/ko/model_doc/lfm2.md
@ -0,0 +1,85 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2025년 7월 10일에 출시되었으며, 2025년 7월 10일에 Hugging Face Transformers에 추가되었습니다.*
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+# LFM2
+
+## 개요[[overview]]
+
+[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)는 Liquid AI가 개발한 차세대 Liquid Foundation Model로 egde AI와 온디바이스 배포에 특화되어 설계되었습니다.
+
+이 모델들은 350M, 700M, 1.2B, 2.6B의 네 가지 크기의 매개변수로 제공되며, CPU, GPU, NPU 하드웨어에서 효율적으로 실행되도록 설계되었습니다. 이로 인해 특히 낮은 지연 시간, 오프라인 작동 및 개인 정보 보호가 필요한 애플리케이션에 적합합니다.
+
+## 아키텍처[[architecture]]
+
+아키텍처는 게이트가 있는 짧은 합성곱 블록과 QK 레이어 정규화가 적용된 그룹 쿼리 어텐션 블록으로 구성됩니다. 이 설계는 선형 연산이 입력 의존적인 게이트에 의해 조절되는 동적 시스템 개념에서 비롯되었습니다. 짧은 합성곱은 특히 임베디드 SoC CPU에 최적화되어 있어, 클라우드 연결에 의존하지 않고 빠르고 로컬화된 추론이 필요한 장치에 이상적입니다.
+
+LFM2는 제한된 속도와 메모리 환경에서 품질을 최대화되도록 설계되었습니다. 이는 퀄컴 스냅드래곤 프로세서에서 실제 최대 메모리 사용량과 추론 속도를 측정하여, 임베디드 하드웨어에서의 실제 성능에 맞게 모델을 최적화하기 위한 체계적인 아키텍처 탐색을 통해 달성되었습니다. 그 결과, 비슷한 크기의 모델에 비해 2배 빠른 디코딩 및 프리필 성능을 달성하면서도, 지식, 수학, 지시 사항 따르기, 다국어 작업 전반에서 우수한 벤치마크 성능을 유지하는 모델이 탄생했습니다.
+
+## 예시[[example]]
+
+다음 예시는 `AutoModelForCausalLM` 클래스를 사용하여 답변을 생성하는 방법을 보여줍니다.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# 모델과 토크나이저를 가져옵니다
+model_id = "LiquidAI/LFM2-1.2B"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype="bfloat16",
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# 답변 생성
+prompt = "What is C. elegans?"
+input_ids = tokenizer.apply_chat_template(
+    [{"role": "user", "content": prompt}],
+    add_generation_prompt=True,
+    return_tensors="pt",
+    tokenize=True,
+)
+
+output = model.generate(
+    input_ids,
+    do_sample=True,
+    temperature=0.3,
+    min_p=0.15,
+    repetition_penalty=1.05,
+    max_new_tokens=512,
+)
+
+print(tokenizer.decode(output[0], skip_special_tokens=False))
+```
+
+## Lfm2Config [[transformers.Lfm2Config]]
+
+[[autodoc]] Lfm2Config
+
+## Lfm2Model [[transformers.Lfm2Model]]
+
+[[autodoc]] Lfm2Model
+    - forward
+
+## Lfm2ForCausalLM [[transformers.Lfm2ForCausalLM]]
+
+[[autodoc]] Lfm2ForCausalLM
+    - forward
--- a/docs/source/ko/model_doc/llama4.md
+++ b/docs/source/ko/model_doc/llama4.md
@ -0,0 +1,443 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*Meta는 이 모델을 2025-04-05에 출시하고 같은 날 Hugging Face Transformers에 추가했습니다.*
+
+# Llama4[[llama4]]
+
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+Meta에서 개발한 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)는 새로운 자기회귀 Mixture-of-Experts (MoE) 아키텍처를 도입합니다.
+이 세대는 두 가지 모델로 나뉩니다:
+- 128개의 전문가(expert)를 사용하여 총 약 400B 매개변수 중 17B 활성 매개변수를 갖는 고성능 Llama 4 Maverick
+- 16개의 전문가만 사용하여 총 약 109B 매개변수 중 17B 활성 매개변수를 갖는 경량화된 Llama 4 Scout
+-
+두 모델 모두 네이티브 멀티모달을 위한 초기 융합(early fusion)을 활용하여 텍스트와 이미지 입력을 처리할 수 있습니다.
+Maverick과 Scout 모두 200개 언어를 포함하는 데이터에서 최대 40조개의 토큰으로 훈련되었습니다.
+(아랍어, 스페인어, 독일어, 힌디어를 포함한 12개 언어에 대한 특정 미세 조정 지원 포함)
+
+Meta는 Llama 4 Scout을 누구나 쉽게 사용할 수 있도록 설계했습니다. Scout은 4비트 또는 8비트 양자화를 적용하면 단일 서버급 GPU에서도 실시간으로 실행할 수 있습니다. 반면, 더 대규모인 Llama 4 Maverick은 고성능 연산을 위해 BF16과 FP8 형식으로 제공합니다.
+이 모델들은 모델 저장소에서 제공되는 사용자 지정 Llama 4 커뮤니티 라이선스 계약에 따라 출시됩니다.
+
+모든 원본 Llama 체크포인트는 hugging face [meta-llama](https://huggingface.co/meta-llama) 페이지에서 확인하실 수 있습니다.
+
+> [!TIP]
+> Llama 4 모델 패밀리는 두 가지 형태로 제공됩니다: 109B와 402B 매개변수입니다. 이 두 형태 모두 매우 큰 모델이며
+> 일반적인 기기에서는 실행할 수 없습니다. 아래에 메모리 사용량을 줄이는 방법 몇 가지를 정리했습니다.
+>
+> 더욱 빠르고 안정적인 다운로드를 위해 `hf_xet` 종속성 설치를 권장합니다:
+> `pip install transformers[hf_xet]`
+
+아래 예시들은 [`Pipeline`] 또는 [`AutoModel`]로 생성하는 방법을 보여줍니다. 또한 일부 Llama 4 변형이
+최대 1천만 토큰의 컨텍스트 길이를 갖기 때문에, 매우 긴 컨텍스트 생성을 활성화하기 위해 올바른 속성을 토글하는 방법을 보여주는 예시도 추가했습니다.
+
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+messages = [
+    {"role": "user", "content": "마요네즈 레시피가 무엇인가요?"},
+]
+
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    device_map="auto",
+    dtype=torch.bfloat16
+)
+
+output = pipe(messages, do_sample=False, max_new_tokens=200)
+print(output[0]["generated_text"][-1]["content"])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Text only">
+
+```py
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+
+img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": img_url},
+            {"type": "text", "text": "이 이미지를 두 문장으로 설명해주세요."},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal with multiple images">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+
+url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": url1},
+            {"type": "image", "url": url2},
+            {"type": "text", "text": "이 두 이미지가 어떻게 비슷하고, 어떻게 다른지 설명해주실 수 있나요?"},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Long context">
+
+주의: 아래 예시는 `device_map="auto"`와 flex-attention을 모두 사용합니다.
+이 예시를 텐서 병렬 모드로 실행하려면 `torchrun`을 사용하세요.
+
+향후 텐서 병렬 없이 `device_map="auto"`와 flex-attention을 함께 실행할 수 있도록
+작업할 예정입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration, AutoTokenizer, infer_device
+import torch
+import time
+
+file = "very_long_context_prompt.txt"
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+with open(file, "r") as f:
+    very_long_text = "\n".join(f.readlines())
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    attn_implementation="flex_attention",
+    dtype=torch.bfloat16
+)
+
+messages = [
+    {"role": "user", "content": f"다음 텍스트들을 보세요: [{very_long_text}]\n\n\n\n책들은 무엇이며, 누가 썼나요? 좋은 목록을 만들어주세요."},
+]
+input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+device = infer_device()
+torch_device_module = getattr(torch, device, torch.cuda)
+torch_device_module.synchronize()
+start = time.time()
+out = model.generate(
+    input_ids.to(model.device),
+    prefill_chunk_size=2048*8,
+    max_new_tokens=300,
+    cache_implementation="hybrid",
+)
+print(time.time()-start)
+print(tokenizer.batch_decode(out[:, input_ids.shape[-1]:]))
+print(f"{torch_device_module.max_memory_allocated(model.device) / 1024**3:.2f} GiB")
+```
+
+</hfoption>
+</hfoptions>
+
+## 효율성; Llama 4의 최대 성능 활용하기[[efficiency-how-to-get-the-best-out-of-llama-4]]
+
+### 어텐션 방법[[the-attention-methods]]
+
+기본 설정으로 주어지는 어텐션 함수를 변경하면 계산 성능과 메모리 사용량을 크게 개선할 수 있습니다. 인터페이스에 대한 자세한 설명은 [어텐션 인터페이스](../attention_interface) 개요를 참조하세요.
+
+Llama 4 모델은 처음 공개될 때부터 다음 어텐션 방식을 지원합니다: `eager`, `flex_attention`, `sdpa`. 최상의 결과를 위해 `flex_attention` 사용을 권장합니다.
+어텐션 메커니즘 전환은 모델을 초기화할 때 이루어집니다:
+
+
+<hfoptions id="Attention">
+<hfoption id="Flex Attention">
+
+Flex Attention은 모델이 긴 컨텍스트를 처리할 때 최적의 성능을 발휘합니다.
+
+> [!TIP] 주의: 아래 예시는 `device_map="auto"`와 flex-attention을 모두 사용합니다.
+> 이 예시를 텐서 병렬 모드로 실행하려면 `torchrun`을 사용하세요.
+>
+> 향후 텐서 병렬 없이 `device_map="auto"`와 flex-attention을 함께 실행할 수 있도록
+> 작업할 예정입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="flex_attention",
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="SDPA">
+`sdpa` 어텐션 방법은 일반적으로 `eager` 방법보다 계산 효율적입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="sdpa",
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="Eager">
+`eager` 어텐션 방법이 기본으로 설정되어 있으므로 모델 로드 시 다른 설정이 필요하지 않습니다:
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+</hfoptions>
+
+
+### 양자화[[quantization]]
+
+양자화는 가중치를 더 낮은 정밀도로 바꿔 대형 모델의 메모리 부담을 줄입니다. 사용 가능한 양자화 백엔드에 대해서는 [양자화](../quantization/overview) 개요를 참조하세요.
+현재는 FBGEMM과 LLM-Compressor를 지원하며, 곧 더 많은 방식이 추가될 예정입니다.
+
+두 가지 방법을 사용하는 예시를 아래에서 확인하세요:
+
+
+
+다음은 FBGEMM 접근법을 사용하여 BF16 모델을 FP8로 로드하는 예시입니다:
+
+<hfoptions id="Quantization">
+<hfoption id="FBGEMM">
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration, FbgemmFp8Config
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+    quantization_config=FbgemmFp8Config()
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="LLM-Compressor">
+
+LLLM-Compressor를 사용할 때는 함께 제공되는 사전 양자화된 FP8 체크포인트를 쓰는 것이 좋습니다:
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    tp_plan="auto",
+    dtype=torch.bfloat16,
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+</hfoption>
+</hfoptions>
+
+### 오프로딩[[offloading]]
+
+CPU 오프로딩을 활성화하면, GPU 메모리가 부족할 때 모델이 구성 요소를 CPU로 이동시킵니다.
+추론 시 다양한 구성 요소들이 GPU와 CPU 간에 동적으로 로드되고 언로드됩니다. 이를 통해 CPU 메모리가 충분한 한 더 작은 머신에서도 모델을 로드할 수 있습니다.
+다만 통신 오버헤드로 인해 추론 속도가 느려질 수 있습니다.
+
+CPU 오프로딩을 활성화하려면 모델 로드 시 `device_map`을 `auto`로 지정하면 됩니다
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+
+## Llama4Config
+
+[[autodoc]] Llama4Config
+
+## Llama4TextConfig
+
+[[autodoc]] Llama4TextConfig
+
+## Llama4VisionConfig
+
+[[autodoc]] Llama4VisionConfig
+
+## Llama4Processor
+
+[[autodoc]] Llama4Processor
+
+## Llama4ImageProcessorFast
+
+[[autodoc]] Llama4ImageProcessorFast
+
+## Llama4ForConditionalGeneration
+
+[[autodoc]] Llama4ForConditionalGeneration
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4TextModel
+
+[[autodoc]] Llama4TextModel
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4VisionModel
+
+[[autodoc]] Llama4VisionModel
+- forward
--- a/docs/source/ko/model_doc/sam_hq.md
+++ b/docs/source/ko/model_doc/sam_hq.md
@ -0,0 +1,141 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2023-06-02에 발표되었으며 2025-04-28에 Hugging Face Transformers에 추가되었습니다.*
+
+# SAM-HQ[[sam_hq]]
+
+## 개요[[overview]]
+
+SAM-HQ (High-Quality Segment Anything Model)는 Lei Ke, Mingqiao Ye, Martin Danelljan, Yifan Liu, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu가 제안한 [Segment Anything in High Quality](https://huggingface.co/papers/2306.01567) 논문에서 소개되었습니다.
+
+이 모델은 기존 SAM(Segment Anything Model)의 향상된 버전입니다. SAM-HQ는 SAM의 핵심 장점인 프롬프트 기반 설계, 효율성, 제로샷 일반화 능력을 그대로 유지하면서도 훨씬 더 높은 품질의 분할 마스크를 생성하는 것이 특징입니다.
+
+![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
+
+SAM-HQ는 기존 SAM 모델 대비 다음과 같은 5가지 핵심 개선 사항을 도입했습니다.
+
+1. 고품질 출력 토큰: SAM-HQ는 SAM의 마스크 디코더에 학습 가능한 토큰을 주입합니다. 이 토큰은 모델이 더 높은 품질의 분할 마스크를 예측하도록 돕는 핵심적인 요소입니다.
+2. 전역-지역 특징 융합: 모델의 서로 다른 단계에서 추출된 특징들을 결합하여 분할 마스크의 세부적인 정확도를 향상시킵니다. 이미지의 전체적인 맥락 정보와 객체의 미세한 경계 정보를 함께 활용하여 마스크 품질을 개선합니다.
+3. 훈련 데이터 개선: SAM 모델이 SA-1B와 같은 대규모 데이터를 사용한 것과 달리, SAM-HQ는 신중하게 선별된 44,000개의 고품질 마스크로 구성된 데이터셋을 사용하여 훈련됩니다.
+4. 높은 효율성: 마스크 품질을 상당히 개선했음에도 불구하고, 추가된 매개변수는 단 0.5%에 불과합니다.
+5. 제로샷 성능: SAM-HQ는 성능이 개선되었음에도 불구하고, SAM 모델의 강력한 제로샷 일반화 능력을 그대로 유지합니다.
+
+논문 초록 내용:
+
+* 최근 발표된 SAM(Segment Anything Model)은 분할 모델의 규모를 확장하는 데 있어 획기적인 발전이며, 강력한 제로샷 기능과 유연한 프롬프트 기능을 제공합니다. 하지만 SAM은 11억 개의 마스크로 훈련되었음에도 불구하고, 특히 복잡하고 정교한 구조를 가진 객체를 분할할 때 마스크 예측 품질이 미흡한 경우가 많습니다. 저희는 HQ-SAM을 제안하며, SAM의 기존 장점인 프롬프트 기반 설계, 효율성, 제로샷 일반화 능력을 모두 유지하면서도 어떤 객체든 정확하게 분할할 수 있는 능력을 부여합니다. 저희는 신중한 설계를 통해 SAM의 사전 훈련된 모델 가중치를 재사용하고 보존하며 최소한의 추가적인 매개변수와 연산만을 도입했습니다. 핵심적으로 저희는 학습 가능한 고품질 출력 토큰을 설계했습니다. 이 토큰은 SAM의 마스크 디코더에 주입되어 고품질 마스크를 예측하는 역할을 담당합니다. 마스크의 세부 사항을 개선하기 위해 이 토큰을 마스크 디코더 특징에만 적용하는 것이 아니라 초기 및 최종 ViT 특징과 먼저 융합하여 사용합니다. 도입된 학습 가능한 매개변수를 훈련하기 위해 저희는 여러 출처에서 가져온 44,000개의 미세 조정된 마스크 데이터셋을 구성했습니다. HQ-SAM은 오직 이 44,000개 마스크 데이터셋만으로 훈련되며 GPU 8대를 사용했을 때 단 4시간이 소요됩니다.
+
+SAM-HQ 사용 팁:
+
+- SAM-HQ는 기존 SAM 모델보다 더 높은 품질의 마스크 생성하며, 특히 복잡한 구조와 미세한 세부 사항을 가진 객체에 대해 성능이 우수합니다.
+- 이 모델은 더욱 정확한 경계와 얇은 구조에 대한 더 나은 처리 능력을 갖춘 이진 마스크를 예측합니다.
+- SAM과 마찬가지로 모델은 입력으로 2차원 포인트 및 바운딩 박스를 사용할 때 더 좋은 성능을 보입니다.
+- 하나의 이미지에 대해 다수의 포인트를 프롬프트로 입력하여 단일의 고품질 마스크를 예측할 수 있습니다.
+- 이 모델은 SAM의 제로샷 일반화 능력을 그대로 유지합니다.
+- SAM-HQ는 SAM 대비 약 0.5%의 추가 매개변수만을 가집니다.
+- 현재 모델의 미세 조정은 지원되지 않습니다.
+
+이 모델은 [sushmanth](https://huggingface.co/sushmanth)님께서 기여해주셨습니다.
+원본 코드는 [여기](https://github.com/SysCV/SAM-HQ)에서 확인하실 수 있습니다.
+
+아래는 이미지와 2차원 포인트가 주어졌을 때, 마스크를 생성하는 방법에 대한 예시입니다.
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import infer_device, SamHQModel, SamHQProcessor
+
+device = infer_device()
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+input_points = [[[450, 600]]]  # 이미지 내 창문의 2차원 위치
+
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+또한, 프로세서에서 입력 이미지와 함께 사용자의 마스크를 직접 처리하여 모델에 전달할 수도 있습니다.
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import infer_device, SamHQModel, SamHQProcessor
+
+device = infer_device()
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("1")
+input_points = [[[450, 600]]]  # 이미지 내 창문의 2차원 위치
+
+inputs = processor(raw_image, input_points=input_points, segmentation_maps=segmentation_map, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+## 자료[[resources]]
+
+다음은 SAM-HQ 사용을 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티 (🌎로 표시) 자료 목록입니다.
+
+- 모델 사용을 위한 데모 노트북 (출시 예정)
+- 논문 구현 및 코드: [SAM-HQ 깃허브 저장소](https://github.com/SysCV/SAM-HQ)
+
+## SamHQConfig[[transformers.SamHQConfig]]
+
+[[autodoc]] SamHQConfig
+
+## SamHQVisionConfig[[transformers.SamHQVisionConfig]]
+
+[[autodoc]] SamHQVisionConfig
+
+## SamHQMaskDecoderConfig[[transformers.SamHQMaskDecoderConfig]]
+
+[[autodoc]] SamHQMaskDecoderConfig
+
+## SamHQPromptEncoderConfig[[transformers.SamHQPromptEncoderConfig]]
+
+[[autodoc]] SamHQPromptEncoderConfig
+
+## SamHQProcessor[[transformers.SamHQProcessor]]
+
+[[autodoc]] SamHQProcessor
+
+## SamHQVisionModel[[transformers.SamHQVisionModel]]
+
+[[autodoc]] SamHQVisionModel
+
+## SamHQModel[[transformers.SamHQModel]]
+
+[[autodoc]] SamHQModel
+    - forward
--- a/docs/source/ko/perf_infer_cpu.md
+++ b/docs/source/ko/perf_infer_cpu.md
@ -17,11 +17,6 @@ rendered properly in your Markdown viewer.

 이 가이드는 CPU에서 대규모 모델을 효율적으로 추론하는 방법에 중점을 두고 있습니다.

-## PyTorch JIT 모드 (TorchScript) [[pytorch-jitmode-torchscript]]
-TorchScript는 PyTorch 코드에서 직렬화와 최적화가 가능한 모델을 생성할때 쓰입니다. TorchScript로 만들어진 프로그램은 기존 Python 프로세스에서 저장한 뒤, 종속성이 없는 새로운 프로세스로 가져올 수 있습니다. PyTorch의 기본 설정인 `eager` 모드와 비교했을때, `jit` 모드는 연산자 결합과 같은 최적화 방법론을 통해 모델 추론에서 대부분 더 나은 성능을 제공합니다.
-
-TorchScript에 대한 친절한 소개는 [PyTorch TorchScript 튜토리얼](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules)을 참조하세요.
-
 ### JIT 모드와 함께하는 IPEX 그래프 최적화 [[ipex-graph-optimization-with-jitmode]]
 Intel® Extension for PyTorch(IPEX)는 Transformers 계열 모델의 jit 모드에서 추가적인 최적화를 제공합니다. jit 모드와 더불어 Intel® Extension for PyTorch(IPEX)를 활용하시길 강력히 권장드립니다. Transformers 모델에서 자주 사용되는 일부 연산자 패턴은 이미 jit 모드 연산자 결합(operator fusion)의 형태로 Intel® Extension for PyTorch(IPEX)에서 지원되고 있습니다. Multi-head-attention, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm 결합 패턴 등이 이용 가능하며 활용했을 때 성능이 우수합니다. 연산자 결합의 이점은 사용자에게 고스란히 전달됩니다. 분석에 따르면, 질의 응답, 텍스트 분류 및 토큰 분류와 같은 가장 인기 있는 NLP 태스크 중 약 70%가 이러한 결합 패턴을 사용하여 Float32 정밀도와 BFloat16 혼합 정밀도 모두에서 성능상의 이점을 얻을 수 있습니다.

--- a/docs/source/ko/perf_train_gpu_many.md
+++ b/docs/source/ko/perf_train_gpu_many.md
@ -476,8 +476,6 @@ https://huggingface.co/papers/2201.11990)

 따라서 이 프레임워크의 장점은 선택한 클러스터에서 30분 동안 시뮬레이션을 실행하고 이 특정 환경을 최적으로 활용하기 위한 최상의 전략을 제안한다는 것입니다. 부품을 추가/제거/교체하면 실행하고 그에 대한 계획을 다시 최적화한 후 훈련할 수 있습니다. 다른 설정은 자체적인 사용자 정의 최적화를 가질 수 있습니다.

-🤗 Transformers 현황: 아직 통합되지 않음. 이미 [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py)를 통해 모델을 FX-추적할 수 있으며, 이는 FlexFlow의 선행 조건입니다. 따라서 어떤 작업을 수행해야 FlexFlow가 우리의 모델과 함께 작동할 수 있는지 파악해야 합니다.
-

 ## 어떤 전략을 사용해야 할까요? [[which-strategy-to-use-when]]

--- a/docs/source/ko/serialization.md
+++ b/docs/source/ko/serialization.md
@ -47,7 +47,7 @@ ONNX 형식으로 내보낸 모델은 다음과 같이 사용할 수 있습니
 🤗 Transformers 모델을 ONNX로 내보내려면 먼저 추가 종속성을 설치하세요:

 ```bash
-pip install optimum[exporters]
+pip install optimum-onnx
 ```

 사용 가능한 모든 인수를 확인하려면 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)를 참조하거나 명령줄에서 도움말을 보세요.
@ -123,59 +123,3 @@ CLI 대신에 `optimum.onnxruntime`을 사용하여 프로그래밍 방식으로
 ### 지원되지 않는 아키텍처의 모델 내보내기 [[exporting-a-model-for-an-unsupported-architecture]]

 현재 내보낼 수 없는 모델을 지원하기 위해 기여하려면, 먼저 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)에서 지원되는지 확인한 후 지원되지 않는 경우에는 [🤗 Optimum에 기여](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)하세요.
-
-### `transformers.onnx`를 사용하여 모델 내보내기 [[exporting-a-model-with-transformersonnx]]
-
-<Tip warning={true}>
-
-`tranformers.onnx`는 더 이상 유지되지 않습니다. 위에서 설명한 대로 🤗 Optimum을 사용하여 모델을 내보내세요. 이 섹션은 향후 버전에서 제거될 예정입니다.
-
-</Tip>
-
-🤗 Transformers 모델을 ONNX로 내보내려면 추가 종속성을 설치하세요:
-
-```bash
-pip install transformers[onnx]
-```
-
-`transformers.onnx` 패키지를 Python 모듈로 사용하여 준비된 구성을 사용하여 체크포인트를 내보냅니다:
-
-```bash
-python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
-```
-
-이렇게 하면 `--model` 인수에 정의된 체크포인트의 ONNX 그래프가 내보내집니다. 🤗 Hub에서 제공하는 체크포인트나 로컬에 저장된 체크포인트를 전달할 수 있습니다. 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 가속기 중 하나에서 실행할 수 있습니다. 예를 들어, 다음과 같이 ONNX Runtime을 사용하여 모델을 로드하고 실행할 수 있습니다:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-필요한 출력 이름(예: `["last_hidden_state"]`)은 각 모델의 ONNX 구성을 확인하여 얻을 수 있습니다. 예를 들어, DistilBERT의 경우 다음과 같습니다:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-Hub의 TensorFlow 체크포인트에 대해서도 동일한 프로세스가 적용됩니다. 예를 들어, 다음과 같이 순수한 TensorFlow 체크포인트를 내보냅니다:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-로컬에 저장된 모델을 내보내려면 모델의 가중치 파일과 토크나이저 파일을 동일한 디렉토리에 저장한 다음, transformers.onnx 패키지의 --model 인수를 원하는 디렉토리로 지정하여 ONNX로 내보냅니다:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
--- a/docs/source/ko/torchscript.md
+++ b/docs/source/ko/torchscript.md
@ -1,189 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# TorchScript로 내보내기[[export-to-torchscript]]
-
-<Tip>
-
-TorchScript를 활용한 실험은 아직 초기 단계로, 가변적인 입력 크기 모델들을 통해 그 기능성을 계속 탐구하고 있습니다. 
-이 기능은 저희가 관심을 두고 있는 분야 중 하나이며, 
-앞으로 출시될 버전에서 더 많은 코드 예제, 더 유연한 구현, 그리고 Python 기반 코드와 컴파일된 TorchScript를 비교하는 벤치마크를 등을 통해 분석을 심화할 예정입니다.
-
-</Tip>
-
-[TorchScript 문서](https://pytorch.org/docs/stable/jit.html)에서는 이렇게 말합니다.
-
-> TorchScript는 PyTorch 코드에서 직렬화 및 최적화 가능한 모델을 생성하는 방법입니다.
-
-[JIT과 TRACE](https://pytorch.org/docs/stable/jit.html)는 개발자가 모델을 내보내서 효율 지향적인 C++ 프로그램과 같은 다른 프로그램에서 재사용할 수 있도록 하는 PyTorch 모듈입니다.
-
-PyTorch 기반 Python 프로그램과 다른 환경에서 모델을 재사용할 수 있도록, 🤗 Transformers 모델을 TorchScript로 내보낼 수 있는 인터페이스를 제공합니다. 
-이 문서에서는 TorchScript를 사용하여 모델을 내보내고 사용하는 방법을 설명합니다.
-
-모델을 내보내려면 두 가지가 필요합니다:
-
- `torchscript` 플래그로 모델 인스턴스화
- 더미 입력을 사용한 순전파(forward pass)
-
-이 필수 조건들은 아래에 자세히 설명된 것처럼 개발자들이 주의해야 할 여러 사항들을 의미합니다.
-
-## TorchScript 플래그와 묶인 가중치(tied weights)[[torchscript-flag-and-tied-weights]]
-
-`torchscript` 플래그가 필요한 이유는 대부분의 🤗 Transformers 언어 모델에서 `Embedding` 레이어와 `Decoding` 레이어 간의 묶인 가중치(tied weights)가 존재하기 때문입니다.
-TorchScript는 묶인 가중치를 가진 모델을 내보낼 수 없으므로, 미리 가중치를 풀고 복제해야 합니다.
-
-`torchscript` 플래그로 인스턴스화된 모델은 `Embedding` 레이어와 `Decoding` 레이어가 분리되어 있으므로 이후에 훈련해서는 안 됩니다.
-훈련을 하게 되면 두 레이어 간 동기화가 해제되어 예상치 못한 결과가 발생할 수 있습니다.
-
-언어 모델 헤드를 갖지 않은 모델은 가중치가 묶여 있지 않아서 이 문제가 발생하지 않습니다.
-이러한 모델들은 `torchscript` 플래그 없이 안전하게 내보낼 수 있습니다.
-
-## 더미 입력과 표준 길이[[dummy-inputs-and-standard-lengths]]
-
-더미 입력(dummy inputs)은 모델의 순전파(forward pass)에 사용됩니다. 
-입력 값이 레이어를 통해 전파되는 동안, PyTorch는 각 텐서에서 실행된 다른 연산을 추적합니다. 
-이러한 기록된 연산은 모델의 *추적(trace)*을 생성하는 데 사용됩니다.
-
-추적은 입력의 차원을 기준으로 생성됩니다. 
-따라서 더미 입력의 차원에 제한되어, 다른 시퀀스 길이나 배치 크기에서는 작동하지 않습니다. 
-다른 크기로 시도할 경우 다음과 같은 오류가 발생합니다:
-
-```
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
-```
-추론 중 모델에 공급될 가장 큰 입력만큼 큰 더미 입력 크기로 모델을 추적하는 것이 좋습니다. 
-패딩은 누락된 값을 채우는 데 도움이 될 수 있습니다. 
-그러나 모델이 더 큰 입력 크기로 추적되기 때문에, 행렬의 차원이 커지고 계산량이 많아집니다.
-
-다양한 시퀀스 길이 모델을 내보낼 때는 각 입력에 대해 수행되는 총 연산 횟수에 주의하고 성능을 주의 깊게 확인하세요.
-
-## Python에서 TorchScript 사용하기[[using-torchscript-in-python]]
-
-이 섹션에서는 모델을 저장하고 가져오는 방법, 추적을 사용하여 추론하는 방법을 보여줍니다.
-
-### 모델 저장하기[[saving-a-model]]
-
-`BertModel`을 TorchScript로 내보내려면 `BertConfig` 클래스에서 `BertModel`을 인스턴스화한 다음, `traced_bert.pt`라는 파일명으로 디스크에 저장하면 됩니다.
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-# 입력 텍스트 토큰화하기
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# 입력 토큰 중 하나를 마스킹하기
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# 더미 입력 만들기
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# torchscript 플래그로 모델 초기화하기
-# 이 모델은 LM 헤드가 없으므로 필요하지 않지만, 플래그를 True로 설정합니다.
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# 모델을 인스턴트화하기
-model = BertModel(config)
-
-# 모델을 평가 모드로 두어야 합니다.
-model.eval()
-
-# 만약 *from_pretrained*를 사용하여 모델을 인스턴스화하는 경우, TorchScript 플래그를 쉽게 설정할 수 있습니다
-model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
-
-# 추적 생성하기
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-### 모델 가져오기[[loading-a-model]]
-
-이제 이전에 저장한 `BertModel`, 즉 `traced_bert.pt`를 디스크에서 가져오고, 이전에 초기화한 `dummy_input`에서 사용할 수 있습니다.
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-### 추적된 모델을 사용하여 추론하기[[using-a-traced-model-for-inference]]
-
-`__call__` 이중 언더스코어(dunder) 메소드를 사용하여 추론에 추적된 모델을 사용하세요:
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-
-## Neuron SDK로 Hugging Face TorchScript 모델을 AWS에 배포하기[[deploy-hugging-face-torchscript-models-to-aws-with-the-neuron-sdk]]
-
-AWS가 클라우드에서 저비용, 고성능 머신 러닝 추론을 위한 [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) 인스턴스 제품군을 출시했습니다. 
-Inf1 인스턴스는 딥러닝 추론 워크로드에 특화된 맞춤 하드웨어 가속기인 AWS Inferentia 칩으로 구동됩니다. 
-[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#)은 Inferentia를 위한 SDK로, Inf1에 배포하기 위한 transformers 모델 추적 및 최적화를 지원합니다. 
-Neuron SDK는 다음과 같은 기능을 제공합니다:
-
-1. 코드 한 줄만 변경하면 클라우드 추론를 위해 TorchScript 모델을 추적하고 최적화할 수 있는 쉬운 API
-2. 즉시 사용 가능한 성능 최적화로 [비용 효율 향상](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>)
-3. [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) 또는 [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html)로 구축된 Hugging Face transformers 모델 지원
-
-### 시사점[[implications]]
-
-[BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert) 아키텍처 또는 그 변형인 [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) 및 [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta)를 기반으로 한 Transformers 모델은 추출 기반 질의응답, 시퀀스 분류 및 토큰 분류와 같은 비생성 작업 시 Inf1에서 최상의 성능을 보입니다. 
-그러나 텍스트 생성 작업도 [AWS Neuron MarianMT 튜토리얼](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html)을 따라 Inf1에서 실행되도록 조정할 수 있습니다.
-
-Inferentia에서 바로 변환할 수 있는 모델에 대한 자세한 정보는 Neuron 문서의 [Model Architecture Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) 섹션에서 확인할 수 있습니다.
-
-### 종속성[[dependencies]]
-
-AWS Neuron을 사용하여 모델을 변환하려면 [Neuron SDK 환경](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide)이 필요합니다.
- 이는 [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html)에 미리 구성되어 있습니다.
-
-### AWS Neuron으로 모델 변환하기[[converting-a-model-for-aws-neuron]]
-
-`BertModel`을 추적하려면, [Python에서 TorchScript 사용하기](torchscript#using-torchscript-in-python)에서와 동일한 코드를 사용해서 AWS NEURON용 모델을 변환합니다. 
-`torch.neuron` 프레임워크 익스텐션을 가져와 Python API를 통해 Neuron SDK의 구성 요소에 접근합니다:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-
-다음 줄만 수정하면 됩니다:
-
-```diff
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-이로써 Neuron SDK가 모델을 추적하고 Inf1 인스턴스에 최적화할 수 있게 됩니다.
-
-AWS Neuron SDK의 기능, 도구, 예제 튜토리얼 및 최신 업데이트에 대해 자세히 알아보려면 [AWS NeuronSDK 문서](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html)를 참조하세요.
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -44,8 +44,6 @@
    title: 聊天模型的模板
  - local: serialization
    title: 导出为 ONNX
-  - local: torchscript
-    title: 导出为 TorchScript
  - local: gguf
    title: 与 GGUF 格式的互操作性
  - local: tiktoken
@ -109,8 +107,6 @@
      title: 模型
    - local: main_classes/text_generation
      title: 文本生成
-    - local: main_classes/onnx
-      title: ONNX
    - local: main_classes/optimizer_schedules
      title: Optimization
    - local: main_classes/output
--- a/docs/source/zh/main_classes/onnx.md
+++ b/docs/source/zh/main_classes/onnx.md
@ -1,45 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 导出 🤗 Transformers 模型到 ONNX
-
-🤗 Transformers提供了一个`transformers.onnx`包，通过利用配置对象，您可以将模型checkpoints转换为ONNX图。
-
-有关更多详细信息，请参阅导出 🤗 Transformers 模型的[指南](../serialization)。
-
-## ONNX Configurations
-
-我们提供了三个抽象类，取决于您希望导出的模型架构类型：
-
-* 基于编码器的模型继承 [`~onnx.config.OnnxConfig`]
-* 基于解码器的模型继承 [`~onnx.config.OnnxConfigWithPast`]
-* 编码器-解码器模型继承 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-### OnnxConfig
-
-[[autodoc]] onnx.config.OnnxConfig
-
-### OnnxConfigWithPast
-
-[[autodoc]] onnx.config.OnnxConfigWithPast
-
-### OnnxSeq2SeqConfigWithPast
-
-[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
-
-## ONNX Features
-
-每个ONNX配置与一组 _特性_ 相关联，使您能够为不同类型的拓扑结构或任务导出模型。
--- a/docs/source/zh/serialization.md
+++ b/docs/source/zh/serialization.md
@ -47,7 +47,7 @@ rendered properly in your Markdown viewer.
 要将 🤗 Transformers 模型导出为 ONNX，首先需要安装额外的依赖项：

 ```bash
-pip install optimum[exporters]
+pip install optimum-onnx
 ```

 请参阅 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 以查看所有可用参数，或者在命令行中查看帮助：
@ -117,53 +117,3 @@ optimum-cli export onnx --model local_path --task question-answering distilbert_
 ### 导出尚未支持的架构的模型

 如果你想要为当前无法导出的模型添加支持，请先检查 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview) 是否支持该模型，如果不支持，你可以 [直接为 🤗 Optimum 贡献代码](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)。
-
-### 使用 `transformers.onnx` 导出模型
-
-<Tip warning={true}>
-
-`transformers.onnx` 不再进行维护，请如上所述，使用 🤗 Optimum 导出模型。这部分内容将在未来版本中删除。
-
-</Tip>
-
-要使用 `transformers.onnx` 将 🤗 Transformers 模型导出为 ONNX，请安装额外的依赖项：
-
-```bash
-pip install transformers[onnx]
-```
-
-将 `transformers.onnx` 包作为 Python 模块使用，以使用现成的配置导出检查点：
-
-```bash
-python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
-```
-
-以上代码将导出由 `--model` 参数定义的检查点的 ONNX 图。传入任何 🤗 Hub 上或者存储与本地的检查点。生成的 `model.onnx` 文件可以在支持 ONNX 标准的众多加速引擎上运行。例如，使用 ONNX Runtime 加载并运行模型，如下所示：
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-可以通过查看每个模型的 ONNX 配置来获取所需的输出名（例如 `["last_hidden_state"]`）。例如，对于 DistilBERT，可以用以下代码获取输出名称：
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-要导出本地存储的模型，请将模型的权重和分词器文件保存在同一目录中（例如 `local-pt-checkpoint`），然后通过将 `transformers.onnx` 包的 `--model` 参数指向该目录，将其导出为 ONNX：
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
--- a/docs/source/zh/torchscript.md
+++ b/docs/source/zh/torchscript.md
@ -1,197 +0,0 @@
-<!--
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-->
-
-# 导出为 TorchScript
-
-<Tip>
-
-这是开始使用 TorchScript 进行实验的起点，我们仍在探索其在变量输入大小模型中的能力。
-这是我们关注的焦点，我们将在即将发布的版本中深入分析，提供更多的代码示例、更灵活的实现以及比较
-Python 代码与编译 TorchScript 的性能基准。
-
-</Tip>
-
-根据 [TorchScript 文档](https://pytorch.org/docs/stable/jit.html)：
-
-> TorchScript 是从 PyTorch 代码创建可序列化和可优化的模型的一种方式。
-
-有两个 PyTorch 模块：[JIT 和 TRACE](https://pytorch.org/docs/stable/jit.html)。
-这两个模块允许开发人员将其模型导出到其他程序中重用，比如面向效率的 C++ 程序。
-
-我们提供了一个接口，允许您将 🤗 Transformers 模型导出为 TorchScript，
-以便在与基于 PyTorch 的 Python 程序不同的环境中重用。
-本文解释如何使用 TorchScript 导出并使用我们的模型。
-
-导出模型需要两个步骤：
-
- 使用 `torchscript` 参数实例化模型
- 使用虚拟输入进行前向传递
-
-这些必要条件意味着开发人员应该注意以下详细信息。
-
-## TorchScript 参数和绑定权重
-
-`torchscript` 参数是必需的，因为大多数 🤗 Transformers 语言模型的 `Embedding` 层和
-`Decoding` 层之间有绑定权重。TorchScript 不允许导出具有绑定权重的模型，因此必须事先解绑和克隆权重。
-
-使用 `torchscript` 参数实例化的模型将其 `Embedding` 层和 `Decoding` 层分开，
-这意味着它们不应该在后续进行训练。训练将导致这两层不同步，产生意外结果。
-
-对于没有语言模型头部的模型，情况不同，因为这些模型没有绑定权重。
-这些模型可以安全地导出而无需 `torchscript` 参数。
-
-## 虚拟输入和标准长度
-
-虚拟输入用于模型的前向传递。当输入的值传播到各层时，PyTorch 会跟踪在每个张量上执行的不同操作。
-然后使用记录的操作来创建模型的 *trace* 。
-
-跟踪是相对于输入的维度创建的。因此，它受到虚拟输入的维度限制，对于任何其他序列长度或批量大小都不起作用。
-当尝试使用不同大小时，会引发以下错误：
-
-```text
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
-```
-
-我们建议使用至少与推断期间将馈送到模型的最大输入一样大的虚拟输入大小进行跟踪。
-填充可以帮助填补缺失的值。然而，由于模型是使用更大的输入大小进行跟踪的，矩阵的维度也会很大，导致更多的计算。
-
-在每个输入上执行的操作总数要仔细考虑，并在导出不同序列长度模型时密切关注性能。
-
-## 在 Python 中使用 TorchScript
-
-本节演示了如何保存和加载模型以及如何使用 trace 进行推断。
-
-### 保存模型
-
-要使用 TorchScript 导出 `BertModel`，请从 `BertConfig` 类实例化 `BertModel`，
-然后将其保存到名为 `traced_bert.pt` 的磁盘文件中：
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-# 对输入文本分词
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# 屏蔽一个输入 token
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# 创建虚拟输入
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# 使用 torchscript 参数初始化模型
-# 即使此模型没有 LM Head，也将参数设置为 True。
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# 实例化模型
-model = BertModel(config)
-
-# 模型需要处于评估模式
-model.eval()
-
-# 如果您使用 *from_pretrained* 实例化模型，还可以轻松设置 TorchScript 参数
-model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
-
-# 创建 trace
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-### 加载模型
-
-现在，您可以从磁盘加载先前保存的 `BertModel`、`traced_bert.pt`，并在先前初始化的 `dummy_input` 上使用：
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-### 使用 trace 模型进行推断
-
-通过使用其 `__call__` dunder 方法使用 trace 模型进行推断：
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-
-## 使用 Neuron SDK 将 Hugging Face TorchScript 模型部署到 AWS
-
-AWS 引入了用于云端低成本、高性能机器学习推理的
-[Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) 实例系列。
-Inf1 实例由 AWS Inferentia 芯片提供支持，这是一款专为深度学习推理工作负载而构建的定制硬件加速器。
-[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) 是
-Inferentia 的 SDK，支持对 transformers 模型进行跟踪和优化，以便在 Inf1 上部署。Neuron SDK 提供：
-
-1. 简单易用的 API，只需更改一行代码即可为云端推理跟踪和优化 TorchScript 模型。
-2. 针对[改进的性能成本](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/)的即插即用性能优化。
-3. 支持使用 [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html)
-   或 [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html)
-   构建的 Hugging Face transformers 模型。
-
-### 影响
-
-基于 [BERT（来自 Transformers 的双向编码器表示）](https://huggingface.co/docs/transformers/main/model_doc/bert)架构的
-transformers 模型，或其变体，如 [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert)
-和 [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) 在 Inf1 上运行最佳，
-可用于生成抽取式问答、序列分类和标记分类等任务。然而，文本生成任务仍可以适应在 Inf1 上运行，
-如这篇 [AWS Neuron MarianMT 教程](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html)所述。
-有关可以直接在 Inferentia 上转换的模型的更多信息，请参阅 Neuron 文档的[模型架构适配](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia)章节。
-
-### 依赖关系
-
-使用 AWS Neuron 将模型转换为模型需要一个
-[Neuron SDK 环境](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide)，
-它已经预先配置在 [AWS 深度学习 AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html)上。
-
-### 将模型转换为 AWS Neuron
-
-使用与 [Python 中使用 TorchScript](torchscript#using-torchscript-in-python) 相同的代码来跟踪
-`BertModel` 以将模型转换为 AWS NEURON。导入 `torch.neuron` 框架扩展以通过 Python API 访问 Neuron SDK 的组件：
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-
-您只需要修改下面这一行：
-
-```diff
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-这样就能使 Neuron SDK 跟踪模型并对其进行优化，以在 Inf1 实例上运行。
-
-要了解有关 AWS Neuron SDK 功能、工具、示例教程和最新更新的更多信息，
-请参阅 [AWS NeuronSDK 文档](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html)。
--- a/examples/3D_parallel.py
+++ b/examples/3D_parallel.py
@ -151,7 +151,6 @@ def main():
    if dist.is_initialized() and dp_mesh.size() > 1:
        model = FSDP(model, device_mesh=dp_mesh, sharding_strategy=ShardingStrategy.NO_SHARD)
        use_ddp = True
-        pass

    model.train()

--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@ -122,7 +122,7 @@ class GLUETransformer(BaseTransformer):
        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

-        results = {**{"val_loss": val_loss_mean}, **compute_metrics(self.hparams.task, preds, out_label_ids)}
+        results = {"val_loss": val_loss_mean, **compute_metrics(self.hparams.task, preds, out_label_ids)}

        ret = dict(results.items())
        ret["log"] = results
--- a/examples/legacy/seq2seq/pack_dataset.py
+++ b/examples/legacy/seq2seq/pack_dataset.py
@ -60,8 +60,8 @@ def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
    save_path.mkdir(exist_ok=True)
    for split in ["train"]:
        src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target"
-        src_docs = [x.rstrip() for x in Path(src_path).open().readlines()]
-        tgt_docs = [x.rstrip() for x in Path(tgt_path).open().readlines()]
+        src_docs = [x.rstrip() for x in Path(src_path).open()]
+        tgt_docs = [x.rstrip() for x in Path(tgt_path).open()]
        packed_src, packed_tgt = pack_examples(tok, src_docs, tgt_docs, max_tokens)
        print(f"packed {split} split from {len(src_docs)} examples -> {len(packed_src)}.")
        Path(save_path / f"{split}.source").open("w").write("\n".join(packed_src))
--- a/examples/legacy/seq2seq/rouge_cli.py
+++ b/examples/legacy/seq2seq/rouge_cli.py
@ -19,8 +19,8 @@ from utils import calculate_rouge, save_json

 def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
    """Kwargs will be passed to calculate_rouge"""
-    pred_lns = [x.strip() for x in open(pred_path).readlines()]
-    tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)]
+    pred_lns = [x.strip() for x in open(pred_path)]
+    tgt_lns = [x.strip() for x in open(tgt_path)][: len(pred_lns)]
    metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
    if save_path is not None:
        save_json(metrics, save_path, indent=None)
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@ -205,7 +205,7 @@ def run_generate():
            return
        tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
        with open(tgt_file) as f:
-            labels = [x.rstrip() for x in f.readlines()][: len(preds)]
+            labels = [x.rstrip() for x in f][: len(preds)]

        # Calculate metrics, save metrics,  and save _generations.txt
        calc_bleu = "translation" in args.task
@ -252,8 +252,7 @@ def gather_results_from_each_node(num_replicas, save_dir, timeout) -> list[dict[
            return json_data
        except JSONDecodeError:
            continue
-    else:
-        raise TimeoutError("Rank 0 gave up on waiting for other processes")
+    raise TimeoutError("Rank 0 gave up on waiting for other processes")
    # Unreachable


--- a/examples/legacy/seq2seq/run_eval.py
+++ b/examples/legacy/seq2seq/run_eval.py
@ -130,7 +130,7 @@ def run_generate(verbose=True):
    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)
    if parsed_args and verbose:
        print(f"parsed the following generate kwargs: {parsed_args}")
-    examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path).readlines()]
+    examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path)]
    if args.n_obs > 0:
        examples = examples[: args.n_obs]
    Path(args.save_path).parent.mkdir(exist_ok=True)
@ -159,8 +159,8 @@ def run_generate(verbose=True):

    # Compute scores
    score_fn = calculate_bleu if "translation" in args.task else calculate_rouge
-    output_lns = [x.rstrip() for x in open(args.save_path).readlines()]
-    reference_lns = [x.rstrip() for x in open(args.reference_path).readlines()][: len(output_lns)]
+    output_lns = [x.rstrip() for x in open(args.save_path)]
+    reference_lns = [x.rstrip() for x in open(args.reference_path)][: len(output_lns)]
    scores: dict = score_fn(output_lns, reference_lns)
    scores.update(runtime_metrics)

--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@ -162,7 +162,7 @@ class AbstractSeq2SeqDataset(Dataset):

    @staticmethod
    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open().readlines()]
+        return [len(x) for x in Path(data_file).open()]

    @cached_property
    def tgt_lens(self):
--- a/examples/legacy/token-classification/scripts/preprocess.py
+++ b/examples/legacy/token-classification/scripts/preprocess.py
@ -31,7 +31,7 @@ with open(dataset) as f_p:
            continue

        if (subword_len_counter + current_subwords_len) > max_len:
-            print("")
+            print()
            print(line)
            subword_len_counter = current_subwords_len
            continue
--- a/examples/modular-transformers/configuration_duplicated_method.py
+++ b/examples/modular-transformers/configuration_duplicated_method.py
@ -5,8 +5,10 @@
 #                          modular_duplicated_method.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

+from typing import Optional
+
 from ...configuration_utils import PreTrainedConfig
-from ...modeling_rope_utils import rope_config_validation
+from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params


 class DuplicatedMethodConfig(PreTrainedConfig):
@ -65,45 +67,10 @@ class DuplicatedMethodConfig(PreTrainedConfig):
            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'duplicated_method3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'duplicated_method3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'duplicated_method3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'duplicated_method3'. Scaling factor applied to high frequency components of the RoPE
+        rope_parameters (`RopeParameters`, *optional*):
+            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain
+            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
+            with longer `max_position_embeddings`.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
@ -146,28 +113,27 @@ class DuplicatedMethodConfig(PreTrainedConfig):

    def __init__(
        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        mlp_bias=False,
-        head_dim=None,
+        vocab_size: Optional[int] = 32000,
+        hidden_size: Optional[int] = 4096,
+        intermediate_size: Optional[int] = 11008,
+        num_hidden_layers: Optional[int] = 32,
+        num_attention_heads: Optional[int] = 32,
+        num_key_value_heads: Optional[int] = None,
+        hidden_act: Optional[str] = "silu",
+        max_position_embeddings: Optional[int] = 2048,
+        initializer_range: Optional[float] = 0.02,
+        rms_norm_eps: Optional[int] = 1e-6,
+        use_cache: Optional[bool] = True,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = 1,
+        eos_token_id: Optional[int] = 2,
+        pretraining_tp: Optional[int] = 1,
+        tie_word_embeddings: Optional[bool] = False,
+        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        attention_bias: Optional[bool] = False,
+        attention_dropout: Optional[float] = 0.0,
+        mlp_bias: Optional[bool] = False,
+        head_dim: Optional[int] = None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
@ -187,16 +153,17 @@ class DuplicatedMethodConfig(PreTrainedConfig):
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias
        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        self.rope_parameters = rope_scaling or rope_parameters
+
        # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, copy it it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_theta = kwargs.get("rope_theta", 10000.0)
+        standardize_rope_params(self, rope_theta=rope_theta)
        rope_config_validation(self)

        super().__init__(
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@ -5,8 +5,10 @@
 #                          modular_my_new_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

+from typing import Optional
+
 from ...configuration_utils import PreTrainedConfig
-from ...modeling_rope_utils import rope_config_validation
+from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params


 class MyNewModelConfig(PreTrainedConfig):
@ -147,38 +149,30 @@ class MyNewModelConfig(PreTrainedConfig):

    def __init__(
        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
+        vocab_size: Optional[int] = 32000,
+        hidden_size: Optional[int] = 4096,
+        intermediate_size: Optional[int] = 11008,
+        num_hidden_layers: Optional[int] = 32,
+        num_attention_heads: Optional[int] = 32,
+        num_key_value_heads: Optional[int] = None,
+        hidden_act: Optional[str] = "silu",
+        max_position_embeddings: Optional[int] = 2048,
+        initializer_range: Optional[float] = 0.02,
+        rms_norm_eps: Optional[int] = 1e-6,
+        use_cache: Optional[bool] = True,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = 1,
+        eos_token_id: Optional[int] = 2,
+        pretraining_tp: Optional[int] = 1,
+        tie_word_embeddings: Optional[bool] = False,
+        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        attention_bias: Optional[bool] = False,
+        attention_dropout: Optional[float] = 0.0,
        mlp_bias=True,
-        head_dim=None,
+        head_dim: Optional[int] = None,
        new_param=0,
        **kwargs,
    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
@ -196,15 +190,24 @@ class MyNewModelConfig(PreTrainedConfig):
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias
        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        self.rope_parameters = rope_scaling or rope_parameters
+
        # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, copy it it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_theta = kwargs.get("rope_theta", 10000.0)
+        standardize_rope_params(self, rope_theta=rope_theta)
        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
        self.new_param = new_param
--- a/examples/modular-transformers/configuration_my_new_model2.py
+++ b/examples/modular-transformers/configuration_my_new_model2.py
@ -4,9 +4,10 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_my_new_model2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Optional

 from ...configuration_utils import PreTrainedConfig
-from ...modeling_rope_utils import rope_config_validation
+from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params


 class MyNewModel2Config(PreTrainedConfig):
@ -51,28 +52,27 @@ class MyNewModel2Config(PreTrainedConfig):

    def __init__(
        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        mlp_bias=False,
-        head_dim=None,
+        vocab_size: Optional[int] = 32000,
+        hidden_size: Optional[int] = 4096,
+        intermediate_size: Optional[int] = 11008,
+        num_hidden_layers: Optional[int] = 32,
+        num_attention_heads: Optional[int] = 32,
+        num_key_value_heads: Optional[int] = None,
+        hidden_act: Optional[str] = "silu",
+        max_position_embeddings: Optional[int] = 2048,
+        initializer_range: Optional[float] = 0.02,
+        rms_norm_eps: Optional[int] = 1e-6,
+        use_cache: Optional[bool] = True,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = 1,
+        eos_token_id: Optional[int] = 2,
+        pretraining_tp: Optional[int] = 1,
+        tie_word_embeddings: Optional[bool] = False,
+        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        attention_bias: Optional[bool] = False,
+        attention_dropout: Optional[float] = 0.0,
+        mlp_bias: Optional[bool] = False,
+        head_dim: Optional[int] = None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
@ -92,16 +92,17 @@ class MyNewModel2Config(PreTrainedConfig):
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias
        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        self.rope_parameters = rope_scaling or rope_parameters
+
        # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, copy it it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_theta = kwargs.get("rope_theta", 10000.0)
+        standardize_rope_params(self, rope_theta=rope_theta)
        rope_config_validation(self)

        super().__init__(
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -156,8 +156,8 @@ class MyNewModel2Attention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor],
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
@ -207,7 +207,6 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        self.mlp = MyNewModel2MLP(config)
        self.input_layernorm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.attention_type = config.layer_types[layer_idx]

    def forward(
        self,
@ -217,7 +216,7 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        past_key_values: Optional[Cache] = None,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.Tensor:
        residual = hidden_states
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -125,15 +125,23 @@ def token_type_ids_mask_function(
        # If it's 1 for both query and key/value, we are in an image block
        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
-        safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
-        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
+        safe_q_idx = torch.where(q_idx < token_type_ids.shape[1], q_idx, 0)
+        safe_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+
+        token_type_ids_at_q_idx = token_type_ids[batch_idx, safe_q_idx]
+        token_type_ids_at_q_idx = torch.where(q_idx < token_type_ids.shape[1], token_type_ids_at_q_idx, 0)
+
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_kv_idx]
        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)

-        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_idx]
+        image_group_ids_at_q_idx = image_group_ids[batch_idx, safe_q_idx]
+        image_group_ids_at_q_idx = torch.where(q_idx < image_group_ids.shape[1], image_group_ids_at_q_idx, -1)
+
+        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_kv_idx]
        image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)

-        is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
-        same_image_block = image_group_ids[batch_idx, q_idx] == image_group_ids_at_kv_idx
+        is_image_block = (token_type_ids_at_q_idx == 1) & (token_type_ids_at_kv_idx == 1)
+        same_image_block = image_group_ids_at_q_idx == image_group_ids_at_kv_idx

        # This is bidirectional attention whenever we are dealing with image tokens
        return is_image_block & same_image_block
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -51,8 +51,8 @@ class SuperRotaryEmbedding(nn.Module):
    def __init__(self, config: SuperConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        if hasattr(config, "rope_parameters") and isinstance(config.rope_parameters, dict):
+            self.rope_type = config.rope_parameters.get("rope_type", config.rope_parameters.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
@ -258,7 +258,7 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
        past_key_values: Optional[Cache] = None,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.Tensor:
        residual = hidden_states
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@ -389,7 +389,7 @@ python run_speech_recognition_seq2seq.py \
 	--fp16 \
 	--do_train \
 	--do_eval \
-	--predict_with_generate \
+	--predict_with_generate
 ```
 On a single V100, training should take approximately 8 hours, with a final cross-entropy loss of **1e-4** and word error rate of **32.6%**.

@ -428,7 +428,7 @@ torchrun \
 	--fp16 \
 	--do_train \
 	--do_eval \
-	--predict_with_generate \
+	--predict_with_generate
 ```
 On two V100s, training should take approximately 4 hours, with a final cross-entropy loss of **1e-4** and word error rate of **32.6%**.

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -616,8 +616,7 @@ def main():
        output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_predictions_file, "w") as writer:
-                for prediction in true_predictions:
-                    writer.write(" ".join(prediction) + "\n")
+                writer.writelines(" ".join(prediction) + "\n" for prediction in true_predictions)

    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
    if data_args.dataset_name is not None:
--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@ -21,12 +21,12 @@ A useful guide for English-Chinese translation of Hugging Face documentation

 Dictionary

-Hugging Face: 抱抱脸
+Hugging Face: Hugging Face（不翻译）
 token: 词符（并用括号标注原英文）
 tokenize: 词符化（并用括号标注原英文）
 tokenizer: 词符化器（并用括号标注原英文）
 transformer: transformer（不翻译）
-pipeline: 流水线
+pipeline: pipeline（不翻译）
 API: API (不翻译）
 inference: 推理
 Trainer: 训练器。当作为类名出现时不翻译。
@ -107,9 +107,9 @@ checkpoint: 检查点
 - [用 DistilBERT 做问答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
 - [用 T5 做翻译](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)

-**[Write With Transformer](https://transformer.huggingface.co)**，由抱抱脸团队打造，是一个文本生成的官方 demo。
+**[Write With Transformer](https://transformer.huggingface.co)**，由 Hugging Face 团队打造，是一个文本生成的官方 demo。

-## 如果你在寻找由抱抱脸团队提供的定制化支持服务
+## 如果你在寻找由 Hugging Face 团队提供的定制化支持服务

 <a target="_blank" href="https://huggingface.co/support">
    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
@ -117,25 +117,25 @@ checkpoint: 检查点

 ## 快速上手

-我们为快速使用模型提供了 `pipeline` （流水线）API。流水线聚合了预训练模型和对应的文本预处理。下面是一个快速使用流水线去判断正负面情绪的例子：
+我们为快速使用模型提供了 `pipeline` API。Pipeline 聚合了预训练模型和对应的文本预处理。下面是一个快速使用 pipeline 去判断正负面情绪的例子：

 ```python
 >>> from transformers import pipeline

-# 使用情绪分析流水线
+# 使用情绪分析 pipeline
 >>> classifier = pipeline('sentiment-analysis')
 >>> classifier('We are very happy to introduce pipeline to the transformers repository.')
 [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
 ```

-第二行代码下载并缓存了流水线使用的预训练模型，而第三行代码则在给定的文本上进行了评估。这里的答案“正面” (positive) 具有 99 的置信度。
+第二行代码下载并缓存了 pipeline 使用的预训练模型，而第三行代码则在给定的文本上进行了评估。这里的答案"正面" (positive) 具有 99 的置信度。

-许多的 NLP 任务都有开箱即用的预训练流水线。比如说，我们可以轻松的从给定文本中抽取问题答案：
+许多的 NLP 任务都有开箱即用的预训练 `pipeline`。比如说，我们可以轻松的从给定文本中抽取问题答案：

 ``` python
 >>> from transformers import pipeline

-# 使用问答流水线
+# 使用问答 pipeline
 >>> question_answerer = pipeline('question-answering')
 >>> question_answerer({
 ...     'question': 'What is the name of the repository ?',
@ -145,7 +145,7 @@ checkpoint: 检查点

 ```

-除了给出答案，预训练模型还给出了对应的置信度分数、答案在词符化 (tokenized) 后的文本中开始和结束的位置。你可以从[这个教程](https://huggingface.co/docs/transformers/task_summary)了解更多流水线API支持的任务。
+除了给出答案，预训练模型还给出了对应的置信度分数、答案在词符化 (tokenized) 后的文本中开始和结束的位置。你可以从[这个教程](https://huggingface.co/docs/transformers/task_summary)了解更多 `pipeline` API 支持的任务。

 要在你的任务上下载和使用任意预训练模型也很简单，只需三行代码。这里是 PyTorch 版的示例：
 ```python
@ -193,11 +193,11 @@ checkpoint: 检查点
 1. 为你的需求轻松定制专属模型和用例：
    - 我们为每种模型架构提供了多个用例来复现原论文结果
    - 模型内部结构保持透明一致
-    - 模型文件可单独使用，方便魔改和快速实验
+    - 模型文件可单独使用，方便修改和快速实验

 ## 什么情况下我不该用 transformers？

- 本库并不是模块化的神经网络工具箱。模型文件中的代码特意呈若璞玉，未经额外抽象封装，以便研究人员快速迭代魔改而不致溺于抽象和文件跳转之中。
+- 本库并不是模块化的神经网络工具箱。模型文件中的代码特意呈若璞玉，未经额外抽象封装，以便研究人员快速迭代修改而不致溺于抽象和文件跳转之中。
 - `Trainer` API 并非兼容任何模型，只为本库之模型优化。若是在寻找适用于通用机器学习的训练循环实现，请另觅他库。
 - 尽管我们已尽力而为，[examples 目录](https://github.com/huggingface/transformers/tree/main/examples)中的脚本也仅为用例而已。对于你的特定问题，它们并不一定开箱即用，可能需要改几行代码以适之。

--- a/notebooks/README.md
+++ b/notebooks/README.md
@ -30,10 +30,10 @@ You can open any page of the documentation as a notebook in Colab (there is a bu

 | Notebook     |      Description      |   |   |   |
 |:----------|:-------------|:-------------|:------------|------:|
-| [Quicktour of the library](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)  | A presentation of the various APIs in Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)|
-| [Summary of the tasks](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)  | How to run the models of the Transformers library task by task |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)|
-| [Preprocessing data](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)  | How to use a tokenizer to preprocess your data |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)|
-| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)|
+| [Quicktour of the library](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)  | A presentation of the various APIs in Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)| |
+| [Summary of the tasks](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)  | How to run the models of the Transformers library task by task |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| |
+| [Preprocessing data](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)  | How to use a tokenizer to preprocess your data |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)||
+| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| |
 | [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)  | The differences between the tokenizers algorithm |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb )|
 | [Multilingual models](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)  | How to use the multilingual models of the library |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|

@ -45,16 +45,16 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 |:----------|:-------------|:-------------|:-------------|------:|
 | [Train your tokenizer](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
 | [Train your language model](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)   | How to easily start using transformers  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|
-| [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)|
-| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|
-| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)|
-| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)|
-| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)|
-| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)|
-| [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)|
-| [How to train a language model from scratch](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)|
-| [How to generate text](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)|
-| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| How Reformer pushes the limits of language modeling | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)|
+| [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| |
+| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|
+| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| |
+| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| |
+| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| |
+| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| |
+| [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| |
+| [How to train a language model from scratch](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| |
+| [How to generate text](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| |
+| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| How Reformer pushes the limits of language modeling | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| |

 #### Computer Vision[[pytorch-cv]]

@ -108,6 +108,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 | [How to fine-tune a model on text classification with ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| Show how to preprocess the data and fine-tune a model on any GLUE task using [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)|
 | [How to fine-tune a model on summarization with ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| Show how to preprocess the data and fine-tune a model on XSUM using [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)|

-## Community notebooks:
+## Community notebooks

 More notebooks developed by the community are available [here](https://hf.co/docs/transformers/community#community-notebooks).
--- a/pyproject.toml
+++ b/pyproject.toml
@ -32,7 +32,7 @@ line-length = 119
 ignore = ["C901", "E501", "E741", "F402", "F823", "SIM1", "SIM300", "SIM212", "SIM905", "UP009", "UP015", "UP031", "UP028", "UP004", "UP045", "UP007"]
 # RUF013: Checks for the use of implicit Optional
 #  in type annotations when the default parameter value is None.
-select = ["C", "E", "F", "I", "W", "RUF013", "PERF102", "PLC1802", "PLC0208", "SIM", "UP", "PIE794"]
+select = ["C", "E", "F", "I", "W", "RUF013", "PERF102", "PLC1802", "PLC0208", "SIM", "UP", "PIE794", "FURB"]
 extend-safe-fixes = ["UP006"]

 # Ignore import violations in all `__init__.py` files.
--- a/setup.py
+++ b/setup.py
@ -67,7 +67,6 @@ To create the package for pypi.
 9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 """

-import os
 import re
 import shutil
 from pathlib import Path
@ -118,6 +117,7 @@ _deps = [
    "importlib_metadata",
    "ipadic>=1.0.0,<2.0",
    "jinja2>=3.1.0",
+    "jmespath>=1.0.1",
    "kenlm",
    "kernels>=0.10.2,<0.11",
    "librosa",
@ -261,10 +261,7 @@ extras["torch"] = deps_list("torch", "accelerate")
 extras["accelerate"] = deps_list("accelerate")
 extras["hf_xet"] = deps_list("hf_xet")

-if os.name == "nt":  # windows
-    extras["retrieval"] = deps_list("datasets")  # faiss is not supported on windows
-else:
-    extras["retrieval"] = deps_list("faiss-cpu", "datasets")
+extras["retrieval"] = deps_list("faiss-cpu", "datasets")

 extras["tokenizers"] = deps_list("tokenizers")
 extras["ftfy"] = deps_list("ftfy")
@ -298,7 +295,7 @@ extras["num2words"] = deps_list("num2words")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["tiktoken"] = deps_list("tiktoken", "blobfile")
 extras["mistral-common"] = deps_list("mistral-common[opencv]")
-extras["chat_template"] = deps_list("jinja2")
+extras["chat_template"] = deps_list("jinja2", "jmespath")
 extras["testing"] = (
    deps_list(
        "pytest",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -129,8 +129,6 @@ _import_structure = {
    ],
    "loss": [],
    "modelcard": ["ModelCard"],
-    # Models
-    "onnx": [],
    "pipelines": [
        "AudioClassificationPipeline",
        "AutomaticSpeechRecognitionPipeline",
@ -440,7 +438,7 @@ else:
    _import_structure["modeling_flash_attention_utils"] = []
    _import_structure["modeling_layers"] = ["GradientCheckpointingLayer"]
    _import_structure["modeling_outputs"] = []
-    _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update"]
+    _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update", "RopeParameters"]
    _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"]
    _import_structure["masking_utils"] = ["AttentionMaskInterface"]
    _import_structure["optimization"] = [
@ -619,6 +617,7 @@ if TYPE_CHECKING:
    from .modelcard import ModelCard as ModelCard
    from .modeling_layers import GradientCheckpointingLayer as GradientCheckpointingLayer
    from .modeling_rope_utils import ROPE_INIT_FUNCTIONS as ROPE_INIT_FUNCTIONS
+    from .modeling_rope_utils import RopeParameters as RopeParameters
    from .modeling_rope_utils import dynamic_rope_update as dynamic_rope_update
    from .modeling_utils import AttentionInterface as AttentionInterface
    from .modeling_utils import PreTrainedModel as PreTrainedModel
--- a/src/transformers/cli/chat.py
+++ b/src/transformers/cli/chat.py
@ -249,7 +249,7 @@ class Chat:

        # Generation settings
        config = load_generation_config(generation_config)
-        config.update(**{"do_sample": True, "max_new_tokens": 256})  # some default values
+        config.update(do_sample=True, max_new_tokens=256)  # some default values
        config.update(**parse_generate_flags(generate_flags))
        self.config = config

--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -167,8 +167,6 @@ class PreTrainedConfig(PushToHubMixin):

        > PyTorch specific parameters

-        torchscript (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should be used with Torchscript.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
            model has a output word embedding layer.
@ -206,7 +204,6 @@ class PreTrainedConfig(PushToHubMixin):
        output_hidden_states: bool = False,
        output_attentions: bool = False,
        return_dict: bool = True,
-        torchscript: bool = False,
        dtype: Optional[Union[str, "torch.dtype"]] = None,
        # Common arguments
        tie_word_embeddings: bool = True,
@ -268,7 +265,6 @@ class PreTrainedConfig(PushToHubMixin):
        # Attributes common for all models
        self.return_dict = return_dict
        self.output_hidden_states = output_hidden_states
-        self.torchscript = torchscript
        self.dtype = dtype
        self._output_attentions = output_attentions  # has public property

@ -373,8 +369,7 @@ class PreTrainedConfig(PushToHubMixin):
        """
        `bool`: Whether or not return [`~utils.ModelOutput`] instead of tuples.
        """
-        # If torchscript is set, force `return_dict=False` to avoid jit errors
-        return self.return_dict and not self.torchscript
+        return self.return_dict

    @property
    def num_labels(self) -> int:
@ -395,7 +390,7 @@ class PreTrainedConfig(PushToHubMixin):
        return self._attn_implementation_internal

    @_attn_implementation.setter
-    def _attn_implementation(self, value: Optional[Union[str, dict]]):
+    def _attn_implementation(self, value: str | dict | None):
        """We set it recursively on the sub-configs as well"""
        # Set if for current config
        current_attn = getattr(self, "_attn_implementation", None)
@ -422,7 +417,15 @@ class PreTrainedConfig(PushToHubMixin):
        logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
        self.dtype = value

-    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+    @property
+    def rope_scaling(self):
+        return self.rope_parameters
+
+    @rope_scaling.setter
+    def rope_scaling(self, value):
+        self.rope_parameters = value
+
+    def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
        """
        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
        [`~PreTrainedConfig.from_pretrained`] class method.
@ -487,11 +490,11 @@ class PreTrainedConfig(PushToHubMixin):
    @classmethod
    def from_pretrained(
        cls: type[SpecificPreTrainedConfigType],
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        pretrained_model_name_or_path: str | os.PathLike,
+        cache_dir: str | os.PathLike | None = None,
        force_download: bool = False,
        local_files_only: bool = False,
-        token: Optional[Union[str, bool]] = None,
+        token: str | bool | None = None,
        revision: str = "main",
        **kwargs,
    ) -> SpecificPreTrainedConfigType:
@ -571,8 +574,6 @@ class PreTrainedConfig(PushToHubMixin):
        kwargs["force_download"] = force_download
        kwargs["local_files_only"] = local_files_only
        kwargs["revision"] = revision
-        if token is not None:
-            kwargs["token"] = token

        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        if cls.base_config_key and cls.base_config_key in config_dict:
@ -596,7 +597,7 @@ class PreTrainedConfig(PushToHubMixin):

    @classmethod
    def get_config_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+        cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
    ) -> tuple[dict[str, Any], dict[str, Any]]:
        """
        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
@ -629,7 +630,7 @@ class PreTrainedConfig(PushToHubMixin):

    @classmethod
    def _get_config_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+        cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
    ) -> tuple[dict[str, Any], dict[str, Any]]:
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
@ -792,7 +793,7 @@ class PreTrainedConfig(PushToHubMixin):

    @classmethod
    def from_json_file(
-        cls: type[SpecificPreTrainedConfigType], json_file: Union[str, os.PathLike]
+        cls: type[SpecificPreTrainedConfigType], json_file: str | os.PathLike
    ) -> SpecificPreTrainedConfigType:
        """
        Instantiates a [`PreTrainedConfig`] from the path to a JSON file of parameters.
@ -809,7 +810,7 @@ class PreTrainedConfig(PushToHubMixin):
        return cls(**config_dict)

    @classmethod
-    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+    def _dict_from_json_file(cls, json_file: str | os.PathLike):
        with open(json_file, encoding="utf-8") as reader:
            text = reader.read()
        return json.loads(text)
@ -934,7 +935,7 @@ class PreTrainedConfig(PushToHubMixin):
            config_dict = self.to_dict()
        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"

-    def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
+    def to_json_file(self, json_file_path: str | os.PathLike, use_diff: bool = True):
        """
        Save this instance to a JSON file.

--- a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
+++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
@ -15,6 +15,7 @@

 import argparse
 import os
+from pathlib import Path

 import transformers

@ -69,6 +70,15 @@ def convert_slow_checkpoint_to_fast(tokenizer_name, checkpoint_name, dump_path,
            if "/" in checkpoint:
                checkpoint_directory, checkpoint_prefix_name = checkpoint.split("/")
                dump_path_full = os.path.join(dump_path, checkpoint_directory)
+
+                # Security check
+                try:
+                    Path(dump_path_full).resolve().relative_to(Path(dump_path).resolve())
+                except ValueError:
+                    raise ValueError(
+                        f"Invalid checkpoint path: '{checkpoint}' attempts to escape `dump_path`: {dump_path}"
+                    )
+
            elif add_prefix:
                checkpoint_prefix_name = checkpoint
                dump_path_full = dump_path
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -27,6 +27,7 @@ deps = {
    "importlib_metadata": "importlib_metadata",
    "ipadic": "ipadic>=1.0.0,<2.0",
    "jinja2": "jinja2>=3.1.0",
+    "jmespath": "jmespath>=1.0.1",
    "kenlm": "kenlm",
    "kernels": "kernels>=0.10.2,<0.11",
    "librosa": "librosa",
--- a/src/transformers/distributed/configuration_utils.py
+++ b/src/transformers/distributed/configuration_utils.py
@ -16,7 +16,7 @@ import copy
 import json
 import os
 from dataclasses import dataclass
-from typing import Any, Union
+from typing import Any


@dataclass
@ -49,7 +49,7 @@ class DistributedConfig:
        return config

    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_json_file
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+    def to_json_file(self, json_file_path: str | os.PathLike):
        """
        Save this instance to a JSON file.
        Args:
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@ -323,7 +323,7 @@ def get_cached_module_file(
    local_files_only: bool = False,
    repo_type: Optional[str] = None,
    _commit_hash: Optional[str] = None,
-    **kwargs,
+    **deprecated_kwargs,
 ) -> str:
    """
    Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@ -331,6 +331,7 @@ class FeatureExtractionMixin(PushToHubMixin):
        kwargs["force_download"] = force_download
        kwargs["local_files_only"] = local_files_only
        kwargs["revision"] = revision
+
        if token is not None:
            kwargs["token"] = token

@ -408,6 +409,7 @@ class FeatureExtractionMixin(PushToHubMixin):
        token = kwargs.pop("token", None)
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)
+
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)

--- a/src/transformers/generation/continuous_batching/cache_manager.py
+++ b/src/transformers/generation/continuous_batching/cache_manager.py
@ -31,7 +31,6 @@ class CacheAllocator(ABC):
    def allocate_blocks(self, n_blocks: int, request_id: str, free_blocks: deque[int]) -> Optional[int]:
        """Allocates n_blocks for a given request_id. Returns the num of blocks allocated if successful and None
        otherwise."""
-        pass

    def free_blocks(self, request_id: str, free_blocks: deque[int]) -> None:
        """Frees all blocks associated with a request_id."""
@ -46,17 +45,14 @@ class CacheAllocator(ABC):
    @abstractmethod
    def get_read_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
        """Returns the physical indices of where to read request_id's cache in the cache tensor."""
-        pass

    @abstractmethod
    def get_write_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
        """Returns the physical indices of where to write request_id's cache in the cache tensor."""
-        pass

    @abstractmethod
    def get_seqlens_k(self, request_id: str, past_length: int, query_length: int) -> tuple[str, int]:
        """Returns the attention type of the cache allocator and the key sequence length for the given request_id."""
-        pass


 class FullAttentionCacheAllocator(CacheAllocator):
--- a/Show More
+++ b/Show More