check 1

2025-10-20 17:13:56 +08:00 · 2025-10-17 06:57:10 +02:00 · 2025-10-17 06:28:55 +02:00 · 2025-10-17 03:03:25 +02:00 · 2025-10-16 21:32:04 +02:00 · 2025-10-16 21:16:53 +02:00
22 changed files with 1538 additions and 272 deletions
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -41,7 +41,10 @@ env:

 jobs:
  check_new_failures:
-    name: " "
+    name: "Find commits for new failing tests"
+    strategy:
+      matrix:
+        run_idx: [1, 2, 3]
    runs-on:
      group: aws-g5-4xlarge-cache
    container:
@ -118,6 +121,10 @@ jobs:
        run: |
          python3 utils/print_env.py

+      - name: Install pytest-flakefinder
+        if: ${{ env.process == 'true' }}
+        run: python3 -m pip install pytest-flakefinder
+
      - name: Show installed libraries and their versions
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
@ -126,82 +133,104 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
        run: |
-          ls -l new_failures_with_bad_commit.json
-          cat new_failures_with_bad_commit.json
+          ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+          cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

-      - name: Checkout back
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: |
-          git checkout ${{ inputs.start_sha }}
-
-      - name: Process report
-        shell: bash
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-          JOB_NAME: ${{ inputs.job }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
-        run: |
-          python3 utils/process_bad_commit_report.py
-
-      - name: Process report
-        shell: bash
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-          JOB_NAME: ${{ inputs.job }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
-        run: |
-          {
-            echo 'REPORT_TEXT<<EOF'
-            python3 utils/process_bad_commit_report.py
-            echo EOF
-          } >> "$GITHUB_ENV"
-
-      - name: Prepare Slack report title
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: |
-          pip install slack_sdk
-          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
-
-      - name: Send processed report
-        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: '#${{ inputs.slack_report_channel }}'
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "blocks": [
-                {
-                  "type": "header",
-                  "text": {
-                    "type": "plain_text",
-                    "text": "${{ env.title }}"
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "${{ env.REPORT_TEXT }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          name: new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}
+          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+
+  process_new_failures_with_commit_info:
+    name: "process bad commit reports"
+    needs: [check_new_failures]
+    runs-on:
+      group: aws-g5-4xlarge-cache
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: new_failures_with_bad_commit_${{ inputs.job }}*
+          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+          merge-multiple: true
+
+      - name: Check files
+        working-directory: /transformers
+        run: |
+          ls -la /transformers
+          ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+
+
+#      - name: Process report
+#        shell: bash
+#        working-directory: /transformers
+#        if: ${{ env.process == 'true' }}
+#        env:
+#          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+#          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+#          JOB_NAME: ${{ inputs.job }}
+#          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
+#        run: |
+#          python3 utils/process_bad_commit_report.py
+
+#      - name: Process report
+#        shell: bash
+#        working-directory: /transformers
+#        if: ${{ env.process == 'true' }}
+#        env:
+#          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+#          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+#          JOB_NAME: ${{ inputs.job }}
+#          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
+#        run: |
+#          {
+#            echo 'REPORT_TEXT<<EOF'
+#            python3 utils/process_bad_commit_report.py
+#            echo EOF
+#          } >> "$GITHUB_ENV"
+#
+#      - name: Prepare Slack report title
+#        working-directory: /transformers
+#        if: ${{ env.process == 'true' }}
+#        run: |
+#          pip install slack_sdk
+#          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
+#
+#      - name: Send processed report
+#        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
+#        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+#        with:
+#          # Slack channel id, channel name, or user id to post message.
+#          # See also: https://api.slack.com/methods/chat.postMessage#channels
+#          channel-id: '#${{ inputs.slack_report_channel }}'
+#          # For posting a rich message using Block Kit
+#          payload: |
+#            {
+#              "blocks": [
+#                {
+#                  "type": "header",
+#                  "text": {
+#                    "type": "plain_text",
+#                    "text": "${{ env.title }}"
+#                  }
+#                },
+#                {
+#                  "type": "section",
+#                  "text": {
+#                    "type": "mrkdwn",
+#                    "text": "${{ env.REPORT_TEXT }}"
+#                  }
+#                }
+#              ]
+#            }
+#        env:
+#          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -6,7 +6,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - run_nvidia_ci*
+      - multi_jobs_to_check_bad_commit
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -23,7 +23,7 @@ on:

 # Used for `push` to easily modify the target workflow runs to compare against
 env:
-    prev_workflow_run_id: ""
+    prev_workflow_run_id: "18548615847"
    other_workflow_run_id: ""


@ -49,72 +49,10 @@ jobs:
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-models"
+      slack_report_channel: "#transformers-ci-dummy"
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      runner_type: "a10"
      report_repo_id: hf-internal-testing/transformers_daily_ci
      commit_sha: ${{ github.sha }}
    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      docker: huggingface/transformers-pytorch-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-examples"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  trainer-fsdp-ci:
-    name: Trainer/FSDP CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_trainer_and_fsdp_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      docker: huggingface/transformers-all-latest-gpu
-      runner_type: "a10"
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Daily CI
-      working-directory-prefix: /workspace
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  quantization-ci:
-    name: Quantization CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-quantization"
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -104,7 +104,7 @@ class BenchmarkConfig:
            "attn_implementation": self.attn_implementation,
            "sdpa_backend": self.sdpa_backend,
            "compile_mode": self.compile_mode,
-            "compile_options": self.compile_options,
+            "compile_options": self.compile_options | {},  # to avoid inplace modification of the original dict
            "kernelize": self.kernelize,
        }

@ -191,7 +191,7 @@ def generate_all_configs(
    )


-def generate_default_configs(
+def generate_main_configs(
    warmup_iterations: int = 5,
    measurement_iterations: int = 20,
    batch_size: int = 1,
@ -199,20 +199,17 @@ def generate_default_configs(
    num_tokens_to_generate: int = 128,
    gpu_monitoring: bool = False,
 ) -> list[BenchmarkConfig]:
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),  # note: this one can fail with compile because of attn mask
+    # Create kwargs common to all configs
+    kwargs = {
+        "warmup_iterations": warmup_iterations,
+        "measurement_iterations": measurement_iterations,
+        "batch_size": batch_size,
+        "sequence_length": sequence_length,
+        "num_tokens_to_generate": num_tokens_to_generate,
+        "gpu_monitoring": gpu_monitoring,
+    }
+    return [  # TODO: test max-autotune instead of default
+        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", **kwargs),
+        BenchmarkConfig(attn_implementation="eager", compile_mode="default", **kwargs),
+        BenchmarkConfig(attn_implementation="flash_attention_2", **kwargs),
    ]
-    return cross_generate_configs(
-        attn_impl_and_sdpa_backend=all_attn_implementations,
-        compiled_mode=[None, "max-autotune"],
-        kernelized=[False, KERNELIZATION_AVAILABLE],
-        warmup_iterations=warmup_iterations,
-        measurement_iterations=measurement_iterations,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_tokens_to_generate=num_tokens_to_generate,
-        gpu_monitoring=gpu_monitoring,
-    )
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@ -144,11 +144,11 @@ class BenchmarkStreamer(BaseStreamer):
 class BenchmarkRunner:
    """Main benchmark runner that coordinates benchmark execution."""

-    def __init__(
-        self, logger: logging.Logger, output_dir: str = "benchmark_results", commit_id: str | None = None
-    ) -> None:
+    def __init__(self, logger: logging.Logger, output_dir: str | None = None, commit_id: str | None = None) -> None:
        # Those stay constant for the whole run
        self.logger = logger
+        if output_dir is None:
+            output_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "benchmark_results")
        self.output_dir = output_dir
        self.commit_id = get_git_revision() if commit_id is None else commit_id
        os.makedirs(self.output_dir, exist_ok=True)
@ -214,7 +214,7 @@ class BenchmarkRunner:

            # Quick validation: try one measurement first to see if this scenario works
            flush_memory()
-            e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
+            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                max_new_tokens=1, gpu_monitor=None
            )
            if e2e_latency < 0:
@ -231,11 +231,11 @@ class BenchmarkRunner:
            result = BenchmarkResult()
            self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
            for _ in trange(config.measurement_iterations):
-                e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
+                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                    max_new_tokens=config.num_tokens_to_generate,
                    gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                )
-                result.accumulate(e2e_latency, token_generation_times, decoded_output, gpu_metrics)
+                result.accumulate(e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics)
            self.logger.info("Benchmarking done. Cleaning up.")

            # Profile if needed
@ -277,10 +277,11 @@ class BenchmarkRunner:
            raise RuntimeError(f"Generated {new_tokens} tokens, expected {max_new_tokens}")
        # Decode outputs
        decoded_output = self.tokenizer.decode(outputs[0, input_tokens:], skip_special_tokens=True)
+        shape_and_decoded_output = f"{tuple(outputs.shape)} | {decoded_output}"
        # Compute intermediate quantities
        e2e_latency = wall_time_1 - wall_time_0
        token_generation_times = [t - wall_time_0 for t in streamer.timestamps[1:]]
-        return e2e_latency, token_generation_times, decoded_output, gpu_metrics
+        return e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics

    def profile_generate(self, num_tokens_to_profile: int, config_name: str) -> None:
        """Profile the latency of a call to model.generate() with the given (inputs) and (max_new_tokens)."""
@ -351,10 +352,10 @@ class BenchmarkRunner:
                first_metadata = all_results[first_key]["metadata"].to_dict()
                hardware_info = first_metadata.pop("hardware_info")
                pretty_print_dict(first_metadata | hardware_info, tabs=1)
-            for value in all_results.values():
+            for result in all_results.values():
                print("=" * 100)
-                print(f"Config: {value['config'].infer_name(compact=False)}\n")
-                value["measurements"].pprint(tabs=1)
+                print(f"Config: {result['config'].infer_name(compact=False)}\n")
+                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
            print("=" * 100)

        return all_results
--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@ -82,19 +82,19 @@ class BenchmarkResult:
    def __init__(self) -> None:
        self.e2e_latency = []
        self.token_generation_times = []  # time at which each token was generated (relative to start of the generation)
-        self.decoded_outputs = []
+        self.shape_and_decoded_outputs = []
        self.gpu_metrics = []

    def accumulate(
        self,
        e2e_latency: float,
        token_generation_times: list[float],
-        decoded_output: str,
+        shape_and_decoded_output: str,
        gpu_metrics: GPURawMetrics | None,
    ) -> None:
        self.e2e_latency.append(e2e_latency)
        self.token_generation_times.append(token_generation_times)
-        self.decoded_outputs.append(decoded_output)
+        self.shape_and_decoded_outputs.append(shape_and_decoded_output)
        self.gpu_metrics.append(gpu_metrics)

    def to_dict(self) -> dict[str, None | int | float]:
@ -106,7 +106,7 @@ class BenchmarkResult:
        return {
            "e2e_latency": self.e2e_latency,
            "token_generation_times": self.token_generation_times,
-            "decoded_outputs": self.decoded_outputs,
+            "shape_and_decoded_outputs": self.shape_and_decoded_outputs,
            "gpu_metrics": gpu_metrics,
        }

@ -123,7 +123,7 @@ class BenchmarkResult:
            new_instance.accumulate(
                e2e_latency=data["e2e_latency"][i],
                token_generation_times=data["token_generation_times"][i],
-                decoded_output=data["decoded_output"][i],
+                shape_and_decoded_output=data["shape_and_decoded_outputs"][i],
                gpu_metrics=gpu_metrics[i],
            )
        return new_instance
@ -134,19 +134,27 @@ class BenchmarkResult:
    def get_measured_itl(self) -> list[float]:
        return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]

-    def pprint(self, tabs: int = 0) -> None:
-        collated_stats = equalize_lengths_and_collate(
-            [
-                add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
-                add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
-                add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
-            ]
-        )
-        pretty_print_dict(
-            {
-                "E2E Latency": collated_stats[0],
-                "Time to First Token": collated_stats[1],
-                "Inter-Token Latency": collated_stats[2],
-            },
-            tabs=tabs,
-        )
+    def get_throughput(self, batch_size: int) -> float:
+        return [
+            batch_size * len(dt) / e2e_latency
+            for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
+        ]
+
+    def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
+        stats_to_collate = [
+            add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
+        ]
+        if batch_size > 0:
+            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
+            stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
+        collated_stats = equalize_lengths_and_collate(stats_to_collate)
+        dict_to_pprint = {
+            "E2E Latency": collated_stats[0],
+            "Time to First Token": collated_stats[1],
+            "Inter-Token Latency": collated_stats[2],
+        }
+        if batch_size > 0:
+            dict_to_pprint["Throughput"] = collated_stats[3]
+        pretty_print_dict(dict_to_pprint, tabs=tabs)
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -20,28 +20,28 @@ in the ./benches directory, organizing outputs into model-specific subfolders.

 import argparse
 import logging
-import random
 import sys
 import uuid

-from framework.benchmark_config import BenchmarkConfig, generate_all_configs
+from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
 from framework.benchmark_runner import BenchmarkRunner


 if __name__ == "__main__":
    # Parse arguments
    parser = argparse.ArgumentParser()
-    parser.add_argument("--output-dir", type=str, default="benchmark_results", help="Output dir for benchmark results")
+    parser.add_argument("--output-dir", type=str, default=None, help="Output dir for benchmark results")
    parser.add_argument("--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO")
    parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")

-    parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations")
-    parser.add_argument("--iterations", type=int, default=20, help="Number of measurement iterations")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations")
+    parser.add_argument("--iterations", type=int, default=10, help="Number of measurement iterations")

    parser.add_argument("--batch-size", "-b", type=int, nargs="+", help="Batch size")
    parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
    parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")

+    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
    parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")

    parser.add_argument("--commit-id", type=str, help="Git commit ID (if not provided, will auto-detect from git)")
@ -69,42 +69,47 @@ if __name__ == "__main__":

    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
-        benchmark_configs = generate_all_configs(
+        if args.cross_generate:
+            benchmark_configs = generate_all_configs(
+                warmup_iterations=args.warmup,
+                measurement_iterations=args.iterations,
+                batch_size=args.batch_size[0],
+                sequence_length=args.sequence_length[0],
+                num_tokens_to_generate=args.num_tokens_to_generate[0],
+            )
+        else:
+            benchmark_configs = generate_main_configs(
+                warmup_iterations=args.warmup,
+                measurement_iterations=args.iterations,
+                batch_size=args.batch_size[0],
+                sequence_length=args.sequence_length[0],
+                num_tokens_to_generate=args.num_tokens_to_generate[0],
+            )
+
+    # Otherwise, we benchmark across all combinations of dimensions
+    else:
+        main_config = generate_main_configs(
            warmup_iterations=args.warmup,
            measurement_iterations=args.iterations,
            batch_size=args.batch_size[0],
            sequence_length=args.sequence_length[0],
            num_tokens_to_generate=args.num_tokens_to_generate[0],
-        )
-        random.shuffle(benchmark_configs)
-
-    # Otherwise, we benchmark across all combinations of dimensions
-    else:
-        kwargs = {
-            "warmup_iterations": args.warmup,
-            "measurement_iterations": args.iterations,
-            "gpu_monitoring": False,
-            "batch_size": args.batch_size[0],
-            "sequence_length": args.sequence_length[0],
-            "num_tokens_to_generate": args.num_tokens_to_generate[0],
-            "attn_implementation": "flex_attention",
-            "sdpa_backend": None,
-            "compile_mode": "default",
-            "kernelize": False,
-        }
+        )[0]
        benchmark_configs = []
        for num_tokens_to_generate in args.num_tokens_to_generate:
            for sequence_length in args.sequence_length:
                for batch_size in args.batch_size:
-                    kwargs["batch_size"] = batch_size
-                    kwargs["sequence_length"] = sequence_length
-                    kwargs["num_tokens_to_generate"] = num_tokens_to_generate
-                    benchmark_configs.append(BenchmarkConfig(**kwargs))
+                    cfg_dict = main_config.to_dict()
+                    cfg_dict["batch_size"] = batch_size
+                    cfg_dict["sequence_length"] = sequence_length
+                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
+                    cfg_dict.pop("name")
+                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))

    runner = BenchmarkRunner(logger, args.output_dir, args.commit_id)
    results = runner.run_benchmarks(
        args.model_id,
-        benchmark_configs[:3],
+        benchmark_configs,
        args.num_tokens_to_profile,
        pretty_print_summary=True,
    )
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -380,7 +380,7 @@ CB is opt-in and currently applies to chat completions.
 ```sh
 transformers serve \
  --continuous-batching
-  --attn_implementation sdpa_paged
+  --attn_implementation "sdpa"
 ```

 ### Performance tips
@ -390,11 +390,10 @@ transformers serve \
 ```sh
 transformers serve \
  --continuous_batching \
-  --attn_implementation paged_attention
+  --attn_implementation "flash_attention_2"
 ```

 > [!TIP]
-> If you choose `paged_attention`, you must install `flash-attn` separately: `pip install flash-attn --no-build-isolation`

 - `--dtype {bfloat16|float16}` typically improve throughput and memory use vs. `float32`

--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -84,6 +84,8 @@
      title: Transformers로 채팅하기
    - local: chat_templating
      title: 챗봇 템플릿 익히기
+    - local: chat_extras
+      title: Tools 와 RAG
    - local: in_translation
      title: (번역중) Multimodal templates
    - local: in_translation
@ -459,7 +461,7 @@
        title: BertJapanese
      - local: model_doc/bertweet
        title: BERTweet
-      - local: in_translation
+      - local: model_doc/big_bird
        title: BigBird
      - local: in_translation
        title: BigBirdPegasus
@ -483,7 +485,7 @@
        title: CANINE
      - local: model_doc/codegen
        title: CodeGen
-      - local: in_translation
+      - local: model_doc/code_llama
        title: CodeLlama
      - local: model_doc/cohere
        title: Cohere
@ -881,6 +883,8 @@
        title: SegFormer
      - local: in_translation
        title: SegGpt
+      - local: model_doc/sam_hq
+        title: Segment Anything High Quality (SAM-HQ)
      - local: in_translation
        title: SuperGlue
      - local: in_translation
@ -1097,7 +1101,7 @@
        title: LayoutXLM
      - local: in_translation
        title: LiLT
-      - local: in_translation
+      - local: model_doc/llama4
        title: Llama4
      - local: in_translation
        title: Llava
--- a/docs/source/ko/chat_extras.md
+++ b/docs/source/ko/chat_extras.md
@ -0,0 +1,299 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 도구와 RAG[[Tools-and-RAG]]
+
+[`~PreTrainedTokenizerBase.apply_chat_template`] 메소드는 채팅 메시지 외에도 문자열, 리스트, 딕셔너리 등 거의 모든 종류의 추가 인수 타입을 지원합니다. 이를 통해 다양한 사용 상황에서 채팅 템플릿을 활용할 수 있습니다.
+
+이 가이드에서는 도구 및 검색 증강 생성(RAG)과 함께 채팅 템플릿을 사용하는 방법을 보여드립니다.
+
+## 도구[[Tools]]
+
+도구는 대규모 언어 모델(LLM)이 특정 작업을 수행하기 위해 호출할 수 있는 함수입니다. 이는 실시간 정보, 계산 도구 또는 대규모 데이터베이스 접근 등을 통해 대화형 에이전트의 기능을 확장하는 강력한 방법입니다.
+
+도구를 만들 때는 아래 규칙을 따르세요.
+
+1. 함수는 기능을 잘 설명하는 이름을 가져야 합니다.
+2. 함수의 인수는 함수 헤더에 타입 힌트를 포함해야 합니다(`Args` 블록에는 포함하지 마세요).
+3. 함수에는 [Google 스타일](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) 의 독스트링(docstring)이 포함되어야 합니다.
+4. 함수에 반환 타입과 `Returns` 블록을 포함할 수 있지만, 도구를 활용하는 대부분의 모델에서 이를 사용하지 않기 때문에 무시할 수 있습니다.
+
+주어진 위치의 현재 온도와 풍속을 가져오는 도구의 예시는 아래와 같습니다.
+
+```py
+def get_current_temperature(location: str, unit: str) -> float:
+    """
+    주어진 위치의 현재 온도를 가져옵니다.
+    
+    Args:
+        location: 온도를 가져올 위치, "도시, 국가" 형식
+        unit: 온도를 반환할 단위. (선택지: ["celsius(섭씨)", "fahrenheit(화씨)"])
+    Returns:
+        주어진 위치의 지정된 단위로 표시된 현재 온도(float 자료형).
+    """
+    return 22.  # 실제 함수라면 아마 진짜로 기온을 가져와야겠죠!
+
+def get_current_wind_speed(location: str) -> float:
+    """
+    주어진 위치의 현재 풍속을 km/h 단위로 가져옵니다.
+    
+    Args:
+        location: 온도를 가져올 위치, "도시, 국가" 형식
+    Returns:
+        주어진 위치의 현재 풍속(km/h, float 자료형).
+    """
+    return 6.  # 실제 함수라면 아마 진짜로 풍속을 가져와야겠죠!
+
+tools = [get_current_temperature, get_current_wind_speed]
+```
+
+[NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B)와 같이 도구 사용을 지원하는 모델과 토크나이저를 가져오세요. 하드웨어가 지원된다면 [Command-R](./model_doc/cohere)이나 [Mixtral-8x22B](./model_doc/mixtral)와 같은 더 큰 모델도 고려할 수 있습니다.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+채팅 메시지를 생성합니다.
+
+```py
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+`messages`와 도구 목록 `tools`를 [`~PreTrainedTokenizerBase.apply_chat_template`]에 전달한 뒤, 이를 모델의 입력으로 사용하여 텍스트를 생성할 수 있습니다.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+outputs = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+채팅 모델은 독스트링(docstring)에 정의된 형식에 따라 `get_current_temperature` 함수에 올바른 매개변수를 전달해 호출했습니다. 파리를 기준으로 위치를 프랑스로 추론했으며, 온도 단위는 섭씨를 사용해야 한다고 판단했습니다.
+
+이제 `get_current_temperature` 함수와 해당 인수들을 `tool_call` 딕셔너리에 담아 채팅 메시지에 추가합니다. `tool_call` 딕셔너리는 `system`이나 `user`가 아닌 `assistant` 역할로 제공되어야 합니다.
+
+> [!WARNING]
+> OpenAI API는 `tool_call` 형식으로 JSON 문자열을 사용합니다. Transformers에서 사용할 경우 딕셔너리를 요구하기 때문에, 오류가 발생하거나 모델이 이상하게 동작할 수 있습니다.
+
+<hfoptions id="tool-call">
+<hfoption id="Llama">
+
+```py
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+
+어시스턴트가 함수 출력을 읽고 사용자와 채팅할 수 있도록 합니다.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
+```
+
+</hfoption>
+<hfoption id="Mistral/Mixtral">
+
+[Mistral](./model_doc/mistral) 및 [Mixtral](./model_doc/mixtral) 모델의 경우 추가적으로 `tool_call_id`가 필요합니다. `tool_call_id`는 9자리 영숫자 문자열로 생성되어 `tool_call` 딕셔너리의 `id` 키에 할당됩니다.
+
+```py
+tool_call_id = "9Ae3bDc2F"
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
+```
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+</hfoption>
+</hfoptions>
+
+## 스키마[[Schema]]
+
+[`~PreTrainedTokenizerBase.apply_chat_template`]은 함수를 [JSON 스키마](https://json-schema.org/learn/getting-started-step-by-step)로 변환하여 채팅 템플릿에 전달합니다. LLM은 함수 내부의 코드를 보지 못합니다. 다시 말해, LLM은 함수가 기술적으로 어떻게 작동하는지는 신경 쓰지 않고, 함수의 **정의**와 **인수**만 참조합니다.
+
+함수가 앞서 나열된 규칙을 따르면, 내부에서 JSON 스키마가 자동으로 생성됩니다. 하지만 더 나은 가독성이나 디버깅을 위해 [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205)를 사용하여 스키마를 수동으로 변환할 수 있습니다.
+
+```py
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    두 숫자를 곱하는 함수
+    
+    Args:
+        a: 곱할 첫 번째 숫자
+        b: 곱할 두 번째 숫자
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+스키마를 편집하거나 처음부터 직접 작성할 수 있습니다. 이를 통해 더 복잡한 함수에 대한 정확한 스키마를 유연하게 정의할 수 있습니다.
+
+> [!WARNING]
+> 함수 시그니처를 단순하게 유지하고 인수를 최소한으로 유지하세요. 이러한 함수는 중첩된 인수를 가진 복잡한 함수에 비해 모델이 더 쉽게 이해하고 사용할 수 있습니다.
+
+아래 예시는 스키마를 수동으로 작성한 다음 [`~PreTrainedTokenizerBase.apply_chat_template`]에 전달하는 방법을 보여줍니다.
+
+```py
+# 인수를 받지 않는 간단한 함수
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# 두 개의 숫자 인수를 받는 더 완전한 함수
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## RAG[[RAG]]
+
+검색 증강 생성(Retrieval-augmented generation, RAG) 모델은 쿼리를 반환하기 전에 문서를 검색해 추가 정보를 얻어 모델이 기존에 가지고 있던 지식을 확장시킵니다. RAG 모델의 경우, [`~PreTrainedTokenizerBase.apply_chat_template`]에 `documents` 매개변수를 추가하세요. 이 `documents` 매개변수는 문서 목록이어야 하며, 각 문서는 `title`과 `content` 키를 가진 단일 딕셔너리여야 합니다.
+
+> [!TIP]
+> RAG를 위한 `documents` 매개변수는 폭넓게 지원되지 않으며 많은 모델들이 `documents`를 무시하는 채팅 템플릿을 가지고 있습니다. 모델이 `documents`를 지원하는지 확인하려면 모델 카드를 읽거나 `print(tokenizer.chat_template)`를 실행하여 `documents` 키가 있는지 확인하세요. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024)과 [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024)는 모두 RAG 채팅 템플릿에서 `documents`를 지원합니다.
+
+모델에 전달할 문서 목록을 생성하세요.
+
+```py
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+```
+
+[`~PreTrainedTokenizerBase.apply_chat_template`]에서 `chat_template="rag"`를 설정하고 응답을 생성하세요.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# 모델과 토크나이저 로드
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
+device = model.device # 모델을 가져온 장치 확인
+
+# 대화 입력 정의
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# 응답 생성
+generated_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# 생성된 텍스트를 디코딩하고 생성 프롬프트와 함께 출력
+generated_text = tokenizer.decode(generated_tokens[0])
+print(generated_text)
+```
--- a/docs/source/ko/model_doc/big_bird.md
+++ b/docs/source/ko/model_doc/big_bird.md
@ -0,0 +1,158 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2020-07-28에 출시되었으며 2021-03-30에 Hugging Face Transformers에 추가되었습니다.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
+# BigBird[[bigbird]]
+
+[BigBird](https://huggingface.co/papers/2007.14062)는 [BERT](./bert)의 512토큰과 달리 최대 4096토큰까지의 시퀀스 길이를 처리하도록 설계된 트랜스포머 모델입니다. 기존 트랜스포머들은 시퀀스 길이가 늘어날수록 어텐션 계산 비용이 급격히 증가하여 긴 입력 처리에 어려움을 겪습니다. BigBird는 희소 어텐션 메커니즘으로 이 문제를 해결하는데, 모든 토큰을 동시에 살펴보는 대신 로컬 어텐션, 랜덤 어텐션, 그리고 몇 개의 전역 토큰을 조합하여 전체 입력을 효율적으로 처리합니다. 이런 방식을 통해 계산 효율성을 유지하면서도 시퀀스 전체를 충분히 이해할 수 있게 됩니다. 따라서 BigBird는 질의응답, 요약, 유전체학 응용처럼 긴 문서를 다루는 작업에 특히 우수한 성능을 보입니다.
+
+모든 원본 BigBird 체크포인트는 [Google](https://huggingface.co/google?search_models=bigbird) 조직에서 찾아볼 수 있습니다.
+
+> [!TIP]
+> 오른쪽 사이드바의 BigBird 모델들을 클릭하여 다양한 언어 작업에 BigBird를 적용하는 더 많은 예시를 확인해보세요.
+
+아래 예시는 [`Pipeline`], [`AutoModel`], 그리고 명령줄에서 `[MASK]` 토큰을 예측하는 방법을 보여줍니다.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="fill-mask",
+    model="google/bigbird-roberta-base",
+    dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create [MASK] through a process known as photosynthesis.")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-roberta-base",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "google/bigbird-roberta-base",
+    dtype=torch.float16,
+    device_map="auto",
+)
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+!echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google/bigbird-roberta-base --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## 참고사항[[notes]]
+
+- BigBird는 절대 위치 임베딩을 사용하므로 입력을 오른쪽에 패딩해야 합니다.
+- BigBird는 `original_full`과 `block_sparse` 어텐션을 지원합니다. 입력 시퀀스 길이가 1024 미만인 경우에는 희소 패턴의 이점이 크지 않으므로 `original_full` 사용을 권장합니다.
+- 현재 구현은 3블록 윈도우 크기와 2개의 전역 블록을 사용하며, ITC 구현만 지원하고 `num_random_blocks=0`은 지원하지 않습니다.
+- 시퀀스 길이는 블록 크기로 나누어떨어져야 합니다.
+
+## 리소스[[resources]]
+
+- BigBird 어텐션 메커니즘의 자세한 작동 원리는 [BigBird](https://huggingface.co/blog/big-bird) 블로그 포스트를 참고하세요.
+
+## BigBirdConfig[[bigbirdconfig]]
+
+[[autodoc]] BigBirdConfig
+
+## BigBirdTokenizer[[bigbirdtokenizer]]
+
+[[autodoc]] BigBirdTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BigBirdTokenizerFast[[bigbirdtokenizerfast]]
+
+[[autodoc]] BigBirdTokenizerFast
+
+## BigBird 특정 출력[[bigbird-specific-outputs]]
+
+[[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
+
+## BigBirdModel[[bigbirdmodel]]
+
+[[autodoc]] BigBirdModel
+    - forward
+
+## BigBirdForPreTraining[[bigbirdforpretraining]]
+
+[[autodoc]] BigBirdForPreTraining
+    - forward
+
+## BigBirdForCausalLM[[bigbirdforcausallm]]
+
+[[autodoc]] BigBirdForCausalLM
+    - forward
+
+## BigBirdForMaskedLM[[bigbirdformaskedlm]]
+
+[[autodoc]] BigBirdForMaskedLM
+    - forward
+
+## BigBirdForSequenceClassification[[bigbirdforsequenceclassification]]
+
+[[autodoc]] BigBirdForSequenceClassification
+    - forward
+
+## BigBirdForMultipleChoice[[bigbirdformultiplechoice]]
+
+[[autodoc]] BigBirdForMultipleChoice
+    - forward
+
+## BigBirdForTokenClassification[[bigbirdfortokenclassification]]
+
+[[autodoc]] BigBirdForTokenClassification
+    - forward
+
+## BigBirdForQuestionAnswering[[bigbirdforquestionanswering]]
+
+[[autodoc]] BigBirdForQuestionAnswering
+    - forward
--- a/docs/source/ko/model_doc/code_llama.md
+++ b/docs/source/ko/model_doc/code_llama.md
@ -0,0 +1,180 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2023년 8월 24일에 공개되었으며, 2023년 8월 25일에 Hugging Face Transformers에 추가되었습니다.*
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        ">
+    </div>
+</div>
+
+# CodeLlama[[codellama]]
+
+[Code Llama](https://huggingface.co/papers/2308.12950)는 코딩 작업에 특화된 대규모 언어 모델 계열로,  [Llama 2](./llama2)를 기반으로 개발되었습니다. 일반적인 코드, Python 특화, 명령어(지시) 기반 변형 등 다양한 버전으로 제공되며, 모두 7B, 13B, 34B, 70B 매개변수 크기로 사용할 수 있습니다. Code Llama 모델은 코드를 생성하고 설명하며, 코드의 누락된 부분을 채울 수도 있습니다. 이를 인필링(infilling)이라고 합니다. 16K 토큰 길이로 훈련되었지만, 최대 100K 토큰까지 안정적으로 생성하며 긴 컨텍스트도 처리할 수 있습니다.
+
+[Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933) 컬렉션에서 모든 원본 Code Llama 체크포인트를 찾을 수 있습니다.
+
+> [!TIP]
+> 다양한 코딩 작업에 Code Llama를 적용하는 더 많은 예시를 보려면 오른쪽 사이드바의 Code Llama 모델을 클릭하세요.
+
+아래 예시는 [`Pipeline`], [`AutoModel`], 그리고 명령줄에서 코드를 생성하는 방법을 보여줍니다.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    "text-generation",
+    model="meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map=0
+)
+
+# 기본 코드 생성
+result = pipe("# Function to calculate the factorial of a number\ndef factorial(n):", max_new_tokens=256)
+print(result[0]['generated_text'])
+
+# 인필링
+infill_result = pipe("def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result", max_new_tokens=200)
+print(infill_result[0]['generated_text'])
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+# 기본 코드 생성
+prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(
+    **input_ids,
+    max_new_tokens=256,
+    cache_implementation="static"
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+# 인필링
+infill_prompt = "def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result"
+input_ids = tokenizer(infill_prompt, return_tensors="pt").to(model.device)
+
+filled_output = model.generate(**input_ids, max_new_tokens=200)
+filled_text = tokenizer.decode(filled_output[0], skip_special_tokens=True)
+print(filled_text)
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+양자화는 가중치를 더 낮은 정밀도로 표현하여 대규모 모델의 메모리 부담을 줄입니다. 더 많은 사용 가능한 양자화 백엔드는 [양자화](../quantization/overview) 개요를 참조하세요.
+
+아래 예시는 [bitsandbytes](../quantization/bitsandbytes)를 사용하여 가중치를 4비트로만 양자화합니다.
+
+```py
+# bitsandbytes를 설치합니다.
+import torch
+from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
+tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-34b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+   "meta-llama/CodeLlama-34b-hf",
+   torch_dtype=torch.bfloat16,
+   device_map="auto",
+   quantization_config=bnb_config
+)
+
+prompt = "# Write a Python function to check if a string is a palindrome\ndef is_palindrome(s):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, max_new_tokens=200, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+[AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139)를 사용하면 모델이 어떤 토큰에 주의를 기울일 수 있고 기울일 수 없는지를 더 잘 이해할 수 있습니다.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+
+visualizer = AttentionMaskVisualizer("meta-llama/CodeLlama-7b-hf")
+visualizer("""def func(a, b):
+  return a + b""")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/codellama-attn-mask.png"/>
+</div>
+
+## 참고사항[[notes]]
+
+- 인필링 기능은 7B 및 13B 기반 모델에서만 사용할 수 있으며, Python, Instruct, 34B 또는 70B 모델에서는 사용할 수 없습니다.
+- 코드를 채워 넣고 싶은 부분에 `<FILL_ME>` 토큰을 사용하세요. 토크나이저는 이 토큰을 분할하여 [원본 훈련 패턴](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402) 을 따르는 입력 문자열로 변환합니다. 이는 직접 패턴을 준비하는 것보다 더 안정적입니다.
+    ```py
+    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+
+    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    PROMPT = '''def remove_non_ascii(s: str) -> str:
+        """ <FILL_ME>
+        return result
+    '''
+    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
+    generated_ids = model.generate(input_ids, max_new_tokens=128)
+
+    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
+    print(PROMPT.replace("<FILL_ME>", filling))
+    ```
+- 추가 훈련이나 미세 조정에는 `bfloat16`을 사용하고 추론에는 `float16`을 사용하세요.
+- `BOS` 문자는 접두사나 접미사를 인코딩할 때 인필링 작업에 사용되지 않으며, 각 프롬프트의 맨 앞에서만 사용됩니다.
+- 토크나이저는 [SentencePiece](https://github.com/google/sentencepiece)를 기반으로 하는 byte-pair 인코딩 모델입니다. 디코딩 과정에서 첫 번째 토큰이 단어의 시작인 경우(예를 들어 "Banana"), 토크나이저는 문자열에 접두사 공백을 추가하지 않습니다.
+
+## CodeLlamaTokenizer
+
+[[autodoc]] CodeLlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CodeLlamaTokenizerFast
+
+[[autodoc]] CodeLlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
--- a/docs/source/ko/model_doc/llama4.md
+++ b/docs/source/ko/model_doc/llama4.md
@ -0,0 +1,443 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*Meta는 이 모델을 2025-04-05에 출시하고 같은 날 Hugging Face Transformers에 추가했습니다.*
+
+# Llama4[[llama4]]
+
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+Meta에서 개발한 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)는 새로운 자기회귀 Mixture-of-Experts (MoE) 아키텍처를 도입합니다.
+이 세대는 두 가지 모델로 나뉩니다:
+- 128개의 전문가(expert)를 사용하여 총 약 400B 매개변수 중 17B 활성 매개변수를 갖는 고성능 Llama 4 Maverick
+- 16개의 전문가만 사용하여 총 약 109B 매개변수 중 17B 활성 매개변수를 갖는 경량화된 Llama 4 Scout
+-
+두 모델 모두 네이티브 멀티모달을 위한 초기 융합(early fusion)을 활용하여 텍스트와 이미지 입력을 처리할 수 있습니다.
+Maverick과 Scout 모두 200개 언어를 포함하는 데이터에서 최대 40조개의 토큰으로 훈련되었습니다.
+(아랍어, 스페인어, 독일어, 힌디어를 포함한 12개 언어에 대한 특정 미세 조정 지원 포함)
+
+Meta는 Llama 4 Scout을 누구나 쉽게 사용할 수 있도록 설계했습니다. Scout은 4비트 또는 8비트 양자화를 적용하면 단일 서버급 GPU에서도 실시간으로 실행할 수 있습니다. 반면, 더 대규모인 Llama 4 Maverick은 고성능 연산을 위해 BF16과 FP8 형식으로 제공합니다.
+이 모델들은 모델 저장소에서 제공되는 사용자 지정 Llama 4 커뮤니티 라이선스 계약에 따라 출시됩니다.
+
+모든 원본 Llama 체크포인트는 hugging face [meta-llama](https://huggingface.co/meta-llama) 페이지에서 확인하실 수 있습니다.
+
+> [!TIP]
+> Llama 4 모델 패밀리는 두 가지 형태로 제공됩니다: 109B와 402B 매개변수입니다. 이 두 형태 모두 매우 큰 모델이며
+> 일반적인 기기에서는 실행할 수 없습니다. 아래에 메모리 사용량을 줄이는 방법 몇 가지를 정리했습니다.
+>
+> 더욱 빠르고 안정적인 다운로드를 위해 `hf_xet` 종속성 설치를 권장합니다:
+> `pip install transformers[hf_xet]`
+
+아래 예시들은 [`Pipeline`] 또는 [`AutoModel`]로 생성하는 방법을 보여줍니다. 또한 일부 Llama 4 변형이
+최대 1천만 토큰의 컨텍스트 길이를 갖기 때문에, 매우 긴 컨텍스트 생성을 활성화하기 위해 올바른 속성을 토글하는 방법을 보여주는 예시도 추가했습니다.
+
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+messages = [
+    {"role": "user", "content": "마요네즈 레시피가 무엇인가요?"},
+]
+
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    device_map="auto",
+    dtype=torch.bfloat16
+)
+
+output = pipe(messages, do_sample=False, max_new_tokens=200)
+print(output[0]["generated_text"][-1]["content"])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Text only">
+
+```py
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+
+img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": img_url},
+            {"type": "text", "text": "이 이미지를 두 문장으로 설명해주세요."},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal with multiple images">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+
+url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": url1},
+            {"type": "image", "url": url2},
+            {"type": "text", "text": "이 두 이미지가 어떻게 비슷하고, 어떻게 다른지 설명해주실 수 있나요?"},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Long context">
+
+주의: 아래 예시는 `device_map="auto"`와 flex-attention을 모두 사용합니다.
+이 예시를 텐서 병렬 모드로 실행하려면 `torchrun`을 사용하세요.
+
+향후 텐서 병렬 없이 `device_map="auto"`와 flex-attention을 함께 실행할 수 있도록
+작업할 예정입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration, AutoTokenizer, infer_device
+import torch
+import time
+
+file = "very_long_context_prompt.txt"
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+with open(file, "r") as f:
+    very_long_text = "\n".join(f.readlines())
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    attn_implementation="flex_attention",
+    dtype=torch.bfloat16
+)
+
+messages = [
+    {"role": "user", "content": f"다음 텍스트들을 보세요: [{very_long_text}]\n\n\n\n책들은 무엇이며, 누가 썼나요? 좋은 목록을 만들어주세요."},
+]
+input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+device = infer_device()
+torch_device_module = getattr(torch, device, torch.cuda)
+torch_device_module.synchronize()
+start = time.time()
+out = model.generate(
+    input_ids.to(model.device),
+    prefill_chunk_size=2048*8,
+    max_new_tokens=300,
+    cache_implementation="hybrid",
+)
+print(time.time()-start)
+print(tokenizer.batch_decode(out[:, input_ids.shape[-1]:]))
+print(f"{torch_device_module.max_memory_allocated(model.device) / 1024**3:.2f} GiB")
+```
+
+</hfoption>
+</hfoptions>
+
+## 효율성; Llama 4의 최대 성능 활용하기[[efficiency-how-to-get-the-best-out-of-llama-4]]
+
+### 어텐션 방법[[the-attention-methods]]
+
+기본 설정으로 주어지는 어텐션 함수를 변경하면 계산 성능과 메모리 사용량을 크게 개선할 수 있습니다. 인터페이스에 대한 자세한 설명은 [어텐션 인터페이스](../attention_interface) 개요를 참조하세요.
+
+Llama 4 모델은 처음 공개될 때부터 다음 어텐션 방식을 지원합니다: `eager`, `flex_attention`, `sdpa`. 최상의 결과를 위해 `flex_attention` 사용을 권장합니다.
+어텐션 메커니즘 전환은 모델을 초기화할 때 이루어집니다:
+
+
+<hfoptions id="Attention">
+<hfoption id="Flex Attention">
+
+Flex Attention은 모델이 긴 컨텍스트를 처리할 때 최적의 성능을 발휘합니다.
+
+> [!TIP] 주의: 아래 예시는 `device_map="auto"`와 flex-attention을 모두 사용합니다.
+> 이 예시를 텐서 병렬 모드로 실행하려면 `torchrun`을 사용하세요.
+>
+> 향후 텐서 병렬 없이 `device_map="auto"`와 flex-attention을 함께 실행할 수 있도록
+> 작업할 예정입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="flex_attention",
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="SDPA">
+`sdpa` 어텐션 방법은 일반적으로 `eager` 방법보다 계산 효율적입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="sdpa",
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="Eager">
+`eager` 어텐션 방법이 기본으로 설정되어 있으므로 모델 로드 시 다른 설정이 필요하지 않습니다:
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+</hfoptions>
+
+
+### 양자화[[quantization]]
+
+양자화는 가중치를 더 낮은 정밀도로 바꿔 대형 모델의 메모리 부담을 줄입니다. 사용 가능한 양자화 백엔드에 대해서는 [양자화](../quantization/overview) 개요를 참조하세요.
+현재는 FBGEMM과 LLM-Compressor를 지원하며, 곧 더 많은 방식이 추가될 예정입니다.
+
+두 가지 방법을 사용하는 예시를 아래에서 확인하세요:
+
+
+
+다음은 FBGEMM 접근법을 사용하여 BF16 모델을 FP8로 로드하는 예시입니다:
+
+<hfoptions id="Quantization">
+<hfoption id="FBGEMM">
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration, FbgemmFp8Config
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+    quantization_config=FbgemmFp8Config()
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="LLM-Compressor">
+
+LLLM-Compressor를 사용할 때는 함께 제공되는 사전 양자화된 FP8 체크포인트를 쓰는 것이 좋습니다:
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    tp_plan="auto",
+    dtype=torch.bfloat16,
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+</hfoption>
+</hfoptions>
+
+### 오프로딩[[offloading]]
+
+CPU 오프로딩을 활성화하면, GPU 메모리가 부족할 때 모델이 구성 요소를 CPU로 이동시킵니다.
+추론 시 다양한 구성 요소들이 GPU와 CPU 간에 동적으로 로드되고 언로드됩니다. 이를 통해 CPU 메모리가 충분한 한 더 작은 머신에서도 모델을 로드할 수 있습니다.
+다만 통신 오버헤드로 인해 추론 속도가 느려질 수 있습니다.
+
+CPU 오프로딩을 활성화하려면 모델 로드 시 `device_map`을 `auto`로 지정하면 됩니다
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+
+## Llama4Config
+
+[[autodoc]] Llama4Config
+
+## Llama4TextConfig
+
+[[autodoc]] Llama4TextConfig
+
+## Llama4VisionConfig
+
+[[autodoc]] Llama4VisionConfig
+
+## Llama4Processor
+
+[[autodoc]] Llama4Processor
+
+## Llama4ImageProcessorFast
+
+[[autodoc]] Llama4ImageProcessorFast
+
+## Llama4ForConditionalGeneration
+
+[[autodoc]] Llama4ForConditionalGeneration
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4TextModel
+
+[[autodoc]] Llama4TextModel
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4VisionModel
+
+[[autodoc]] Llama4VisionModel
+- forward
--- a/docs/source/ko/model_doc/sam_hq.md
+++ b/docs/source/ko/model_doc/sam_hq.md
@ -0,0 +1,141 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2023-06-02에 발표되었으며 2025-04-28에 Hugging Face Transformers에 추가되었습니다.*
+
+# SAM-HQ[[sam_hq]]
+
+## 개요[[overview]]
+
+SAM-HQ (High-Quality Segment Anything Model)는 Lei Ke, Mingqiao Ye, Martin Danelljan, Yifan Liu, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu가 제안한 [Segment Anything in High Quality](https://huggingface.co/papers/2306.01567) 논문에서 소개되었습니다.
+
+이 모델은 기존 SAM(Segment Anything Model)의 향상된 버전입니다. SAM-HQ는 SAM의 핵심 장점인 프롬프트 기반 설계, 효율성, 제로샷 일반화 능력을 그대로 유지하면서도 훨씬 더 높은 품질의 분할 마스크를 생성하는 것이 특징입니다.
+
+![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
+
+SAM-HQ는 기존 SAM 모델 대비 다음과 같은 5가지 핵심 개선 사항을 도입했습니다.
+
+1. 고품질 출력 토큰: SAM-HQ는 SAM의 마스크 디코더에 학습 가능한 토큰을 주입합니다. 이 토큰은 모델이 더 높은 품질의 분할 마스크를 예측하도록 돕는 핵심적인 요소입니다.
+2. 전역-지역 특징 융합: 모델의 서로 다른 단계에서 추출된 특징들을 결합하여 분할 마스크의 세부적인 정확도를 향상시킵니다. 이미지의 전체적인 맥락 정보와 객체의 미세한 경계 정보를 함께 활용하여 마스크 품질을 개선합니다.
+3. 훈련 데이터 개선: SAM 모델이 SA-1B와 같은 대규모 데이터를 사용한 것과 달리, SAM-HQ는 신중하게 선별된 44,000개의 고품질 마스크로 구성된 데이터셋을 사용하여 훈련됩니다.
+4. 높은 효율성: 마스크 품질을 상당히 개선했음에도 불구하고, 추가된 매개변수는 단 0.5%에 불과합니다.
+5. 제로샷 성능: SAM-HQ는 성능이 개선되었음에도 불구하고, SAM 모델의 강력한 제로샷 일반화 능력을 그대로 유지합니다.
+
+논문 초록 내용:
+
+* 최근 발표된 SAM(Segment Anything Model)은 분할 모델의 규모를 확장하는 데 있어 획기적인 발전이며, 강력한 제로샷 기능과 유연한 프롬프트 기능을 제공합니다. 하지만 SAM은 11억 개의 마스크로 훈련되었음에도 불구하고, 특히 복잡하고 정교한 구조를 가진 객체를 분할할 때 마스크 예측 품질이 미흡한 경우가 많습니다. 저희는 HQ-SAM을 제안하며, SAM의 기존 장점인 프롬프트 기반 설계, 효율성, 제로샷 일반화 능력을 모두 유지하면서도 어떤 객체든 정확하게 분할할 수 있는 능력을 부여합니다. 저희는 신중한 설계를 통해 SAM의 사전 훈련된 모델 가중치를 재사용하고 보존하며 최소한의 추가적인 매개변수와 연산만을 도입했습니다. 핵심적으로 저희는 학습 가능한 고품질 출력 토큰을 설계했습니다. 이 토큰은 SAM의 마스크 디코더에 주입되어 고품질 마스크를 예측하는 역할을 담당합니다. 마스크의 세부 사항을 개선하기 위해 이 토큰을 마스크 디코더 특징에만 적용하는 것이 아니라 초기 및 최종 ViT 특징과 먼저 융합하여 사용합니다. 도입된 학습 가능한 매개변수를 훈련하기 위해 저희는 여러 출처에서 가져온 44,000개의 미세 조정된 마스크 데이터셋을 구성했습니다. HQ-SAM은 오직 이 44,000개 마스크 데이터셋만으로 훈련되며 GPU 8대를 사용했을 때 단 4시간이 소요됩니다.
+
+SAM-HQ 사용 팁:
+
+- SAM-HQ는 기존 SAM 모델보다 더 높은 품질의 마스크 생성하며, 특히 복잡한 구조와 미세한 세부 사항을 가진 객체에 대해 성능이 우수합니다.
+- 이 모델은 더욱 정확한 경계와 얇은 구조에 대한 더 나은 처리 능력을 갖춘 이진 마스크를 예측합니다.
+- SAM과 마찬가지로 모델은 입력으로 2차원 포인트 및 바운딩 박스를 사용할 때 더 좋은 성능을 보입니다.
+- 하나의 이미지에 대해 다수의 포인트를 프롬프트로 입력하여 단일의 고품질 마스크를 예측할 수 있습니다.
+- 이 모델은 SAM의 제로샷 일반화 능력을 그대로 유지합니다.
+- SAM-HQ는 SAM 대비 약 0.5%의 추가 매개변수만을 가집니다.
+- 현재 모델의 미세 조정은 지원되지 않습니다.
+
+이 모델은 [sushmanth](https://huggingface.co/sushmanth)님께서 기여해주셨습니다.
+원본 코드는 [여기](https://github.com/SysCV/SAM-HQ)에서 확인하실 수 있습니다.
+
+아래는 이미지와 2차원 포인트가 주어졌을 때, 마스크를 생성하는 방법에 대한 예시입니다.
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import infer_device, SamHQModel, SamHQProcessor
+
+device = infer_device()
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+input_points = [[[450, 600]]]  # 이미지 내 창문의 2차원 위치
+
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+또한, 프로세서에서 입력 이미지와 함께 사용자의 마스크를 직접 처리하여 모델에 전달할 수도 있습니다.
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import infer_device, SamHQModel, SamHQProcessor
+
+device = infer_device()
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("1")
+input_points = [[[450, 600]]]  # 이미지 내 창문의 2차원 위치
+
+inputs = processor(raw_image, input_points=input_points, segmentation_maps=segmentation_map, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+## 자료[[resources]]
+
+다음은 SAM-HQ 사용을 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티 (🌎로 표시) 자료 목록입니다.
+
+- 모델 사용을 위한 데모 노트북 (출시 예정)
+- 논문 구현 및 코드: [SAM-HQ 깃허브 저장소](https://github.com/SysCV/SAM-HQ)
+
+## SamHQConfig[[transformers.SamHQConfig]]
+
+[[autodoc]] SamHQConfig
+
+## SamHQVisionConfig[[transformers.SamHQVisionConfig]]
+
+[[autodoc]] SamHQVisionConfig
+
+## SamHQMaskDecoderConfig[[transformers.SamHQMaskDecoderConfig]]
+
+[[autodoc]] SamHQMaskDecoderConfig
+
+## SamHQPromptEncoderConfig[[transformers.SamHQPromptEncoderConfig]]
+
+[[autodoc]] SamHQPromptEncoderConfig
+
+## SamHQProcessor[[transformers.SamHQProcessor]]
+
+[[autodoc]] SamHQProcessor
+
+## SamHQVisionModel[[transformers.SamHQVisionModel]]
+
+[[autodoc]] SamHQVisionModel
+
+## SamHQModel[[transformers.SamHQModel]]
+
+[[autodoc]] SamHQModel
+    - forward
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@ -929,14 +929,6 @@ class ContinuousBatchingManager:
            if self.batch_processor is not None:
                request_cancelled = self.batch_processor.scheduler.request_is_cancelled(request_id)

-    @staticmethod
-    def supported_attention_implementations() -> set[str]:
-        return {"eager_paged", "sdpa_paged", "flash_attention_2"}
-
-    @staticmethod
-    def default_attention_implementation() -> str:
-        return "sdpa_paged"
-
    @traced
    def warmup(self, batch_processor: ContinuousBatchProcessor) -> None:
        stream = torch.cuda.Stream(device=self.model.device)
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -557,7 +557,7 @@ def _infer_parameter_dtype(
    is_param_float8_e4m3fn = is_torch_e4m3fn_available and empty_param.dtype == torch.float8_e4m3fn
    if empty_param.dtype.is_floating_point and not is_param_float8_e4m3fn:
        # dtype that was instantiated in the meta model -- note that this respects subconfigs dtypes
-        if hf_quantizer is not None:
+        if hf_quantizer is not None and hf_quantizer.param_needs_quantization(model, param_name):
            casting_dtype = model.config._pre_quantization_dtype
        else:
            casting_dtype = old_param.dtype
@ -2419,30 +2419,30 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        if applicable_attention not in ["eager"] + ALL_ATTENTION_FUNCTIONS.valid_keys():
            message = (
                f'Specified `attn_implementation="{applicable_attention}"` is not supported. The only possible arguments are '
-                '`attn_implementation="eager"`'
+                '`attn_implementation="eager"`, `"paged|eager"`'
            )
            # check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases
            if self._supports_flash_attn or getattr(self, "_supports_flash_attn_2", False):
-                message += ', `"attn_implementation=flash_attention_3"`, `"attn_implementation=flash_attention_2"`'
+                message += ', `"attn_implementation=flash_attention_3"`, `"attn_implementation=flash_attention_2"`, `"attn_implementation=paged|flash_attention_2"`'
            if self._supports_sdpa:
-                message += ', `"attn_implementation=sdpa"'
+                message += ', `"attn_implementation=sdpa"`, `"attn_implementation=paged|spda"`'
            if self._supports_flex_attn:
                message += ', `"attn_implementation=flex_attention"`'
            raise ValueError(message + ".")

        # Perform relevant checks
-        if applicable_attention == "flash_attention_2":
+        if "flash_attention_2" in applicable_attention:
            self._flash_attn_2_can_dispatch(is_init_check)
-        elif applicable_attention == "flash_attention_3":
+        elif "flash_attention_3" in applicable_attention:
            self._flash_attn_3_can_dispatch(is_init_check)
-        elif applicable_attention == "flex_attention":
+        elif "flex_attention" in applicable_attention:
            self._flex_attn_can_dispatch(is_init_check)
-        elif applicable_attention == "sdpa":
+        elif "sdpa" in applicable_attention:
            # Sdpa is the default, so we try it and fallback to eager otherwise when not possible
            try:
                self._sdpa_can_dispatch(is_init_check)
            except (ValueError, ImportError) as e:
-                if requested_attention == "sdpa":
+                if requested_attention is not None and "sdpa" in requested_attention:
                    raise e
                applicable_attention = "eager"

@ -4430,6 +4430,12 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                    "loaded from GGUF files."
                )

+        if kernel_config is not None and not use_kernels:
+            logger.warning_once(
+                "A kernel_config was provided but use_kernels is False; setting use_kernels=True automatically. To suppress this warning, explicitly set use_kernels to True."
+            )
+            use_kernels = True
+
        checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
            pretrained_model_name_or_path=pretrained_model_name_or_path,
            variant=variant,
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -738,6 +738,10 @@ class Trainer:
        self._train_batch_size = args.train_batch_size
        self._created_lr_scheduler = False

+        # Set use_cache for the model
+        if getattr(self.model, "config", None) is not None:
+            self.model.config.use_cache = self.args.use_cache
+
        # very last
        self._memory_tracker.stop_and_update_metrics()

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@ -752,6 +752,10 @@ class TrainingArguments:
            Whether or not to average tokens across devices. If enabled, will use all_reduce to synchronize
            num_tokens_in_batch for precise loss calculation. Reference:
            https://github.com/huggingface/transformers/issues/34242
+
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not to enable cache for the model. For training, this is usually not needed apart from some PEFT methods that uses `past_key_values`.
+
    """

    # Sometimes users will pass in a `str` repr of a dict in the CLI
@ -1382,6 +1386,13 @@ class TrainingArguments:
        },
    )

+    use_cache: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to use cache for the model For training, this is usually not needed apart from some PEFT methods that uses `past_key_values`."
+        },
+    )
+
    def __post_init__(self):
        # Set default output_dir if not provided
        if self.output_dir is None:
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -52,7 +52,10 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
        try:
            # importlib.metadata works with the distribution package, which may be different from the import
            # name (e.g. `PIL` is the import name, but `pillow` is the distribution name)
-            distribution_name = PACKAGE_DISTRIBUTION_MAPPING[pkg_name][0]
+            distributions = PACKAGE_DISTRIBUTION_MAPPING[pkg_name]
+            # In most cases, the packages are well-behaved and both have the same name. If it's not the case, we
+            # pick the first item of the list as best guess (it's almost always a list of length 1 anyway)
+            distribution_name = pkg_name if pkg_name in distributions else distributions[0]
            package_version = importlib.metadata.version(distribution_name)
        except importlib.metadata.PackageNotFoundError:
            # If we cannot find the metadata (because of editable install for example), try to import directly.
@ -1167,6 +1170,13 @@ def is_mistral_common_available() -> bool:
    return _is_package_available("mistral_common")


+@lru_cache
+def is_opentelemetry_available() -> bool:
+    return _is_package_available("opentelemetry") and version.parse(
+        importlib.metadata.version("opentelemetry-api")
+    ) >= version.parse("1.30.0")
+
+
 def check_torch_load_is_safe() -> None:
    if not is_torch_greater_or_equal("2.6"):
        raise ValueError(
--- a/src/transformers/utils/metrics.py
+++ b/src/transformers/utils/metrics.py
@ -5,6 +5,8 @@ from collections.abc import Callable
 from enum import Enum
 from typing import Any, Optional, Union

+from .import_utils import is_opentelemetry_available
+

 class RequestStatus(Enum):
    """Status of a generation request through its lifecycle."""
@ -18,12 +20,12 @@ class RequestStatus(Enum):
    FAILED = "failed"


-try:
+if is_opentelemetry_available():
    from opentelemetry import metrics
    from opentelemetry.trace import Status, StatusCode, get_tracer

    _has_opentelemetry = True
-except ImportError:
+else:
    _has_opentelemetry = False


@ -183,7 +185,10 @@ class ContinuousBatchProcessorMetrics:
        """Initialize OpenTelemetry metrics and tracing if the library is available."""

        if not _has_opentelemetry:
-            logger.info("OpenTelemetry is not installed. Metrics and tracing will not be recorded.")
+            logger.info(
+                "OpenTelemetry is not installed. Metrics and tracing will not be recorded."
+                "You can install it with `pip install opentelemetry-api>=1.30.0`"
+            )
            return

        self.meter = metrics.get_meter("transformers.generation.continuous_batch_processor")
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@ -219,6 +219,7 @@ class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        super().test_multi_gpu_data_parallel_forward()

    def test_config(self):
+        assert 1 == 2
        self.config_tester.run_common_tests()

    @unittest.skip(reason="ViT does not use inputs_embeds")
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import git
 import json
 import os
 import re
@ -38,30 +39,34 @@ def create_script(target_test):
 import os
 import subprocess

+_ = subprocess.run(
+    ["python3", "-m", "pip", "install", "-e", "."],
+    capture_output = True,
+    text=True,
+)
+
 result = subprocess.run(
-    ["python3", "-m", "pytest", "-v", "-rfEp", f"{target_test}"],
+    ["python3", "-m", "pytest", "-v", "--flake-finder", "--flake-runs=4", "-rfEp", f"{target_test}"],
    capture_output = True,
    text=True,
 )
 print(result.stdout)

-if f"PASSED {target_test}" in result.stdout:
-    print("test passed")
-    exit(0)
-elif len(result.stderr) > 0:
+if f"FAILED {target_test}" in result.stdout:
+    print("test failed")
+    exit(1)
+elif result.returncode != 0:
    if "ERROR: file or directory not found: " in result.stderr:
        print("test file or directory not found in this commit")
-        exit(0)
+        exit(125)
    elif "ERROR: not found: " in result.stderr:
        print("test not found in this commit")
-        exit(0)
+        exit(125)
    else:
-        print(f"pytest failed to run: {{result.stderr}}")
+        print(f"pytest gets unknown error: {{result.stderr}}")
        exit(-1)
-elif f"FAILED {target_test}" in result.stdout:
-    print("test failed")
-    exit(2)

+print(f"pytest runs successfully.")
 exit(0)
 """

@ -69,20 +74,57 @@ exit(0)
        fp.write(script.strip())


+def is_bad_commit(target_test, commit):
+    repo = git.Repo('.')  # or specify path to your repo
+
+    # Save the current HEAD reference
+    original_head = repo.head.commit
+
+    # Checkout to the commit
+    repo.git.checkout(commit)
+
+    create_script(target_test=target_test)
+
+    result = subprocess.run(
+        ["python3", "target_script.py"],
+        capture_output=True,
+        text=True,
+    )
+
+    # Restore to original commit
+    repo.git.checkout(original_head)
+
+    return result.returncode != 0
+
+
 def find_bad_commit(target_test, start_commit, end_commit):
-    """Find (backward) the earliest commit between `start_commit` and `end_commit` at which `target_test` fails.
+    """Find (backward) the earliest commit between `start_commit` (inclusive) and `end_commit` (exclusive) at which `target_test` fails.

    Args:
        target_test (`str`): The test to check.
-        start_commit (`str`): The latest commit.
-        end_commit (`str`): The earliest commit.
+        start_commit (`str`): The latest commit (inclusive).
+        end_commit (`str`): The earliest commit (exclusive).

    Returns:
        `str`: The earliest commit at which `target_test` fails.
    """

+    # check if `end_commit` fails the test
+    failed_before = is_bad_commit(target_test, end_commit)
+    if failed_before:
+        return None
+
+    # if there is no new commit (e.g. 2 different CI runs on the same commit):
+    #   - failed once on `start_commit` but passed on `end_commit`, which are the same commit --> flaky (or something change externally) --> don't report
    if start_commit == end_commit:
-        return start_commit
+        return None
+
+    # Now, we are (almost) sure `target_test` is not failing at `end_commit`
+    # check if `start_commit` fail the test
+    failed_now = is_bad_commit(target_test, start_commit)
+    if not failed_now:
+        # failed on CI run, but not reproducible here --> don't report
+        return None

    create_script(target_test=target_test)

@ -103,22 +145,11 @@ git bisect run python3 target_script.py
    )
    print(result.stdout)

+    # This happens if running the script gives exit code < 0  or other issues
    if "error: bisect run failed" in result.stderr:
-        index = result.stderr.find("error: bisect run failed")
-        bash_error = result.stderr[index:]
-
-        error_msg = f"Error when running git bisect:\nbash error: {bash_error}"
-
-        pattern = "pytest failed to run: .+"
-        pytest_errors = re.findall(pattern, result.stdout)
-        if len(pytest_errors) > 0:
-            pytest_error = pytest_errors[0]
-            index = pytest_error.find("pytest failed to run: ")
-            index += len("pytest failed to run: ")
-            pytest_error = pytest_error[index:]
-            error_msg += f"pytest error: {pytest_error}"
-
-        raise ValueError(error_msg)
+        error_msg = f"Error when running git bisect:\nbash error: {result.stderr}\nbash output:\n{result.stdout}\nset `bad_commit` to `None`."
+        print(error_msg)
+        return None

    pattern = r"(.+) is the first bad commit"
    commits = re.findall(pattern, result.stdout)
@ -135,6 +166,9 @@ git bisect run python3 target_script.py

 def get_commit_info(commit):
    """Get information for a commit via `api.github.com`."""
+    if commit is None:
+        return {"commit": None, "pr_number": None, "author": None, "merged_by": None}
+
    pr_number = None
    author = None
    merged_author = None
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@ -74,4 +74,5 @@ if __name__ == "__main__":
        end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
        model_splits.append(d[start:end])

+    model_splits = [["models/vit"], ["models/clip"]]
    print(model_splits)
Author	SHA1	Message	Date
ydshieh	740f952218	check 1	2025-10-17 06:57:10 +02:00
ydshieh	950c4e5303	check 1	2025-10-17 06:28:55 +02:00
ydshieh	89970f4797	check 1	2025-10-17 03:03:25 +02:00
ydshieh	a4a46e62a5	check 1	2025-10-16 21:32:04 +02:00
ydshieh	9b36498d5f	1	2025-10-16 21:16:53 +02:00
HyunSang Jang	eefbf4ac8b	🌐 [i18n-KO] Translated llama4.md to Korean (#40396 ) * docs: ko: llama4.md * feat: nmt draft * fix: manual edits * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> --------- Co-authored-by: TaskerJang <bymyself103@naver.com> Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com>	2025-10-16 11:28:27 -07:00
Judy	50ca781d78	🌐 [i18n-KO] Translated `code_llama.md` to Korean (#40558 ) * docs: ko: code_llama.md * feat: nmt draft * fix: manual edits * Apply suggestions from code review Co-authored-by: Harheem Kim <49297157+harheem@users.noreply.github.com> Co-authored-by: HyunZ118 <156191095+HyunZ118@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Harheem Kim <49297157+harheem@users.noreply.github.com> --------- Co-authored-by: Harheem Kim <49297157+harheem@users.noreply.github.com> Co-authored-by: HyunZ118 <156191095+HyunZ118@users.noreply.github.com>	2025-10-16 11:27:46 -07:00
SSUM	8739fc05c4	[i18n-KO] Translated `big_bird.md` to Korean (#40445 ) * docs: ko: BigBird.md * feat: nmt draft * fix: manual edits	2025-10-16 11:23:56 -07:00
HyunZ118	77b5ad65ee	🌐 [i18n-KO] Translated sam_hq.md to Korean (#41340 ) * fix: manual edits * Apply suggestions from code review Apply suggestions from code review Co-authored-by: HyunSang Jang <tasker.dev103@gmail.com> * Apply suggestions from code review Apply suggestions from code review Co-authored-by: Woojun Jung <46880056+jungnerd@users.noreply.github.com> --------- Co-authored-by: HyunSang Jang <tasker.dev103@gmail.com> Co-authored-by: Woojun Jung <46880056+jungnerd@users.noreply.github.com>	2025-10-16 11:10:16 -07:00
Judy	a9731a725e	🌐 [i18n-KO] Translated `chat_extras.md` to Korean (#39863 ) * docs: ko: chat_extras.md * feat: nmt draft * fix: manual edits * Apply suggestions from code review * Apply suggestions from code review * Update docs/source/ko/chat_extras.md	2025-10-16 10:41:03 -07:00
Marc Sun	bdbc2d037b	[Trainer] [Breaking change] `use_cache` default to `False` (#41585 ) * use_cache default to `False` when training * style * Fix comment * add checks * style * set * switch	2025-10-16 18:51:36 +02:00
Mohamed Mekkouri	fe11cbb808	Erroring when KernelConfig is passed without use_kernels = True (#41657 ) * update * update	2025-10-16 18:08:46 +02:00
Yih-Dar	6344371a91	improve `utils/check_bad_commit.py` (#41658 ) * robust * robust * robust --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-10-16 15:51:19 +00:00
Cyril Vallez	a408384a88	Improve package version check (#41661 ) fix	2025-10-16 17:31:58 +02:00
Rémi Ouazan	f7c33abab3	Small changes to benchmarking script (#41662 )	2025-10-16 17:25:49 +02:00
Marc Sun	9839d57a02	Fix serving continuous batching (#41624 ) * udpate-serving-cb * style * style * check none * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>	2025-10-16 17:24:21 +02:00
Cyril Vallez	e85d5ab2bb	Fix dtype casting with quantization (#41665 ) fix dtype casting	2025-10-16 17:19:32 +02:00