fix

2025-10-20 17:13:56 +08:00 · 2025-10-17 22:21:10 +02:00 · 2025-10-17 21:30:29 +02:00 · 2025-10-17 20:59:00 +02:00 · 2025-10-17 20:30:23 +02:00 · 2025-10-17 15:40:54 +02:00
14 changed files with 1404 additions and 111 deletions
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -41,9 +41,14 @@ env:

 jobs:
  check_new_failures:
-    name: " "
+    name: "Find commits for new failing tests"
+    strategy:
+      matrix:
+        run_idx: [1]
    runs-on:
      group: aws-g5-4xlarge-cache
+    outputs:
+      process: ${{ steps.check_file.outputs.process }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -54,14 +59,17 @@ jobs:
          path: /transformers/ci_results_${{ inputs.job }}

      - name: Check file
+        id: check_file
        working-directory: /transformers
        run: |
          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
            echo "process=true" >> $GITHUB_ENV
+            echo "process=true" >> $GITHUB_OUTPUT
          else
            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
            echo "process=false" >> $GITHUB_ENV
+            echo "process=false" >> $GITHUB_OUTPUT
          fi

      - uses: actions/download-artifact@v4
@ -118,6 +126,10 @@ jobs:
        run: |
          python3 utils/print_env.py

+      - name: Install pytest-flakefinder
+        if: ${{ env.process == 'true' }}
+        run: python3 -m pip install pytest-flakefinder
+
      - name: Show installed libraries and their versions
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
@ -126,25 +138,63 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
        run: |
-          ls -l new_failures_with_bad_commit.json
-          cat new_failures_with_bad_commit.json
+          ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+          cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

-      - name: Checkout back
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}
+          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+
+  process_new_failures_with_commit_info:
+    name: "process bad commit reports"
+    needs: check_new_failures
+    if: needs.check_new_failures.outputs.process == 'true'
+    runs-on:
+      group: aws-g5-4xlarge-cache
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: ci_results_${{ inputs.job }}
+          path: /transformers/ci_results_${{ inputs.job }}
+
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: new_failures_with_bad_commit_${{ inputs.job }}*
+          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+          merge-multiple: true
+
+      - name: Check files
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        run: |
-          git checkout ${{ inputs.start_sha }}
+          ls -la /transformers
+          ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+
+      # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
+      # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
+      - name: Merge files
+        shell: bash
+        working-directory: /transformers
+        run: |
+          cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Process report
        shell: bash
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -156,7 +206,6 @@ jobs:
      - name: Process report
        shell: bash
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -171,13 +220,12 @@ jobs:

      - name: Prepare Slack report title
        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
        run: |
          pip install slack_sdk
          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV

      - name: Send processed report
-        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
+        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
        with:
          # Slack channel id, channel name, or user id to post message.
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -6,7 +6,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - run_nvidia_ci*
+      - multi_jobs_to_check_bad_commit
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -23,7 +23,7 @@ on:

 # Used for `push` to easily modify the target workflow runs to compare against
 env:
-    prev_workflow_run_id: ""
+    prev_workflow_run_id: "18548615847"
    other_workflow_run_id: ""


@ -49,72 +49,10 @@ jobs:
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-models"
+      slack_report_channel: "#transformers-ci-dummy"
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      runner_type: "a10"
      report_repo_id: hf-internal-testing/transformers_daily_ci
      commit_sha: ${{ github.sha }}
    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      docker: huggingface/transformers-pytorch-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-examples"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  trainer-fsdp-ci:
-    name: Trainer/FSDP CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_trainer_and_fsdp_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      docker: huggingface/transformers-all-latest-gpu
-      runner_type: "a10"
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Daily CI
-      working-directory-prefix: /workspace
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  quantization-ci:
-    name: Quantization CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-quantization"
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -84,6 +84,8 @@
      title: Transformers로 채팅하기
    - local: chat_templating
      title: 챗봇 템플릿 익히기
+    - local: chat_extras
+      title: Tools 와 RAG
    - local: in_translation
      title: (번역중) Multimodal templates
    - local: in_translation
@ -459,7 +461,7 @@
        title: BertJapanese
      - local: model_doc/bertweet
        title: BERTweet
-      - local: in_translation
+      - local: model_doc/big_bird
        title: BigBird
      - local: in_translation
        title: BigBirdPegasus
@ -483,7 +485,7 @@
        title: CANINE
      - local: model_doc/codegen
        title: CodeGen
-      - local: in_translation
+      - local: model_doc/code_llama
        title: CodeLlama
      - local: model_doc/cohere
        title: Cohere
@ -881,6 +883,8 @@
        title: SegFormer
      - local: in_translation
        title: SegGpt
+      - local: model_doc/sam_hq
+        title: Segment Anything High Quality (SAM-HQ)
      - local: in_translation
        title: SuperGlue
      - local: in_translation
@ -1097,7 +1101,7 @@
        title: LayoutXLM
      - local: in_translation
        title: LiLT
-      - local: in_translation
+      - local: model_doc/llama4
        title: Llama4
      - local: in_translation
        title: Llava
--- a/docs/source/ko/chat_extras.md
+++ b/docs/source/ko/chat_extras.md
@ -0,0 +1,299 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 도구와 RAG[[Tools-and-RAG]]
+
+[`~PreTrainedTokenizerBase.apply_chat_template`] 메소드는 채팅 메시지 외에도 문자열, 리스트, 딕셔너리 등 거의 모든 종류의 추가 인수 타입을 지원합니다. 이를 통해 다양한 사용 상황에서 채팅 템플릿을 활용할 수 있습니다.
+
+이 가이드에서는 도구 및 검색 증강 생성(RAG)과 함께 채팅 템플릿을 사용하는 방법을 보여드립니다.
+
+## 도구[[Tools]]
+
+도구는 대규모 언어 모델(LLM)이 특정 작업을 수행하기 위해 호출할 수 있는 함수입니다. 이는 실시간 정보, 계산 도구 또는 대규모 데이터베이스 접근 등을 통해 대화형 에이전트의 기능을 확장하는 강력한 방법입니다.
+
+도구를 만들 때는 아래 규칙을 따르세요.
+
+1. 함수는 기능을 잘 설명하는 이름을 가져야 합니다.
+2. 함수의 인수는 함수 헤더에 타입 힌트를 포함해야 합니다(`Args` 블록에는 포함하지 마세요).
+3. 함수에는 [Google 스타일](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) 의 독스트링(docstring)이 포함되어야 합니다.
+4. 함수에 반환 타입과 `Returns` 블록을 포함할 수 있지만, 도구를 활용하는 대부분의 모델에서 이를 사용하지 않기 때문에 무시할 수 있습니다.
+
+주어진 위치의 현재 온도와 풍속을 가져오는 도구의 예시는 아래와 같습니다.
+
+```py
+def get_current_temperature(location: str, unit: str) -> float:
+    """
+    주어진 위치의 현재 온도를 가져옵니다.
+    
+    Args:
+        location: 온도를 가져올 위치, "도시, 국가" 형식
+        unit: 온도를 반환할 단위. (선택지: ["celsius(섭씨)", "fahrenheit(화씨)"])
+    Returns:
+        주어진 위치의 지정된 단위로 표시된 현재 온도(float 자료형).
+    """
+    return 22.  # 실제 함수라면 아마 진짜로 기온을 가져와야겠죠!
+
+def get_current_wind_speed(location: str) -> float:
+    """
+    주어진 위치의 현재 풍속을 km/h 단위로 가져옵니다.
+    
+    Args:
+        location: 온도를 가져올 위치, "도시, 국가" 형식
+    Returns:
+        주어진 위치의 현재 풍속(km/h, float 자료형).
+    """
+    return 6.  # 실제 함수라면 아마 진짜로 풍속을 가져와야겠죠!
+
+tools = [get_current_temperature, get_current_wind_speed]
+```
+
+[NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B)와 같이 도구 사용을 지원하는 모델과 토크나이저를 가져오세요. 하드웨어가 지원된다면 [Command-R](./model_doc/cohere)이나 [Mixtral-8x22B](./model_doc/mixtral)와 같은 더 큰 모델도 고려할 수 있습니다.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+채팅 메시지를 생성합니다.
+
+```py
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+`messages`와 도구 목록 `tools`를 [`~PreTrainedTokenizerBase.apply_chat_template`]에 전달한 뒤, 이를 모델의 입력으로 사용하여 텍스트를 생성할 수 있습니다.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+outputs = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+채팅 모델은 독스트링(docstring)에 정의된 형식에 따라 `get_current_temperature` 함수에 올바른 매개변수를 전달해 호출했습니다. 파리를 기준으로 위치를 프랑스로 추론했으며, 온도 단위는 섭씨를 사용해야 한다고 판단했습니다.
+
+이제 `get_current_temperature` 함수와 해당 인수들을 `tool_call` 딕셔너리에 담아 채팅 메시지에 추가합니다. `tool_call` 딕셔너리는 `system`이나 `user`가 아닌 `assistant` 역할로 제공되어야 합니다.
+
+> [!WARNING]
+> OpenAI API는 `tool_call` 형식으로 JSON 문자열을 사용합니다. Transformers에서 사용할 경우 딕셔너리를 요구하기 때문에, 오류가 발생하거나 모델이 이상하게 동작할 수 있습니다.
+
+<hfoptions id="tool-call">
+<hfoption id="Llama">
+
+```py
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+
+어시스턴트가 함수 출력을 읽고 사용자와 채팅할 수 있도록 합니다.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
+```
+
+</hfoption>
+<hfoption id="Mistral/Mixtral">
+
+[Mistral](./model_doc/mistral) 및 [Mixtral](./model_doc/mixtral) 모델의 경우 추가적으로 `tool_call_id`가 필요합니다. `tool_call_id`는 9자리 영숫자 문자열로 생성되어 `tool_call` 딕셔너리의 `id` 키에 할당됩니다.
+
+```py
+tool_call_id = "9Ae3bDc2F"
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
+```
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+</hfoption>
+</hfoptions>
+
+## 스키마[[Schema]]
+
+[`~PreTrainedTokenizerBase.apply_chat_template`]은 함수를 [JSON 스키마](https://json-schema.org/learn/getting-started-step-by-step)로 변환하여 채팅 템플릿에 전달합니다. LLM은 함수 내부의 코드를 보지 못합니다. 다시 말해, LLM은 함수가 기술적으로 어떻게 작동하는지는 신경 쓰지 않고, 함수의 **정의**와 **인수**만 참조합니다.
+
+함수가 앞서 나열된 규칙을 따르면, 내부에서 JSON 스키마가 자동으로 생성됩니다. 하지만 더 나은 가독성이나 디버깅을 위해 [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205)를 사용하여 스키마를 수동으로 변환할 수 있습니다.
+
+```py
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    두 숫자를 곱하는 함수
+    
+    Args:
+        a: 곱할 첫 번째 숫자
+        b: 곱할 두 번째 숫자
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+스키마를 편집하거나 처음부터 직접 작성할 수 있습니다. 이를 통해 더 복잡한 함수에 대한 정확한 스키마를 유연하게 정의할 수 있습니다.
+
+> [!WARNING]
+> 함수 시그니처를 단순하게 유지하고 인수를 최소한으로 유지하세요. 이러한 함수는 중첩된 인수를 가진 복잡한 함수에 비해 모델이 더 쉽게 이해하고 사용할 수 있습니다.
+
+아래 예시는 스키마를 수동으로 작성한 다음 [`~PreTrainedTokenizerBase.apply_chat_template`]에 전달하는 방법을 보여줍니다.
+
+```py
+# 인수를 받지 않는 간단한 함수
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# 두 개의 숫자 인수를 받는 더 완전한 함수
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## RAG[[RAG]]
+
+검색 증강 생성(Retrieval-augmented generation, RAG) 모델은 쿼리를 반환하기 전에 문서를 검색해 추가 정보를 얻어 모델이 기존에 가지고 있던 지식을 확장시킵니다. RAG 모델의 경우, [`~PreTrainedTokenizerBase.apply_chat_template`]에 `documents` 매개변수를 추가하세요. 이 `documents` 매개변수는 문서 목록이어야 하며, 각 문서는 `title`과 `content` 키를 가진 단일 딕셔너리여야 합니다.
+
+> [!TIP]
+> RAG를 위한 `documents` 매개변수는 폭넓게 지원되지 않으며 많은 모델들이 `documents`를 무시하는 채팅 템플릿을 가지고 있습니다. 모델이 `documents`를 지원하는지 확인하려면 모델 카드를 읽거나 `print(tokenizer.chat_template)`를 실행하여 `documents` 키가 있는지 확인하세요. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024)과 [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024)는 모두 RAG 채팅 템플릿에서 `documents`를 지원합니다.
+
+모델에 전달할 문서 목록을 생성하세요.
+
+```py
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+```
+
+[`~PreTrainedTokenizerBase.apply_chat_template`]에서 `chat_template="rag"`를 설정하고 응답을 생성하세요.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# 모델과 토크나이저 로드
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
+device = model.device # 모델을 가져온 장치 확인
+
+# 대화 입력 정의
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# 응답 생성
+generated_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# 생성된 텍스트를 디코딩하고 생성 프롬프트와 함께 출력
+generated_text = tokenizer.decode(generated_tokens[0])
+print(generated_text)
+```
--- a/docs/source/ko/model_doc/big_bird.md
+++ b/docs/source/ko/model_doc/big_bird.md
@ -0,0 +1,158 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2020-07-28에 출시되었으며 2021-03-30에 Hugging Face Transformers에 추가되었습니다.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
+# BigBird[[bigbird]]
+
+[BigBird](https://huggingface.co/papers/2007.14062)는 [BERT](./bert)의 512토큰과 달리 최대 4096토큰까지의 시퀀스 길이를 처리하도록 설계된 트랜스포머 모델입니다. 기존 트랜스포머들은 시퀀스 길이가 늘어날수록 어텐션 계산 비용이 급격히 증가하여 긴 입력 처리에 어려움을 겪습니다. BigBird는 희소 어텐션 메커니즘으로 이 문제를 해결하는데, 모든 토큰을 동시에 살펴보는 대신 로컬 어텐션, 랜덤 어텐션, 그리고 몇 개의 전역 토큰을 조합하여 전체 입력을 효율적으로 처리합니다. 이런 방식을 통해 계산 효율성을 유지하면서도 시퀀스 전체를 충분히 이해할 수 있게 됩니다. 따라서 BigBird는 질의응답, 요약, 유전체학 응용처럼 긴 문서를 다루는 작업에 특히 우수한 성능을 보입니다.
+
+모든 원본 BigBird 체크포인트는 [Google](https://huggingface.co/google?search_models=bigbird) 조직에서 찾아볼 수 있습니다.
+
+> [!TIP]
+> 오른쪽 사이드바의 BigBird 모델들을 클릭하여 다양한 언어 작업에 BigBird를 적용하는 더 많은 예시를 확인해보세요.
+
+아래 예시는 [`Pipeline`], [`AutoModel`], 그리고 명령줄에서 `[MASK]` 토큰을 예측하는 방법을 보여줍니다.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="fill-mask",
+    model="google/bigbird-roberta-base",
+    dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create [MASK] through a process known as photosynthesis.")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-roberta-base",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "google/bigbird-roberta-base",
+    dtype=torch.float16,
+    device_map="auto",
+)
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+!echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google/bigbird-roberta-base --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## 참고사항[[notes]]
+
+- BigBird는 절대 위치 임베딩을 사용하므로 입력을 오른쪽에 패딩해야 합니다.
+- BigBird는 `original_full`과 `block_sparse` 어텐션을 지원합니다. 입력 시퀀스 길이가 1024 미만인 경우에는 희소 패턴의 이점이 크지 않으므로 `original_full` 사용을 권장합니다.
+- 현재 구현은 3블록 윈도우 크기와 2개의 전역 블록을 사용하며, ITC 구현만 지원하고 `num_random_blocks=0`은 지원하지 않습니다.
+- 시퀀스 길이는 블록 크기로 나누어떨어져야 합니다.
+
+## 리소스[[resources]]
+
+- BigBird 어텐션 메커니즘의 자세한 작동 원리는 [BigBird](https://huggingface.co/blog/big-bird) 블로그 포스트를 참고하세요.
+
+## BigBirdConfig[[bigbirdconfig]]
+
+[[autodoc]] BigBirdConfig
+
+## BigBirdTokenizer[[bigbirdtokenizer]]
+
+[[autodoc]] BigBirdTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BigBirdTokenizerFast[[bigbirdtokenizerfast]]
+
+[[autodoc]] BigBirdTokenizerFast
+
+## BigBird 특정 출력[[bigbird-specific-outputs]]
+
+[[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
+
+## BigBirdModel[[bigbirdmodel]]
+
+[[autodoc]] BigBirdModel
+    - forward
+
+## BigBirdForPreTraining[[bigbirdforpretraining]]
+
+[[autodoc]] BigBirdForPreTraining
+    - forward
+
+## BigBirdForCausalLM[[bigbirdforcausallm]]
+
+[[autodoc]] BigBirdForCausalLM
+    - forward
+
+## BigBirdForMaskedLM[[bigbirdformaskedlm]]
+
+[[autodoc]] BigBirdForMaskedLM
+    - forward
+
+## BigBirdForSequenceClassification[[bigbirdforsequenceclassification]]
+
+[[autodoc]] BigBirdForSequenceClassification
+    - forward
+
+## BigBirdForMultipleChoice[[bigbirdformultiplechoice]]
+
+[[autodoc]] BigBirdForMultipleChoice
+    - forward
+
+## BigBirdForTokenClassification[[bigbirdfortokenclassification]]
+
+[[autodoc]] BigBirdForTokenClassification
+    - forward
+
+## BigBirdForQuestionAnswering[[bigbirdforquestionanswering]]
+
+[[autodoc]] BigBirdForQuestionAnswering
+    - forward
--- a/docs/source/ko/model_doc/code_llama.md
+++ b/docs/source/ko/model_doc/code_llama.md
@ -0,0 +1,180 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2023년 8월 24일에 공개되었으며, 2023년 8월 25일에 Hugging Face Transformers에 추가되었습니다.*
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        ">
+    </div>
+</div>
+
+# CodeLlama[[codellama]]
+
+[Code Llama](https://huggingface.co/papers/2308.12950)는 코딩 작업에 특화된 대규모 언어 모델 계열로,  [Llama 2](./llama2)를 기반으로 개발되었습니다. 일반적인 코드, Python 특화, 명령어(지시) 기반 변형 등 다양한 버전으로 제공되며, 모두 7B, 13B, 34B, 70B 매개변수 크기로 사용할 수 있습니다. Code Llama 모델은 코드를 생성하고 설명하며, 코드의 누락된 부분을 채울 수도 있습니다. 이를 인필링(infilling)이라고 합니다. 16K 토큰 길이로 훈련되었지만, 최대 100K 토큰까지 안정적으로 생성하며 긴 컨텍스트도 처리할 수 있습니다.
+
+[Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933) 컬렉션에서 모든 원본 Code Llama 체크포인트를 찾을 수 있습니다.
+
+> [!TIP]
+> 다양한 코딩 작업에 Code Llama를 적용하는 더 많은 예시를 보려면 오른쪽 사이드바의 Code Llama 모델을 클릭하세요.
+
+아래 예시는 [`Pipeline`], [`AutoModel`], 그리고 명령줄에서 코드를 생성하는 방법을 보여줍니다.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    "text-generation",
+    model="meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map=0
+)
+
+# 기본 코드 생성
+result = pipe("# Function to calculate the factorial of a number\ndef factorial(n):", max_new_tokens=256)
+print(result[0]['generated_text'])
+
+# 인필링
+infill_result = pipe("def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result", max_new_tokens=200)
+print(infill_result[0]['generated_text'])
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+# 기본 코드 생성
+prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(
+    **input_ids,
+    max_new_tokens=256,
+    cache_implementation="static"
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+# 인필링
+infill_prompt = "def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result"
+input_ids = tokenizer(infill_prompt, return_tensors="pt").to(model.device)
+
+filled_output = model.generate(**input_ids, max_new_tokens=200)
+filled_text = tokenizer.decode(filled_output[0], skip_special_tokens=True)
+print(filled_text)
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+양자화는 가중치를 더 낮은 정밀도로 표현하여 대규모 모델의 메모리 부담을 줄입니다. 더 많은 사용 가능한 양자화 백엔드는 [양자화](../quantization/overview) 개요를 참조하세요.
+
+아래 예시는 [bitsandbytes](../quantization/bitsandbytes)를 사용하여 가중치를 4비트로만 양자화합니다.
+
+```py
+# bitsandbytes를 설치합니다.
+import torch
+from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
+tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-34b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+   "meta-llama/CodeLlama-34b-hf",
+   torch_dtype=torch.bfloat16,
+   device_map="auto",
+   quantization_config=bnb_config
+)
+
+prompt = "# Write a Python function to check if a string is a palindrome\ndef is_palindrome(s):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, max_new_tokens=200, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+[AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139)를 사용하면 모델이 어떤 토큰에 주의를 기울일 수 있고 기울일 수 없는지를 더 잘 이해할 수 있습니다.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+
+visualizer = AttentionMaskVisualizer("meta-llama/CodeLlama-7b-hf")
+visualizer("""def func(a, b):
+  return a + b""")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/codellama-attn-mask.png"/>
+</div>
+
+## 참고사항[[notes]]
+
+- 인필링 기능은 7B 및 13B 기반 모델에서만 사용할 수 있으며, Python, Instruct, 34B 또는 70B 모델에서는 사용할 수 없습니다.
+- 코드를 채워 넣고 싶은 부분에 `<FILL_ME>` 토큰을 사용하세요. 토크나이저는 이 토큰을 분할하여 [원본 훈련 패턴](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402) 을 따르는 입력 문자열로 변환합니다. 이는 직접 패턴을 준비하는 것보다 더 안정적입니다.
+    ```py
+    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+
+    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    PROMPT = '''def remove_non_ascii(s: str) -> str:
+        """ <FILL_ME>
+        return result
+    '''
+    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
+    generated_ids = model.generate(input_ids, max_new_tokens=128)
+
+    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
+    print(PROMPT.replace("<FILL_ME>", filling))
+    ```
+- 추가 훈련이나 미세 조정에는 `bfloat16`을 사용하고 추론에는 `float16`을 사용하세요.
+- `BOS` 문자는 접두사나 접미사를 인코딩할 때 인필링 작업에 사용되지 않으며, 각 프롬프트의 맨 앞에서만 사용됩니다.
+- 토크나이저는 [SentencePiece](https://github.com/google/sentencepiece)를 기반으로 하는 byte-pair 인코딩 모델입니다. 디코딩 과정에서 첫 번째 토큰이 단어의 시작인 경우(예를 들어 "Banana"), 토크나이저는 문자열에 접두사 공백을 추가하지 않습니다.
+
+## CodeLlamaTokenizer
+
+[[autodoc]] CodeLlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CodeLlamaTokenizerFast
+
+[[autodoc]] CodeLlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
--- a/docs/source/ko/model_doc/llama4.md
+++ b/docs/source/ko/model_doc/llama4.md
@ -0,0 +1,443 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*Meta는 이 모델을 2025-04-05에 출시하고 같은 날 Hugging Face Transformers에 추가했습니다.*
+
+# Llama4[[llama4]]
+
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+Meta에서 개발한 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)는 새로운 자기회귀 Mixture-of-Experts (MoE) 아키텍처를 도입합니다.
+이 세대는 두 가지 모델로 나뉩니다:
+- 128개의 전문가(expert)를 사용하여 총 약 400B 매개변수 중 17B 활성 매개변수를 갖는 고성능 Llama 4 Maverick
+- 16개의 전문가만 사용하여 총 약 109B 매개변수 중 17B 활성 매개변수를 갖는 경량화된 Llama 4 Scout
+-
+두 모델 모두 네이티브 멀티모달을 위한 초기 융합(early fusion)을 활용하여 텍스트와 이미지 입력을 처리할 수 있습니다.
+Maverick과 Scout 모두 200개 언어를 포함하는 데이터에서 최대 40조개의 토큰으로 훈련되었습니다.
+(아랍어, 스페인어, 독일어, 힌디어를 포함한 12개 언어에 대한 특정 미세 조정 지원 포함)
+
+Meta는 Llama 4 Scout을 누구나 쉽게 사용할 수 있도록 설계했습니다. Scout은 4비트 또는 8비트 양자화를 적용하면 단일 서버급 GPU에서도 실시간으로 실행할 수 있습니다. 반면, 더 대규모인 Llama 4 Maverick은 고성능 연산을 위해 BF16과 FP8 형식으로 제공합니다.
+이 모델들은 모델 저장소에서 제공되는 사용자 지정 Llama 4 커뮤니티 라이선스 계약에 따라 출시됩니다.
+
+모든 원본 Llama 체크포인트는 hugging face [meta-llama](https://huggingface.co/meta-llama) 페이지에서 확인하실 수 있습니다.
+
+> [!TIP]
+> Llama 4 모델 패밀리는 두 가지 형태로 제공됩니다: 109B와 402B 매개변수입니다. 이 두 형태 모두 매우 큰 모델이며
+> 일반적인 기기에서는 실행할 수 없습니다. 아래에 메모리 사용량을 줄이는 방법 몇 가지를 정리했습니다.
+>
+> 더욱 빠르고 안정적인 다운로드를 위해 `hf_xet` 종속성 설치를 권장합니다:
+> `pip install transformers[hf_xet]`
+
+아래 예시들은 [`Pipeline`] 또는 [`AutoModel`]로 생성하는 방법을 보여줍니다. 또한 일부 Llama 4 변형이
+최대 1천만 토큰의 컨텍스트 길이를 갖기 때문에, 매우 긴 컨텍스트 생성을 활성화하기 위해 올바른 속성을 토글하는 방법을 보여주는 예시도 추가했습니다.
+
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+messages = [
+    {"role": "user", "content": "마요네즈 레시피가 무엇인가요?"},
+]
+
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    device_map="auto",
+    dtype=torch.bfloat16
+)
+
+output = pipe(messages, do_sample=False, max_new_tokens=200)
+print(output[0]["generated_text"][-1]["content"])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Text only">
+
+```py
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+
+img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": img_url},
+            {"type": "text", "text": "이 이미지를 두 문장으로 설명해주세요."},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal with multiple images">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+
+url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": url1},
+            {"type": "image", "url": url2},
+            {"type": "text", "text": "이 두 이미지가 어떻게 비슷하고, 어떻게 다른지 설명해주실 수 있나요?"},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Long context">
+
+주의: 아래 예시는 `device_map="auto"`와 flex-attention을 모두 사용합니다.
+이 예시를 텐서 병렬 모드로 실행하려면 `torchrun`을 사용하세요.
+
+향후 텐서 병렬 없이 `device_map="auto"`와 flex-attention을 함께 실행할 수 있도록
+작업할 예정입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration, AutoTokenizer, infer_device
+import torch
+import time
+
+file = "very_long_context_prompt.txt"
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+with open(file, "r") as f:
+    very_long_text = "\n".join(f.readlines())
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    attn_implementation="flex_attention",
+    dtype=torch.bfloat16
+)
+
+messages = [
+    {"role": "user", "content": f"다음 텍스트들을 보세요: [{very_long_text}]\n\n\n\n책들은 무엇이며, 누가 썼나요? 좋은 목록을 만들어주세요."},
+]
+input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+device = infer_device()
+torch_device_module = getattr(torch, device, torch.cuda)
+torch_device_module.synchronize()
+start = time.time()
+out = model.generate(
+    input_ids.to(model.device),
+    prefill_chunk_size=2048*8,
+    max_new_tokens=300,
+    cache_implementation="hybrid",
+)
+print(time.time()-start)
+print(tokenizer.batch_decode(out[:, input_ids.shape[-1]:]))
+print(f"{torch_device_module.max_memory_allocated(model.device) / 1024**3:.2f} GiB")
+```
+
+</hfoption>
+</hfoptions>
+
+## 효율성; Llama 4의 최대 성능 활용하기[[efficiency-how-to-get-the-best-out-of-llama-4]]
+
+### 어텐션 방법[[the-attention-methods]]
+
+기본 설정으로 주어지는 어텐션 함수를 변경하면 계산 성능과 메모리 사용량을 크게 개선할 수 있습니다. 인터페이스에 대한 자세한 설명은 [어텐션 인터페이스](../attention_interface) 개요를 참조하세요.
+
+Llama 4 모델은 처음 공개될 때부터 다음 어텐션 방식을 지원합니다: `eager`, `flex_attention`, `sdpa`. 최상의 결과를 위해 `flex_attention` 사용을 권장합니다.
+어텐션 메커니즘 전환은 모델을 초기화할 때 이루어집니다:
+
+
+<hfoptions id="Attention">
+<hfoption id="Flex Attention">
+
+Flex Attention은 모델이 긴 컨텍스트를 처리할 때 최적의 성능을 발휘합니다.
+
+> [!TIP] 주의: 아래 예시는 `device_map="auto"`와 flex-attention을 모두 사용합니다.
+> 이 예시를 텐서 병렬 모드로 실행하려면 `torchrun`을 사용하세요.
+>
+> 향후 텐서 병렬 없이 `device_map="auto"`와 flex-attention을 함께 실행할 수 있도록
+> 작업할 예정입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="flex_attention",
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="SDPA">
+`sdpa` 어텐션 방법은 일반적으로 `eager` 방법보다 계산 효율적입니다.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="sdpa",
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="Eager">
+`eager` 어텐션 방법이 기본으로 설정되어 있으므로 모델 로드 시 다른 설정이 필요하지 않습니다:
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+</hfoption>
+</hfoptions>
+
+
+### 양자화[[quantization]]
+
+양자화는 가중치를 더 낮은 정밀도로 바꿔 대형 모델의 메모리 부담을 줄입니다. 사용 가능한 양자화 백엔드에 대해서는 [양자화](../quantization/overview) 개요를 참조하세요.
+현재는 FBGEMM과 LLM-Compressor를 지원하며, 곧 더 많은 방식이 추가될 예정입니다.
+
+두 가지 방법을 사용하는 예시를 아래에서 확인하세요:
+
+
+
+다음은 FBGEMM 접근법을 사용하여 BF16 모델을 FP8로 로드하는 예시입니다:
+
+<hfoptions id="Quantization">
+<hfoption id="FBGEMM">
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration, FbgemmFp8Config
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+    quantization_config=FbgemmFp8Config()
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="LLM-Compressor">
+
+LLLM-Compressor를 사용할 때는 함께 제공되는 사전 양자화된 FP8 체크포인트를 쓰는 것이 좋습니다:
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "당신은 누구신가요?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    tp_plan="auto",
+    dtype=torch.bfloat16,
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+</hfoption>
+</hfoptions>
+
+### 오프로딩[[offloading]]
+
+CPU 오프로딩을 활성화하면, GPU 메모리가 부족할 때 모델이 구성 요소를 CPU로 이동시킵니다.
+추론 시 다양한 구성 요소들이 GPU와 CPU 간에 동적으로 로드되고 언로드됩니다. 이를 통해 CPU 메모리가 충분한 한 더 작은 머신에서도 모델을 로드할 수 있습니다.
+다만 통신 오버헤드로 인해 추론 속도가 느려질 수 있습니다.
+
+CPU 오프로딩을 활성화하려면 모델 로드 시 `device_map`을 `auto`로 지정하면 됩니다
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    dtype=torch.bfloat16,
+)
+```
+
+## Llama4Config
+
+[[autodoc]] Llama4Config
+
+## Llama4TextConfig
+
+[[autodoc]] Llama4TextConfig
+
+## Llama4VisionConfig
+
+[[autodoc]] Llama4VisionConfig
+
+## Llama4Processor
+
+[[autodoc]] Llama4Processor
+
+## Llama4ImageProcessorFast
+
+[[autodoc]] Llama4ImageProcessorFast
+
+## Llama4ForConditionalGeneration
+
+[[autodoc]] Llama4ForConditionalGeneration
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4TextModel
+
+[[autodoc]] Llama4TextModel
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4VisionModel
+
+[[autodoc]] Llama4VisionModel
+- forward
--- a/docs/source/ko/model_doc/sam_hq.md
+++ b/docs/source/ko/model_doc/sam_hq.md
@ -0,0 +1,141 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*이 모델은 2023-06-02에 발표되었으며 2025-04-28에 Hugging Face Transformers에 추가되었습니다.*
+
+# SAM-HQ[[sam_hq]]
+
+## 개요[[overview]]
+
+SAM-HQ (High-Quality Segment Anything Model)는 Lei Ke, Mingqiao Ye, Martin Danelljan, Yifan Liu, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu가 제안한 [Segment Anything in High Quality](https://huggingface.co/papers/2306.01567) 논문에서 소개되었습니다.
+
+이 모델은 기존 SAM(Segment Anything Model)의 향상된 버전입니다. SAM-HQ는 SAM의 핵심 장점인 프롬프트 기반 설계, 효율성, 제로샷 일반화 능력을 그대로 유지하면서도 훨씬 더 높은 품질의 분할 마스크를 생성하는 것이 특징입니다.
+
+![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
+
+SAM-HQ는 기존 SAM 모델 대비 다음과 같은 5가지 핵심 개선 사항을 도입했습니다.
+
+1. 고품질 출력 토큰: SAM-HQ는 SAM의 마스크 디코더에 학습 가능한 토큰을 주입합니다. 이 토큰은 모델이 더 높은 품질의 분할 마스크를 예측하도록 돕는 핵심적인 요소입니다.
+2. 전역-지역 특징 융합: 모델의 서로 다른 단계에서 추출된 특징들을 결합하여 분할 마스크의 세부적인 정확도를 향상시킵니다. 이미지의 전체적인 맥락 정보와 객체의 미세한 경계 정보를 함께 활용하여 마스크 품질을 개선합니다.
+3. 훈련 데이터 개선: SAM 모델이 SA-1B와 같은 대규모 데이터를 사용한 것과 달리, SAM-HQ는 신중하게 선별된 44,000개의 고품질 마스크로 구성된 데이터셋을 사용하여 훈련됩니다.
+4. 높은 효율성: 마스크 품질을 상당히 개선했음에도 불구하고, 추가된 매개변수는 단 0.5%에 불과합니다.
+5. 제로샷 성능: SAM-HQ는 성능이 개선되었음에도 불구하고, SAM 모델의 강력한 제로샷 일반화 능력을 그대로 유지합니다.
+
+논문 초록 내용:
+
+* 최근 발표된 SAM(Segment Anything Model)은 분할 모델의 규모를 확장하는 데 있어 획기적인 발전이며, 강력한 제로샷 기능과 유연한 프롬프트 기능을 제공합니다. 하지만 SAM은 11억 개의 마스크로 훈련되었음에도 불구하고, 특히 복잡하고 정교한 구조를 가진 객체를 분할할 때 마스크 예측 품질이 미흡한 경우가 많습니다. 저희는 HQ-SAM을 제안하며, SAM의 기존 장점인 프롬프트 기반 설계, 효율성, 제로샷 일반화 능력을 모두 유지하면서도 어떤 객체든 정확하게 분할할 수 있는 능력을 부여합니다. 저희는 신중한 설계를 통해 SAM의 사전 훈련된 모델 가중치를 재사용하고 보존하며 최소한의 추가적인 매개변수와 연산만을 도입했습니다. 핵심적으로 저희는 학습 가능한 고품질 출력 토큰을 설계했습니다. 이 토큰은 SAM의 마스크 디코더에 주입되어 고품질 마스크를 예측하는 역할을 담당합니다. 마스크의 세부 사항을 개선하기 위해 이 토큰을 마스크 디코더 특징에만 적용하는 것이 아니라 초기 및 최종 ViT 특징과 먼저 융합하여 사용합니다. 도입된 학습 가능한 매개변수를 훈련하기 위해 저희는 여러 출처에서 가져온 44,000개의 미세 조정된 마스크 데이터셋을 구성했습니다. HQ-SAM은 오직 이 44,000개 마스크 데이터셋만으로 훈련되며 GPU 8대를 사용했을 때 단 4시간이 소요됩니다.
+
+SAM-HQ 사용 팁:
+
+- SAM-HQ는 기존 SAM 모델보다 더 높은 품질의 마스크 생성하며, 특히 복잡한 구조와 미세한 세부 사항을 가진 객체에 대해 성능이 우수합니다.
+- 이 모델은 더욱 정확한 경계와 얇은 구조에 대한 더 나은 처리 능력을 갖춘 이진 마스크를 예측합니다.
+- SAM과 마찬가지로 모델은 입력으로 2차원 포인트 및 바운딩 박스를 사용할 때 더 좋은 성능을 보입니다.
+- 하나의 이미지에 대해 다수의 포인트를 프롬프트로 입력하여 단일의 고품질 마스크를 예측할 수 있습니다.
+- 이 모델은 SAM의 제로샷 일반화 능력을 그대로 유지합니다.
+- SAM-HQ는 SAM 대비 약 0.5%의 추가 매개변수만을 가집니다.
+- 현재 모델의 미세 조정은 지원되지 않습니다.
+
+이 모델은 [sushmanth](https://huggingface.co/sushmanth)님께서 기여해주셨습니다.
+원본 코드는 [여기](https://github.com/SysCV/SAM-HQ)에서 확인하실 수 있습니다.
+
+아래는 이미지와 2차원 포인트가 주어졌을 때, 마스크를 생성하는 방법에 대한 예시입니다.
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import infer_device, SamHQModel, SamHQProcessor
+
+device = infer_device()
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+input_points = [[[450, 600]]]  # 이미지 내 창문의 2차원 위치
+
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+또한, 프로세서에서 입력 이미지와 함께 사용자의 마스크를 직접 처리하여 모델에 전달할 수도 있습니다.
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import infer_device, SamHQModel, SamHQProcessor
+
+device = infer_device()
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("1")
+input_points = [[[450, 600]]]  # 이미지 내 창문의 2차원 위치
+
+inputs = processor(raw_image, input_points=input_points, segmentation_maps=segmentation_map, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+## 자료[[resources]]
+
+다음은 SAM-HQ 사용을 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티 (🌎로 표시) 자료 목록입니다.
+
+- 모델 사용을 위한 데모 노트북 (출시 예정)
+- 논문 구현 및 코드: [SAM-HQ 깃허브 저장소](https://github.com/SysCV/SAM-HQ)
+
+## SamHQConfig[[transformers.SamHQConfig]]
+
+[[autodoc]] SamHQConfig
+
+## SamHQVisionConfig[[transformers.SamHQVisionConfig]]
+
+[[autodoc]] SamHQVisionConfig
+
+## SamHQMaskDecoderConfig[[transformers.SamHQMaskDecoderConfig]]
+
+[[autodoc]] SamHQMaskDecoderConfig
+
+## SamHQPromptEncoderConfig[[transformers.SamHQPromptEncoderConfig]]
+
+[[autodoc]] SamHQPromptEncoderConfig
+
+## SamHQProcessor[[transformers.SamHQProcessor]]
+
+[[autodoc]] SamHQProcessor
+
+## SamHQVisionModel[[transformers.SamHQVisionModel]]
+
+[[autodoc]] SamHQVisionModel
+
+## SamHQModel[[transformers.SamHQModel]]
+
+[[autodoc]] SamHQModel
+    - forward
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -738,6 +738,10 @@ class Trainer:
        self._train_batch_size = args.train_batch_size
        self._created_lr_scheduler = False

+        # Set use_cache for the model
+        if getattr(self.model, "config", None) is not None:
+            self.model.config.use_cache = self.args.use_cache
+
        # very last
        self._memory_tracker.stop_and_update_metrics()

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@ -752,6 +752,10 @@ class TrainingArguments:
            Whether or not to average tokens across devices. If enabled, will use all_reduce to synchronize
            num_tokens_in_batch for precise loss calculation. Reference:
            https://github.com/huggingface/transformers/issues/34242
+
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not to enable cache for the model. For training, this is usually not needed apart from some PEFT methods that uses `past_key_values`.
+
    """

    # Sometimes users will pass in a `str` repr of a dict in the CLI
@ -1382,6 +1386,13 @@ class TrainingArguments:
        },
    )

+    use_cache: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to use cache for the model For training, this is usually not needed apart from some PEFT methods that uses `past_key_values`."
+        },
+    )
+
    def __post_init__(self):
        # Set default output_dir if not provided
        if self.output_dir is None:
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@ -219,6 +219,7 @@ class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        super().test_multi_gpu_data_parallel_forward()

    def test_config(self):
+        assert 1 == 2
        self.config_tester.run_common_tests()

    @unittest.skip(reason="ViT does not use inputs_embeds")
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@ -20,6 +20,7 @@ import os
 import re
 import subprocess

+import git
 import requests


@ -38,8 +39,14 @@ def create_script(target_test):
 import os
 import subprocess

+_ = subprocess.run(
+    ["python3", "-m", "pip", "install", "-e", "."],
+    capture_output = True,
+    text=True,
+)
+
 result = subprocess.run(
-    ["python3", "-m", "pytest", "-v", "-rfEp", f"{target_test}"],
+    ["python3", "-m", "pytest", "-v", "--flake-finder", "--flake-runs=4", "-rfEp", f"{target_test}"],
    capture_output = True,
    text=True,
 )
@ -47,17 +54,20 @@ print(result.stdout)

 if f"FAILED {target_test}" in result.stdout:
    print("test failed")
-    exit(2)
+    exit(1)
 elif result.returncode != 0:
    if "ERROR: file or directory not found: " in result.stderr:
        print("test file or directory not found in this commit")
+        # git bisect treats exit code 125 as `test not found`. But this causes it not be able to make the conclusion
+        # if a test is added between the `good commit` (exclusive) and `bad commit` (inclusive) (in git bisect terminology).
+        # So we return 0 here in order to allow the process being able to identify the first commit that fails the test.
        exit(0)
    elif "ERROR: not found: " in result.stderr:
        print("test not found in this commit")
        exit(0)
    else:
        print(f"pytest gets unknown error: {{result.stderr}}")
-        exit(-1)
+        exit(1)

 print(f"pytest runs successfully.")
 exit(0)
@ -67,20 +77,63 @@ exit(0)
        fp.write(script.strip())


+def is_bad_commit(target_test, commit):
+    repo = git.Repo(".")  # or specify path to your repo
+
+    # Save the current HEAD reference
+    original_head = repo.head.commit
+
+    # Checkout to the commit
+    repo.git.checkout(commit)
+
+    create_script(target_test=target_test)
+
+    result = subprocess.run(
+        ["python3", "target_script.py"],
+        capture_output=True,
+        text=True,
+    )
+
+    # Restore to original commit
+    repo.git.checkout(original_head)
+
+    return result.returncode != 0
+
+
 def find_bad_commit(target_test, start_commit, end_commit):
-    """Find (backward) the earliest commit between `start_commit` and `end_commit` at which `target_test` fails.
+    """Find (backward) the earliest commit between `start_commit` (inclusive) and `end_commit` (exclusive) at which `target_test` fails.

    Args:
        target_test (`str`): The test to check.
-        start_commit (`str`): The latest commit.
-        end_commit (`str`): The earliest commit.
+        start_commit (`str`): The latest commit (inclusive).
+        end_commit (`str`): The earliest commit (exclusive).

    Returns:
        `str`: The earliest commit at which `target_test` fails.
    """

+    # check if `end_commit` fails the test
+    failed_before = is_bad_commit(target_test, end_commit)
+    if failed_before:
+        return (
+            None,
+            f"flaky: test passed in the previous run (commit: {end_commit}) but failed (on the same commit) during the check of the current run.",
+        )
+
+    # if there is no new commit (e.g. 2 different CI runs on the same commit):
+    #   - failed once on `start_commit` but passed on `end_commit`, which are the same commit --> flaky (or something change externally) --> don't report
    if start_commit == end_commit:
-        return start_commit
+        return (
+            None,
+            f"flaky: test fails on the current CI run but passed in the previous run which is running on the same commit {end_commit}.",
+        )
+
+    # Now, we are (almost) sure `target_test` is not failing at `end_commit`
+    # check if `start_commit` fail the test
+    failed_now = is_bad_commit(target_test, start_commit)
+    if not failed_now:
+        # failed on CI run, but not reproducible here --> don't report
+        return None, f"flaky: test fails on the current CI run (commit: {start_commit}) but passes during the check."

    create_script(target_test=target_test)

@ -105,7 +158,7 @@ git bisect run python3 target_script.py
    if "error: bisect run failed" in result.stderr:
        error_msg = f"Error when running git bisect:\nbash error: {result.stderr}\nbash output:\n{result.stdout}\nset `bad_commit` to `None`."
        print(error_msg)
-        return None
+        return None, "git bisect failed"

    pattern = r"(.+) is the first bad commit"
    commits = re.findall(pattern, result.stdout)
@ -117,7 +170,7 @@ git bisect run python3 target_script.py
    print(f"Between `start_commit` {start_commit} and `end_commit` {end_commit}")
    print(f"bad_commit: {bad_commit}\n")

-    return bad_commit
+    return bad_commit, "git bisect find the bad commit."


 def get_commit_info(commit):
@ -171,9 +224,11 @@ if __name__ == "__main__":
        raise ValueError("Exactly one argument `test` or `file` must be specified.")

    if args.test is not None:
-        commit = find_bad_commit(target_test=args.test, start_commit=args.start_commit, end_commit=args.end_commit)
+        commit, status = find_bad_commit(
+            target_test=args.test, start_commit=args.start_commit, end_commit=args.end_commit
+        )
        with open(args.output_file, "w", encoding="UTF-8") as fp:
-            fp.write(f"{args.test}\n{commit}")
+            fp.write(f"{args.test}\n{commit}\n{status}")
    elif os.path.isfile(args.file):
        with open(args.file, "r", encoding="UTF-8") as fp:
            reports = json.load(fp)
@ -185,8 +240,10 @@ if __name__ == "__main__":

            failed_tests_with_bad_commits = []
            for test in failed_tests:
-                commit = find_bad_commit(target_test=test, start_commit=args.start_commit, end_commit=args.end_commit)
-                info = {"test": test, "commit": commit}
+                commit, status = find_bad_commit(
+                    target_test=test, start_commit=args.start_commit, end_commit=args.end_commit
+                )
+                info = {"test": test, "commit": commit, "status": status}

                if commit in commit_info_cache:
                    commit_info = commit_info_cache[commit]
--- a/utils/process_bad_commit_report.py
+++ b/utils/process_bad_commit_report.py
@ -26,6 +26,33 @@ if __name__ == "__main__":

    job_name = os.environ.get("JOB_NAME")

+    # Upload to Hub and get the url
+    # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder`
+    report_repo_subfolder = ""
+    if os.getenv("GITHUB_EVENT_NAME") != "schedule":
+        report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}"
+        report_repo_subfolder = f"runs/{report_repo_subfolder}"
+
+    workflow_run = get_last_daily_ci_run(
+        token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID")
+    )
+    workflow_run_created_time = workflow_run["created_at"]
+
+    report_repo_folder = workflow_run_created_time.split("T")[0]
+
+    if report_repo_subfolder:
+        report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}"
+
+    report_repo_id = os.getenv("REPORT_REPO_ID")
+
+    commit_info = api.upload_file(
+        path_or_fileobj="new_failures_with_bad_commit.json",
+        path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit.json",
+        repo_id=report_repo_id,
+        repo_type="dataset",
+        token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
+    )
+
    with open("new_failures_with_bad_commit.json") as fp:
        data = json.load(fp)

@ -88,25 +115,6 @@ if __name__ == "__main__":
            _data[model] = {k: v for k, v in model_result.items() if len(v) > 0}
        new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0}

-    # Upload to Hub and get the url
-    # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder`
-    report_repo_subfolder = ""
-    if os.getenv("GITHUB_EVENT_NAME") != "schedule":
-        report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}"
-        report_repo_subfolder = f"runs/{report_repo_subfolder}"
-
-    workflow_run = get_last_daily_ci_run(
-        token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID")
-    )
-    workflow_run_created_time = workflow_run["created_at"]
-
-    report_repo_folder = workflow_run_created_time.split("T")[0]
-
-    if report_repo_subfolder:
-        report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}"
-
-    report_repo_id = os.getenv("REPORT_REPO_ID")
-
    with open("new_failures_with_bad_commit_grouped_by_authors.json", "w") as fp:
        json.dump(new_data_full, fp, ensure_ascii=False, indent=4)
    commit_info = api.upload_file(
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@ -74,4 +74,5 @@ if __name__ == "__main__":
        end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
        model_splits.append(d[start:end])

+    model_splits = [["models/vit"], ["models/clip"]]
    print(model_splits)
Author	SHA1	Message	Date
ydshieh	ca38d640b7	fix	2025-10-17 22:21:10 +02:00
ydshieh	b8011a3dc5	fix	2025-10-17 21:30:29 +02:00
ydshieh	a118c8e1c4	fix	2025-10-17 20:59:00 +02:00
ydshieh	0109c42409	fix	2025-10-17 20:30:23 +02:00
ydshieh	3c7552f733	fix	2025-10-17 15:40:54 +02:00
ydshieh	4757bf062b	fix	2025-10-17 15:12:12 +02:00
ydshieh	aceaa7ce97	fix	2025-10-17 15:05:09 +02:00
ydshieh	c9293376a0	fix	2025-10-17 12:04:50 +02:00
ydshieh	e69d3ca150	check 1	2025-10-17 10:44:24 +02:00
ydshieh	bffad7f4fb	check 1	2025-10-17 09:21:09 +02:00
ydshieh	740f952218	check 1	2025-10-17 06:57:10 +02:00
ydshieh	950c4e5303	check 1	2025-10-17 06:28:55 +02:00
ydshieh	89970f4797	check 1	2025-10-17 03:03:25 +02:00
ydshieh	a4a46e62a5	check 1	2025-10-16 21:32:04 +02:00
ydshieh	9b36498d5f	1	2025-10-16 21:16:53 +02:00
HyunSang Jang	eefbf4ac8b	🌐 [i18n-KO] Translated llama4.md to Korean (#40396 ) * docs: ko: llama4.md * feat: nmt draft * fix: manual edits * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> * Update docs/source/ko/model_doc/llama4.md Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com> --------- Co-authored-by: TaskerJang <bymyself103@naver.com> Co-authored-by: YONGSANG <71686691+4N3MONE@users.noreply.github.com>	2025-10-16 11:28:27 -07:00
Judy	50ca781d78	🌐 [i18n-KO] Translated `code_llama.md` to Korean (#40558 ) * docs: ko: code_llama.md * feat: nmt draft * fix: manual edits * Apply suggestions from code review Co-authored-by: Harheem Kim <49297157+harheem@users.noreply.github.com> Co-authored-by: HyunZ118 <156191095+HyunZ118@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Harheem Kim <49297157+harheem@users.noreply.github.com> --------- Co-authored-by: Harheem Kim <49297157+harheem@users.noreply.github.com> Co-authored-by: HyunZ118 <156191095+HyunZ118@users.noreply.github.com>	2025-10-16 11:27:46 -07:00
SSUM	8739fc05c4	[i18n-KO] Translated `big_bird.md` to Korean (#40445 ) * docs: ko: BigBird.md * feat: nmt draft * fix: manual edits	2025-10-16 11:23:56 -07:00
HyunZ118	77b5ad65ee	🌐 [i18n-KO] Translated sam_hq.md to Korean (#41340 ) * fix: manual edits * Apply suggestions from code review Apply suggestions from code review Co-authored-by: HyunSang Jang <tasker.dev103@gmail.com> * Apply suggestions from code review Apply suggestions from code review Co-authored-by: Woojun Jung <46880056+jungnerd@users.noreply.github.com> --------- Co-authored-by: HyunSang Jang <tasker.dev103@gmail.com> Co-authored-by: Woojun Jung <46880056+jungnerd@users.noreply.github.com>	2025-10-16 11:10:16 -07:00
Judy	a9731a725e	🌐 [i18n-KO] Translated `chat_extras.md` to Korean (#39863 ) * docs: ko: chat_extras.md * feat: nmt draft * fix: manual edits * Apply suggestions from code review * Apply suggestions from code review * Update docs/source/ko/chat_extras.md	2025-10-16 10:41:03 -07:00
Marc Sun	bdbc2d037b	[Trainer] [Breaking change] `use_cache` default to `False` (#41585 ) * use_cache default to `False` when training * style * Fix comment * add checks * style * set * switch	2025-10-16 18:51:36 +02:00