add mask generation fine-tuning docs

update huggingface_hub dependency version (#42033 )
* update huggingface_hub version * nit
2025-11-07 14:04:43 +08:00 · 2025-11-05 18:44:16 +01:00 · 2025-11-05 16:22:22 +01:00 · 2025-11-05 14:40:01 +00:00 · 2025-11-05 15:33:07 +01:00 · 2025-11-05 14:17:12 +01:00
69 changed files with 2209 additions and 504 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -52,7 +52,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
+          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -97,7 +97,7 @@ jobs:
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
    steps:
      -
        name: Set up Docker Buildx
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -0,0 +1,22 @@
+---
+name: CodeQL Security Analysis
+
+on:
+  push:
+    branches: ["main"]
+  # pull_request:
+  #   branches: ["main"]
+  workflow_dispatch:
+
+jobs:
+  codeql:
+    name: CodeQL Analysis
+    uses: huggingface/security-workflows/.github/workflows/codeql-reusable.yml@main
+    permissions:
+      security-events: write
+      packages: read
+      actions: read
+      contents: read
+    with:
+      languages: '["actions"]'
+      queries: 'security-extended,security-and-quality'
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@ -2,7 +2,7 @@ name: Self-hosted runner (AMD scheduled CI caller)

 on:
  schedule:
-    - cron: "17 2 * * *"
+    - cron: "17 5 * * *"

 jobs:
  run_scheduled_amd_ci:
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -102,8 +102,10 @@ jobs:
        working-directory: /transformers/tests
        run: |
          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+            python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }} > folder_slices.txt
+            echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
+            python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
+            echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
@ -336,7 +338,7 @@ jobs:
        working-directory: ${{ inputs.working-directory-prefix }}/
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check

      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again* (for nightly & Past CI)
@ -346,7 +348,7 @@ jobs:
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -4,7 +4,7 @@ on:
  workflow_dispatch:
    inputs:
      runner_type:
-        description: 'Type of runner to test (a10 or t4)'
+        description: 'Type of runner to test (a10)'
        required: true
      docker_image:
        description: 'Name of the Docker image'
@ -36,14 +36,10 @@ jobs:
          NUM_GPUS: ${{ github.event.inputs.num_gpus }}
          RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
        run: |
-          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
+          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
+            echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
          else
            echo "RUNNER=" >> $GITHUB_ENV
          fi
@ -61,8 +57,6 @@ jobs:
      group: ${{ needs.get_runner.outputs.RUNNER }}
    container:
      image: ${{ github.event.inputs.docker_image }}
-      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
    steps:
      - name: Update clone
        working-directory: /transformers
@ -106,7 +100,7 @@ jobs:
          else
            echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
          fi
-
+        
      - name: Tailscale # In order to be able to SSH when a test fails
        uses: huggingface/tailscale-action@main
        with:
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -1,8 +1,11 @@
 import hashlib
+import itertools
 import json
 import logging
 from typing import Any

+from transformers.utils.import_utils import is_flash_attn_2_available
+

 KERNELIZATION_AVAILABLE = False
 try:
@ -18,6 +21,16 @@ logger = logging.getLogger(__name__)
 class BenchmarkConfig:
    """Configuration for a single benchmark scenario."""

+    all_attn_implementations = [
+        ("flash_attention_2", None),
+        ("eager", None),
+        ("sdpa", "math"),
+        ("sdpa", "flash_attention"),
+        ("flex_attention", None),
+    ]
+
+    all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
+
    def __init__(
        self,
        warmup_iterations: int = 5,
@ -59,6 +72,13 @@ class BenchmarkConfig:
    def check_validity(self, skip_validity_check: bool = False) -> None:
        if skip_validity_check:
            return
+        # Check FA is installed
+        if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
+            logger.warning(
+                "Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
+            )
+            self.attn_implementation = "sdpa"
+            self.sdpa_backend = "flash_attention"
        # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
        is_fa = self.attn_implementation == "flash_attention_2"
        is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
@ -127,88 +147,68 @@ class BenchmarkConfig:
        )


-def cross_generate_configs(
-    attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
-    compiled_mode: list[str | None],
-    kernelized: list[bool],
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
+def adapt_configs(
+    configs: list[BenchmarkConfig],
+    warmup_iterations: int | list[int] = 5,
+    measurement_iterations: int | list[int] = 20,
+    batch_size: int | list[int] = 1,
+    sequence_length: int | list[int] = 128,
+    num_tokens_to_generate: int | list[int] = 128,
+    gpu_monitoring: bool | list[bool] = True,
 ) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-        "gpu_monitoring": gpu_monitoring,
-    }
-    # Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
-    configs = []
-    for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
-        for cm in list(dict.fromkeys(compiled_mode)):
-            for kernelize_on in list(dict.fromkeys(kernelized)):
-                config = BenchmarkConfig(
-                    attn_implementation=attn_implementation,
-                    sdpa_backend=sdpa_backend,
-                    compile_mode=cm,
-                    kernelize=kernelize_on,
-                    **kwargs,
-                )
-                configs.append(config)
-    return configs
-
-
-def generate_all_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
-) -> list[BenchmarkConfig]:
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),
-        ("flex_attention", None),
-    ]
-    return cross_generate_configs(
-        attn_impl_and_sdpa_backend=all_attn_implementations,
-        compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
-        kernelized=[False, KERNELIZATION_AVAILABLE],
-        warmup_iterations=warmup_iterations,
-        measurement_iterations=measurement_iterations,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_tokens_to_generate=num_tokens_to_generate,
-        gpu_monitoring=gpu_monitoring,
+    parameters = (
+        x if isinstance(x, list) else [x]
+        for x in [
+            warmup_iterations,
+            measurement_iterations,
+            batch_size,
+            sequence_length,
+            num_tokens_to_generate,
+            gpu_monitoring,
+        ]
    )
+    iterator = itertools.product(*parameters)
+
+    adapted_configs = []
+    for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
+        for config in configs:
+            config = config.to_dict()
+            config["warmup_iterations"] = warmup_iters
+            config["measurement_iterations"] = measurement_iters
+            config["batch_size"] = bs
+            config["sequence_length"] = seqlen
+            config["num_tokens_to_generate"] = ntok
+            config["gpu_monitoring"] = monitor
+            adapted_configs.append(BenchmarkConfig.from_dict(config))
+    return adapted_configs


-def generate_main_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-    }
-    return [  # TODO: test max-autotune instead of default
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
-    ]
+def get_config_by_level(level: int) -> list[BenchmarkConfig]:
+    configs = []
+    # Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
+    if level >= 3:
+        for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
+            # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
+            compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
+            for cm in compile_modes:
+                for kernelize_on in [False, KERNELIZATION_AVAILABLE]:
+                    configs.append(
+                        BenchmarkConfig(
+                            attn_implementation=attn_implementation,
+                            sdpa_backend=sdpa_backend,
+                            compile_mode=cm,
+                            kernelize=kernelize_on,
+                        )
+                    )
+        return configs
+    # Otherwise, we add the configs for the given level
+    if level >= 0:
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
+    if level >= 1:
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
+        configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
+    if level >= 2:
+        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
+    return configs
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -23,7 +23,7 @@ import logging
 import sys
 import uuid

-from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
+from framework.benchmark_config import adapt_configs, get_config_by_level
 from framework.benchmark_runner import BenchmarkRunner


@ -40,7 +40,14 @@ if __name__ == "__main__":
    parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
    parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")

-    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
+    parser.add_argument(
+        "--level",
+        type=int,
+        default=1,
+        help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
+        " each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
+        " combinations of configs w/ all compile modes",
+    )
    parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")

    parser.add_argument("--branch-name", type=str, help="Git branch name")
@ -79,64 +86,24 @@ if __name__ == "__main__":
            "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
        )

-    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
-    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
-        if args.cross_generate:
-            benchmark_configs = generate_all_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-                gpu_monitoring=not args.no_gpu_monitoring,
-            )
-        else:
-            benchmark_configs = generate_main_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-            )
-
-    # Otherwise, we benchmark across all combinations of dimensions
-    else:
-        main_config = generate_main_configs(
-            warmup_iterations=args.warmup,
-            measurement_iterations=args.iterations,
-            batch_size=args.batch_size[0],
-            sequence_length=args.sequence_length[0],
-            num_tokens_to_generate=args.num_tokens_to_generate[0],
-        )[0]
-        benchmark_configs = []
-        for num_tokens_to_generate in args.num_tokens_to_generate:
-            for sequence_length in args.sequence_length:
-                for batch_size in args.batch_size:
-                    cfg_dict = main_config.to_dict()
-                    cfg_dict["batch_size"] = batch_size
-                    cfg_dict["sequence_length"] = sequence_length
-                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
-                    cfg_dict.pop("name")
-                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
-
-    runner = BenchmarkRunner(
-        logger,
-        args.output_dir,
-        args.branch_name,
-        args.commit_id,
-        args.commit_message,
+    # Get the configs for the given coverage level
+    configs = get_config_by_level(args.level)
+    # Adapt the configs to the given arguments
+    configs = adapt_configs(
+        configs,
+        args.warmup,
+        args.iterations,
+        args.batch_size,
+        args.sequence_length,
+        args.num_tokens_to_generate,
+        not args.no_gpu_monitoring,
    )
+
+    runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
    timestamp, results = runner.run_benchmarks(
-        args.model_id,
-        benchmark_configs,
-        args.num_tokens_to_profile,
-        pretty_print_summary=True,
+        args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
    )

    dataset_id = args.push_result_to_dataset
    if dataset_id is not None and len(results) > 0:
-        runner.push_results_to_hub(
-            dataset_id,
-            results,
-            timestamp,
-        )
+        runner.push_results_to_hub(dataset_id, results, timestamp)
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -29,7 +29,7 @@ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
 RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir

 # Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache-dir -v --disable-pip-version-check 2>&1

 ARG REF=main
 WORKDIR /
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1

 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -81,7 +81,7 @@ RUN python3 -m pip uninstall -y flash-attn
 RUN cd transformers && python3 setup.py develop

 # Add fp-quant for quantization testing
-RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
+RUN python3 -m pip install --no-cache-dir "fp-quant>=0.3.2"

 # Low usage or incompatible lib, will enable later on

--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -158,6 +158,24 @@ print("Retrieval scores (query x image):")
 print(scores)
 ```

+You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
+
+```python
+import torch
+from transformers import ColQwen2ForRetrieval, ColQwen2Processor
+from transformers.utils.import_utils import is_flash_attn_2_available
+
+model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
+
+model = ColQwen2ForRetrieval.from_pretrained(
+    model_name,
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
+)
+processor = ColQwen2Processor.from_pretrained(model_name)
+```
+
 ## Notes

 - [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
 from PIL import Image
 from transformers import AutoTokenizer
 from transformers.models.fuyu.processing_fuyu import FuyuProcessor
-from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
+from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast


 tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
-image_processor = FuyuImageProcessor()
+image_processor = FuyuImageProcessorFast()


 processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
@ -118,6 +118,11 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
 [[autodoc]] FuyuImageProcessor
    - __call__

+## FuyuImageProcessor
+
+[[autodoc]] FuyuImageProcessorFast
+    - __call__
+
 ## FuyuProcessor

 [[autodoc]] FuyuProcessor
--- a/docs/source/en/model_doc/glpn.md
+++ b/docs/source/en/model_doc/glpn.md
@ -61,6 +61,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GLPNImageProcessor
    - preprocess

+## GLPNImageProcessorFast
+
+[[autodoc]] GLPNImageProcessorFast
+    - preprocess
+
 ## GLPNModel

 [[autodoc]] GLPNModel
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"

 A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.

-Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
+Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.

 > [!TIP]
 > Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
--- a/docs/source/en/tasks/mask_generation.md
+++ b/docs/source/en/tasks/mask_generation.md
@ -24,8 +24,9 @@ Mask generation models are trained on large amounts of data and operate in two m
 - Prompting mode: In this mode, the model takes in an image and a prompt, where a prompt can be a 2D point location (XY coordinates) in the image within an object or a bounding box surrounding an object. In prompting mode, the model only returns the mask over the object
 that the prompt is pointing out.
 - Segment Everything mode: In segment everything, given an image, the model generates every mask in the image. To do so, a grid of points is generated and overlaid on the image for inference.
+- Video Inference: The model takes in a video, and a point or box prompt in a video frame, which is tracked throughout the video. You can get more information on how to do video inference by following [SAM 2 docs](model_doc/sam2).

-Mask generation task is supported by [Segment Anything Model (SAM)](model_doc/sam). It's a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks.
+Mask generation task is supported by [Segment Anything Model (SAM)](model_doc/sam) and [Segment Anything Model 2 (SAM2)](model_doc/sam2), while video inference is supported by [Segment Anything Model 2 (SAM2)](model_doc/sam2). SAM is a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks.  Meanwhile SAM 2 extends SAM by adding a memory module to track the masks. 

 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sam.png" alt="SAM Architecture"/>
@ -53,7 +54,7 @@ The easiest way to infer mask generation models is to use the `mask-generation`
 ```python
 >>> from transformers import pipeline

->>> checkpoint = "facebook/sam-vit-base"
+>>> checkpoint = "facebook/sam2-hiera-base-plus"
 >>> mask_generator = pipeline(model=checkpoint, task="mask-generation")
 ```

@ -80,20 +81,12 @@ masks = mask_generator(image, points_per_batch=128, pred_iou_thresh=0.88)
 The `masks` looks like the following:

 ```bash
-{'masks': [array([[False, False, False, ...,  True,  True,  True],
-         [False, False, False, ...,  True,  True,  True],
-         [False, False, False, ...,  True,  True,  True],
-         ...,
-         [False, False, False, ..., False, False, False],
-         [False, False, False, ..., False, False, False],
-         [False, False, False, ..., False, False, False]]),
-  array([[False, False, False, ..., False, False, False],
-         [False, False, False, ..., False, False, False],
-         [False, False, False, ..., False, False, False],
-         ...,
-'scores': tensor([0.9972, 0.9917,
-        ...,
-}
+{'masks': [tensor([[False, False, False,  ...,  True,  True,  True],
+          [False, False, False,  ...,  True,  True,  True],
+          [False, False, False,  ...,  True,  True,  True],
+          ...,
+          [False, False, False,  ..., False, False, False], .. 
+ 'scores': tensor([0.9874, 0.9793, 0.9780, 0.9776, ... 0.9016])}
 ```

 We can visualize them like this:
@ -235,3 +228,270 @@ plt.show()
 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/box_inference.png" alt="Visualized Inference"/>
 </div>
+
+## Fine-tuning for Mask Generation 
+
+We will fine-tune SAM2.1 on small part of MicroMat dataset for image matting. We need to install monai library to use the DICE loss, and trackio for logging the masks during training.
+
+```bash 
+pip install -q datasets monai trackio
+``` 
+We can now load our dataset and take a look.
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("merve/MicroMat-mini", split="train")
+dataset
+# Dataset({
+#    features: ['image', 'mask', 'prompt', 'image_id', 'object_id', 'sample_idx', 'granularity', 
+# 'image_path', 'mask_path', 'prompt_path'],  num_rows: 94
+#})
+```
+We need image, mask and prompt columns. We split for train and test.
+
+```python
+dataset = dataset.train_test_split(test_size=0.1)
+train_ds = dataset["train"]
+val_ds = dataset["test"]
+```
+
+Let's take a look at a sample.
+```python
+train_ds[0]
+``` 
+```
+ {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=2040x1356>,
+ 'mask': <PIL.PngImagePlugin.PngImageFile image mode=L size=2040x1356>,
+ 'prompt': '{"point": [[137, 1165, 1], [77, 1273, 0], [58, 1351, 0]], "bbox": [0, 701, 251, 1356]}',
+ 'image_id': '0034',
+ 'object_id': '34',
+ 'sample_idx': 1,
+ 'granularity': 'fine',
+ 'image_path': '/content/MicroMat-mini/img/0034.png',
+ 'mask_path': '/content/MicroMat-mini/mask/0034_34.png',
+ 'prompt_path': '/content/MicroMat-mini/prompt/0034_34.json'}
+```
+Prompts are string of dictionaries, so can get bounding boxes like below.
+```python
+import json
+
+json.loads(train_ds["prompt"][0])["bbox"]
+# [0, 701, 251, 1356]
+``` 
+
+Visualize an example image, prompt and mask.
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+
+def show_mask(mask, ax):
+    color = np.array([0.12, 0.56, 1.0, 0.6])
+    mask = np.array(mask)
+    h, w = mask.shape
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, 4)
+    ax.imshow(mask_image)
+    x0, y0, x1, y1 = eval(train_ds["prompt"][0])["bbox"]
+    ax.add_patch(
+        plt.Rectangle((x0, y0), x1 - x0, y1 - y0,
+                      fill=False, edgecolor="lime", linewidth=2))
+
+example = train_ds[0]
+image = np.array(example["image"])
+ground_truth_mask = np.array(example["mask"])
+
+fig, ax = plt.subplots()
+ax.imshow(image)
+show_mask(ground_truth_mask, ax)
+ax.set_title("Ground truth mask")
+ax.set_axis_off()
+
+plt.show() 
+```
+
+Now we can define our dataset for loading the data. SAMDataset wraps an our dataset and formats each sample the way the SAM processor expects. So instead of raw images and masks, you get processed images, bounding boxes, and ground-truth masks ready for training.
+
+By default, processor resizes images, so on top of images and masks, it also returns original sizes. We also need to binarize the mask as it has values 0, 255.
+
+```python
+from torch.utils.data import Dataset
+import torch
+
+class SAMDataset(Dataset):
+  def __init__(self, dataset, processor):
+    self.dataset = dataset
+    self.processor = processor
+
+  def __len__(self):
+    return len(self.dataset)
+
+  def __getitem__(self, idx):
+    item = self.dataset[idx]
+    image = item["image"]
+    prompt = eval(item["prompt"])["bbox"]
+    inputs = self.processor(image, input_boxes=[[prompt]], return_tensors="pt")
+    inputs["ground_truth_mask"] = (np.array(item["mask"]) > 0).astype(np.float32)
+    inputs["original_image_size"] = torch.tensor(image.size[::-1])
+
+
+    return inputs
+``` 
+
+We can initialize the processor and the dataset with it. 
+
+```python 
+from transformers import Sam2Processor
+
+processor = Sam2Processor.from_pretrained("facebook/sam2.1-hiera-small")
+train_dataset = SAMDataset(dataset=train_ds, processor=processor)
+``` 
+
+We need to define a data collator that will turn varying size of ground truth masks to batches of reshaped masks in same shape. We reshape them using nearest neighbor interpolation. We also make batched tensors for rest of the elements in the batch. If your masks are all of same size, feel free to skip this step.
+
+```python
+import torch.nn.functional as F
+
+def collate_fn(batch, target_hw=(256, 256)):
+
+    pixel_values = torch.cat([item["pixel_values"] for item in batch], dim=0)
+    original_sizes = torch.stack([item["original_sizes"] for item in batch])
+    reshaped_input_sizes = torch.stack([item["reshaped_input_sizes"] for item in batch])
+    input_boxes = torch.cat([item["input_boxes"] for item in batch], dim=0)
+    ground_truth_masks = torch.cat([
+        F.interpolate(
+            torch.as_tensor(x["ground_truth_mask"]).unsqueeze(0).unsqueeze(0).float(),
+            size=(256, 256),
+            mode="nearest"
+        )
+        for x in batch
+    ], dim=0).long()
+
+    return {
+        "pixel_values": pixel_values,
+        "original_sizes": original_sizes,
+        "reshaped_input_sizes": reshaped_input_sizes,
+        "input_boxes": input_boxes,
+        "ground_truth_mask": ground_truth_masks,
+        "original_image_size": torch.stack([item["original_image_size"] for item in batch]),
+    }
+
+from torch.utils.data import DataLoader
+train_dataloader = DataLoader(
+    train_dataset,
+    batch_size=4,
+    shuffle=True,
+    collate_fn=collate_fn,
+)
+``` 
+
+Let's take a look at what the data loader yields.
+
+```python
+batch = next(iter(train_dataloader))
+for k,v in batch.items():
+  print(k,v.shape)
+
+# pixel_values torch.Size([4, 3, 1024, 1024])
+# original_sizes torch.Size([4, 1, 2])
+# reshaped_input_sizes torch.Size([4, 1, 2])
+# input_boxes torch.Size([4, 1, 4])
+# ground_truth_mask torch.Size([4, 1, 256, 256])
+#original_image_size torch.Size([4, 2])
+```
+We will now load the model, we will freeze the vision and the prompt encoder and only train the mask decoder. 
+
+```python
+from transformers import Sam2Model
+
+model = Sam2Model.from_pretrained("facebook/sam2.1-hiera-small")
+
+for name, param in model.named_parameters():
+  if name.startswith("vision_encoder") or name.startswith("prompt_encoder"):
+    param.requires_grad_(False)
+``` 
+
+We can now define the optimizer and the loss function.
+```python 
+from torch.optim import Adam
+import monai
+
+optimizer = Adam(model.mask_decoder.parameters(), lr=1e-5, weight_decay=0)
+seg_loss = monai.losses.DiceCELoss(sigmoid=True, squared_pred=True, reduction='mean')
+```
+
+We need to log our predictions to trackio so we can monitor the model improvement in the middle of the training. 
+
+```python
+from PIL import Image
+import trackio
+import json
+
+
+@torch.no_grad()
+def predict_fn(img, bbox):
+
+  inputs = processor(images=img, input_boxes=[[bbox]], return_tensors="pt").to(model.device)
+
+  with torch.no_grad():
+      outputs = model(**inputs)
+
+  masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0]
+  return masks
+
+def log_eval_masks_trackio(dataset, indices, step, predict_fn,  project=None, sample_cap=8):
+    logs = {"eval/step": int(step)}
+    for idx in indices[:sample_cap]:
+        item = dataset[idx] 
+        img = item["image"]
+        bbox = json.loads(item["prompt"])["bbox"]
+        preds = predict_fn(img, bbox)
+        preds = preds.squeeze(0)
+        mask = (preds[0] > 0).cpu().numpy()  
+
+        overlay = np.asarray(img, dtype=np.uint8).copy()
+        overlay[mask] = 0.55 * overlay[mask] + 0.45 * np.array([0, 255, 0], dtype=np.float32)
+        logs[f"{idx}/overlay"] = trackio.Image(overlay, caption="overlay")
+        
+    trackio.log(logs)
+```
+We can now write our training loop and train!
+
+Notice how we log our loss and evaluation masks with trackio.
+```python
+from tqdm import tqdm
+from statistics import mean
+import trackio
+import torch
+
+num_epochs = 30
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+
+model.train()
+trackio.init(project="mask-eval")
+for epoch in range(num_epochs):
+    epoch_losses = []
+    for batch in tqdm(train_dataloader):
+      outputs = model(pixel_values=batch["pixel_values"].to(device),
+                      input_boxes=batch["input_boxes"].to(device),
+                      multimask_output=False)
+
+      predicted_masks = outputs.pred_masks.squeeze(1)
+      ground_truth_masks = batch["ground_truth_mask"].float().to(device)
+      loss = seg_loss(predicted_masks, ground_truth_masks)
+
+      optimizer.zero_grad()
+      loss.backward()
+
+      optimizer.step()
+      epoch_losses.append(loss.item())
+      
+    log_eval_masks_trackio(dataset=val_ds, indices=[0, 3, 6, 9], step=epoch, predict_fn=predict_fn, project="mask-eval")
+    print(f'Epoch: {epoch}')
+    print(f'Mean loss: {mean(epoch_losses)}')
+    trackio.log({"loss": mean(epoch_losses)})
+
+trackio.finish()
+```
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -187,7 +187,7 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
+    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
--- a/setup.py
+++ b/setup.py
@ -113,7 +113,7 @@ _deps = [
    "GitPython<3.1.19",
    "hf-doc-builder>=0.3.0",
    "hf_xet",
-    "huggingface-hub==1.0.0.rc6",
+    "huggingface-hub>=1.0.0,<2.0",
    "importlib_metadata",
    "ipadic>=1.0.0,<2.0",
    "jinja2>=3.1.0",
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -23,7 +23,7 @@ deps = {
    "GitPython": "GitPython<3.1.19",
    "hf-doc-builder": "hf-doc-builder>=0.3.0",
    "hf_xet": "hf_xet",
-    "huggingface-hub": "huggingface-hub==1.0.0.rc6",
+    "huggingface-hub": "huggingface-hub>=1.0.0,<2.0",
    "importlib_metadata": "importlib_metadata",
    "ipadic": "ipadic>=1.0.0,<2.0",
    "jinja2": "jinja2>=3.1.0",
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -2192,7 +2192,7 @@ class GenerationMixin(ContinuousMixin):
            has_disk_offload = "disk" in all_model_devices
            can_compile &= not has_disk_offload

-        # Finally: if the user has manually specified compilation options, but compilation is not possible, let's warn
+        # If the user has manually specified compilation options, but compilation is not possible, let's warn
        # them
        if generation_config.compile_config is not None and not can_compile:
            logger.warning_once(
@ -2200,6 +2200,18 @@ class GenerationMixin(ContinuousMixin):
                "will be skipped."
            )

+            # Finally: if we can compile, disable tokenizers parallelism and check for FA2 + static cache
+            os.environ["TOKENIZERS_PARALLELISM"] = "0"
+            # If we use FA2 and a static cache, we cannot compile with fullgraph
+            if self.config._attn_implementation == "flash_attention_2":
+                # only raise warning if the user passed an explicit compile-config
+                if generation_config.compile_config is not None and generation_config.compile_config.fullgraph:
+                    logger.warning_once(
+                        "When using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
+                        "FA2 introduces graph breaks. We overrode the option with `fullgraph=False`."
+                    )
+                    generation_config.compile_config.fullgraph = False
+
        return can_compile

    def _get_deprecated_gen_repo(
@ -2636,7 +2648,7 @@ class GenerationMixin(ContinuousMixin):
                UserWarning,
            )

-        # 8. prepare logits processors and stopping criteria
+        # 8. Prepare logits processors and stopping criteria
        prepared_logits_processor = self._get_logits_processor(
            generation_config=generation_config,
            input_ids_seq_length=input_ids_length,
@ -2843,40 +2855,21 @@ class GenerationMixin(ContinuousMixin):
        batch_size, cur_len = input_ids.shape[:2]
        this_peer_finished = False
        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-        model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)

-        model_forward = self.__call__
-        compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
-        if compile_forward:
-            os.environ["TOKENIZERS_PARALLELISM"] = "0"
-            # If we use FA2 and a static cache, we cannot compile with fullgraph
-            if self.config._attn_implementation == "flash_attention_2":
-                # only raise warning if the user passed an explicit compile-config
-                if generation_config.compile_config is not None and generation_config.compile_config.fullgraph:
-                    logger.warning_once(
-                        "When using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
-                        "FA2 introduces graph breaks. We overrode the option with `fullgraph=False`."
-                    )
-                    generation_config.compile_config.fullgraph = False
-            model_forward = self.get_compiled_call(generation_config.compile_config)
+        model_forward = (
+            self.get_compiled_call(generation_config.compile_config)
+            if self._valid_auto_compile_criteria(model_kwargs, generation_config)
+            else self.__call__
+        )

-        if generation_config.prefill_chunk_size is not None:
-            model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
-            is_prefill = False
-        else:
-            is_prefill = True
+        prefill_consumed = False
+        outputs = self._prefill(input_ids, generation_config, model_kwargs)

        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            # prepare model inputs
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            if is_prefill:
-                outputs = self(**model_inputs, return_dict=True)
-                is_prefill = False
-            else:
+            if prefill_consumed:
+                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
                outputs = model_forward(**model_inputs, return_dict=True)
-
-            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            prefill_consumed = True
            model_kwargs = self._update_model_kwargs_for_generation(
                outputs,
                model_kwargs,
@ -3246,7 +3239,6 @@ class GenerationMixin(ContinuousMixin):
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        """
-
        # 1. init beam_search values
        pad_token_id = generation_config._pad_token_tensor
        eos_token_id = generation_config._eos_token_tensor
@ -3287,8 +3279,6 @@ class GenerationMixin(ContinuousMixin):
            dim=0,
        ).to(input_ids.device)

-        model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
-
        # (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there
        # are newer low-memory alternatives like the offloaded cache)
        sequential = generation_config.low_memory
@ -3350,13 +3340,18 @@ class GenerationMixin(ContinuousMixin):
        )
        beam_indices = running_beam_indices.detach().clone()

+        prefill_consumed = False
+        flat_running_sequences = input_ids
+        model_outputs = self._prefill(input_ids, generation_config, model_kwargs)
+
        # 4. run the generation loop
        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            # a. Forward current tokens, obtain the logits
-            flat_running_sequences = self._flatten_beam_dim(running_sequences[:, :, :cur_len])
-            model_inputs = self.prepare_inputs_for_generation(flat_running_sequences, **model_kwargs)
-
-            model_outputs = self(**model_inputs, return_dict=True)
+            if prefill_consumed:
+                # a. Forward current tokens, obtain the logits
+                flat_running_sequences = self._flatten_beam_dim(running_sequences[:, :, :cur_len])
+                model_inputs = self.prepare_inputs_for_generation(flat_running_sequences, **model_kwargs)
+                model_outputs = self(**model_inputs, return_dict=True)
+            prefill_consumed = True

            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
            model_kwargs = self._update_model_kwargs_for_generation(
@ -3839,49 +3834,51 @@ class GenerationMixin(ContinuousMixin):
        else:
            return input_ids

-    def _prefill_chunking(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, **model_kwargs):
-        # Even if we are not compiling the forward, flex is always compiled when used. With chunk prefill, we may
-        # end up needing just a bit more graphs than the default (which is 8). Doing this avoids very cryptic warnings
-        torch._dynamo.config.cache_size_limit = 64
+    # TODO: v5.1: make public once API stabilized
+    def _prefill(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, model_kwargs):
+        if generation_config.prefill_chunk_size is None:
+            model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            return self(**model_inputs, return_dict=True)
+        else:  # Chunked prefill
+            # Even if we are not compiling the forward, flex is always compiled when used. With chunked prefill, we may
+            # end up needing just a bit more graphs than the default (which is 8). Doing this avoids very cryptic warnings
+            torch._dynamo.config.cache_size_limit = 64

-        chunk_size = generation_config.prefill_chunk_size
-        # Only chunk up the token just before last, so that decoding is completely performed outside this function
-        # (here we simply prefill the cache)
-        input_chunks = torch.split(input_ids[:, :-1], chunk_size, dim=-1)
+            chunk_size = generation_config.prefill_chunk_size
+            input_chunks = torch.split(input_ids, chunk_size, dim=-1)

-        if "past_key_values" not in model_kwargs:
-            raise ValueError("Cannot use prefill chunking without a cache")
+            if "past_key_values" not in model_kwargs:
+                raise ValueError("Cannot use prefill chunking without a cache")

-        model_forward = self.forward
-
-        compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
-        if compile_forward:
-            model_forward = self.get_compiled_call(generation_config.compile_config)
-
-        attention_mask = model_kwargs.pop("attention_mask", None)
-
-        past_length = 0
-        for input_chunk in input_chunks:
-            current_length = past_length + input_chunk.shape[-1]
-            # Prepare inputs
-            if attention_mask is not None:
-                model_kwargs["attention_mask"] = attention_mask[:, :current_length]
-            model_kwargs["cache_position"] = torch.arange(
-                past_length, current_length, dtype=torch.long, device=input_chunk.device
+            model_forward = (
+                self.get_compiled_call(generation_config.compile_config)
+                if self._valid_auto_compile_criteria(model_kwargs, generation_config)
+                else self.__call__
            )
-            model_kwargs["position_ids"] = model_kwargs["cache_position"].unsqueeze(0)
-            model_inputs = self.prepare_inputs_for_generation(input_chunk, **model_kwargs)

-            outputs = model_forward(**model_inputs, return_dict=True)
+            attention_mask = model_kwargs.pop("attention_mask", None)
+            past_length = 0
+            for input_chunk in input_chunks:
+                current_length = past_length + input_chunk.shape[-1]
+                if attention_mask is not None:
+                    model_kwargs["attention_mask"] = attention_mask[:, :current_length]
+                model_kwargs["cache_position"] = torch.arange(
+                    past_length, current_length, dtype=torch.long, device=input_chunk.device
+                )
+                model_kwargs["position_ids"] = model_kwargs["cache_position"].unsqueeze(0)
+                model_inputs = self.prepare_inputs_for_generation(input_chunk, **model_kwargs)

-            model_kwargs["past_key_values"] = outputs.past_key_values
-            past_length = current_length
+                outputs = model_forward(**model_inputs, return_dict=True)

-        model_kwargs["attention_mask"] = attention_mask
-        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
-        _ = model_kwargs.pop("position_ids", None)
+                model_kwargs["past_key_values"] = outputs.past_key_values
+                past_length = current_length

-        return model_kwargs
+            model_kwargs["attention_mask"] = attention_mask
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
+            _ = model_kwargs.pop("position_ids", None)
+            # Latest outputs contain next token logits
+            return outputs


 def _speculative_sampling(
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@ -362,25 +362,13 @@ class ImageProcessingMixin(PushToHubMixin):
        """
        image_processor_dict = image_processor_dict.copy()
        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
-        # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
-        # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
-        if "size" in kwargs and "size" in image_processor_dict:
-            image_processor_dict["size"] = kwargs.pop("size")
-        if "crop_size" in kwargs and "crop_size" in image_processor_dict:
-            image_processor_dict["crop_size"] = kwargs.pop("crop_size")
-
+        image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
        image_processor = cls(**image_processor_dict)

-        # Update image_processor with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
+        # Remove kwargs that are used to initialize the image processor attributes
+        for key in list(kwargs):
            if hasattr(image_processor, key):
-                setattr(image_processor, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
+                kwargs.pop(key)

        logger.info(f"Image processor {image_processor}")
        if return_unused_kwargs:
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@ -185,6 +185,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
    input_data_format = None
    device = None
    model_input_names = ["pixel_values"]
+    image_seq_length = None
    valid_kwargs = ImagesKwargs
    unused_kwargs = None

@ -227,6 +228,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
        padding_mode: Optional[str] = "constant",
        return_mask: bool = False,
        disable_grouping: Optional[bool] = False,
+        is_nested: Optional[bool] = False,
        **kwargs,
    ) -> Union[tuple["torch.Tensor", "torch.Tensor"], "torch.Tensor"]:
        """
@ -257,7 +259,9 @@ class BaseImageProcessorFast(BaseImageProcessor):
        else:
            pad_size = get_max_height_width(images)

-        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, disable_grouping=disable_grouping, is_nested=is_nested
+        )
        processed_images_grouped = {}
        processed_masks_grouped = {}
        for shape, stacked_images in grouped_images.items():
@ -280,9 +284,9 @@ class BaseImageProcessorFast(BaseImageProcessor):
                stacked_masks[..., : image_size[0], : image_size[1]] = 1
                processed_masks_grouped[shape] = stacked_masks

-        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=is_nested)
        if return_mask:
-            processed_masks = reorder_images(processed_masks_grouped, grouped_images_index)
+            processed_masks = reorder_images(processed_masks_grouped, grouped_images_index, is_nested=is_nested)
            return processed_images, processed_masks

        return processed_images
@ -305,6 +309,8 @@ class BaseImageProcessorFast(BaseImageProcessor):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to use antialiasing.

        Returns:
            `torch.Tensor`: The resized image.
--- a/src/transformers/integrations/fp_quant.py
+++ b/src/transformers/integrations/fp_quant.py
@ -35,6 +35,10 @@ def adapt_fp_quant_config(config: FPQuantConfig):

    if config.backward_dtype == "bf16":
        backward_dtype = FPQuantDtype.BF16
+    elif config.backward_dtype == "mxfp8":
+        backward_dtype = FPQuantDtype.MXFP8
+    elif config.backward_dtype == "mxfp4":
+        backward_dtype = FPQuantDtype.MXFP4
    else:
        raise ValueError(f"Unsupported backward dtype: {config.backward_dtype}")

--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@ -11,17 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import importlib.metadata
 import re
 from collections.abc import Callable
 from functools import partial
 from types import ModuleType
 from typing import Optional, Union

-from packaging import version as pkg_version
-
 from ..modeling_flash_attention_utils import lazy_import_flash_attention
 from ..utils import logging
+from ..utils.import_utils import is_kernels_available
 from .flash_attention import flash_attention_forward


@ -67,6 +65,12 @@ try:
                    layer_name="LigerRMSNorm",
                )
            },
+            "xpu": {
+                Mode.INFERENCE: LayerRepository(
+                    repo_id="kernels-community/rmsnorm",
+                    layer_name="RMSNorm",
+                )
+            },
        },
        "MLP": {
            "cuda": LayerRepository(
@ -142,7 +146,18 @@ try:
        },
    }

-    register_kernel_mapping(_KERNEL_MAPPING)
+    def has_key(d, key):
+        return key in d or any(isinstance(v, dict) and has_key(v, key) for v in d.values())
+
+    def register_kernel_mapping_transformers(mapping=None):
+        if mapping is None:
+            mapping = _KERNEL_MAPPING
+        if has_key(mapping, "xpu") and not is_kernels_available(MIN_VERSION="0.10.2"):
+            raise ImportError(
+                "kernels uses an incompatible version. Please install the latest version with `pip install -U kernels`."
+            )
+        register_kernel_mapping(mapping)
+

 except ImportError:
    _kernels_available = False
@ -221,7 +236,7 @@ def load_and_register_attn_kernel(attn_implementation: str, attention_wrapper: O

    # Load the kernel from hub
    try:
-        kernel = get_kernel_wrapper(repo_id, revision=rev)
+        kernel = get_kernel(repo_id, revision=rev)
    except Exception as e:
        raise ValueError(f"An error occurred while trying to load from '{repo_id}': {e}.")
    # correctly wrap the kernel
@ -245,10 +260,12 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, Optional[ModuleType]]
        mapping[kernel_name] = None
        return None
    if _kernels_available:
+        from kernels import get_kernel
+
        try:
            repo_id = _HUB_KERNEL_MAPPING[kernel_name]["repo_id"]
            version = _HUB_KERNEL_MAPPING[kernel_name].get("version", None)
-            kernel = get_kernel_wrapper(repo_id, version=version)
+            kernel = get_kernel(repo_id, version=version)
            mapping[kernel_name] = kernel
        except FileNotFoundError:
            mapping[kernel_name] = None
@ -280,25 +297,11 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, Optional[ModuleType]]
    return mapping[kernel_name]


-def get_kernel_wrapper(kernel_name: str, revision: Optional[str] = None, version: Optional[str] = None) -> ModuleType:
-    from .. import __version__
-
-    user_agent = {"framework": "transformers", "version": __version__, "repo_id": kernel_name}
-    if _kernels_available:
-        kernels_version = importlib.metadata.version("kernels")
-        if pkg_version.parse(kernels_version) >= pkg_version.parse("0.10.4"):
-            return get_kernel(kernel_name, revision=revision, version=version, user_agent=user_agent)
-        else:
-            return get_kernel(kernel_name, revision=revision)
-    else:
-        raise ImportError("kernels is not installed, please install it with `pip install kernels`")
-
-
 __all__ = [
    "LayerRepository",
    "use_kernel_forward_from_hub",
    "register_kernel_mapping",
+    "register_kernel_mapping_transformers",
    "replace_kernel_forward_from_hub",
    "lazy_load_kernel",
-    "get_kernel_wrapper",
 ]
--- a/src/transformers/integrations/mxfp4.py
+++ b/src/transformers/integrations/mxfp4.py
@ -470,10 +470,10 @@ def replace_with_mxfp4_linear(
    if quantization_config.dequantize:
        return model
    else:
-        from .hub_kernels import get_kernel_wrapper
+        from kernels import get_kernel

        global triton_kernels_hub
-        triton_kernels_hub = get_kernel_wrapper("kernels-community/triton_kernels")
+        triton_kernels_hub = get_kernel("kernels-community/triton_kernels")

    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert

--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -4033,10 +4033,14 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        if use_kernels:
            if not is_kernels_available():
                raise ValueError(
-                    "Kernels are not available. To use kernels, please install kernels using `pip install kernels`"
+                    "`use_kernels=True` requires kernels>=0.9.0. Please install the latest version with `pip install -U kernels`"
                )
            from kernels import use_kernel_mapping

+            from .integrations.hub_kernels import register_kernel_mapping_transformers
+
+            register_kernel_mapping_transformers()
+
            if kernel_config is not None and isinstance(kernel_config, KernelConfig):
                # This will make sure the mapping is valid, and the layers are registered in the model
                kernel_config.sanitize_kernel_mapping(self)
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -98,12 +98,12 @@ else:
            ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
            ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
            ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
-            ("fuyu", ("FuyuImageProcessor", None)),
+            ("fuyu", ("FuyuImageProcessor", "FuyuImageProcessorFast")),
            ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
            ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
            ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
-            ("glpn", ("GLPNImageProcessor", None)),
+            ("glpn", ("GLPNImageProcessor", "GLPNImageProcessorFast")),
            ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
            ("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
            ("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
--- a/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
+++ b/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
@ -39,9 +39,10 @@ from typing import Any, Optional

 import torch
 from huggingface_hub import snapshot_download
+from peft import PeftModel
 from safetensors import safe_open

-from transformers import AutoConfig
+from transformers import AutoConfig, AutoModel
 from transformers.models.colqwen2 import ColQwen2ForRetrieval
 from transformers.models.colqwen2.configuration_colqwen2 import ColQwen2Config
 from transformers.utils import logging
@ -69,7 +70,7 @@ def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> d
                    original_state_dict[key] = f.get_tensor(key)

    # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
-    if "lm_head.weight" not in original_state_dict:
+    if "lm_head.weight" not in original_state_dict and "model.embed_tokens.weight" in original_state_dict:
        original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()

    return original_state_dict
@ -124,7 +125,21 @@ def convert_colqwen2_weights_to_hf(
    config.is_composition = False

    # Load the untrained model
-    model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
+    vlm_name_or_path = getattr(config.vlm_config, "_name_or_path", None)
+    if vlm_name_or_path and "2.5" in str(vlm_name_or_path):
+        print(
+            "Detected colqwen2.5 adapters in vlm_config; loading base model %s and merging PEFT weights."
+            % vlm_name_or_path
+        )
+        base_model = AutoModel.from_pretrained(
+            vlm_name_or_path,
+            device_map="cpu",
+            trust_remote_code=True,
+        )
+        peft_model = PeftModel.from_pretrained(base_model, model_id)
+        model = peft_model.merge_and_unload()
+    else:
+        model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
    print("Created model with new config and randomly initialized weights")

    # NOTE: The new model was initialized with float32 weights. We need to convert it to the desired precision.
@ -201,6 +216,7 @@ if __name__ == "__main__":
        help="Name or path of the original VLM backbone model",
        default=None,
    )
+
    args = parser.parse_args()

    convert_colqwen2_weights_to_hf(
--- a/src/transformers/models/colqwen2/modeling_colqwen2.py
+++ b/src/transformers/models/colqwen2/modeling_colqwen2.py
@ -172,7 +172,6 @@ class ColQwen2ForRetrieval(ColQwen2PreTrainedModel):
            inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)

            if pixel_values is not None:
-                pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
                image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
                image_mask = (
                    (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@ -359,7 +359,6 @@ class ColQwen2ForRetrieval(ColPaliForRetrieval):
            inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)

            if pixel_values is not None:
-                pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
                image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
                image_mask = (
                    (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
--- a/src/transformers/models/csm/generation_csm.py
+++ b/src/transformers/models/csm/generation_csm.py
@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union

@ -204,11 +203,11 @@ class CsmGenerationMixin(GenerationMixin):
                    criterion.max_length -= cur_len
        # ============================================

-        model_forward = self.__call__
-        compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
-        if compile_forward:
-            os.environ["TOKENIZERS_PARALLELISM"] = "0"
-            model_forward = self.get_compiled_call(generation_config.compile_config)
+        model_forward = (
+            self.get_compiled_call(generation_config.compile_config)
+            if self._valid_auto_compile_criteria(model_kwargs, generation_config)
+            else self.__call__
+        )

        is_prefill = True
        while self._has_unfinished_sequences(
--- a/src/transformers/models/dia/generation_dia.py
+++ b/src/transformers/models/dia/generation_dia.py
@ -278,6 +278,12 @@ class DiaGenerationMixin(GenerationMixin):
        )
        generation_mode = generation_config.get_generation_mode(assistant_model)

+        if generation_mode not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
+            )
+
        self._validate_model_kwargs(model_kwargs.copy())
        self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)

@ -382,26 +388,29 @@ class DiaGenerationMixin(GenerationMixin):
        # Prepare inner 2D logic in generation loop
        input_ids = input_ids.reshape(-1, input_ids.shape[-1])

-        # 10. go into different generation modes
-        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
-            # 11. expand input_ids with `num_return_sequences` additional sequences per batch
-            if generation_config.num_return_sequences > 1:
-                raise ValueError("`num_return_sequences>1` is incompatible with Dia.")
+        model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
+        # prepare model inputs
+        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

-            # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-            return self._sample(
-                input_ids,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                generation_config=generation_config,
-                **generation_mode_kwargs,
-                **model_kwargs,
-            )
-        else:
-            raise ValueError(
-                "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1`."
-            )
+        # 10. Prefill
+        model_inputs.update({"output_attentions": generation_config.output_attentions})
+        model_inputs.update({"output_hidden_states": generation_config.output_hidden_states})
+        outputs = self(**model_inputs, return_dict=True)
+
+        # 11. expand input_ids with `num_return_sequences` additional sequences per batch
+        if generation_config.num_return_sequences > 1:
+            raise ValueError("`num_return_sequences>1` is incompatible with Dia.")
+
+        # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
+        return self._sample(
+            input_ids,
+            logits_processor=prepared_logits_processor,
+            stopping_criteria=prepared_stopping_criteria,
+            generation_config=generation_config,
+            prefill_outputs=outputs,
+            **generation_mode_kwargs,
+            **model_kwargs,
+        )

    @torch.no_grad()
    def generate(
--- a/src/transformers/models/fuyu/init.py
+++ b/src/transformers/models/fuyu/init.py
@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
 if TYPE_CHECKING:
    from .configuration_fuyu import *
    from .image_processing_fuyu import *
+    from .image_processing_fuyu_fast import *
    from .modeling_fuyu import *
    from .processing_fuyu import *
 else:
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@ -29,6 +29,7 @@ from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
+    SizeDict,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
@ -37,6 +38,7 @@ from ...image_utils import (
    to_numpy_array,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
    TensorType,
    filter_out_non_signature_kwargs,
@ -70,6 +72,21 @@ def make_list_of_list_of_images(
    raise ValueError("images must be a list of list of images or a list of images or an image.")


+class FuyuImagesKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
+        Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+    padding_value (`float`, *optional*, defaults to 1.0):
+        The value to pad the image with.
+    padding_mode (`str`, *optional*, defaults to "constant"):
+        The padding mode to use when padding the image.
+    """
+
+    patch_size: Optional[SizeDict]
+    padding_value: float
+    padding_mode: str
+
+
 class FuyuBatchFeature(BatchFeature):
    """
    BatchFeature class for Fuyu image processor and processor.
@ -232,6 +249,7 @@ class FuyuImageProcessor(BaseImageProcessor):
        "image_patch_indices_per_batch",
        "image_patch_indices_per_subsequence",
    ]
+    valid_kwargs = FuyuImagesKwargs

    def __init__(
        self,
--- a/src/transformers/models/fuyu/image_processing_fuyu_fast.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
@ -0,0 +1,382 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Fuyu."""
+
+import math
+from typing import Optional, Union
+
+import torch
+
+from ...image_processing_utils import get_size_dict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torchvision_available,
+    logging,
+    requires_backends,
+)
+from .image_processing_fuyu import FuyuBatchFeature, FuyuImagesKwargs, make_list_of_list_of_images
+
+
+if is_torchvision_available():
+    from torchvision.transforms.v2 import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class FuyuImageProcessorFast(BaseImageProcessorFast):
+    do_resize = True
+    size = {"height": 1080, "width": 1920}
+    resample = PILImageResampling.BILINEAR
+    do_pad = True
+    padding_value = 1.0
+    padding_mode = "constant"
+    do_normalize = True
+    image_mean = 0.5
+    image_std = 0.5
+    do_rescale = True
+    rescale_factor = 1 / 255
+    model_input_names = [
+        "images",
+        "image_input_ids",
+        "image_patches",
+        "image_patch_indices_per_batch",
+        "image_patch_indices_per_subsequence",
+    ]
+    valid_kwargs = FuyuImagesKwargs
+
+    def _prepare_images_structure(
+        self,
+        images: ImageInput,
+        expected_ndims: int = 3,
+    ) -> ImageInput:
+        images = self.fetch_images(images)
+        return make_list_of_list_of_images(images)
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize an image to fit within `(size["height"], size["width"])` while maintaining aspect ratio.
+        Only resizes if the image is larger than the target size.
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the max size of the output image.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BILINEAR`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to apply antialiasing when resizing.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        image_height, image_width = image.shape[-2:]
+        target_height, target_width = size.height, size.width
+        # Only resize if image is larger than target
+        if image_width <= target_width and image_height <= target_height:
+            return image
+        # Calculate optimal scale factor to fit within target size
+        height_scale_factor = target_height / image_height
+        width_scale_factor = target_width / image_width
+        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+        new_height = int(image_height * optimal_scale_factor)
+        new_width = int(image_width * optimal_scale_factor)
+
+        return super().resize(
+            image, SizeDict(height=new_height, width=new_width), interpolation=interpolation, antialias=antialias
+        )
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: Optional[bool],
+        padding_value: Optional[float],
+        padding_mode: Optional[str],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> FuyuBatchFeature:
+        # Group images by size for batched resizing
+        original_image_sizes = [batch_image[0].shape[-2:] for batch_image in images if batch_image]
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, disable_grouping=disable_grouping, is_nested=True
+        )
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index, is_nested=True)
+
+        image_sizes = [batch_image[0].shape[-2:] for batch_image in resized_images if batch_image]
+        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+        image_scale_factors = [
+            [resized_size[0] / original_size[0]]
+            for original_size, resized_size in zip(original_image_sizes, image_sizes)
+        ]
+        if do_pad:
+            resized_images = self.pad(
+                resized_images,
+                pad_size=size,
+                fill_value=padding_value,
+                padding_mode=padding_mode,
+                disable_grouping=disable_grouping,
+                is_nested=True,
+            )
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(
+            resized_images, disable_grouping=disable_grouping, is_nested=True
+        )
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=True)
+
+        return FuyuBatchFeature(
+            data={
+                "images": processed_images,
+                "image_unpadded_heights": image_unpadded_heights,
+                "image_unpadded_widths": image_unpadded_widths,
+                "image_scale_factors": image_scale_factors,
+            },
+            tensor_type=return_tensors,
+        )
+
+    def get_num_patches(self, image_height: int, image_width: int, patch_size: Optional[SizeDict] = None) -> int:
+        """
+        Calculate number of patches required to encode an image.
+        Args:
+            image_height (`int`):
+                Height of the image.
+            image_width (`int`):
+                Width of the image.
+            patch_size (`SizeDict`, *optional*):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        if patch_size is None:
+            patch_size = SizeDict(**self.patch_size)
+        patch_height, patch_width = patch_size.height, patch_size.width
+        if image_height % patch_height != 0:
+            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+        if image_width % patch_width != 0:
+            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+        num_patches_per_dim_h = image_height // patch_height
+        num_patches_per_dim_w = image_width // patch_width
+        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+        return num_patches
+
+    def patchify_image(self, image: torch.Tensor, patch_size: Optional[SizeDict] = None) -> torch.Tensor:
+        """
+        Convert an image into a tensor of patches using PyTorch's unfold operation.
+        Args:
+            image (`torch.Tensor`):
+                Image to convert. Shape: [batch, channels, height, width]
+            patch_size (`SizeDict`, *optional*):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        requires_backends(self, ["torch"])
+        if patch_size is None:
+            patch_size = SizeDict(**self.patch_size)
+        patch_height, patch_width = patch_size.height, patch_size.width
+        batch_size, channels, _, _ = image.shape
+        # Use unfold to extract patches
+        unfolded_along_height = image.unfold(2, patch_height, patch_height)
+        patches = unfolded_along_height.unfold(3, patch_width, patch_width)
+        patches = patches.contiguous()
+        # Reshape to [batch, num_patches, channels * patch_h * patch_w]
+        patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
+        patches = patches.permute(0, 2, 3, 4, 1)
+        patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
+        return patches
+
+    def preprocess_with_tokenizer_info(
+        self,
+        image_input: torch.Tensor,
+        image_present: torch.Tensor,
+        image_unpadded_h: torch.Tensor,
+        image_unpadded_w: torch.Tensor,
+        image_placeholder_id: int,
+        image_newline_id: int,
+        variable_sized: bool,
+        patch_size: Optional[dict[str, int]] = None,
+    ) -> FuyuBatchFeature:
+        """
+        Process images for model input. In particular, variable-sized images are handled here.
+
+        Args:
+            image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
+                Tensor of images padded to model input size.
+            image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
+                Tensor of 1s and 0s indicating whether an image is present.
+            image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image heights.
+            image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image widths.
+            image_placeholder_id (int):
+                The id of the image placeholder token. Comes from an associated tokenizer.
+            image_newline_id (int):
+                The id of the image newline token. Comes from an associated tokenizer.
+            variable_sized (bool):
+                Whether to process images as variable-sized.
+            patch_size (`dict[str, int]`, *optional*):
+                Size of the patches.
+        """
+        requires_backends(self, ["torch"])
+
+        if patch_size is None:
+            patch_size = SizeDict(**self.patch_size)
+        else:
+            patch_size = SizeDict(**patch_size)
+        patch_height, patch_width = patch_size.height, patch_size.width
+        # Only images that are present
+        images: list[list[torch.Tensor]] = []
+        batch_image_patches: list[list[torch.Tensor]] = []
+        # Image input ids for every subsequence, including ones with no image present
+        batch_image_input_ids: list[list[torch.Tensor]] = []
+        for batch_index in range(image_input.shape[0]):
+            image_input_ids = []
+            image_patches = []
+            for subseq_index in range(image_input.shape[1]):
+                if image_present[batch_index, subseq_index]:
+                    image = image_input[batch_index, subseq_index]
+                    image_height, image_width = image.shape[1], image.shape[2]
+                    if variable_sized:
+                        # Calculate new dimensions based on unpadded size
+                        # The min() is required here due to floating point issues
+                        new_h = min(
+                            image_height,
+                            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+                        )
+                        new_w = min(
+                            image_width,
+                            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+                        )
+                        image = image[:, :new_h, :new_w]
+                        image_height, image_width = new_h, new_w
+                    num_patches = self.get_num_patches(
+                        image_height=image_height, image_width=image_width, patch_size=patch_size
+                    )
+                    # Create tensor of placeholder IDs
+                    tensor_of_image_ids = torch.full(
+                        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+                    )
+                    # Patchify the image
+                    patches = self.patchify_image(image=image.unsqueeze(0), patch_size=patch_size).squeeze(0)
+                    assert num_patches == patches.shape[0]
+                    if variable_sized:
+                        # Terminate each line with newline ID
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
+                        newline_ids = torch.full(
+                            [tensor_of_image_ids.shape[0], 1],
+                            image_newline_id,
+                            dtype=torch.int32,
+                            device=image_input.device,
+                        )
+                        tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
+                    images.append([image])
+                    image_input_ids.append(tensor_of_image_ids)
+                    image_patches.append(patches)
+                else:
+                    image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
+            batch_image_input_ids.append(image_input_ids)
+            batch_image_patches.append(image_patches)
+        # Create image patch indices
+        image_patch_indices_per_batch: list[list[torch.Tensor]] = []
+        image_patch_indices_per_subsequence: list[list[torch.Tensor]] = []
+
+        for sample_image_input_ids in batch_image_input_ids:
+            index_offset = 0
+            per_batch_indices = []
+            per_subsequence_indices = []
+            for subseq_image_input_ids in sample_image_input_ids:
+                # Indices of image patches
+                patches_mask = subseq_image_input_ids == image_placeholder_id
+                num_patches = torch.count_nonzero(patches_mask)
+                indices = torch.arange(num_patches, dtype=torch.int64, device=subseq_image_input_ids.device).type_as(
+                    subseq_image_input_ids
+                )
+                # Place those indices in the image input ids token stream, with -1 representing non-index tokens
+                indices_in_stream_per_batch = torch.full_like(subseq_image_input_ids, -1)
+                indices_in_stream_per_subsequence = torch.full_like(subseq_image_input_ids, -1)
+                patches_inds = torch.nonzero(patches_mask, as_tuple=True)[0]
+
+                indices_in_stream_per_batch[patches_inds] = indices + index_offset
+                indices_in_stream_per_subsequence[patches_inds] = indices
+
+                per_batch_indices.append(indices_in_stream_per_batch)
+                per_subsequence_indices.append(indices_in_stream_per_subsequence)
+                index_offset += num_patches
+
+            image_patch_indices_per_batch.append(per_batch_indices)
+            image_patch_indices_per_subsequence.append(per_subsequence_indices)
+        return FuyuBatchFeature(
+            data={
+                "images": images,
+                "image_input_ids": batch_image_input_ids,
+                "image_patches": batch_image_patches,
+                "image_patch_indices_per_batch": image_patch_indices_per_batch,
+                "image_patch_indices_per_subsequence": image_patch_indices_per_subsequence,
+            }
+        )
+
+    def _further_process_kwargs(
+        self,
+        patch_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Process Fuyu-specific kwargs before validation.
+        """
+        kwargs = super()._further_process_kwargs(**kwargs)
+        if patch_size is not None:
+            patch_size = SizeDict(**get_size_dict(patch_size, param_name="patch_size"))
+        kwargs["patch_size"] = patch_size
+        return kwargs
+
+
+__all__ = ["FuyuImageProcessorFast"]
--- a/src/transformers/models/glpn/init.py
+++ b/src/transformers/models/glpn/init.py
@ -21,6 +21,7 @@ if TYPE_CHECKING:
    from .configuration_glpn import *
    from .feature_extraction_glpn import *
    from .image_processing_glpn import *
+    from .image_processing_glpn_fast import *
    from .modeling_glpn import *
 else:
    import sys
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@ -39,6 +39,7 @@ from ...image_utils import (
    valid_images,
    validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends


@ -49,6 +50,17 @@ if is_torch_available():
 logger = logging.get_logger(__name__)


+class GLPNImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    size_divisor (`int`, *optional*, defaults to 32):
+        When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest
+        multiple of `size_divisor`.
+    """
+
+    size_divisor: int
+    resample: PILImageResampling
+
+
@requires(backends=("vision",))
 class GLPNImageProcessor(BaseImageProcessor):
    r"""
@ -66,9 +78,12 @@ class GLPNImageProcessor(BaseImageProcessor):
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be
            overridden by `do_rescale` in `preprocess`.
+        rescale_factor (`float`, *optional*, defaults to `1 / 255`):
+            The scaling factor to apply to the pixel values. Can be overridden by `rescale_factor` in `preprocess`.
    """

    model_input_names = ["pixel_values"]
+    valid_kwargs = GLPNImageProcessorKwargs

    def __init__(
        self,
@ -76,12 +91,14 @@ class GLPNImageProcessor(BaseImageProcessor):
        size_divisor: int = 32,
        resample=PILImageResampling.BILINEAR,
        do_rescale: bool = True,
+        rescale_factor: Optional[float] = 1 / 255,
        **kwargs,
    ) -> None:
        self.do_resize = do_resize
        self.do_rescale = do_rescale
        self.size_divisor = size_divisor
        self.resample = resample
+        self.rescale_factor = rescale_factor
        super().__init__(**kwargs)

    def resize(
@ -142,6 +159,7 @@ class GLPNImageProcessor(BaseImageProcessor):
        size_divisor: Optional[int] = None,
        resample=None,
        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
        return_tensors: Optional[Union[TensorType, str]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
@ -181,6 +199,7 @@ class GLPNImageProcessor(BaseImageProcessor):
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
        resample = resample if resample is not None else self.resample

@ -217,7 +236,9 @@ class GLPNImageProcessor(BaseImageProcessor):
            ]

        if do_rescale:
-            images = [self.rescale(image, scale=1 / 255, input_data_format=input_data_format) for image in images]
+            images = [
+                self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) for image in images
+            ]

        images = [
            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
--- a/src/transformers/models/glpn/image_processing_glpn_fast.py
+++ b/src/transformers/models/glpn/image_processing_glpn_fast.py
@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for GLPN."""
+
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from ...image_utils import (
+    PILImageResampling,
+    SizeDict,
+)
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    requires_backends,
+)
+from .image_processing_glpn import GLPNImageProcessorKwargs
+
+
+@auto_docstring
+class GLPNImageProcessorFast(BaseImageProcessorFast):
+    do_resize = True
+    do_rescale = True
+    rescale_factor = 1 / 255
+    resample = PILImageResampling.BILINEAR
+    size_divisor = 32
+    valid_kwargs = GLPNImageProcessorKwargs
+
+    def _validate_preprocess_kwargs(self, **kwargs):
+        # pop `do_resize` to not raise an error as `size` is not None
+        kwargs.pop("do_resize", None)
+        return super()._validate_preprocess_kwargs(**kwargs)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size_divisor: int,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to use antialiasing.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        height, width = image.shape[-2:]
+        # Rounds the height and width down to the closest multiple of size_divisor
+        new_h = height // size_divisor * size_divisor
+        new_w = width // size_divisor * size_divisor
+        return super().resize(
+            image, SizeDict(height=new_h, width=new_w), interpolation=interpolation, antialias=antialias
+        )
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size_divisor: Optional[int] = None,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        do_rescale: bool = True,
+        rescale_factor: Optional[float] = 1 / 255,
+        do_normalize: bool = False,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        disable_grouping: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        resample: Optional[PILImageResampling] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        grouped_images, grouped_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        processed_groups = {}
+
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(stacked_images, size_divisor=size_divisor, interpolation=interpolation)
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_groups[shape] = stacked_images
+
+        processed_images = reorder_images(processed_groups, grouped_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(self, outputs, target_sizes=None):
+        """
+        Convert raw model outputs to final depth predictions.
+        Mirrors slow GLPN: PyTorch interpolate w/ bicubic, align_corners=False.
+        """
+        requires_backends(self, "torch")
+        predicted_depth = outputs.predicted_depth
+
+        results = []
+        target_sizes = target_sizes or [None] * predicted_depth.shape[0]
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                # Add batch and channel dimensions for interpolation
+                depth_4d = depth[None, None, ...]
+                resized = torch.nn.functional.interpolate(
+                    depth_4d, size=target_size, mode="bicubic", align_corners=False
+                )
+                depth = resized.squeeze(0).squeeze(0)
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["GLPNImageProcessorFast"]
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@ -286,8 +286,8 @@ class Idefics3Processor(ProcessorMixin):
                        f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
                    )

-                image_rows = inputs.pop("rows", [[0] * len(text)])
-                image_cols = inputs.pop("cols", [[0] * len(text)])
+                image_rows = inputs.pop("rows", [[0] * n_images for n_images in n_images_in_text])
+                image_cols = inputs.pop("cols", [[0] * n_images for n_images in n_images_in_text])

                fake_image_token = self.fake_image_token
                image_token = self.image_token
--- a/src/transformers/models/mra/modeling_mra.py
+++ b/src/transformers/models/mra/modeling_mra.py
@ -53,9 +53,9 @@ def load_cuda_kernels():
    global mra_cuda_kernel
    if not is_kernels_available():
        raise ImportError("kernels is not installed, please install it with `pip install kernels`")
-    from ...integrations.hub_kernels import get_kernel_wrapper
+    from kernels import get_kernel

-    mra_cuda_kernel = get_kernel_wrapper("kernels-community/mra")
+    mra_cuda_kernel = get_kernel("kernels-community/mra")


 def sparse_max(sparse_qk_prod, indices, query_num_block, key_num_block):
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@ -2109,6 +2109,7 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        synced_gpus: Optional[bool] = None,
        streamer: Optional["BaseStreamer"] = None,
+        use_model_defaults: Optional[bool] = None,
        **kwargs,
    ):
        """
@ -2153,6 +2154,11 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            use_model_defaults (`bool`, *optional*):
+                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
+                generation configuration (`model.generation_config`), as opposed to the global defaults
+                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
+                `True`.
            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@ -2175,13 +2181,19 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        """
        # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
-        if generation_config is None:
-            generation_config = self.generation_config
+        generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+        generation_mode = generation_config.get_generation_mode()
+        if generation_mode not in [GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH]:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
+            )

-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
-        generation_config.validate()
        self._validate_model_kwargs(model_kwargs.copy())
+        self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)

        if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) is tuple:
            # wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate
@ -2281,31 +2293,26 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
            generation_config=generation_config, stopping_criteria=stopping_criteria
        )

-        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
-            # expand input_ids with `num_return_sequences` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_return_sequences,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
+        # expand input_ids with `num_return_sequences` additional sequences per batch
+        input_ids, model_kwargs = self._expand_inputs_for_generation(
+            input_ids=input_ids,
+            expand_size=generation_config.num_return_sequences,
+            is_encoder_decoder=self.config.is_encoder_decoder,
+            **model_kwargs,
+        )

-            # 11. run sample
-            outputs = self._sample(
-                input_ids,
-                logits_processor=logits_processor,
-                stopping_criteria=stopping_criteria,
-                generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                **model_kwargs,
-            )
+        # 10b. prepare prefill outputs
+        generation_mode_kwargs["prefill_outputs"] = self._prefill(input_ids, generation_config, model_kwargs)

-        else:
-            raise ValueError(
-                "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1`."
-            )
+        # 11. run sample
+        outputs = self._sample(
+            input_ids,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            generation_config=generation_config,
+            **generation_mode_kwargs,
+            **model_kwargs,
+        )

        if generation_config.return_dict_in_generate:
            output_ids = outputs.sequences
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@ -291,7 +291,7 @@ class Owlv2ImageProcessor(BaseImageProcessor):
        image = pad(
            image=image,
            padding=((0, size - height), (0, size - width)),
-            constant_values=0.5,
+            constant_values=0.0,
            data_format=data_format,
            input_data_format=input_data_format,
        )
--- a/src/transformers/models/owlv2/image_processing_owlv2_fast.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2_fast.py
@ -228,7 +228,7 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):

        return results

-    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
+    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.0) -> "torch.Tensor":
        """
        Pad an image with zeros to the given size.
        """
@ -245,7 +245,7 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
        self,
        images: list["torch.Tensor"],
        disable_grouping: Optional[bool],
-        constant_value: float = 0.5,
+        constant_value: float = 0.0,
        **kwargs,
    ) -> list["torch.Tensor"]:
        """
@ -351,7 +351,7 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
        processed_images = reorder_images(processed_images_grouped, grouped_images_index)

        if do_pad:
-            processed_images = self.pad(processed_images, constant_value=0.5, disable_grouping=disable_grouping)
+            processed_images = self.pad(processed_images, constant_value=0.0, disable_grouping=disable_grouping)

        grouped_images, grouped_images_index = group_images_by_shape(
            processed_images, disable_grouping=disable_grouping
--- a/src/transformers/models/owlv2/modular_owlv2.py
+++ b/src/transformers/models/owlv2/modular_owlv2.py
@ -52,7 +52,7 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
    crop_size = None
    do_center_crop = None

-    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
+    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.0) -> "torch.Tensor":
        """
        Pad an image with zeros to the given size.
        """
@ -69,7 +69,7 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
        self,
        images: list["torch.Tensor"],
        disable_grouping: Optional[bool],
-        constant_value: float = 0.5,
+        constant_value: float = 0.0,
        **kwargs,
    ) -> list["torch.Tensor"]:
        """
@ -175,7 +175,7 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
        processed_images = reorder_images(processed_images_grouped, grouped_images_index)

        if do_pad:
-            processed_images = self.pad(processed_images, constant_value=0.5, disable_grouping=disable_grouping)
+            processed_images = self.pad(processed_images, constant_value=0.0, disable_grouping=disable_grouping)

        grouped_images, grouped_images_index = group_images_by_shape(
            processed_images, disable_grouping=disable_grouping
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@ -53,11 +53,18 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
    """
    max_patches (`int`, *optional*):
        Maximum number of patches to extract.
+    patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+        The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
+    is_vqa (`bool`, *optional*, defaults to `False`):
+        Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
+        rendered onto the input images.
    header_text (`Union[list[str], str]`, *optional*):
        Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
    """

    max_patches: int
+    patch_size: dict[str, int]
+    is_vqa: bool
    header_text: Optional[Union[list[str], str]]


--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@ -14,7 +14,6 @@
 # limitations under the License.
 """RAG model implementation."""

-import copy
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Optional, Union
@ -24,7 +23,8 @@ from torch import nn

 from ...cache_utils import Cache, EncoderDecoderCache
 from ...configuration_utils import PreTrainedConfig
-from ...generation import GenerationConfig, GenerationMixin, LogitsProcessorList, StoppingCriteriaList
+from ...generation import GenerationConfig, GenerationMixin, GenerationMode, LogitsProcessorList, StoppingCriteriaList
+from ...generation.utils import GENERATION_MODES_MAPPING
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, logging
@ -1403,6 +1403,7 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
        logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
        stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
+        use_model_defaults: Optional[bool] = None,
        **kwargs,
    ) -> torch.LongTensor:
        """
@ -1461,6 +1462,11 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                model's config. If a stopping criteria is passed that is already created with the arguments or a
                model's config an error is thrown.
+            use_model_defaults (`bool`, *optional*):
+                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
+                generation configuration (`model.generation_config`), as opposed to the global defaults
+                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
+                `True`.
            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model.
@ -1471,10 +1477,24 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
            finished early due to the `eos_token_id`.
        """
        # Handle `generation_config` and kwargs that might update it
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+        generation_mode = generation_config.get_generation_mode()
+        if generation_mode not in [
+            GenerationMode.SAMPLE,
+            GenerationMode.GREEDY_SEARCH,
+            GenerationMode.BEAM_SEARCH,
+            GenerationMode.BEAM_SAMPLE,
+        ]:
+            raise ValueError(
+                f"RAG model is not compatible with {generation_mode} generation. Please check your generation parameters."
+            )
+        # type() required to access the unbound class-level method
+        decoding_method = getattr(type(self), GENERATION_MODES_MAPPING[generation_mode])
+        self._validate_model_kwargs(model_kwargs.copy())
+        self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)

        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask)
@ -1550,7 +1570,7 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
        model_kwargs["attention_mask"] = context_attention_mask
        model_kwargs["n_docs"] = n_docs

-        pre_processor = self._get_logits_processor(
+        prepared_logits_processor = self._get_logits_processor(
            generation_config=generation_config,
            input_ids_seq_length=input_ids_seq_length,
            encoder_input_ids=context_input_ids,
@ -1571,37 +1591,18 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
            max_cache_length=generation_config.max_length - 1,
        )

-        if generation_config.num_beams == 1:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
-                    " greedy search."
-                )
-            return self._sample(
-                input_ids,
-                logits_processor=pre_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                generation_config=generation_config,
-                synced_gpus=False,
-                streamer=None,
-                **model_kwargs,
-            )
-        elif generation_config.num_beams > 1:
-            if generation_config.num_return_sequences > generation_config.num_beams:
-                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
+        # Prefill pass
+        generation_mode_kwargs["prefill_outputs"] = self._prefill(input_ids, generation_config, model_kwargs)

-            return self._beam_search(
-                input_ids,
-                logits_processor=pre_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                generation_config=generation_config,
-                synced_gpus=False,
-                **model_kwargs,
-            )
-        else:
-            raise ValueError(
-                f"`num_beams` has to be an integer strictly superior to 0 (≥ 1), but is {generation_config.num_beams}"
-            )
+        return decoding_method(
+            self,
+            input_ids,
+            logits_processor=prepared_logits_processor,
+            stopping_criteria=prepared_stopping_criteria,
+            generation_config=generation_config,
+            **generation_mode_kwargs,
+            **model_kwargs,
+        )

    # Auxiliary functions for beam search
    def _temporary_reorder_cache(self, past_key_values, beam_idx):
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@ -48,9 +48,9 @@ def load_wkv_cuda_kernel(context_length):
    if not is_kernels_available():
        raise ImportError("kernels is not installed, please install it with `pip install kernels`")

-    from ...integrations.hub_kernels import get_kernel_wrapper
+    from kernels import get_kernel

-    rwkv_cuda_kernel = get_kernel_wrapper("kernels-community/rwkv")
+    rwkv_cuda_kernel = get_kernel("kernels-community/rwkv")
    rwkv_cuda_kernel.max_seq_length = context_length


--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@ -172,8 +172,6 @@ class SmolVLMProcessor(ProcessorMixin):

    def expand_text_with_image_tokens(self, text, image_rows, image_cols):
        prompt_strings = []
-        image_rows = image_rows if image_rows is not None else [[0] * len(text)]
-        image_cols = image_cols if image_cols is not None else [[0] * len(text)]
        for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
            # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
            image_prompt_strings = []
@ -330,6 +328,11 @@ class SmolVLMProcessor(ProcessorMixin):
                    raise ValueError(
                        f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
                    )
+                # Set default values for image_rows and image_cols if not provided
+                if image_rows is None:
+                    image_rows = [[0] * n_images for n_images in n_images_in_text]
+                if image_cols is None:
+                    image_cols = [[0] * n_images for n_images in n_images_in_text]
                text = self.expand_text_with_image_tokens(text, image_rows=image_rows, image_cols=image_cols)

        elif videos is not None:
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@ -53,9 +53,9 @@ def load_cuda_kernels():
    global lsh_cumulation
    if not is_kernels_available():
        raise ImportError("kernels is not installed, please install it with `pip install kernels`")
-    from ...integrations.hub_kernels import get_kernel_wrapper
+    from kernels import get_kernel

-    yoso = get_kernel_wrapper("kernels-community/yoso")
+    yoso = get_kernel("kernels-community/yoso")
    lsh_cumulation = yoso.lsh_cumulation


--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@ -59,6 +59,7 @@ from .base import (
    get_default_model_and_revision,
    load_model,
 )
+from .deprecated import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
 from .depth_estimation import DepthEstimationPipeline
 from .document_question_answering import DocumentQuestionAnsweringPipeline
 from .feature_extraction import FeatureExtractionPipeline
@ -74,7 +75,6 @@ from .mask_generation import MaskGenerationPipeline
 from .object_detection import ObjectDetectionPipeline
 from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
 from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
-from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
 from .text_classification import TextClassificationPipeline
 from .text_generation import TextGenerationPipeline
 from .text_to_audio import TextToAudioPipeline
--- a/src/transformers/pipelines/deprecated/init.py
+++ b/src/transformers/pipelines/deprecated/init.py
@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
--- a/src/transformers/pipelines/deprecated/text2text_generation.py
+++ b/src/transformers/pipelines/deprecated/text2text_generation.py
@ -2,14 +2,14 @@ import enum
 import warnings
 from typing import Any

-from ..generation import GenerationConfig
-from ..tokenization_utils import TruncationStrategy
-from ..utils import add_end_docstrings, is_torch_available, logging
-from .base import Pipeline, build_pipeline_init_args
+from ...generation import GenerationConfig
+from ...tokenization_utils import TruncationStrategy
+from ...utils import add_end_docstrings, is_torch_available, logging
+from ..base import Pipeline, build_pipeline_init_args


 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+    from ...models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES

 logger = logging.get_logger(__name__)

@ -77,6 +77,12 @@ class Text2TextGenerationPipeline(Pipeline):
    return_name = "generated"

    def __init__(self, *args, **kwargs):
+        if self.return_name == "generated":  # Check this isn't summarization/translation instead
+            logger.warning_once(
+                "The `Text2TextGenerationPipeline` is deprecated and no longer maintained. For most "
+                "purposes, we recommend using newer models with causal pipelines like "
+                "`TextGenerationPipeline` or `ImageTextToTextPipeline`."
+            )
        super().__init__(*args, **kwargs)

        self.check_model_type(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
@ -254,6 +260,14 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
    # Used in the return key of the pipeline.
    return_name = "summary"

+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "The `SummarizationPipeline` is deprecated and no longer maintained. For most "
+            "summarization tasks, we recommend appropriately prompting modern general-purpose LLMs "
+            "via pipelines like `TextGenerationPipeline` or `ImageTextToTextPipeline`."
+        )
+        super().__init__(*args, **kwargs)
+
    def __call__(self, *args, **kwargs):
        r"""
        Summarize the text(s) given as inputs.
@ -323,6 +337,14 @@ class TranslationPipeline(Text2TextGenerationPipeline):
    # Used in the return key of the pipeline.
    return_name = "translation"

+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "The `TranslationPipeline` is deprecated and no longer maintained. For most "
+            "translation tasks, we recommend appropriately prompting modern general-purpose LLMs "
+            "via pipelines like `TextGenerationPipeline` or `ImageTextToTextPipeline`."
+        )
+        super().__init__(*args, **kwargs)
+
    def check_inputs(self, input_length: int, min_length: int, max_new_tokens: int):
        """
        Removed input length check - unnecessary with max_new_tokens (previously relevant for max_length)
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@ -219,6 +219,9 @@ class ImagesKwargs(TypedDict, total=False):
            - `'np'`: Return NumPy `np.ndarray` objects.
        disable_grouping (`bool`, *optional*):
            Whether to group images by shapes when processing or not, only relevant for fast image processing.
+        image_seq_length (`int`, *optional*):
+            The number of image tokens to be used for each image in the input.
+            Added for backward compatibility but this should be set as a processor attribute in future models.
    """

    do_convert_rgb: Optional[bool]
@ -239,6 +242,7 @@ class ImagesKwargs(TypedDict, total=False):
    device: Annotated[Optional[str], device_validator()]
    return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
    disable_grouping: Optional[bool]
+    image_seq_length: Optional[int]


 class VideosKwargs(TypedDict, total=False):
@ -1366,8 +1370,8 @@ class ProcessorMixin(PushToHubMixin):
        if token is not None:
            kwargs["token"] = token

-        processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+        processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
        return cls.from_args_and_dict(args, processor_dict, **kwargs)

    @classmethod
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@ -20,7 +20,7 @@ from .quantizers_utils import get_module_from_name
 if TYPE_CHECKING:
    from ..modeling_utils import PreTrainedModel

-from ..utils import is_fp_quant_available, is_qutlass_available, is_torch_available, logging
+from ..utils import is_fp_quant_available, is_qutlass_available, is_torch_available, is_torch_xpu_available, logging
 from ..utils.quantization_config import QuantizationConfigMixin


@ -45,9 +45,9 @@ class FPQuantHfQuantizer(HfQuantizer):
        self.quantization_config = quantization_config

    def validate_environment(self, device_map, **kwargs):
-        if not torch.cuda.is_available():
+        if not torch.cuda.is_available() and not is_torch_xpu_available():
            raise NotImplementedError(
-                "FPQuant quantization is only supported on GPU. Please use a different quantizer."
+                "FPQuant quantization is only supported on GPU or Intel XPU. Please use a different quantizer."
            )

        if not is_qutlass_available() and not self.quantization_config.pseudoquantization:
--- a/src/transformers/quantizers/quantizer_mxfp4.py
+++ b/src/transformers/quantizers/quantizer_mxfp4.py
@ -55,9 +55,9 @@ class Mxfp4HfQuantizer(HfQuantizer):
        """Lazy import and initialize kernels only when needed"""
        if self.triton_kernels_hub is None:
            try:
-                from ..integrations.hub_kernels import get_kernel_wrapper
+                from kernels import get_kernel

-                self.triton_kernels_hub = get_kernel_wrapper("kernels-community/triton_kernels")
+                self.triton_kernels_hub = get_kernel("kernels-community/triton_kernels")
            except ImportError:
                raise ImportError("kernels package is required for MXFP4 quantization")
        return self.triton_kernels_hub
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@ -885,6 +885,12 @@ class TrainingArguments:
            )
        },
    )
+    logging_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Deprecated and will be removed in v5.2. Set env var `TENSORBOARD_LOGGING_DIR` instead. TensorBoard log directory."
+        },
+    )
    logging_strategy: Union[IntervalStrategy, str] = field(
        default="steps",
        metadata={"help": "The logging strategy to use."},
@ -1695,6 +1701,11 @@ class TrainingArguments:
        if isinstance(self.include_num_input_tokens_seen, bool):
            self.include_num_input_tokens_seen = "all" if self.include_num_input_tokens_seen else "no"

+        if self.logging_dir is not None:
+            logger.warning(
+                "`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead."
+            )
+
    def __str__(self):
        self_as_dict = asdict(self)

--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -87,6 +87,7 @@ VPTQ_MIN_VERSION = "0.0.4"
 TORCHAO_MIN_VERSION = "0.4.0"
 AUTOROUND_MIN_VERSION = "0.5.0"
 TRITON_MIN_VERSION = "1.0.0"
+KERNELS_MIN_VERSION = "0.9.0"


@lru_cache
@ -513,8 +514,9 @@ def is_kenlm_available() -> bool:


@lru_cache
-def is_kernels_available() -> bool:
-    return _is_package_available("kernels")
+def is_kernels_available(MIN_VERSION: str = KERNELS_MIN_VERSION) -> bool:
+    is_available, kernels_version = _is_package_available("kernels", return_version=True)
+    return is_available and version.parse(kernels_version) >= version.parse(MIN_VERSION)


@lru_cache
@ -971,13 +973,13 @@ def is_quark_available() -> bool:
@lru_cache
 def is_fp_quant_available():
    is_available, fp_quant_version = _is_package_available("fp_quant", return_version=True)
-    return is_available and version.parse(fp_quant_version) >= version.parse("0.2.0")
+    return is_available and version.parse(fp_quant_version) >= version.parse("0.3.2")


@lru_cache
 def is_qutlass_available():
    is_available, qutlass_version = _is_package_available("qutlass", return_version=True)
-    return is_available and version.parse(qutlass_version) >= version.parse("0.1.0")
+    return is_available and version.parse(qutlass_version) >= version.parse("0.2.0")


@lru_cache
--- a/src/transformers/utils/kernel_config.py
+++ b/src/transformers/utils/kernel_config.py
@ -12,12 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from ..utils import PushToHubMixin, is_kernels_available, is_torch_available
+from ..utils import PushToHubMixin, is_torch_available


-if is_kernels_available():
-    from kernels import LayerRepository, Mode
-
 if is_torch_available():
    import torch

@ -58,6 +55,8 @@ def infer_device(model):


 def add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping):
+    from kernels import LayerRepository
+
    if device not in ["cuda", "rocm", "xpu"]:
        raise ValueError(f"Only cuda, rocm, and xpu devices supported, got: {device}")
    repo_layer_name = repo_name.split(":")[1]
@ -82,6 +81,8 @@ class KernelConfig(PushToHubMixin):
        self.registered_layer_names = {}

    def update_kernel(self, repo_id, registered_name, layer_name, device, mode, revision=None):
+        from kernels import LayerRepository
+
        self.kernel_mapping[registered_name] = {
            device: {
                mode: LayerRepository(
@ -204,6 +205,8 @@ class KernelConfig(PushToHubMixin):
        The device is inferred from the model's parameters if not provided.
        The Mode is inferred from the model's training state.
        """
+        from kernels import Mode
+
        compatible_mapping = {}
        for layer_name, kernel in self.kernel_mapping.items():
            # Infer Mode: use Mode.TRAINING if model is training, else use Mode.INFERENCE
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@ -1601,8 +1601,12 @@ class FPQuantConfig(QuantizationConfigMixin):
        else:
            raise ValueError("Only 'mxfp4' and 'nvfp4' are supported for forward_dtype for now.")

-        if self.backward_dtype != "bf16":
-            raise ValueError("Only 'bf16' is supported for backward_dtype for now.")
+        if self.backward_dtype not in ["bf16", "mxfp8", "mxfp4"]:
+            raise ValueError("Only 'bf16', 'mxfp8' and 'mxfp4' are supported for backward_dtype for now.")
+
+        if self.backward_dtype != "bf16" and self.forward_dtype != "mxfp4":
+            raise ValueError("Only 'mxfp4' forward is compatible with non-bf16 backwards for now.")
+
        if self.transform_init not in ["hadamard", "identity", "gsr"]:
            raise ValueError("Only 'hadamard', 'identity' and 'gsr' are supported for transform_init.")

--- a/tests/fsdp/test_context_parallel.py
+++ b/tests/fsdp/test_context_parallel.py
@ -0,0 +1,224 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import sys
+from pathlib import Path
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    require_accelerate,
+    require_torch_multi_accelerator,
+    run_first,
+    slow,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        DataCollatorForLanguageModeling,
+        HfArgumentParser,
+        Trainer,
+        TrainingArguments,
+    )
+
+
+class TestContextParallel(TestCasePlus):
+    """Test Trainer with Torch context parallelism enabled via accelerate's ParallelismConfig."""
+
+    @require_torch_multi_accelerator
+    @require_accelerate
+    @slow
+    @run_first
+    def test_cp_equivalence(self):
+        """Test that CP produces the same losses as without CP."""
+
+        # Shared setup
+        world_size = 2
+        script_path = __file__
+
+        # Step 1: Run with CP enabled (cp_size=world_size)
+        cp_yes_output_dir = Path(self.get_auto_remove_tmp_dir()).resolve()
+        cp_yes_config_path = cp_yes_output_dir / "context_parallel_config.yaml"
+        cp_yes_losses_path = cp_yes_output_dir / "cp_yes_losses.json"
+
+        # Write config file inline (self-contained test)
+        with open(cp_yes_config_path, "w") as f:
+            f.write(
+                f"""distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_version: 2
+mixed_precision: bf16
+num_processes: {world_size}
+parallelism_config:
+  parallelism_config_dp_replicate_size: 1
+  parallelism_config_dp_shard_size: 1
+  parallelism_config_tp_size: 1
+  parallelism_config_cp_size: {world_size}
+  parallelism_config_cp_comm_strategy: alltoall
+"""
+            )
+
+        cmd_cp_yes = f"""
+            accelerate launch
+            --config_file {cp_yes_config_path}
+            {script_path}
+            --output_dir {cp_yes_output_dir}
+            --report_to none
+            --max_steps 10
+            --per_device_train_batch_size 1
+            --gradient_accumulation_steps 1
+            --logging_steps 1
+            --remove_unused_columns False
+            --seed 42
+            --loss_output_file {cp_yes_losses_path}
+        """.split()
+
+        execute_subprocess_async(cmd_cp_yes, env=self.get_env())
+
+        # Step 2: Run without CP (FSDP with num_processes=1, no parallelism_config)
+        cp_no_output_dir = Path(self.get_auto_remove_tmp_dir()).resolve()
+        cp_no_config_path = cp_no_output_dir / "context_parallel_config.yaml"
+        cp_no_losses_path = cp_no_output_dir / "cp_no_losses.json"
+
+        # Write config file inline (self-contained test)
+        with open(cp_no_config_path, "w") as f:
+            f.write(
+                """distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_version: 2
+mixed_precision: bf16
+num_processes: 1
+"""
+            )
+
+        cmd_cp_no = f"""
+            accelerate launch
+            --config_file {cp_no_config_path}
+            {script_path}
+            --output_dir {cp_no_output_dir}
+            --report_to none
+            --max_steps 10
+            --per_device_train_batch_size 1
+            --gradient_accumulation_steps 1
+            --logging_steps 1
+            --remove_unused_columns False
+            --seed 42
+            --loss_output_file {cp_no_losses_path}
+        """.split()
+
+        execute_subprocess_async(cmd_cp_no, env=self.get_env())
+
+        # Compare losses - should be very close since CP just splits sequence computation
+        with open(cp_yes_losses_path) as f:
+            cp_yes_losses = json.load(f)
+        with open(cp_no_losses_path) as f:
+            cp_no_losses = json.load(f)
+
+        assert len(cp_yes_losses) == len(cp_no_losses), (
+            f"Different number of losses: CP has {len(cp_yes_losses)}, no-CP has {len(cp_no_losses)}"
+        )
+
+        # CP should produce very similar results (small numerical differences expected)
+        # The differences come from:
+        # - Different gradient reduction patterns in distributed training
+        # - BF16 mixed precision accumulated differences
+        # - Sequence splitting and gathering in CP mode
+        cp_yes_losses_tensor = torch.tensor(cp_yes_losses)
+        cp_no_losses_tensor = torch.tensor(cp_no_losses)
+
+        # Use torch.testing.assert_close with rtol=2% and atol=0.02
+        # Testing shows actual differences are typically <1.5%
+        torch.testing.assert_close(
+            cp_yes_losses_tensor,
+            cp_no_losses_tensor,
+            rtol=2e-2,  # 2% relative tolerance
+            atol=2e-2,  # 0.02 absolute tolerance
+            msg=f"CP losses {cp_yes_losses} do not match non-CP losses {cp_no_losses}",
+        )
+
+
+if __name__ == "__main__":
+    # Parse custom arguments (not TrainingArguments parameters)
+    loss_output_file = None
+
+    if "--loss_output_file" in sys.argv:
+        idx = sys.argv.index("--loss_output_file")
+        loss_output_file = sys.argv[idx + 1]
+        sys.argv.pop(idx)
+        sys.argv.pop(idx)
+
+    parser = HfArgumentParser((TrainingArguments,))
+    training_args = parser.parse_args_into_dataclasses()[0]
+
+    # Use SmolLM (small Llama-based model that works with CP)
+    model_name = "HuggingFaceTB/SmolLM-135M"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        attn_implementation="sdpa",  # CP requires SDPA
+    )
+
+    # Create simple dataset: just tokenize some text
+    texts = [
+        "The quick brown fox jumps over the lazy dog. " * 10,
+        "Hello world, this is a test sentence for training. " * 10,
+    ] * 4  # 8 samples total
+
+    def tokenize_function(examples):
+        return tokenizer(examples, max_length=128, truncation=True, padding="max_length")
+
+    train_dataset = [tokenize_function(text) for text in texts]
+
+    # Use standard DataCollatorForLanguageModeling for causal LM
+    # pad_to_multiple_of=4 ensures sequences are divisible by cp_size * 2 (for cp_size=2)
+    # Trainer will automatically generate position_ids and shift_labels as needed
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,  # Causal language modeling
+        pad_to_multiple_of=4,
+    )
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=data_collator,
+    )
+
+    # Train for a few steps
+    trainer.train()
+
+    # Verify training completed
+    assert trainer.state.global_step > 0, "Training should have completed at least one step"
+
+    # Save losses to file if requested (for equivalence testing)
+    if loss_output_file and training_args.process_index == 0:
+        losses = [log["loss"] for log in trainer.state.log_history if "loss" in log]
+        with open(loss_output_file, "w") as f:
+            json.dump(losses, f)
--- a/tests/generation/test_continuous_batching.py
+++ b/tests/generation/test_continuous_batching.py
@ -350,9 +350,9 @@ class ContinuousBatchingTest(unittest.TestCase):

        messages = [{"content": "What is the Transformers library known for?", "role": "user"}]

-        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
-            model.device
-        )[0]
+        inputs = tokenizer.apply_chat_template(
+            messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
+        ).to(model.device)[0]

        request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=True)

@ -382,9 +382,9 @@ class ContinuousBatchingTest(unittest.TestCase):

        messages = [{"content": "What is the Transformers library known for?", "role": "user"}]

-        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
-            model.device
-        )[0]
+        inputs = tokenizer.apply_chat_template(
+            messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
+        ).to(model.device)[0]

        request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=False)

@ -409,9 +409,9 @@ class ContinuousBatchingTest(unittest.TestCase):

        messages = [{"content": "What is the Transformers library known for?", "role": "user"}]

-        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
-            model.device
-        )[0]
+        inputs = tokenizer.apply_chat_template(
+            messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
+        ).to(model.device)[0]

        # Non-streaming request
        request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=False)
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@ -88,6 +88,15 @@ class AutoFeatureExtractorTest(unittest.TestCase):

        self.assertIsInstance(processor, Wav2Vec2Processor)

+    def test_processor_from_local_subfolder_from_repo(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+            processor.save_pretrained(f"{tmpdirname}/processor_subfolder")
+
+            processor = Wav2Vec2Processor.from_pretrained(tmpdirname, subfolder="processor_subfolder")
+
+        self.assertIsInstance(processor, Wav2Vec2Processor)
+
    def test_processor_from_local_directory_from_extractor_config(self):
        with tempfile.TemporaryDirectory() as tmpdirname:
            # copy relevant files
--- a/tests/models/colqwen2/test_modeling_colqwen2.py
+++ b/tests/models/colqwen2/test_modeling_colqwen2.py
@ -335,12 +335,61 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
                    [15.6562, 12.2656, 20.2969],
                ],
                ("cuda", 8): [
-                    [15.0703, 8.7422, 15.0312],
-                    [9.5078, 16.8906, 10.6250],
-                    [15.6484, 12.3984, 20.4688],
+                    [16.2812, 8.3672, 14.5703],
+                    [9.4922, 17.1875, 10.3281],
+                    [15.0312, 11.3984, 20.1719],
                ],
            }
        )
        expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)

        assert torch.allclose(scores, expected_scores, atol=1e-3), f"Expected scores {expected_scores}, got {scores}"
+
+    @slow
+    def test_model_integration_test_2(self):
+        """
+        Test if the model is able to retrieve the correct pages for a small and easy dataset.
+        This test uses a ColQwen2.5 checkpoint that is compatible with the ColQwen2 architecture.
+        """
+        model = ColQwen2ForRetrieval.from_pretrained(
+            "Sahil-Kabir/colqwen2.5-v0.2-hf",
+            device_map=torch_device,
+            dtype=torch.bfloat16,
+        ).eval()
+        processor = ColQwen2Processor.from_pretrained("Sahil-Kabir/colqwen2.5-v0.2-hf", trust_remote_code=True)
+
+        # Load the test dataset
+        ds = load_dataset("hf-internal-testing/document-visual-retrieval-test", split="test")
+
+        # Preprocess the examples
+        batch_images = processor(images=list(ds["image"])).to(torch_device)
+        batch_queries = processor(text=list(ds["query"])).to(torch_device)
+
+        with torch.inference_mode():
+            image_embeddings = model(**batch_images).embeddings
+            query_embeddings = model(**batch_queries).embeddings
+
+        # Compute retrieval scores
+        scores = processor.score_retrieval(
+            query_embeddings=query_embeddings,
+            passage_embeddings=image_embeddings,
+        )
+
+        assert scores.ndim == 2, f"Expected 2D tensor, got {scores.ndim}"
+        assert scores.shape == (len(ds), len(ds)), f"Expected shape {(len(ds), len(ds))}, got {scores.shape}"
+
+        # Check if the maximum scores per row are in the diagonal of the matrix score
+        self.assertTrue((scores.argmax(axis=1) == torch.arange(len(ds), device=scores.device)).all())
+        # Further validation: fine-grained check, with a hardcoded score from the original Hf implementation.
+        expectations = Expectations(
+            {
+                ("cuda", 8): [
+                    [16.3750, 10.9375, 14.7500],
+                    [11.3750, 16.8750, 12.0625],
+                    [15.3125, 13.1250, 21.5000],
+                ]
+            }
+        )
+        expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)
+
+        assert torch.allclose(scores, expected_scores, atol=0.15), f"Expected scores {expected_scores}, got {scores}"
--- a/tests/models/fuyu/test_image_processing_fuyu.py
+++ b/tests/models/fuyu/test_image_processing_fuyu.py
@ -1,63 +1,466 @@
+import io
 import unittest

+import httpx
 import numpy as np
+import pytest
+from packaging import version

-from transformers import is_torch_available, is_vision_available
+from transformers.image_utils import SizeDict
 from transformers.testing_utils import (
    require_torch,
+    require_torch_accelerator,
    require_torchvision,
    require_vision,
+    slow,
+    torch_device,
 )
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin


 if is_torch_available() and is_vision_available():
    import torch

-    from transformers import FuyuImageProcessor
+    from transformers import FuyuImageProcessor, FuyuImageProcessorFast

 if is_vision_available():
    from PIL import Image


+class FuyuImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        patch_size=None,
+    ):
+        size = size if size is not None else {"height": 180, "width": 360}
+        patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = 30
+        self.max_resolution = 360
+        self.do_resize = do_resize
+        self.size = size
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.patch_size = patch_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_pad": self.do_pad,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "patch_size": self.patch_size,
+        }
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        """Prepares a batch of images for testing"""
+        if equal_resolution:
+            image_inputs = [
+                np.random.randint(
+                    0, 256, (self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
+                )
+                for _ in range(self.batch_size)
+            ]
+        else:
+            heights = [
+                h - (h % 30) for h in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
+            ]
+            widths = [
+                w - (w % 30) for w in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
+            ]
+
+            image_inputs = [
+                np.random.randint(0, 256, (self.num_channels, height, width), dtype=np.uint8)
+                for height, width in zip(heights, widths)
+            ]
+
+        if not numpify and not torchify:
+            image_inputs = [Image.fromarray(np.moveaxis(img, 0, -1)) for img in image_inputs]
+
+        if torchify:
+            image_inputs = [torch.from_numpy(img) for img in image_inputs]
+
+        return image_inputs
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+
@require_torch
@require_vision
@require_torchvision
-class TestFuyuImageProcessor(unittest.TestCase):
+class FuyuImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = FuyuImageProcessor
+    fast_image_processing_class = FuyuImageProcessorFast
+
+    # Skip tests that expect pixel_values output
+    test_cast_dtype = None
+
    def setUp(self):
-        self.size = {"height": 160, "width": 320}
-        self.processor = FuyuImageProcessor(size=self.size, padding_value=1.0)
-        self.batch_size = 3
-        self.channels = 3
-        self.height = 300
-        self.width = 300
+        self.image_processor_tester = FuyuImageProcessingTester(self)
+        self.image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()

-        self.image_input = torch.rand(self.batch_size, self.channels, self.height, self.width)
+        # Initialize image_processor_list (from ImageProcessingTestMixin)
+        image_processor_list = []
+        if self.test_slow_image_processor and self.image_processing_class:
+            image_processor_list.append(self.image_processing_class)
+        if self.test_fast_image_processor and self.fast_image_processing_class:
+            image_processor_list.append(self.fast_image_processing_class)
+        self.image_processor_list = image_processor_list

-        self.image_patch_dim_h = 30
-        self.image_patch_dim_w = 30
-        self.sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
-        self.sample_image_pil = Image.fromarray(self.sample_image)
+    def test_call_pil(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)

-    def test_patches(self):
-        expected_num_patches = self.processor.get_num_patches(image_height=self.height, image_width=self.width)
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)

-        patches_final = self.processor.patchify_image(image=self.image_input)
-        assert patches_final.shape[1] == expected_num_patches, (
-            f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_numpy(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_pytorch(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_numpy_4_channels(self):
+        """Skip this test as Fuyu doesn't support arbitrary channels"""
+        self.skipTest("Fuyu processor is designed for 3-channel RGB images")
+
+    def test_slow_fast_equivalence(self):
+        """Override to handle Fuyu's custom output structure"""
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+        dummy_image = Image.open(
+            io.BytesIO(
+                httpx.get("http://images.cocodataset.org/val2017/000000039769.jpg", follow_redirects=True).content
+            )
+        )
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
+
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.images[0][0], encoding_fast.images[0][0])
+
+    def test_slow_fast_equivalence_batched(self):
+        """Override to handle Fuyu's custom output structure"""
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        # Compare each image tensor
+        for slow_img, fast_img in zip(encoding_slow.images, encoding_fast.images):
+            self._assert_slow_fast_tensors_equivalence(slow_img[0], fast_img[0])
+
+    @slow
+    @require_torch_accelerator
+    @require_vision
+    @pytest.mark.torch_compile_test
+    def test_can_compile_fast_image_processor(self):
+        if self.fast_image_processing_class is None:
+            self.skipTest("Skipping compilation test as fast image processor is not defined")
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        torch.compiler.reset()
+        input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
+        image_processor = self.fast_image_processing_class(**self.image_processor_dict)
+        output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+        image_processor = torch.compile(image_processor, mode="reduce-overhead")
+        output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
+        self._assert_slow_fast_tensors_equivalence(
+            output_eager.images[0][0], output_compiled.images[0][0], atol=1e-4, rtol=1e-4, mean_atol=1e-5
        )

+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_resize"))
+            self.assertTrue(hasattr(image_processor, "size"))
+            self.assertTrue(hasattr(image_processor, "do_pad"))
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "image_mean"))
+            self.assertTrue(hasattr(image_processor, "image_std"))
+            self.assertTrue(hasattr(image_processor, "do_rescale"))
+            self.assertTrue(hasattr(image_processor, "rescale_factor"))
+            self.assertTrue(hasattr(image_processor, "patch_size"))
+
+    def test_patches(self):
+        """Test that patchify_image produces the expected number of patches."""
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            batch_size = 3
+            channels = 3
+            height = 300
+            width = 300
+            image_input = torch.rand(batch_size, channels, height, width)
+
+            expected_num_patches = image_processor.get_num_patches(image_height=height, image_width=width)
+            patches_final = image_processor.patchify_image(image=image_input)
+
+            self.assertEqual(patches_final.shape[1], expected_num_patches)
+
+    def test_patches_match_slow_fast(self):
+        """Test that fast processor produces same patches as slow processor."""
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast patch equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(
+                reason="Skipping slow/fast patch equivalence test as one of the image processors is not defined"
+            )
+
+        batch_size = 3
+        channels = 3
+        height = 300
+        width = 300
+        image_input = torch.rand(batch_size, channels, height, width)
+
+        processor_slow = self.image_processing_class(**self.image_processor_dict)
+        processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        patches_fast = processor_fast.patchify_image(image=image_input)
+        patches_slow = processor_slow.patchify_image(image=image_input)
+
+        self.assertEqual(patches_fast.shape, patches_slow.shape)
+        torch.testing.assert_close(patches_fast, patches_slow, rtol=1e-4, atol=1e-4)
+
    def test_scale_to_target_aspect_ratio(self):
-        # (h:450, w:210) fitting (160, 320) -> (160, 210*160/450)
-        scaled_image = self.processor.resize(self.sample_image, size=self.size)
-        self.assertEqual(scaled_image.shape[0], 160)
-        self.assertEqual(scaled_image.shape[1], 74)
+        """Test that resize maintains aspect ratio correctly."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        if self.test_slow_image_processor and self.image_processing_class:
+            image_processor = self.image_processing_class(**self.image_processor_dict)
+            scaled_image = image_processor.resize(sample_image, size=self.image_processor_dict["size"])
+            self.assertEqual(scaled_image.shape[0], 180)
+            self.assertEqual(scaled_image.shape[1], 84)
+
+        if self.test_fast_image_processor and self.fast_image_processing_class:
+            image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+            sample_tensor = torch.from_numpy(sample_image).permute(2, 0, 1).float()
+
+            size_dict = SizeDict(
+                height=self.image_processor_dict["size"]["height"], width=self.image_processor_dict["size"]["width"]
+            )
+            scaled_image = image_processor_fast.resize(sample_tensor, size=size_dict)
+
+            self.assertEqual(scaled_image.shape[1], 180)
+            self.assertEqual(scaled_image.shape[2], 84)

    def test_apply_transformation_numpy(self):
-        transformed_image = self.processor.preprocess(self.sample_image).images[0][0]
-        self.assertEqual(transformed_image.shape[1], 160)
-        self.assertEqual(transformed_image.shape[2], 320)
+        """Test preprocessing with numpy input."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            transformed_image = image_processor.preprocess(sample_image).images[0][0]
+            self.assertEqual(transformed_image.shape[1], 180)
+            self.assertEqual(transformed_image.shape[2], 360)

    def test_apply_transformation_pil(self):
-        transformed_image = self.processor.preprocess(self.sample_image_pil).images[0][0]
-        self.assertEqual(transformed_image.shape[1], 160)
-        self.assertEqual(transformed_image.shape[2], 320)
+        """Test preprocessing with PIL input."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        sample_image_pil = Image.fromarray(sample_image)
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            transformed_image = image_processor.preprocess(sample_image_pil).images[0][0]
+            self.assertEqual(transformed_image.shape[1], 180)
+            self.assertEqual(transformed_image.shape[2], 360)
+
+    def test_preprocess_output_structure(self):
+        """Test that preprocess returns correct output structure."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            result = image_processor.preprocess(sample_image)
+
+            self.assertIn("images", result)
+            self.assertIn("image_unpadded_heights", result)
+            self.assertIn("image_unpadded_widths", result)
+            self.assertIn("image_scale_factors", result)
+
+            self.assertEqual(len(result.images), 1)
+            self.assertEqual(len(result.images[0]), 1)
+            self.assertEqual(len(result.image_unpadded_heights), 1)
+            self.assertEqual(len(result.image_unpadded_widths), 1)
+            self.assertEqual(len(result.image_scale_factors), 1)
+
+    def test_batch_processing(self):
+        """Test processing multiple images."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        sample_image_pil = Image.fromarray(sample_image)
+        images = [sample_image, sample_image_pil]
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            result = image_processor.preprocess(images)
+
+            self.assertEqual(len(result.images), 2)
+            for img in result.images:
+                self.assertEqual(len(img), 1)
+                if hasattr(img[0], "shape"):
+                    if len(img[0].shape) == 3:
+                        self.assertEqual(img[0].shape[1], 180)
+                        self.assertEqual(img[0].shape[2], 360)
+
+    def test_pad_image_fast(self):
+        """Test that padding works correctly for fast processor."""
+        if not self.test_fast_image_processor or self.fast_image_processing_class is None:
+            self.skipTest(reason="Fast processor not available")
+
+        from transformers.image_utils import SizeDict
+
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        small_image = torch.rand(3, 100, 100)
+        size_dict = SizeDict(height=180, width=360)
+
+        padded = image_processor_fast.pad([small_image], pad_size=size_dict, fill_value=1.0)[0]
+        self.assertEqual(padded.shape[1], 180)
+        self.assertEqual(padded.shape[2], 360)
+
+        self.assertTrue(torch.allclose(padded[:, 100:, :], torch.ones_like(padded[:, 100:, :])))
+        self.assertTrue(torch.allclose(padded[:, :, 100:], torch.ones_like(padded[:, :, 100:])))
+
+    def test_preprocess_with_tokenizer_info(self):
+        """Test preprocess_with_tokenizer_info functionality."""
+        batch_size = 2
+        subseq_size = 1
+        channels = 3
+        image_input = torch.rand(batch_size, subseq_size, channels, 180, 360)
+        image_present = torch.ones(batch_size, subseq_size, dtype=torch.bool)
+        image_unpadded_h = torch.tensor([[180], [180]])
+        image_unpadded_w = torch.tensor([[360], [360]])
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+
+            result = image_processor.preprocess_with_tokenizer_info(
+                image_input=image_input,
+                image_present=image_present,
+                image_unpadded_h=image_unpadded_h,
+                image_unpadded_w=image_unpadded_w,
+                image_placeholder_id=100,
+                image_newline_id=101,
+                variable_sized=True,
+            )
+
+            # Check output structure
+            self.assertIn("images", result)
+            self.assertIn("image_input_ids", result)
+            self.assertIn("image_patches", result)
+            self.assertIn("image_patch_indices_per_batch", result)
+            self.assertIn("image_patch_indices_per_subsequence", result)
+
+            # Check batch structure
+            self.assertEqual(len(result.images), batch_size)
+            self.assertEqual(len(result.image_input_ids), batch_size)
+            self.assertEqual(len(result.image_patches), batch_size)
+
+    def test_device_handling_fast(self):
+        """Test that fast processor can handle device placement."""
+        if not self.test_fast_image_processor or self.fast_image_processing_class is None:
+            self.skipTest(reason="Fast processor not available")
+
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        if torch.cuda.is_available():
+            result_cuda = image_processor_fast.preprocess(sample_image, device="cuda")
+            self.assertEqual(result_cuda.images[0][0].device.type, "cuda")
+
+        result_cpu = image_processor_fast.preprocess(sample_image, device="cpu")
+        self.assertEqual(result_cpu.images[0][0].device.type, "cpu")
+
+    def test_do_not_resize_if_smaller(self):
+        """Test that images smaller than target size are not resized."""
+        if not self.test_fast_image_processor or self.fast_image_processing_class is None:
+            self.skipTest(reason="Fast processor not available")
+
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        small_image = torch.rand(3, 100, 150)
+        size_dict = SizeDict(height=180, width=360)
+
+        resized = image_processor_fast.resize(small_image, size=size_dict)
+
+        self.assertEqual(resized.shape[1], 100)
+        self.assertEqual(resized.shape[2], 150)
--- a/tests/models/glpn/test_image_processing_glpn.py
+++ b/tests/models/glpn/test_image_processing_glpn.py
@ -18,7 +18,7 @@ import unittest
 import numpy as np

 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs

@ -31,6 +31,9 @@ if is_vision_available():

    from transformers import GLPNImageProcessor

+    if is_torchvision_available():
+        from transformers import GLPNImageProcessorFast
+

 class GLPNImageProcessingTester:
    def __init__(
@ -87,19 +90,32 @@ class GLPNImageProcessingTester:
            torchify=torchify,
        )

+    def prepare_depth_outputs(self):
+        if not is_torch_available():
+            return None
+        depth_tensors = prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=1,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=True,
+            torchify=True,
+        )
+        depth_tensors = [depth_tensor.squeeze(0) for depth_tensor in depth_tensors]
+        stacked_depth_tensors = torch.stack(depth_tensors, dim=0)
+        return type("DepthOutput", (), {"predicted_depth": stacked_depth_tensors})
+

@require_torch
@require_vision
 class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = GLPNImageProcessor if is_vision_available() else None
+    fast_image_processing_class = GLPNImageProcessorFast if is_torchvision_available() else None

    def setUp(self):
        super().setUp()
        self.image_processor_tester = GLPNImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
+        self.image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()

    def test_image_processor_properties(self):
        image_processing = self.image_processing_class(**self.image_processor_dict)
@ -115,7 +131,6 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
        for image in image_inputs:
            self.assertIsInstance(image, Image.Image)
-
        # Test not batched input (GLPNImageProcessor doesn't support batching)
        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
@ -161,3 +176,43 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
        self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
        self.image_processing_class.num_channels = 3
+
+    # override as glpn image processors don't support heterogeneous batching
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
+
+    def test_post_process_depth_equivalence(self):
+        # Check that both processors produce equivalent post-processed depth maps
+        if self.fast_image_processing_class is None:
+            self.skipTest("TorchVision not available")
+
+        outputs = self.image_processor_tester.prepare_depth_outputs()
+        slow = self.image_processing_class(**self.image_processor_dict)
+        fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        # target_sizes simulate resized inference outputs
+        target_sizes = [(240, 320)] * self.image_processor_tester.batch_size
+        processed_slow = slow.post_process_depth_estimation(outputs, target_sizes=target_sizes)
+        processed_fast = fast.post_process_depth_estimation(outputs, target_sizes=target_sizes)
+
+        # Compare per-sample predicted depth tensors
+        for pred_slow, pred_fast in zip(processed_slow, processed_fast):
+            depth_slow = pred_slow["predicted_depth"]
+            depth_fast = pred_fast["predicted_depth"]
+            torch.testing.assert_close(depth_fast, depth_slow, atol=1e-1, rtol=1e-3)
+            self.assertLessEqual(torch.mean(torch.abs(depth_fast.float() - depth_slow.float())).item(), 5e-3)
--- a/tests/models/pix2struct/test_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_processing_pix2struct.py
@ -172,6 +172,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
+        print("image_processor", image_processor)
        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
--- a/tests/models/video_llama_3/test_modeling_video_llama_3.py
+++ b/tests/models/video_llama_3/test_modeling_video_llama_3.py
@ -36,6 +36,7 @@ from transformers import (
    is_torch_available,
 )
 from transformers.testing_utils import (
+    Expectations,
    backend_empty_cache,
    require_flash_attn,
    require_torch,
@ -831,7 +832,14 @@ class VideoLlama3IntegrationTest(unittest.TestCase):
        torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4)

        output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
-        EXPECTED_DECODED_TEXT = "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress"
+        # fmt: off
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
+                ("xpu", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
+            }
+        ).get_expectation()
+        # fmt: on

        self.assertEqual(
            self.processor.decode(output[0], skip_special_tokens=True),
@ -874,11 +882,21 @@ class VideoLlama3IntegrationTest(unittest.TestCase):

        # it should not matter whether two images are the same size or not
        output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
+        # fmt: off
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): [
+                    "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
+                    "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
+                ],
+                ("xpu", None): [
+                    "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
+                    "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
+                ],
+            }
+        ).get_expectation()
+        # fmt: on

-        EXPECTED_DECODED_TEXT = [
-            "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
-            "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
-        ]  # fmt: skip
        self.assertEqual(
            self.processor.batch_decode(output, skip_special_tokens=True),
            EXPECTED_DECODED_TEXT,
--- a/tests/quantization/fp_quant_integration/test_fp_quant.py
+++ b/tests/quantization/fp_quant_integration/test_fp_quant.py
@ -22,14 +22,14 @@ from transformers.testing_utils import (
    require_accelerate,
    require_fp_quant,
    require_qutlass,
-    require_torch_gpu,
-    require_torch_multi_gpu,
+    require_torch_accelerator,
+    require_torch_multi_accelerator,
    slow,
    torch_device,
 )


-@require_torch_gpu
+@require_torch_accelerator
 class FPQuantConfigTest(unittest.TestCase):
    def test_to_dict(self):
        """
@ -53,7 +53,7 @@ class FPQuantConfigTest(unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
@require_fp_quant
@require_accelerate
 class FPQuantBaseTest(unittest.TestCase):
@ -64,7 +64,7 @@ class FPQuantBaseTest(unittest.TestCase):

    EXPECTED_OUTPUT = "1 2 3 4 5 6"

-    device_map = "cuda"
+    device_map = torch_device

    @classmethod
    def getQuantizationConfig(cls):
@ -77,10 +77,10 @@ class FPQuantBaseTest(unittest.TestCase):
        Setup quantized model
        """

-        quantization_config = cls.getQuantizationConfig()
+        cls.quantization_config = cls.getQuantizationConfig()
        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
        cls.quantized_model = AutoModelForCausalLM.from_pretrained(
-            cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
+            cls.model_name, device_map=cls.device_map, quantization_config=cls.quantization_config
        )

    def tearDown(self):
@ -111,24 +111,25 @@ class FPQuantBaseTest(unittest.TestCase):
            output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
            self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

-    @require_torch_multi_gpu
-    def test_quantized_model_multi_gpu(self):
+    @require_torch_multi_accelerator
+    def test_quantized_model_multi_accelerator(self):
        """
-        Simple test that checks if the quantized model is working properly with multiple GPUs
-        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
+        Simple test that checks if the quantized model is working properly with multiple accelerators.
+        Set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs. Or set ZE_AFFINITY_MASK=0,1
+        if you have more than 2 Intel XPUs.
        """
        input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
-        quantization_config = FPQuantConfig()
+
        quantized_model = AutoModelForCausalLM.from_pretrained(
-            self.model_name, device_map="auto", quantization_config=quantization_config
+            self.model_name, device_map="auto", quantization_config=self.quantization_config
        )
        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})

        output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

-    @require_torch_multi_gpu
-    def test_save_pretrained_multi_gpu(self):
+    @require_torch_multi_accelerator
+    def test_save_pretrained_multi_accelerator(self):
        """
        Simple test that checks if the quantized model is working properly after being saved and loaded
        """
@ -163,6 +164,13 @@ class FPQuantMXFP4Test(FPQuantBaseTest):
        return FPQuantConfig(forward_dtype="mxfp4", pseudoquantization=False)


+@require_qutlass
+class FPQuantNVFP4Test(FPQuantBaseTest):
+    @classmethod
+    def getQuantizationConfig(cls):
+        return FPQuantConfig(forward_dtype="nvfp4", pseudoquantization=False)
+
+
@require_qutlass
 class FPQuantMXFP4GS128Test(FPQuantBaseTest):
    @classmethod
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@ -81,6 +81,8 @@ if __name__ == "__main__":
    for idx in range(args.num_splits):
        start = end
        end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
-        model_splits.append(d[start:end])
+        # Only add the slice if it is not an empty list
+        if len(d[start:end]) > 0:
+            model_splits.append(d[start:end])

    print(model_splits)
Author	SHA1	Message	Date
merveenoyan	02e0fd8111	add mask generation fine-tuning docs	2025-11-05 18:44:16 +01:00
célina	5689dd6b8e	update `huggingface_hub` dependency version (#42033 ) * update huggingface_hub version * nit	2025-11-05 16:22:22 +01:00
Manuel de Prada Corral	571352d378	🔴 Isolate prefill from generation loops (#40652 ) * isolate-prefill: squash * prefill inside decoding methods * simplify autocompile helpers	2025-11-05 14:40:01 +00:00
Raushan Turganbay	2418196ef4	Fix the order of methods in processor loading (#42031 ) * fix the order * add a test	2025-11-05 15:33:07 +01:00
Yih-Dar	561233cabf	Change trigger time for AMD CI (#42034 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-05 14:17:12 +01:00
Yao Matrix	36b640562b	extend fp_quant cases to xpu (#41833 ) * extend fp_quant UTs to xpu Signed-off-by: Yao, Matrix <matrix.yao@intel.com> * fix style Signed-off-by: Yao, Matrix <matrix.yao@intel.com> * Update tests/quantization/fp_quant_integration/test_fp_quant.py Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --------- Signed-off-by: Yao, Matrix <matrix.yao@intel.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-11-05 10:50:31 +00:00
Kashif Rasul	0c4a202408	[tests] Add Context-parallel CI tests (#41860 ) * intial * simplify tests * add test_cp_equivalence * removed fsdp_transformer_layer_cls_to_wrap * use DataCollatorForLanguageModeling * remove use_cache=False. * changes from review * make script self contained * moved to fsdp folder * fix class name	2025-11-05 11:40:51 +01:00
Pauline Bailly-Masson	20396951af	CodeQL workflow for security analysis (#42015 ) * CodeQL workflow for security analysis Created CodeQL workflow to use reusable workflow from internal and simplified configuration. * Update CodeQL workflow for main branch only and remving python from analysis Restrict CodeQL analysis to 'actions' language only. * Disable pull_request trigger in CodeQL workflow temporarly Comment out pull_request trigger for CodeQL workflow	2025-11-05 10:59:37 +01:00
Yih-Dar	3c4cdd549d	fix `deeepspeed` in AMD docker file (#42025 ) fix deeepspeed in AMD docker Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-05 10:40:29 +01:00
Andrei Panferov	020e713ac8	[FPQuant] MXFP8 and MXFP4 backwards support (#41897 ) * FP-Quant backwards * fp-quant v0.3.0 docker * availability version bump * fp_quant==0.3.1 * fp_quant v0.3.2	2025-11-04 16:52:47 +00:00
Matt	371ef0f4a2	[v5] Deprecate Text2Text and related pipelines (#41996 ) * Deprecate Text2Text and related pipelines * Try a restructure * make fixup * logging -> logger	2025-11-04 16:47:06 +00:00
Mohamed Mekkouri	6efc1799c1	[kernels] Fix XPU layernorm kernel (#41583 ) * fix * add comment * better fix * style * Update src/transformers/modeling_utils.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-11-04 16:59:07 +01:00
Pritam Das	325810e7fc	add fuyu fast image processors (#41817 ) * added fast processor for fuyu (#36978) * updated docs for fuyu model (#36978) * updated test_image_processing and image_processing_fuyu_fast * updated fuyu.md and image_processing_fuyu_fast (#36978) * updated test_image_processing_fuyu (#36978) * formatted image_processing_fuyu_fast and test_image_processing_fuyu (#36978) * updated tests and fuyu fast image processing (#36978) * Merge branch 'fuyu-fast-image-processors' of https://github.com/DeXtAr47-oss/transformers into fuyu-fast-image-processors * fixed format (#36978) * formatted files (#36978) * formatted files * revert unnecessary changes * clean up and process by group --------- Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>	2025-11-04 15:45:02 +00:00
ARAVINDHAN T	9a19171fad	Add GLPNImageProcessorFast (#41725 ) * Add GLPNImageProcessorFast for torch backend * Address review feedback - Simplified to_dict() method - Keep tensors as torch instead of converting to numpy for heterogeneous shapes - Removed unnecessary shape guards in post_process_depth_estimation - Improved variable names (tgt -> target_size, d -> resized) - Removed unnecessary GLPNImageProcessorKwargs class * Address review feedback - Simplified to_dict() method - Keep tensors as torch instead of converting to numpy for heterogeneous shapes - Removed unnecessary shape guards in post_process_depth_estimation - Improved variable names (tgt -> target_size, d -> resized) - Removed unnecessary GLPNImageProcessorKwargs class * commits after 2nd review * Address all review feedback and add explicit batched test - Simplified to_dict() with descriptive variable names (d->output_dict) - Fixed resize operation: changed from crop to proper resize with interpolation - Added padding for heterogeneous batch shapes in both slow and fast processors - Fused rescale and normalize operations for efficiency - Improved all variable names (tgt->target_size, d->depth_4d->resized) - Added GLPNImageProcessorKwargs class in slow processor and imported in fast - Renamed test_equivalence_slow_fast to test_slow_fast_equivalence - Added explicit test_slow_fast_equivalence_batched test - All 20 tests passing * using padding from utils * simplify glpn image processor fast * fix docstring --------- Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co> Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>	2025-11-04 15:44:52 +00:00
MilkClouds	26fca86312	Fix default image_rows and image_cols initialization in Idefics3 and SmolVLM processors (#41871 ) * Fix default image_rows and image_cols initialization in Idefics3 and SmolVLM processors * Fix default initialization of image_rows and image_cols in Idefics3 and SmolVLM processors	2025-11-04 15:42:47 +00:00
Yoni Gozlan	900cf9d33b	Fix issue with from pretrained and kwargs in image processors (#41997 ) * accept kwargs in image proc from_pretrained * only use kwargs that are in cls.valid_kwargs * remove specific logic for _from_auto * add image_seq_length to Images_kwargs for backward compatibility * fix missing image kwargs in pix2struct	2025-11-04 10:35:39 -05:00
Marc Sun	154d5101a4	add back `logging_dir` (#42013 ) * add back * Apply style fixes --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2025-11-04 16:22:58 +01:00
Matt	e3d4fa692e	Fix continuous batching tests (#42012 ) * Fix continuous batching tests * make fixup	2025-11-04 15:10:35 +00:00
Rémi Ouazan	dd4e048e75	Reduce the number of benchmark in the CI (#42008 ) Changed how benchmark cfgs are chosen	2025-11-04 14:07:17 +01:00
Yacklin Wong	6ff4fabd9d	Correct syntax error in trainer.md (#42001 ) A comma is missing between two parameters in the signature of compute_loss function.	2025-11-04 12:36:54 +00:00
Yih-Dar	6d4450e341	Fix `torch+deepspeed` docker file (#41985 ) * fix * delete --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-04 10:41:22 +00:00
Benjamin Bossan	aee5c2384a	DOC Fix typo in argument name: pseudoquant (#41994 ) The correct argument name is pseudoquantization. Since there is no error on passing wrong arguments name (which is arguably an anti-pattern), this is difficult for users to debug.	2025-11-04 10:48:39 +01:00
Mohamed Mekkouri	5b6c209bc5	[kernels] change import time in KernelConfig (#42004 ) * change import time * style	2025-11-04 10:26:24 +01:00
Yih-Dar	258c76e4dc	Fix `run slow v2`: empty report when there is only one model (#42002 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-04 06:46:21 +01:00
James	64397a8301	Fixed wrong padding value in OWLv2 (#41938 ) * Update image_processing_owlv2_fast.py fixed padding value * fixed padding value * Change padding constant value from 0.5 to 0.0 * Fixed missed padding value in modular_owlv2.py --------- Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>	2025-11-03 18:46:28 -05:00
Sahil Kabir	cd309610c0	Integrate colqwen2.5 using colqwen2 modelling code (#40600 ) * adding option for 2.5 * minor - arg in conversion script * getting started on modelling.py * minor - shouldve been using modular * adressing comments + fixing datatype/device _get method * minor * commiting suggestion Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> * docs + first test * ruff fix * minor fix * ruff fix * model fix * model fix * fine-grained check, with a hardcoded score from the original Hf implementation. * minor ruff * update tests values with CI hardware * adding 2.5 to conversion script * Apply style fixes --------- Co-authored-by: Sahil Kabir <sahilkabir@Sahils-MacBook-Pro.local> Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2025-11-03 18:31:07 -05:00
kaixuanliu	dd8f231495	fix 3 failed test cases for video_llama_3 model on Intel XPU (#41931 ) * fix 3 failed test cases for video_llama_3 model on Intel XPU Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> * update Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> * adjust format Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> * update code Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> --------- Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>	2025-11-03 18:18:20 +01:00
Guillaume LEGENDRE	1619a3475f	fix (CI): Refactor SSH runners (#41991 ) * Change ssh runner type * Add wait step to SSH runner workflow * Rename wait step to wait2 in ssh-runner.yml * Remove wait step from ssh-runner.yml Removed the wait step from the SSH runner workflow. * Update runner type for single GPU A10 instance * Update SSH runner version to 1.90.3 * Add sha256sum to ssh-runner workflow * Update runner type and remove unused steps	2025-11-03 18:16:32 +01:00
Rémi Ouazan	ff0f7d6498	More data in benchmarking (#41848 ) * Reduce scope of cross-generate * Rm generate_sall configs * Workflow benchmarks more * Prevent crash when FA is not installed	2025-11-03 18:05:26 +01:00