solve unequal cropping

use existing methods, add default image
add an unnormalize image method
2025-10-21 17:48:57 +08:00 · 2025-08-11 19:20:28 +02:00 · 2025-08-11 16:44:06 +02:00 · 2025-08-11 16:43:27 +02:00 · 2025-08-06 19:19:09 +02:00 · 2025-08-06 19:17:38 +02:00
91 changed files with 701 additions and 138 deletions
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch<2.8' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -16,7 +16,7 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch<2.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@ -6,7 +6,7 @@ RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"


--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,7 +7,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.8' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -511,6 +511,8 @@
        title: GPT2
      - local: model_doc/gpt_bigcode
        title: GPTBigCode
+      - local: model_doc/gpt_oss
+        title: GptOss
      - local: model_doc/gptsan-japanese
        title: GPTSAN Japanese
      - local: model_doc/gpt-sw3
@ -617,8 +619,6 @@
        title: OLMoE
      - local: model_doc/open-llama
        title: Open-Llama
-      - local: model_doc/openai_moe
-        title: OpenAIMoe
      - local: model_doc/opt
        title: OPT
      - local: model_doc/pegasus
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] HqqConfig

+## Mxfp4Config
+
+[[autodoc]] Mxfp4Config
+
 ## FbgemmFp8Config

 [[autodoc]] FbgemmFp8Config
--- a/docs/source/en/model_doc/openai_moe.md
+++ b/docs/source/en/model_doc/openai_moe.md
@ -24,11 +24,11 @@ rendered properly in your Markdown viewer.
    </div>
 </div>

-# OpenAIMoE
+# GptOss

 ## Overview

-The OpenAIMoE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
 <INSERT SHORT SUMMARY HERE>

 The abstract from the paper is the following:
@ -43,16 +43,16 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).


-## OpenAIMoeConfig
+## GptOssConfig

-[[autodoc]] OpenAIMoeConfig
+[[autodoc]] GptOssConfig

-## OpenAIMoeModel
+## GptOssModel

-[[autodoc]] OpenAIMoeModel
+[[autodoc]] GptOssModel
    - forward

-## OpenAIMoeForCausalLM
+## GptOssForCausalLM

-[[autodoc]] OpenAIMoeForCausalLM
+[[autodoc]] GptOssForCausalLM
    - forward
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # FP-Quant

-[FP-Quant](https://github.com/IST-DASLab/FP-Quant) is a family of quantization algorithms tailored for the Blackwell generation of Nvidia GPUs. The goal is to allow for efficient post-training quantization (PTQ) and quantization-aware trainin (QAT) of LLMs in the [MXFP4 and NVFP4 data-types](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
+[FP-Quant](https://github.com/IST-DASLab/FP-Quant) is a family of quantization algorithms tailored for the Blackwell generation of Nvidia GPUs. The goal is to allow for efficient post-training quantization (PTQ) and quantization-aware training (QAT) of LLMs in the [MXFP4 and NVFP4 data-types](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).

 Currently, only PTQ with MXFP4 is supported. Models can either be quantized on the fly with `quantization_config=FPQuantConfig()`:

@ -63,4 +63,4 @@ model.forward = torch.compile(model.forward, mode="max-autotune", fullgraph=True

 FP-Quant currently performs best for very large batch size processing.

-See [QuTLASS README](https://github.com/IST-DASLab/qutlass/blob/main/README.md) for speedups.
+See [QuTLASS README](https://github.com/IST-DASLab/qutlass/blob/main/README.md) for speedups.
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -327,8 +327,6 @@
    title: (번역중) Contribute new quantization method
  title: (번역중) 경량화 메소드
 - sections:
-  - local: performance
-    title: 성능 및 확장성
  - local: in_translation
    title: (번역중) Quantization
  - local: llm_optims
@ -348,8 +346,6 @@
      title: CPU에서 훈련
    - local: perf_train_cpu_many
      title: 다중 CPU에서 훈련하기
-    - local: perf_train_tpu_tf
-      title: TensorFlow로 TPU에서 훈련하기
    - local: perf_train_special
      title: Apple 실리콘에서 PyTorch 학습
    - local: perf_hardware
@ -363,12 +359,8 @@
    - local: perf_infer_gpu_one
      title: 하나의 GPU를 활용한 추론
    title: 추론 최적화하기
-  - local: big_models
-    title: 대형 모델을 인스턴스화
  - local: debugging
    title: 디버깅
-  - local: tf_xla
-    title: TensorFlow 모델을 위한 XLA 통합
  - local: in_translation
    title: (번역중) Optimize inference using `torch.compile()`
  title: (번역중) 성능 및 확장성
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -60,7 +60,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -68,7 +68,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -56,7 +56,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -61,7 +61,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -69,7 +69,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -72,7 +72,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -74,7 +74,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -68,7 +68,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -57,7 +57,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -65,7 +65,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@ -62,7 +62,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -62,7 +62,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -64,7 +64,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -67,7 +67,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -61,7 +61,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -63,7 +63,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -62,7 +62,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -67,7 +67,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@ -66,7 +66,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -71,7 +71,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version(
    "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@ -49,7 +49,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@ -61,7 +61,7 @@ except (ModuleNotFoundError, ImportError):


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version

 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@ -46,7 +46,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 task_to_keys = {
    "cola": ("sentence", None),
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version

 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.55.0.dev0")
+check_min_version("4.56.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/setup.py
+++ b/setup.py
@ -463,7 +463,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.55.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.56.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.55.0.dev0"
+__version__ = "4.56.0.dev0"

 from pathlib import Path
 from typing import TYPE_CHECKING
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -452,6 +452,105 @@ def normalize(
    return image


+def unnormalize(
+    image: Union[np.ndarray, "torch.Tensor"],
+    mean: Union[float, Collection[float]],
+    std: Union[float, Collection[float]],
+    data_format: Optional[ChannelDimension] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
+    """
+    Inverse of `normalize`:
+
+        image = image * std + mean
+
+    Accepts NumPy arrays or PyTorch tensors and mirrors `normalize`'s API,
+    but also handles 4D/5D by broadcasting along the channel axis and
+    collapsing leading batch dims. Defaults to NHWC output for visualization.
+    """
+    # type check
+    is_np = isinstance(image, np.ndarray)
+    is_torch = isinstance(image, torch.Tensor)
+    if not (is_np or is_torch):
+        raise TypeError("image must be a numpy array or a torch tensor")
+
+    # infer layout
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+
+    # cast policy (match normalize): cast only if not floating
+    if is_np:
+        if not np.issubdtype(image.dtype, np.floating):
+            image = image.astype(np.float32)
+    else:
+        if not image.is_floating_point():
+            image = image.float()
+
+    # channel axis and sizes
+    ch_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
+    num_channels = int(image.shape[ch_axis])
+
+    # normalize mean/std to per-channel vectors
+    def _as_seq(x, n):
+        if isinstance(x, Collection):
+            if len(x) != n:
+                raise ValueError(f"value must have {n} elements if it is an iterable, got {len(x)}")
+            return x
+        return [x] * n
+
+    mean_seq = _as_seq(mean, num_channels)
+    std_seq = _as_seq(std, num_channels)
+
+    # make broadcastable tensors/arrays shaped [1, ..., C (at ch_axis), ..., 1]
+    bshape = [1] * image.ndim
+    bshape[ch_axis] = num_channels
+
+    if is_np:
+        mean_arr = np.asarray(mean_seq, dtype=image.dtype).reshape(bshape)
+        std_arr = np.asarray(std_seq, dtype=image.dtype).reshape(bshape)
+        image = image * std_arr + mean_arr
+    else:
+        mean_arr = torch.as_tensor(mean_seq, dtype=image.dtype, device=image.device).view(bshape)
+        std_arr = torch.as_tensor(std_seq, dtype=image.dtype, device=image.device).view(bshape)
+        image = image * std_arr + mean_arr
+
+    # convert to numpy for plotting
+    if is_torch:
+        image = image.detach().cpu().numpy()
+        is_np = True  # from here on
+
+    # target layout: default to NHWC so downstream viz works out of the box
+    target_format = data_format or ChannelDimension.LAST
+
+    # collapse any leading batch dims into one, preserving (C,H,W) or (H,W,C)
+    if input_data_format == ChannelDimension.FIRST:
+        # layout: [*, C, H, W]
+        lead = int(np.prod(image.shape[: image.ndim - 3])) if image.ndim > 3 else 1
+        if image.ndim == 3:
+            c, h, w = image.shape
+            image = image.reshape(1, c, h, w)
+            lead = 1
+        else:
+            c, h, w = image.shape[-3:]
+        image = image.reshape(lead, c, h, w)
+        if target_format == ChannelDimension.LAST:
+            image = np.transpose(image, (0, 2, 3, 1))  # -> [N, H, W, C]
+    else:
+        # layout: [*, H, W, C]
+        lead = int(np.prod(image.shape[: image.ndim - 3])) if image.ndim > 3 else 1
+        if image.ndim == 3:
+            h, w, c = image.shape
+            image = image.reshape(1, h, w, c)
+            lead = 1
+        else:
+            h, w, c = image.shape[-3:]
+        image = image.reshape(lead, h, w, c)
+        if target_format == ChannelDimension.FIRST:
+            image = np.transpose(image, (0, 3, 1, 2))  # -> [N, C, H, W]
+
+    return image
+
+
 def center_crop(
    image: np.ndarray,
    size: tuple[int, int],
--- a/src/transformers/integrations/mxfp4.py
+++ b/src/transformers/integrations/mxfp4.py
@ -264,8 +264,8 @@ def routing_torch_dist(

        expt_data = compute_expt_data_torch(hist, n_local_experts, n_gates_pad)

-        hitted_experts = n_expts_act
-    return RoutingData(gate_scal, hist, n_local_experts, hitted_experts, expt_data), gather_indx, scatter_indx
+        hit_experts = n_expts_act
+    return RoutingData(gate_scal, hist, n_local_experts, hit_experts, expt_data), gather_indx, scatter_indx


 def mlp_forward(self, hidden_states):
@ -280,7 +280,10 @@ def mlp_forward(self, hidden_states):
    batch_size = hidden_states.shape[0]
    hidden_states = hidden_states.reshape(-1, self.router.hidden_dim)
    router_logits = nn.functional.linear(hidden_states, self.router.weight, self.router.bias)
-    routing_data, gather_idx, scatter_idx = routing(router_logits, self.router.top_k)
+
+    with torch.cuda.device(router_logits.device):
+        routing_data, gather_idx, scatter_idx = routing(router_logits, self.router.top_k)
+
    routed_out = self.experts(hidden_states, routing_data, gather_idx, scatter_idx)
    routed_out = routed_out.reshape(batch_size, -1, self.router.hidden_dim)
    return routed_out, router_logits
--- a/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
+++ b/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
@ -356,8 +356,8 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))

--- a/src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py
+++ b/src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py
@ -167,8 +167,8 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))

--- a/src/transformers/models/glm4v/image_processing_glm4v.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v.py
@ -141,7 +141,7 @@ class Glm4vImageProcessor(BaseImageProcessor):
        super().__init__(**kwargs)
        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-        else:
+        elif size is None:
            size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
        self.size = size

--- a/src/transformers/models/gpt_oss/modeling_gpt_oss.py
+++ b/src/transformers/models/gpt_oss/modeling_gpt_oss.py
@ -75,7 +75,7 @@ class GptOssExperts(nn.Module):

    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
        """
-        When training is is more efficient to just loop over the experts and compute the output for each expert
+        When training it is more efficient to just loop over the experts and compute the output for each expert
        as otherwise the memory would explode.

        For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
@ -97,8 +97,8 @@ class GptOssExperts(nn.Module):
                expert_mask = expert_mask.permute(2, 1, 0)
                # we sum on the top_k and on the sequence lenght to get which experts
                # are hit this time around
-                expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-            for expert_idx in expert_hitted[:]:
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
                with torch.no_grad():
                    _, token_idx = torch.where(expert_mask[expert_idx[0]])
                current_state = hidden_states[token_idx]
--- a/src/transformers/models/gpt_oss/modular_gpt_oss.py
+++ b/src/transformers/models/gpt_oss/modular_gpt_oss.py
@ -73,7 +73,7 @@ class GptOssExperts(nn.Module):

    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
        """
-        When training is is more efficient to just loop over the experts and compute the output for each expert
+        When training it is more efficient to just loop over the experts and compute the output for each expert
        as otherwise the memory would explode.

        For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
@ -95,8 +95,8 @@ class GptOssExperts(nn.Module):
                expert_mask = expert_mask.permute(2, 1, 0)
                # we sum on the top_k and on the sequence lenght to get which experts
                # are hit this time around
-                expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-            for expert_idx in expert_hitted[:]:
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
                with torch.no_grad():
                    _, token_idx = torch.where(expert_mask[expert_idx[0]])
                current_state = hidden_states[token_idx]
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@ -40,7 +40,7 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)


-# Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
@ -119,7 +119,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
--- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
+++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
@ -1647,7 +1647,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
--- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
+++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
@ -918,7 +918,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@ -67,7 +67,7 @@ is_fast_path_available = all(
 logger = logging.get_logger(__name__)


-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router
 def load_balancing_loss_func(
    router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
@ -148,7 +148,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@ -50,7 +50,7 @@ if is_flash_attn_available():
 logger = logging.get_logger(__name__)


-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
@ -129,7 +129,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
--- a/src/transformers/models/minimax/modeling_minimax.py
+++ b/src/transformers/models/minimax/modeling_minimax.py
@ -465,8 +465,8 @@ class MiniMaxSparseMoeBlock(nn.Module):
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
            # Index the correct hidden states and compute the expert hidden state for
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@ -121,8 +121,8 @@ class MixtralSparseMoeBlock(nn.Module):
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
            # Index the correct hidden states and compute the expert hidden state for
--- a/src/transformers/models/mixtral/modular_mixtral.py
+++ b/src/transformers/models/mixtral/modular_mixtral.py
@ -201,8 +201,8 @@ class MixtralSparseMoeBlock(nn.Module):
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
            # Index the correct hidden states and compute the expert hidden state for
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@ -39,7 +39,7 @@ if is_flash_attn_available():
 logger = logging.get_logger(__name__)


-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
@ -118,7 +118,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
--- a/src/transformers/models/phimoe/modeling_phimoe.py
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@ -55,7 +55,7 @@ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_m
 logger = logging.get_logger(__name__)


-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
@ -134,7 +134,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@ -59,7 +59,6 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)


-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
@ -138,7 +137,8 @@ def load_balancing_loss_func(
            router_per_expert_attention_mask, dim=0
        )

-    rank = routing_weights.shape[1] * int(routing_weights.device.index)
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
    overall_loss = torch.sum(
        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
    )
@ -621,8 +621,8 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))

--- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@ -244,8 +244,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))

--- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
@ -100,8 +100,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
-        expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hitted:
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))

--- a/src/transformers/quantizers/quantizer_mxfp4.py
+++ b/src/transformers/quantizers/quantizer_mxfp4.py
@ -56,33 +56,44 @@ class Mxfp4HfQuantizer(HfQuantizer):
                "Using mxfp4 quantization requires torch"
                "Please install the latest version of torch ( pip install --upgrade torch )"
            )
+
+        if self.quantization_config.dequantize:
+            return
+
        if not torch.cuda.is_available():
            raise RuntimeError("Using MXFP4 quantized models requires a GPU")

        if not is_accelerate_available():
            raise ImportError("Using mxfp4 requires Accelerate: `pip install accelerate`")

-        if self.quantization_config.dequantize:
-            return
-
        compute_capability = torch.cuda.get_device_capability()
-        major, minor = compute_capability
+        gpu_is_supported = compute_capability >= (7, 5)
+        kernels_available = is_triton_available("3.4.0") and is_triton_kernels_availalble()

-        if not is_triton_available("3.4.0") or not is_triton_kernels_availalble():
-            if self.pre_quantized and not self.quantization_config.dequantize:
+        if self.pre_quantized:
+            # On unsupported GPUs or without kernels, we will dequantize the model to bf16
+            if not gpu_is_supported:
+                logger.warning_once(
+                    "MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200). "
+                    "We will default to dequantizing the model to bf16."
+                )
+                self.quantization_config.dequantize = True
+                return
+
+            if not kernels_available:
                logger.warning_once(
                    "MXFP4 quantization requires triton >= 3.4.0 and triton_kernels installed, we will default to dequantizing the model to bf16"
                )
                self.quantization_config.dequantize = True
                return
-            else:
-                # we can't quantize the model in this case so we raise an error
-                raise ValueError("MXFP4 quantization requires triton >= 3.4.0 and triton_kernels installed")
-
-        if major < 9:
+        elif not gpu_is_supported:
+            # we can't quantize the model in this case so we raise an error
            raise ValueError(
-                "MXFP4 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100, or B100)"
+                "MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200)"
            )
+        elif not kernels_available:
+            # we can't quantize the model in this case so we raise an error
+            raise ValueError("MXFP4 quantization requires triton >= 3.4.0 and triton_kernels installed")

        device_map = kwargs.get("device_map", None)
        if device_map is None:
--- a/src/transformers/utils/processor_visualizer_utils.py
+++ b/src/transformers/utils/processor_visualizer_utils.py
@ -0,0 +1,373 @@
+import re
+from typing import Optional, Union
+
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+import torch
+from PIL import Image
+
+from ..image_transforms import convert_to_rgb, to_pil_image, unnormalize
+from ..models.auto import AutoConfig, AutoProcessor
+
+
+# archs failing that should raise immediately for this util:
+
+INCOMPATIBLE_MODELS = [
+    "bit",
+    "colpali",
+    "colqwen2",
+    "convnext",
+    "d_fine",
+    "data2vec",
+    "efficientloftr",
+    "efficientnet",
+    "fuyu",
+    "gemma3",
+    "glm4v",
+    "glpn",
+    "hgnet_v2",
+    "hiera",
+    "internvl",
+    "janus",
+    "layoutlmv3",
+    "levit",
+    "lightglue",
+    "llama4",
+    "mistral3",
+    "mllama",
+    "mobilevit",
+    "mobilevitv2",
+    "musicgen",
+    "musicgen_melody",
+    "oneformer",
+    "perceiver",
+    "perception_lm",
+    "phi4_multimodal",
+    "qwen2_5_omni",
+    "qwen2_5_vl",
+    "qwen2_vl",
+    "regnet",
+    "resnet",
+    "superglue",
+    "superpoint",
+    "swin2sr",
+    "timm_wrapper",
+    "tvp",
+    "udop",
+    "vitmatte",
+    "vitpose",
+    "vjepa2",
+    "whisper",
+]
+
+
+DEFAULT_IMAGE_URL = (
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/hf-logo-224x224.png"
+)
+
+
+def _looks_like_global(tile: np.ndarray, base: Image.Image, *, mae_tol: float = 0.3) -> bool:
+    """
+    Very simple visualizer heuristic.
+    """
+    base_r = base.convert("RGB").resize(tile.shape[:2][::-1], Image.BILINEAR)
+    base_np = np.asarray(base_r).astype(np.float32) / 255.0
+
+    tile_f32 = tile.astype(np.float32)
+    if tile_f32.max() > 1.5:
+        tile_f32 /= 255.0
+
+    mae = np.abs(tile_f32 - base_np).mean()
+    return mae < mae_tol
+
+
+class ImageVisualizer:
+    def __init__(self, repo_id: str):
+        self.processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=False)
+        self.config = AutoConfig.from_pretrained(repo_id, trust_remote_code=False)
+
+        if hasattr(self.processor, "image_processor"):
+            image_processor = self.processor.image_processor
+        elif hasattr(self.processor, "image_mean"):
+            image_processor = self.processor  # weak test, but works most of the time
+        else:
+            raise ValueError(f"No image processor found for {repo_id}.")
+
+        self.channel_means = getattr(image_processor, "image_mean", [0.485, 0.456, 0.406])
+        self.channel_stds = getattr(image_processor, "image_std", [0.229, 0.224, 0.225])
+        if hasattr(self.processor, "image_token"):
+            self.image_token_marker = self.processor.image_token
+        elif hasattr(self.processor, "image_token_id"):
+            self.image_token_marker = self.processor.decode(self.processor.image_token_id)
+        else:
+            self.image_token_marker = "<image>"
+
+        self.default_prompt = f"{self.image_token_marker} How does it look?"
+
+        self.vision_config = getattr(self.config, "vision_config", None)
+        self.patch_size = getattr(self.vision_config, "patch_size", getattr(image_processor, "patch_size", 14))
+        self.merge_size = getattr(image_processor, "merge_size", 1)
+
+    def _pixel_values_as_tensor(
+        self, pixel_values: Union[torch.Tensor, np.ndarray, list[np.ndarray], list[torch.Tensor]]
+    ):
+        """
+        Normalize input to a 4D tensor with shape (batch, channels, height, width).
+        Supports input of shape:
+          - (B, C, H, W)
+          - (B, N, C, H, W)  -> flattened to (B*N, C, H, W)
+          - (C, H, W)        -> expanded to (1, C, H, W)
+          - list/tuple of arrays or tensors
+        """
+        if isinstance(pixel_values, (list, tuple)):
+            tensor_list = [pv if isinstance(pv, torch.Tensor) else torch.tensor(pv) for pv in pixel_values]
+            pixel_values = torch.stack(tensor_list, dim=0)
+
+        if not isinstance(pixel_values, torch.Tensor):
+            pixel_values = torch.tensor(pixel_values)
+
+        if pixel_values.ndim == 5:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.view(batch_size * num_images, num_channels, height, width)
+        elif pixel_values.ndim == 4:
+            pass
+        elif pixel_values.ndim == 3:
+            pixel_values = pixel_values.unsqueeze(0)
+        else:
+            raise ValueError(f"Unexpected pixel tensor shape {pixel_values.shape}")
+
+        return pixel_values
+
+    def _display_single_image(self, image_array: np.ndarray, show_patch_grid: bool, figsize=(7, 7)):
+        plt.figure(figsize=figsize)
+        plt.imshow(image_array)
+        plt.xticks([])
+        plt.yticks([])
+
+        if show_patch_grid:
+            height, width = image_array.shape[:2]
+            step = max(1, min(height, width) // self.patch_size)
+            for x_pos in range(0, width, step):
+                plt.axvline(x_pos, color="red", linewidth=0.5)
+            for y_pos in range(0, height, step):
+                plt.axhline(y_pos, color="red", linewidth=0.5)
+
+        caption = f"{width}×{height} | mean={', '.join(f'{m:.3f}' for m in self.channel_means)} | std={', '.join(f'{s:.3f}' for s in self.channel_stds)}"
+        plt.tight_layout()
+        plt.figtext(0.5, -0.02, caption, ha="center", va="top", fontsize=12)
+        plt.show()
+
+    def _display_tiled_images(
+        self,
+        tiles_array: np.ndarray,
+        source_image: Image.Image,
+        rows: Optional[int] = None,
+        cols: Optional[int] = None,
+        aspect_ratio: float = 1.0,
+        add_grid: bool = True,
+        figsize=(7, 7),
+    ):
+        """
+        Display a grid of image tiles. Attempts to detect and preserve the original/global image tile,
+        which is then shown separately at the end.
+        """
+        num_tiles = tiles_array.shape[0]
+
+        original_tile_index = None
+        saved_original_tile = None
+
+        for idx in (0, num_tiles - 1):
+            if _looks_like_global(tiles_array[idx], source_image):
+                original_tile_index = idx
+                break
+
+        if original_tile_index is not None:
+            saved_original_tile = tiles_array[original_tile_index]
+            tiles_array = np.delete(tiles_array, original_tile_index, axis=0)
+            num_tiles -= 1
+
+        # Infer grid if not specified
+        grid_rows, grid_cols = rows, cols
+        if grid_rows is None or grid_cols is None:
+            if aspect_ratio >= 1:
+                guessed_cols = int(np.ceil(np.sqrt(num_tiles * aspect_ratio)))
+                guessed_rows = int(np.ceil(num_tiles / max(guessed_cols, 1)))
+            else:
+                guessed_rows = int(np.ceil(np.sqrt(num_tiles / max(aspect_ratio, 1e-8))))
+                guessed_cols = int(np.ceil(num_tiles / max(guessed_rows, 1)))
+            grid_rows = grid_rows if grid_rows is not None else guessed_rows
+            grid_cols = grid_cols if grid_cols is not None else guessed_cols
+
+        fig, axes = plt.subplots(grid_rows, grid_cols, figsize=figsize, squeeze=False)
+        tile_index = 0
+        for row_idx in range(grid_rows):
+            for col_idx in range(grid_cols):
+                ax = axes[row_idx, col_idx]
+                if tile_index < num_tiles:
+                    tile_image = tiles_array[tile_index]
+                    ax.imshow(tile_image)
+                    ax.set_xticks([])
+                    ax.set_yticks([])
+
+                    if add_grid:
+                        height, width = tile_image.shape[:2]
+                        step = max(1, min(height, width) // self.patch_size)
+                        for x_pos in range(0, width, step):
+                            ax.axvline(x_pos, color="red", linewidth=0.5)
+                        for y_pos in range(0, height, step):
+                            ax.axhline(y_pos, color="red", linewidth=0.5)
+                else:
+                    ax.axis("off")
+                tile_index += 1
+
+        unique = sorted({f"{t.shape[1]}×{t.shape[0]}" for t in tiles_array})
+        sizes = ", ".join(unique)
+        caption = f"{tiles_array.shape[0]} patches | {sizes} | mean={', '.join(f'{m:.3f}' for m in self.channel_means)} | std={', '.join(f'{s:.3f}' for s in self.channel_stds)}"
+        plt.tight_layout()
+        fig.text(0.5, 0.02, caption, ha="center", va="bottom", fontsize=12)
+        plt.show()
+
+        if saved_original_tile is not None:
+            fig2, ax2 = plt.subplots(figsize=figsize)
+            ax2.imshow(saved_original_tile)
+            ax2.set_xticks([])
+            ax2.set_yticks([])
+            ax2.set_aspect("equal", adjustable="box")
+            fig2.subplots_adjust(left=0, right=1, top=1, bottom=0)  # no clipping
+            h0, w0 = saved_original_tile.shape[:2]
+            caption = f"{w0}×{h0} | mean={', '.join(f'{m:.3f}' for m in self.channel_means)} | std={', '.join(f'{s:.3f}' for s in self.channel_stds)}"
+            fig2.text(0.5, 0.02, caption, ha="center", va="bottom", fontsize=12)
+            plt.show()
+
+    def default_message(self, full_output: bool = False) -> str:
+        """
+        Build a single formatted prompt string using the processor's chat template.
+        Contains one image (HF logo) and one user text message.
+        If available, adds the generation prompt as well.
+        Falls back to a minimal '<image>' string if no template is available.
+        """
+        # ensure this is a multimodal processor with image + tokenizer
+        if not (
+            hasattr(self.processor, "attributes")
+            and "image_processor" in self.processor.attributes
+            and "tokenizer" in self.processor.attributes
+        ):
+            raise RuntimeError(
+                "Processor does not expose both 'image_processor' and 'tokenizer'; cannot build multimodal example."
+            )
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/hf-logo-224x224.png",
+                    },
+                    {"type": "text", "text": "Please describe this image."},
+                ],
+            }
+        ]
+
+        try:
+            print("For a 224x224 RGB png image: \n")
+            decoded_message = self.processor.batch_decode(
+                self.processor.apply_chat_template(
+                    conversation,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=False,
+                    truncation=False,
+                ),
+                skip_special_tokens=False,
+            )[0]
+
+            image_token_string = getattr(self.processor, "image_token", "<image>")
+            token_escaped = re.escape(image_token_string)
+            image_token_run_pattern = re.compile(rf"(?:{token_escaped})(?:\s*{token_escaped}){{2,}}")
+
+            def compress_image_token_run(match: re.Match) -> str:
+                n_tokens = match.group(0).count(image_token_string)
+                return f"{image_token_string}[...{n_tokens} tokens...]{image_token_string}"
+
+            if full_output:
+                return decoded_message
+            else:
+                return image_token_run_pattern.sub(compress_image_token_run, decoded_message)
+
+        except ValueError:
+            image_token_string = getattr(
+                self.processor,
+                "image_token",
+                getattr(getattr(self.processor, "tokenizer", None), "image_token", "<image>"),
+            )
+            return f"{image_token_string} {'Please describe this image.'}"
+
+    def visualize(
+        self,
+        images: Optional[Union[Image.Image, np.ndarray, str, list[Union[Image.Image, np.ndarray, str]]]] = None,
+        rows: Optional[int] = None,
+        cols: Optional[int] = None,
+        add_grid: bool = True,
+        figsize=(12, 12),
+    ):
+        """
+        Visualize the model-processed image(s). Only single images are supported.
+        If the processor returns multiple tiles, display them in a grid with optional patch grid overlay.
+        """
+        if images is None:
+            images = Image.open(requests.get(DEFAULT_IMAGE_URL, stream=True).raw)
+
+        if not isinstance(images, list):
+            images = [images]
+        else:
+            if len(images) > 1:
+                raise ValueError(
+                    "You passed a list of several images. Only single images are accepted by the visualizer."
+                )
+
+        pil_images = [convert_to_rgb(to_pil_image(x)) for x in images]
+        img_width, img_height = pil_images[0].size
+        aspect_ratio = img_width / max(img_height, 1)
+
+        processed_inputs = self.processor(images=pil_images, text=self.default_prompt, return_tensors="pt")
+        pixel_values = processed_inputs["pixel_values"]
+        unnormalized = unnormalize(pixel_values, mean=self.channel_means, std=self.channel_stds)
+        if unnormalized.ndim == 3 or unnormalized.shape[0] == 1:
+            self._display_single_image(
+                unnormalized[0] if unnormalized.ndim == 4 else unnormalized,
+                show_patch_grid=add_grid,
+                figsize=figsize,
+            )
+            return
+        elif unnormalized.ndim != 4:
+            raise ValueError(f"Unsupported shape after unnormalization: {unnormalized.shape}")
+
+        num_tiles = unnormalized.shape[0]
+
+        if rows is None or cols is None:
+            tile_h, tile_w = unnormalized.shape[1:3]
+            tile_aspect = tile_w / tile_h if tile_h > 0 else 1.0
+            target_aspect = aspect_ratio / tile_aspect
+
+            best_rows, best_cols = 1, num_tiles
+            min_diff = float("inf")
+            for r in range(1, num_tiles + 1):
+                c = int(np.ceil(num_tiles / r))
+                diff = abs((c / r) - target_aspect)
+                if diff < min_diff:
+                    min_diff = diff
+                    best_rows, best_cols = r, c
+
+            rows = best_rows
+            cols = best_cols
+            self._display_tiled_images(
+                unnormalized,
+                pil_images[0],
+                rows=rows,
+                cols=cols,
+                aspect_ratio=aspect_ratio,
+                add_grid=add_grid,
+                figsize=figsize,
+            )
--- a/tests/quantization/mxfp4/test_mxfp4.py
+++ b/tests/quantization/mxfp4/test_mxfp4.py
@ -107,18 +107,31 @@ class Mxfp4QuantizerTest(unittest.TestCase):

    def test_quantizer_validation_low_compute_capability(self):
        """Test quantizer validation with low compute capability"""
-        with patch("torch.cuda.get_device_capability", return_value=(8, 0)):
+        with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
+            from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
+
+            config = Mxfp4Config()
+            quantizer = Mxfp4HfQuantizer(config)
+            quantizer.pre_quantized = False
+
+            with self.assertRaises(ValueError):
+                quantizer.validate_environment()
+
+    def test_quantizer_validation_low_compute_capability_with_prequantized(self):
+        """Test quantizer validation with low compute capability"""
+        with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
            from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer

            config = Mxfp4Config()
            quantizer = Mxfp4HfQuantizer(config)

-            with self.assertRaises(ValueError):
-                quantizer.validate_environment()
+            # Should automatically set dequantize=True and warn
+            quantizer.validate_environment()
+            self.assertTrue(quantizer.quantization_config.dequantize)

    def test_quantizer_validation_low_compute_capability_with_dequantize(self):
        """Test quantizer validation with low compute capability but dequantize enabled"""
-        with patch("torch.cuda.get_device_capability", return_value=(8, 0)):
+        with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
            from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer

            config = Mxfp4Config(dequantize=True)
@ -131,6 +144,52 @@ class Mxfp4QuantizerTest(unittest.TestCase):
                if "compute capability" in str(e):
                    self.fail("Should not raise compute capability error when dequantize=True")

+    def test_quantizer_validation_dequantize_on_cpu(self):
+        """Test quantizer validation with dequantize enabled on CPU-only environment"""
+        with patch("torch.cuda.is_available", return_value=False):
+            from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
+
+            config = Mxfp4Config(dequantize=True)
+            quantizer = Mxfp4HfQuantizer(config)
+
+            # Should not raise error when dequantize=True even without CUDA
+            try:
+                quantizer.validate_environment()
+            except RuntimeError as e:
+                if "requires a GPU" in str(e):
+                    self.fail("Should not raise GPU requirement error when dequantize=True on CPU")
+
+    def test_quantizer_validation_order_dequantize_before_cuda_check(self):
+        """Test that dequantize check happens before CUDA availability check"""
+        # Mock both torch.cuda.is_available and is_accelerate_available to return False
+        with (
+            patch("torch.cuda.is_available", return_value=False),
+            patch(
+                "transformers.quantizers.quantizer_mxfp4.is_accelerate_available",
+                return_value=False,
+            ),
+        ):
+            from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
+
+            # Test with dequantize=True - should pass even without CUDA and accelerate
+            config = Mxfp4Config(dequantize=True)
+            quantizer = Mxfp4HfQuantizer(config)
+
+            # This should not raise any error because dequantize check comes first
+            try:
+                quantizer.validate_environment()
+            except (RuntimeError, ImportError) as e:
+                if "requires a GPU" in str(e) or "requires Accelerate" in str(e):
+                    self.fail(f"Should not raise error when dequantize=True: {e}")
+
+            # Test with dequantize=False - should still fail due to missing CUDA
+            config = Mxfp4Config(dequantize=False)
+            quantizer = Mxfp4HfQuantizer(config)
+
+            with self.assertRaises(RuntimeError) as context:
+                quantizer.validate_environment()
+            self.assertIn("requires a GPU", str(context.exception))
+
    def test_quantizer_validation_missing_triton(self):
        """Test quantizer validation when triton is not available"""
        with (
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@ -171,6 +171,12 @@ if __name__ == "__main__":
    print(f"start_commit: {args.start_commit}")
    print(f"end_commit: {args.end_commit}")

+    # `get_commit_info` uses `requests.get()` to request info. via `api.github.com` without using token.
+    # If there are many new failed tests in a workflow run, this script may fail at some point with `KeyError` at
+    # `pr_number = pr_info_for_commit[0]["number"]` due to the rate limit.
+    # Let's cache the commit info. and reuse them whenever possible.
+    commit_info_cache = {}
+
    if len({args.test is None, args.file is None}) != 2:
        raise ValueError("Exactly one argument `test` or `file` must be specified.")

@ -191,7 +197,14 @@ if __name__ == "__main__":
            for test in failed_tests:
                commit = find_bad_commit(target_test=test, start_commit=args.start_commit, end_commit=args.end_commit)
                info = {"test": test, "commit": commit}
-                info.update(get_commit_info(commit))
+
+                if commit in commit_info_cache:
+                    commit_info = commit_info_cache[commit]
+                else:
+                    commit_info = get_commit_info(commit)
+                    commit_info_cache[commit] = commit_info
+
+                info.update(commit_info)
                failed_tests_with_bad_commits.append(info)

            # If no single-gpu test failures, remove the key
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@ -345,6 +345,8 @@ SPECIAL_CASES_TO_ALLOW.update(
        "IdeficsConfig": True,
        "IdeficsVisionConfig": True,
        "IdeficsPerceiverConfig": True,
+        # TODO: @Arthur/Joao (`hidden_act` unused)
+        "GptOssConfig": True,
    }
 )
Author	SHA1	Message	Date
molbap	b356fce1da	solve unequal cropping	2025-08-11 19:20:28 +02:00
molbap	af7f75e682	use existing methods, add default image	2025-08-11 16:44:06 +02:00
molbap	34ba5909a2	add an unnormalize image method	2025-08-11 16:43:27 +02:00
Pablo Montalvo	fbec904fb0	Merge branch 'main' into vision_visualizer	2025-08-06 19:19:09 +02:00
molbap	a1263dfe7b	fixup	2025-08-06 19:17:38 +02:00
molbap	1878d6c4ff	add captions and better tiling detection	2025-08-06 19:16:14 +02:00
ScutterKey	cb2e0df2ec	[image processor] fix glm4v (#39964 ) * fix glm4v image process * Update src/transformers/models/glm4v/image_processing_glm4v.py --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>	2025-08-06 17:46:58 +01:00
Tialo	9ab75fc428	fix typo (#39936 ) * fix typo * fix modular instead * fix --------- Co-authored-by: y.korobko <y.korobko@tbank.ru>	2025-08-06 16:21:24 +00:00
Mikhail Samin	43b3f58875	Fix grammatical error in MoE variable name: expert_hitted → expert_hit, hitted_experts → hit_experts (#39959 ) * Fix grammatical error: expert_hitted -> expert_hit in MoE implementations * Fix grammatical error: hitted_experts -> hit_experts in MoE implementation	2025-08-06 15:45:19 +00:00
Minseo Kim	dff6185d61	docs: fix typo in 'quantization-aware training' (#39904 )	2025-08-06 14:52:43 +00:00
Matthew Douglas	c7844c7a8e	Enable gpt-oss mxfp4 on older hardware (sm75+) (#39940 ) Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-08-06 13:39:21 +00:00
Lintch	dd70a8cb9d	Fix MXFP4 quantizer validation to allow CPU inference with dequantize option (#39953 ) * Fix MXFP4 quantizer validation to enable CPU dequantization Move dequantize check before CUDA availability check to allow CPU inference when quantization_config.dequantize is True. This enables users to run MXFP4 models on CPU by automatically converting them to BF16 format. * Add tests for MXFP4 quantizer CPU dequantization validation * fix: format mxfp4 test file with ruff	2025-08-06 15:20:41 +02:00
Joao Gante	82eb67e62a	[docs] ko toc fix (#39927 )	2025-08-06 10:12:34 +00:00
Yih-Dar	9e76a6bb54	circleci: pin torch 2.7.1 until `torchcodec` is updated (#39951 ) circleci torch 2.7.1 Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-06 11:18:00 +02:00
Manuel de Prada Corral	910b319357	Fix CI: Tests failing on CPU due to `torch.device('cpu').index` being None (#39933 ) replace routing_weights.device.index with a	2025-08-06 10:22:43 +02:00
Yih-Dar	369c99d0ce	Avoid `utils/check_bad_commit.py` failing due to rate limit (requesting `api.github.com`) (#39918 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-08-05 21:52:20 +02:00
Joao Gante	b771e476a8	[CI] post-`GptOss` fixes for green CI (#39929 )	2025-08-05 20:04:59 +02:00
Lysandre	eb6e26acf3	Dev version	2025-08-05 18:09:30 +02:00
molbap	a6a18efe53	better namings	2025-08-05 17:30:05 +02:00
Pablo Montalvo	e581d2f2ce	fixup	2025-07-25 08:02:39 +00:00
Pablo Montalvo	1f6822d114	move processor visualizer	2025-07-25 07:58:35 +00:00
Pablo Montalvo	edb70ae15c	Merge branch 'main' into vision_visualizer	2025-07-24 12:50:27 +00:00
Pablo Montalvo	27bc371bea	Merge branch 'main' into vision_visualizer	2025-07-22 13:01:45 +02:00
Pablo Montalvo	58c619e809	draft the vision visualizer	2025-03-21 18:53:04 +01:00