fix eetq

skip eetq tests loading shard_checkpoint
New awq version
2025-10-27 14:54:34 +08:00 · 2024-11-21 23:15:27 +00:00 · 2024-11-21 23:00:21 +00:00 · 2024-11-21 22:34:01 +00:00
11 changed files with 47 additions and 160 deletions
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -15,7 +15,6 @@
 # limitations under the License.
 import copy
 import inspect
-import os
 import warnings
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
@ -1030,6 +1029,10 @@ class GenerationMixin:
                "You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument "
                "in favour of `input_ids` or `decoder_input_ids` respectively.",
            )
+        if generation_config.watermarking_config is not None:
+            processors.append(
+                generation_config.watermarking_config.construct_processor(self.config.vocab_size, device)
+            )

        # TODO (joao): find a strategy to specify the order of the processors
        processors = self._merge_criteria_processor_list(processors, logits_processor)
@ -1082,12 +1085,6 @@ class GenerationMixin:
                    )
                )

-        # Watermarking should be after all logits processing is finished (see #34630)
-        if generation_config.watermarking_config is not None:
-            processors.append(
-                generation_config.watermarking_config.construct_processor(self.config.vocab_size, device)
-            )
-
        # `LogitNormalization` should always be the last logit processor, when present
        if generation_config.renormalize_logits is True:
            processors.append(LogitNormalization())
@ -3225,16 +3222,6 @@ class GenerationMixin:
        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)

-        def model_forward(model, *args, **kwargs):
-            return model.forward(*args, **kwargs)
-
-        if isinstance(model_kwargs.get("past_key_values"), StaticCache):
-            if self.device.type == "cuda":
-                logger.warning_once("Using `torch.compile`.")
-                os.environ["TOKENIZERS_PARALLELISM"] = "0"
-                model_forward = torch.compile(model_forward, mode="reduce-overhead", fullgraph=True)
-
-        i = 0
        while self._has_unfinished_sequences(
            this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length
        ):
@ -3245,11 +3232,8 @@ class GenerationMixin:
            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})

-            if i == 0:
-                outputs = self(**model_inputs, return_dict=True)
-                i += 1
-            else:
-                outputs = model_forward(self, return_dict=True, **model_inputs)
+            # forward pass to get next token
+            outputs = self(**model_inputs, return_dict=True)

            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
            model_kwargs = self._update_model_kwargs_for_generation(
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -52,6 +52,7 @@ from .pytorch_utils import (  # noqa: F401
    find_pruneable_heads_and_indices,
    id_tensor_storage,
    is_torch_greater_or_equal_than_1_13,
+    is_torch_greater_or_equal_than_2_4,
    prune_conv1d_layer,
    prune_layer,
    prune_linear_layer,
@ -89,7 +90,6 @@ from .utils import (
    is_peft_available,
    is_remote_url,
    is_safetensors_available,
-    is_torch_greater_or_equal,
    is_torch_sdpa_available,
    is_torch_xla_available,
    logging,
@ -5032,7 +5032,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            device_mesh (`torch.distributed.DeviceMesh`):
                The device mesh to use for tensor parallelism.
        """
-        if not is_torch_greater_or_equal("2.5"):
+        if not is_torch_greater_or_equal_than_2_4:
            raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")

        # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@ -38,7 +38,6 @@ from ...utils import TensorType, is_vision_available, logging


 logger = logging.get_logger(__name__)
-MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum


 if is_vision_available():
@ -117,6 +116,7 @@ def _resize_output_size_scale_below_upper_bound(
 def get_resize_output_image_size(
    image,
    resolution_max_side: int,
+    max_image_size: int = 1820,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
    """
@ -126,18 +126,24 @@ def get_resize_output_image_size(
            Image to resize.
        resolution_max_side (`int`):
            The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
-            input aspect ratio.
+            input aspect ratio, with a lower bound of `min_image_size`.
+        max_image_size (`int`, *optional*, defaults to 1820):
+            Maximum image resolution. If the image is larger than this size, the longest edge will be resized to this
+            value, with the shortest edge resized to keep the input aspect ratio, with a lower bound of `min_image_size`.
        input_data_format (`ChannelDimension` or `str`):
            The channel dimension format of the input image.
    Returns:
        The output size of the image after resizing.
    """
+    if resolution_max_side > max_image_size:
+        raise ValueError("`resolution_max_side` cannot be larger than `max_image_size`")
+
    height, width = get_image_size(image, channel_dim=input_data_format)

    # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
    height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
-    # Find the output size when scaling the image to be below the MAX_IMAGE_SIZE
-    height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
+    # Find the output size when scaling the image to be below the max_image_size
+    height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=max_image_size)
    return height, width


@ -245,7 +251,7 @@ def convert_to_rgb(
    data_format = input_data_format if data_format is None else data_format

    mode = "P" if palette is not None else None
-    image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
+    image = to_pil_image(image, image_mode=mode)
    if image.mode == "P" and palette is not None:
        image.putpalette(palette)

@ -398,7 +404,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
        image_mode = None
        if image.ndim == 2 or image.shape[-1] == 1:
            image_mode = "P"
-        image = to_pil_image(image, image_mode=image_mode, input_data_format=input_data_format)
+        image = to_pil_image(image, image_mode=image_mode)

        resized_image = image.resize((size[1], size[0]), resample=resample)
        resized_image = np.array(resized_image)
@ -748,16 +754,6 @@ class Idefics3ImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images_list = [[to_numpy_array(image) for image in images] for images in images_list]

-        # Extra channel dimension for grayscale images
-        if input_data_format in [ChannelDimension.LAST, None]:
-            images_list = [
-                [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
-            ]
-        elif input_data_format == ChannelDimension.FIRST:
-            images_list = [
-                [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
-            ]
-
        if is_scaled_image(images_list[0][0]) and do_rescale:
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
@ -768,6 +764,18 @@ class Idefics3ImageProcessor(BaseImageProcessor):
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))

+        # Extra channel dimension for grayscale images
+        if input_data_format == ChannelDimension.LAST:
+            images_list = [
+                [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
+        elif input_data_format == ChannelDimension.FIRST:
+            images_list = [
+                [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
+        else:
+            raise ValueError(f"Invalid channel dimension format {input_data_format}.")
+
        if do_resize:
            images_list = [
                [
--- a/src/transformers/models/llama/quantize_fp8_llama.py
+++ b/src/transformers/models/llama/quantize_fp8_llama.py
@ -1,36 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from transformers import FbgemmFp8Config, LlamaForCausalLM
-
-
-modules_to_not_convert = []
-
-# As defined by Meta, we don't quantize the first and last layers as well as the lm_head. Also, we don't quantize the self_attn layers.
-modules_to_not_convert.append("model.layers.0")
-modules_to_not_convert.append("model.layers.125")
-modules_to_not_convert.append("lm_head")
-for layer_i in range(1, 125):
-    modules_to_not_convert.append(f"model.layers.{layer_i}.self_attn")
-
-quantization_config = FbgemmFp8Config(modules_to_not_convert=modules_to_not_convert)
-model_name = "meta-llama/Llama-3.1-405B"
-
-model = LlamaForCausalLM.from_pretrained(
-    model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
-)
-
-model.save_pretrained(f"{model_name}-FP8")
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@ -21,7 +21,7 @@ from packaging import version
 from safetensors.torch import storage_ptr, storage_size
 from torch import nn

-from .utils import is_torch_greater_or_equal, is_torch_xla_available, logging
+from .utils import is_torch_xla_available, logging


 ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
@ -39,7 +39,7 @@ is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse
 is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")


-if is_torch_greater_or_equal("2.5"):
+if is_torch_greater_or_equal_than_2_4:
    from torch.distributed.tensor import Replicate
    from torch.distributed.tensor.parallel import (
        ColwiseParallel,
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@ -215,9 +215,6 @@ class HfQuantizer(ABC):

        # Delete quantizer and quantization config
        del model.hf_quantizer
-        del model.config.quantization_config
-        del model.config._pre_quantization_dtype
-        model.is_quantized = False

        return model

--- a/src/transformers/quantizers/quantizer_eetq.py
+++ b/src/transformers/quantizers/quantizer_eetq.py
@ -53,20 +53,6 @@ class EetqHfQuantizer(HfQuantizer):
                "Please install the latest version of eetq from : https://github.com/NetEase-FuXi/EETQ"
            )

-        try:
-            import eetq  # noqa: F401
-        except ImportError as exc:
-            if "shard_checkpoint" in str(exc):
-                # EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
-                # shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
-                # TODO: Update message once eetq releases a fix
-                raise ImportError(
-                    "You are using a version of EETQ that is incompatible with the current transformers version. "
-                    "Either downgrade transformers to <= v4.46.3 or, if available, upgrade EETQ to > v1.0.0."
-                ) from exc
-            else:
-                raise
-
        if not is_accelerate_available():
            raise ImportError("Loading an EETQ quantized model requires accelerate (`pip install accelerate`)")

--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@ -1143,17 +1143,7 @@ def require_eetq(test_case):
    """
    Decorator marking a test that requires eetq
    """
-    eetq_available = is_eetq_available()
-    if eetq_available:
-        try:
-            import eetq  # noqa: F401
-        except ImportError as exc:
-            if "shard_checkpoint" in str(exc):
-                # EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
-                # shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
-                # TODO: Remove once eetq releases a fix and this release is used in CI
-                eetq_available = False
-    return unittest.skipUnless(eetq_available, "test requires eetq")(test_case)
+    return unittest.skipUnless(is_eetq_available(), "test requires eetq")(test_case)


 def require_av(test_case):
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -1006,6 +1006,17 @@ def is_auto_gptq_available():


 def is_eetq_available():
+    if not _eetq_available:
+        return _eetq_available
+
+    try:
+        from eetq import EetqLinear  # noqa: F401
+    except ImportError as exc:
+        if "shard_checkpoint" in str(exc):
+            # eetq is currently broken with newer transformers versions because it tries to import shard_checkpoint
+            # see https://github.com/NetEase-FuXi/EETQ/issues/34
+            # TODO: Remove once eetq releasees a fix and this release is used in CI
+            return False
    return _eetq_available


--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@ -14,7 +14,6 @@
 # limitations under the License.


-import collections
 import copy
 import gc
 import inspect
@ -2451,58 +2450,6 @@ class UtilsFunctionsTest(unittest.TestCase):
        self.assertTrue(n_matches.item() == 2)
        self.assertTrue(validated_tokens.tolist()[0] == [1, 4, 8])

-    def test_speculative_sampling_target_distribution(self):
-        """
-        Asserts that the target distribution is preserved.
-        Should help with catching issues like #32867.
-        """
-        # assume vocab size 10, input length 5 + 3 generated candidates
-        candidate_input_ids = torch.tensor([[8, 0, 3, 9, 8, 1, 4, 5]])  # input tokens
-        candidate_logits = torch.tensor(
-            [
-                [
-                    [-10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # generated 1
-                    [-10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # generated 4
-                    [-10.0, -10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0],  # generated 5
-                ]
-            ]
-        )
-        candidate_length = 3
-        inf = float("inf")
-        new_logits = torch.tensor(
-            [
-                [
-                    # accepts 1:
-                    [-inf, 10.0, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
-                    # accepts 4:
-                    [-inf, -inf, -inf, -inf, 10.0, -inf, -inf, -inf, -inf, -inf],
-                    # most likely to be 1 or 8, less likely to be 3, then 7, and should never be any other value:
-                    [-inf, 2.0, -inf, 1.0, -inf, -inf, -inf, -0.01, 2.0, -inf],
-                    # N/A:
-                    [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
-                ]
-            ]
-        )
-        last_assistant_token_is_eos = False
-        last_validated_token = []
-        for _ in range(10_000):
-            validated_tokens, n_matches = _speculative_sampling(
-                candidate_input_ids,
-                candidate_logits,
-                candidate_length,
-                new_logits,
-                last_assistant_token_is_eos,
-            )
-            self.assertTrue(n_matches.item() == 2)
-            self.assertTrue(validated_tokens.tolist()[0][0] == 1)
-            self.assertTrue(validated_tokens.tolist()[0][1] == 4)
-            self.assertTrue(validated_tokens.tolist()[0][2] in [1, 3, 7, 8])
-            last_validated_token.append(validated_tokens.tolist()[0][2])
-        # check that the most likely tokens are selected more often than the less likely ones
-        last_token_counts = collections.Counter(last_validated_token)
-        self.assertTrue(last_token_counts[1] > last_token_counts[3] > last_token_counts[7] > 0)
-        self.assertTrue(last_token_counts[8] > last_token_counts[3])
-

@pytest.mark.generate
@require_torch
--- a/tests/quantization/bitnet_integration/test_bitnet.py
+++ b/tests/quantization/bitnet_integration/test_bitnet.py
@ -65,7 +65,7 @@ class BitNetTest(unittest.TestCase):
        """
        Load the model
        """
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+        cls.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
        cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device)

    def tearDown(self):
Author	SHA1	Message	Date
MekkCyber	29845c2460	fix eetq	2024-11-21 23:15:27 +00:00
MekkCyber	4a7158d05c	skip eetq tests loading shard_checkpoint	2024-11-21 23:00:21 +00:00
MekkCyber	b2e08a8466	New awq version	2024-11-21 22:34:01 +00:00