Compare commits

..

3 Commits

Author SHA1 Message Date
29845c2460 fix eetq 2024-11-21 23:15:27 +00:00
4a7158d05c skip eetq tests loading shard_checkpoint 2024-11-21 23:00:21 +00:00
b2e08a8466 New awq version 2024-11-21 22:34:01 +00:00
11 changed files with 47 additions and 160 deletions

View File

@ -15,7 +15,6 @@
# limitations under the License.
import copy
import inspect
import os
import warnings
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
@ -1030,6 +1029,10 @@ class GenerationMixin:
"You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument "
"in favour of `input_ids` or `decoder_input_ids` respectively.",
)
if generation_config.watermarking_config is not None:
processors.append(
generation_config.watermarking_config.construct_processor(self.config.vocab_size, device)
)
# TODO (joao): find a strategy to specify the order of the processors
processors = self._merge_criteria_processor_list(processors, logits_processor)
@ -1082,12 +1085,6 @@ class GenerationMixin:
)
)
# Watermarking should be after all logits processing is finished (see #34630)
if generation_config.watermarking_config is not None:
processors.append(
generation_config.watermarking_config.construct_processor(self.config.vocab_size, device)
)
# `LogitNormalization` should always be the last logit processor, when present
if generation_config.renormalize_logits is True:
processors.append(LogitNormalization())
@ -3225,16 +3222,6 @@ class GenerationMixin:
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
def model_forward(model, *args, **kwargs):
return model.forward(*args, **kwargs)
if isinstance(model_kwargs.get("past_key_values"), StaticCache):
if self.device.type == "cuda":
logger.warning_once("Using `torch.compile`.")
os.environ["TOKENIZERS_PARALLELISM"] = "0"
model_forward = torch.compile(model_forward, mode="reduce-overhead", fullgraph=True)
i = 0
while self._has_unfinished_sequences(
this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length
):
@ -3245,11 +3232,8 @@ class GenerationMixin:
model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
if i == 0:
outputs = self(**model_inputs, return_dict=True)
i += 1
else:
outputs = model_forward(self, return_dict=True, **model_inputs)
# forward pass to get next token
outputs = self(**model_inputs, return_dict=True)
# synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
model_kwargs = self._update_model_kwargs_for_generation(

View File

@ -52,6 +52,7 @@ from .pytorch_utils import ( # noqa: F401
find_pruneable_heads_and_indices,
id_tensor_storage,
is_torch_greater_or_equal_than_1_13,
is_torch_greater_or_equal_than_2_4,
prune_conv1d_layer,
prune_layer,
prune_linear_layer,
@ -89,7 +90,6 @@ from .utils import (
is_peft_available,
is_remote_url,
is_safetensors_available,
is_torch_greater_or_equal,
is_torch_sdpa_available,
is_torch_xla_available,
logging,
@ -5032,7 +5032,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
device_mesh (`torch.distributed.DeviceMesh`):
The device mesh to use for tensor parallelism.
"""
if not is_torch_greater_or_equal("2.5"):
if not is_torch_greater_or_equal_than_2_4:
raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
# Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.

View File

@ -38,7 +38,6 @@ from ...utils import TensorType, is_vision_available, logging
logger = logging.get_logger(__name__)
MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum
if is_vision_available():
@ -117,6 +116,7 @@ def _resize_output_size_scale_below_upper_bound(
def get_resize_output_image_size(
image,
resolution_max_side: int,
max_image_size: int = 1820,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
@ -126,18 +126,24 @@ def get_resize_output_image_size(
Image to resize.
resolution_max_side (`int`):
The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
input aspect ratio.
input aspect ratio, with a lower bound of `min_image_size`.
max_image_size (`int`, *optional*, defaults to 1820):
Maximum image resolution. If the image is larger than this size, the longest edge will be resized to this
value, with the shortest edge resized to keep the input aspect ratio, with a lower bound of `min_image_size`.
input_data_format (`ChannelDimension` or `str`):
The channel dimension format of the input image.
Returns:
The output size of the image after resizing.
"""
if resolution_max_side > max_image_size:
raise ValueError("`resolution_max_side` cannot be larger than `max_image_size`")
height, width = get_image_size(image, channel_dim=input_data_format)
# Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
# Find the output size when scaling the image to be below the MAX_IMAGE_SIZE
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
# Find the output size when scaling the image to be below the max_image_size
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=max_image_size)
return height, width
@ -245,7 +251,7 @@ def convert_to_rgb(
data_format = input_data_format if data_format is None else data_format
mode = "P" if palette is not None else None
image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
image = to_pil_image(image, image_mode=mode)
if image.mode == "P" and palette is not None:
image.putpalette(palette)
@ -398,7 +404,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
image_mode = None
if image.ndim == 2 or image.shape[-1] == 1:
image_mode = "P"
image = to_pil_image(image, image_mode=image_mode, input_data_format=input_data_format)
image = to_pil_image(image, image_mode=image_mode)
resized_image = image.resize((size[1], size[0]), resample=resample)
resized_image = np.array(resized_image)
@ -748,16 +754,6 @@ class Idefics3ImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays.
images_list = [[to_numpy_array(image) for image in images] for images in images_list]
# Extra channel dimension for grayscale images
if input_data_format in [ChannelDimension.LAST, None]:
images_list = [
[np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
]
elif input_data_format == ChannelDimension.FIRST:
images_list = [
[np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
]
if is_scaled_image(images_list[0][0]) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
@ -768,6 +764,18 @@ class Idefics3ImageProcessor(BaseImageProcessor):
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
# Extra channel dimension for grayscale images
if input_data_format == ChannelDimension.LAST:
images_list = [
[np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
]
elif input_data_format == ChannelDimension.FIRST:
images_list = [
[np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
]
else:
raise ValueError(f"Invalid channel dimension format {input_data_format}.")
if do_resize:
images_list = [
[

View File

@ -1,36 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from transformers import FbgemmFp8Config, LlamaForCausalLM
modules_to_not_convert = []
# As defined by Meta, we don't quantize the first and last layers as well as the lm_head. Also, we don't quantize the self_attn layers.
modules_to_not_convert.append("model.layers.0")
modules_to_not_convert.append("model.layers.125")
modules_to_not_convert.append("lm_head")
for layer_i in range(1, 125):
modules_to_not_convert.append(f"model.layers.{layer_i}.self_attn")
quantization_config = FbgemmFp8Config(modules_to_not_convert=modules_to_not_convert)
model_name = "meta-llama/Llama-3.1-405B"
model = LlamaForCausalLM.from_pretrained(
model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
)
model.save_pretrained(f"{model_name}-FP8")

View File

@ -21,7 +21,7 @@ from packaging import version
from safetensors.torch import storage_ptr, storage_size
from torch import nn
from .utils import is_torch_greater_or_equal, is_torch_xla_available, logging
from .utils import is_torch_xla_available, logging
ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
@ -39,7 +39,7 @@ is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse
is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
if is_torch_greater_or_equal("2.5"):
if is_torch_greater_or_equal_than_2_4:
from torch.distributed.tensor import Replicate
from torch.distributed.tensor.parallel import (
ColwiseParallel,

View File

@ -215,9 +215,6 @@ class HfQuantizer(ABC):
# Delete quantizer and quantization config
del model.hf_quantizer
del model.config.quantization_config
del model.config._pre_quantization_dtype
model.is_quantized = False
return model

View File

@ -53,20 +53,6 @@ class EetqHfQuantizer(HfQuantizer):
"Please install the latest version of eetq from : https://github.com/NetEase-FuXi/EETQ"
)
try:
import eetq # noqa: F401
except ImportError as exc:
if "shard_checkpoint" in str(exc):
# EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
# shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
# TODO: Update message once eetq releases a fix
raise ImportError(
"You are using a version of EETQ that is incompatible with the current transformers version. "
"Either downgrade transformers to <= v4.46.3 or, if available, upgrade EETQ to > v1.0.0."
) from exc
else:
raise
if not is_accelerate_available():
raise ImportError("Loading an EETQ quantized model requires accelerate (`pip install accelerate`)")

View File

@ -1143,17 +1143,7 @@ def require_eetq(test_case):
"""
Decorator marking a test that requires eetq
"""
eetq_available = is_eetq_available()
if eetq_available:
try:
import eetq # noqa: F401
except ImportError as exc:
if "shard_checkpoint" in str(exc):
# EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
# shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
# TODO: Remove once eetq releases a fix and this release is used in CI
eetq_available = False
return unittest.skipUnless(eetq_available, "test requires eetq")(test_case)
return unittest.skipUnless(is_eetq_available(), "test requires eetq")(test_case)
def require_av(test_case):

View File

@ -1006,6 +1006,17 @@ def is_auto_gptq_available():
def is_eetq_available():
if not _eetq_available:
return _eetq_available
try:
from eetq import EetqLinear # noqa: F401
except ImportError as exc:
if "shard_checkpoint" in str(exc):
# eetq is currently broken with newer transformers versions because it tries to import shard_checkpoint
# see https://github.com/NetEase-FuXi/EETQ/issues/34
# TODO: Remove once eetq releasees a fix and this release is used in CI
return False
return _eetq_available

View File

@ -14,7 +14,6 @@
# limitations under the License.
import collections
import copy
import gc
import inspect
@ -2451,58 +2450,6 @@ class UtilsFunctionsTest(unittest.TestCase):
self.assertTrue(n_matches.item() == 2)
self.assertTrue(validated_tokens.tolist()[0] == [1, 4, 8])
def test_speculative_sampling_target_distribution(self):
"""
Asserts that the target distribution is preserved.
Should help with catching issues like #32867.
"""
# assume vocab size 10, input length 5 + 3 generated candidates
candidate_input_ids = torch.tensor([[8, 0, 3, 9, 8, 1, 4, 5]]) # input tokens
candidate_logits = torch.tensor(
[
[
[-10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0], # generated 1
[-10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0], # generated 4
[-10.0, -10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0], # generated 5
]
]
)
candidate_length = 3
inf = float("inf")
new_logits = torch.tensor(
[
[
# accepts 1:
[-inf, 10.0, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
# accepts 4:
[-inf, -inf, -inf, -inf, 10.0, -inf, -inf, -inf, -inf, -inf],
# most likely to be 1 or 8, less likely to be 3, then 7, and should never be any other value:
[-inf, 2.0, -inf, 1.0, -inf, -inf, -inf, -0.01, 2.0, -inf],
# N/A:
[-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
]
]
)
last_assistant_token_is_eos = False
last_validated_token = []
for _ in range(10_000):
validated_tokens, n_matches = _speculative_sampling(
candidate_input_ids,
candidate_logits,
candidate_length,
new_logits,
last_assistant_token_is_eos,
)
self.assertTrue(n_matches.item() == 2)
self.assertTrue(validated_tokens.tolist()[0][0] == 1)
self.assertTrue(validated_tokens.tolist()[0][1] == 4)
self.assertTrue(validated_tokens.tolist()[0][2] in [1, 3, 7, 8])
last_validated_token.append(validated_tokens.tolist()[0][2])
# check that the most likely tokens are selected more often than the less likely ones
last_token_counts = collections.Counter(last_validated_token)
self.assertTrue(last_token_counts[1] > last_token_counts[3] > last_token_counts[7] > 0)
self.assertTrue(last_token_counts[8] > last_token_counts[3])
@pytest.mark.generate
@require_torch

View File

@ -65,7 +65,7 @@ class BitNetTest(unittest.TestCase):
"""
Load the model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device)
def tearDown(self):