mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-27 14:54:34 +08:00
Compare commits
3 Commits
add-fp8-ll
...
fix_eetq_t
| Author | SHA1 | Date | |
|---|---|---|---|
| 29845c2460 | |||
| 4a7158d05c | |||
| b2e08a8466 |
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import inspect
|
||||
import os
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
@ -1030,6 +1029,10 @@ class GenerationMixin:
|
||||
"You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument "
|
||||
"in favour of `input_ids` or `decoder_input_ids` respectively.",
|
||||
)
|
||||
if generation_config.watermarking_config is not None:
|
||||
processors.append(
|
||||
generation_config.watermarking_config.construct_processor(self.config.vocab_size, device)
|
||||
)
|
||||
|
||||
# TODO (joao): find a strategy to specify the order of the processors
|
||||
processors = self._merge_criteria_processor_list(processors, logits_processor)
|
||||
@ -1082,12 +1085,6 @@ class GenerationMixin:
|
||||
)
|
||||
)
|
||||
|
||||
# Watermarking should be after all logits processing is finished (see #34630)
|
||||
if generation_config.watermarking_config is not None:
|
||||
processors.append(
|
||||
generation_config.watermarking_config.construct_processor(self.config.vocab_size, device)
|
||||
)
|
||||
|
||||
# `LogitNormalization` should always be the last logit processor, when present
|
||||
if generation_config.renormalize_logits is True:
|
||||
processors.append(LogitNormalization())
|
||||
@ -3225,16 +3222,6 @@ class GenerationMixin:
|
||||
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
|
||||
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
|
||||
|
||||
def model_forward(model, *args, **kwargs):
|
||||
return model.forward(*args, **kwargs)
|
||||
|
||||
if isinstance(model_kwargs.get("past_key_values"), StaticCache):
|
||||
if self.device.type == "cuda":
|
||||
logger.warning_once("Using `torch.compile`.")
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "0"
|
||||
model_forward = torch.compile(model_forward, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
i = 0
|
||||
while self._has_unfinished_sequences(
|
||||
this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length
|
||||
):
|
||||
@ -3245,11 +3232,8 @@ class GenerationMixin:
|
||||
model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
|
||||
model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
|
||||
|
||||
if i == 0:
|
||||
outputs = self(**model_inputs, return_dict=True)
|
||||
i += 1
|
||||
else:
|
||||
outputs = model_forward(self, return_dict=True, **model_inputs)
|
||||
# forward pass to get next token
|
||||
outputs = self(**model_inputs, return_dict=True)
|
||||
|
||||
# synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
|
||||
model_kwargs = self._update_model_kwargs_for_generation(
|
||||
|
||||
@ -52,6 +52,7 @@ from .pytorch_utils import ( # noqa: F401
|
||||
find_pruneable_heads_and_indices,
|
||||
id_tensor_storage,
|
||||
is_torch_greater_or_equal_than_1_13,
|
||||
is_torch_greater_or_equal_than_2_4,
|
||||
prune_conv1d_layer,
|
||||
prune_layer,
|
||||
prune_linear_layer,
|
||||
@ -89,7 +90,6 @@ from .utils import (
|
||||
is_peft_available,
|
||||
is_remote_url,
|
||||
is_safetensors_available,
|
||||
is_torch_greater_or_equal,
|
||||
is_torch_sdpa_available,
|
||||
is_torch_xla_available,
|
||||
logging,
|
||||
@ -5032,7 +5032,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
device_mesh (`torch.distributed.DeviceMesh`):
|
||||
The device mesh to use for tensor parallelism.
|
||||
"""
|
||||
if not is_torch_greater_or_equal("2.5"):
|
||||
if not is_torch_greater_or_equal_than_2_4:
|
||||
raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
|
||||
|
||||
# Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
|
||||
|
||||
@ -38,7 +38,6 @@ from ...utils import TensorType, is_vision_available, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
@ -117,6 +116,7 @@ def _resize_output_size_scale_below_upper_bound(
|
||||
def get_resize_output_image_size(
|
||||
image,
|
||||
resolution_max_side: int,
|
||||
max_image_size: int = 1820,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
@ -126,18 +126,24 @@ def get_resize_output_image_size(
|
||||
Image to resize.
|
||||
resolution_max_side (`int`):
|
||||
The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
|
||||
input aspect ratio.
|
||||
input aspect ratio, with a lower bound of `min_image_size`.
|
||||
max_image_size (`int`, *optional*, defaults to 1820):
|
||||
Maximum image resolution. If the image is larger than this size, the longest edge will be resized to this
|
||||
value, with the shortest edge resized to keep the input aspect ratio, with a lower bound of `min_image_size`.
|
||||
input_data_format (`ChannelDimension` or `str`):
|
||||
The channel dimension format of the input image.
|
||||
Returns:
|
||||
The output size of the image after resizing.
|
||||
"""
|
||||
if resolution_max_side > max_image_size:
|
||||
raise ValueError("`resolution_max_side` cannot be larger than `max_image_size`")
|
||||
|
||||
height, width = get_image_size(image, channel_dim=input_data_format)
|
||||
|
||||
# Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
|
||||
height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
|
||||
# Find the output size when scaling the image to be below the MAX_IMAGE_SIZE
|
||||
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
|
||||
# Find the output size when scaling the image to be below the max_image_size
|
||||
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=max_image_size)
|
||||
return height, width
|
||||
|
||||
|
||||
@ -245,7 +251,7 @@ def convert_to_rgb(
|
||||
data_format = input_data_format if data_format is None else data_format
|
||||
|
||||
mode = "P" if palette is not None else None
|
||||
image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
|
||||
image = to_pil_image(image, image_mode=mode)
|
||||
if image.mode == "P" and palette is not None:
|
||||
image.putpalette(palette)
|
||||
|
||||
@ -398,7 +404,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
|
||||
image_mode = None
|
||||
if image.ndim == 2 or image.shape[-1] == 1:
|
||||
image_mode = "P"
|
||||
image = to_pil_image(image, image_mode=image_mode, input_data_format=input_data_format)
|
||||
image = to_pil_image(image, image_mode=image_mode)
|
||||
|
||||
resized_image = image.resize((size[1], size[0]), resample=resample)
|
||||
resized_image = np.array(resized_image)
|
||||
@ -748,16 +754,6 @@ class Idefics3ImageProcessor(BaseImageProcessor):
|
||||
# All transformations expect numpy arrays.
|
||||
images_list = [[to_numpy_array(image) for image in images] for images in images_list]
|
||||
|
||||
# Extra channel dimension for grayscale images
|
||||
if input_data_format in [ChannelDimension.LAST, None]:
|
||||
images_list = [
|
||||
[np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
|
||||
]
|
||||
elif input_data_format == ChannelDimension.FIRST:
|
||||
images_list = [
|
||||
[np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
|
||||
]
|
||||
|
||||
if is_scaled_image(images_list[0][0]) and do_rescale:
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
@ -768,6 +764,18 @@ class Idefics3ImageProcessor(BaseImageProcessor):
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
|
||||
|
||||
# Extra channel dimension for grayscale images
|
||||
if input_data_format == ChannelDimension.LAST:
|
||||
images_list = [
|
||||
[np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
|
||||
]
|
||||
elif input_data_format == ChannelDimension.FIRST:
|
||||
images_list = [
|
||||
[np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Invalid channel dimension format {input_data_format}.")
|
||||
|
||||
if do_resize:
|
||||
images_list = [
|
||||
[
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from transformers import FbgemmFp8Config, LlamaForCausalLM
|
||||
|
||||
|
||||
modules_to_not_convert = []
|
||||
|
||||
# As defined by Meta, we don't quantize the first and last layers as well as the lm_head. Also, we don't quantize the self_attn layers.
|
||||
modules_to_not_convert.append("model.layers.0")
|
||||
modules_to_not_convert.append("model.layers.125")
|
||||
modules_to_not_convert.append("lm_head")
|
||||
for layer_i in range(1, 125):
|
||||
modules_to_not_convert.append(f"model.layers.{layer_i}.self_attn")
|
||||
|
||||
quantization_config = FbgemmFp8Config(modules_to_not_convert=modules_to_not_convert)
|
||||
model_name = "meta-llama/Llama-3.1-405B"
|
||||
|
||||
model = LlamaForCausalLM.from_pretrained(
|
||||
model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
|
||||
)
|
||||
|
||||
model.save_pretrained(f"{model_name}-FP8")
|
||||
@ -21,7 +21,7 @@ from packaging import version
|
||||
from safetensors.torch import storage_ptr, storage_size
|
||||
from torch import nn
|
||||
|
||||
from .utils import is_torch_greater_or_equal, is_torch_xla_available, logging
|
||||
from .utils import is_torch_xla_available, logging
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
|
||||
@ -39,7 +39,7 @@ is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse
|
||||
is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
|
||||
|
||||
|
||||
if is_torch_greater_or_equal("2.5"):
|
||||
if is_torch_greater_or_equal_than_2_4:
|
||||
from torch.distributed.tensor import Replicate
|
||||
from torch.distributed.tensor.parallel import (
|
||||
ColwiseParallel,
|
||||
|
||||
@ -215,9 +215,6 @@ class HfQuantizer(ABC):
|
||||
|
||||
# Delete quantizer and quantization config
|
||||
del model.hf_quantizer
|
||||
del model.config.quantization_config
|
||||
del model.config._pre_quantization_dtype
|
||||
model.is_quantized = False
|
||||
|
||||
return model
|
||||
|
||||
|
||||
@ -53,20 +53,6 @@ class EetqHfQuantizer(HfQuantizer):
|
||||
"Please install the latest version of eetq from : https://github.com/NetEase-FuXi/EETQ"
|
||||
)
|
||||
|
||||
try:
|
||||
import eetq # noqa: F401
|
||||
except ImportError as exc:
|
||||
if "shard_checkpoint" in str(exc):
|
||||
# EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
|
||||
# shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
|
||||
# TODO: Update message once eetq releases a fix
|
||||
raise ImportError(
|
||||
"You are using a version of EETQ that is incompatible with the current transformers version. "
|
||||
"Either downgrade transformers to <= v4.46.3 or, if available, upgrade EETQ to > v1.0.0."
|
||||
) from exc
|
||||
else:
|
||||
raise
|
||||
|
||||
if not is_accelerate_available():
|
||||
raise ImportError("Loading an EETQ quantized model requires accelerate (`pip install accelerate`)")
|
||||
|
||||
|
||||
@ -1143,17 +1143,7 @@ def require_eetq(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires eetq
|
||||
"""
|
||||
eetq_available = is_eetq_available()
|
||||
if eetq_available:
|
||||
try:
|
||||
import eetq # noqa: F401
|
||||
except ImportError as exc:
|
||||
if "shard_checkpoint" in str(exc):
|
||||
# EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
|
||||
# shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
|
||||
# TODO: Remove once eetq releases a fix and this release is used in CI
|
||||
eetq_available = False
|
||||
return unittest.skipUnless(eetq_available, "test requires eetq")(test_case)
|
||||
return unittest.skipUnless(is_eetq_available(), "test requires eetq")(test_case)
|
||||
|
||||
|
||||
def require_av(test_case):
|
||||
|
||||
@ -1006,6 +1006,17 @@ def is_auto_gptq_available():
|
||||
|
||||
|
||||
def is_eetq_available():
|
||||
if not _eetq_available:
|
||||
return _eetq_available
|
||||
|
||||
try:
|
||||
from eetq import EetqLinear # noqa: F401
|
||||
except ImportError as exc:
|
||||
if "shard_checkpoint" in str(exc):
|
||||
# eetq is currently broken with newer transformers versions because it tries to import shard_checkpoint
|
||||
# see https://github.com/NetEase-FuXi/EETQ/issues/34
|
||||
# TODO: Remove once eetq releasees a fix and this release is used in CI
|
||||
return False
|
||||
return _eetq_available
|
||||
|
||||
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import gc
|
||||
import inspect
|
||||
@ -2451,58 +2450,6 @@ class UtilsFunctionsTest(unittest.TestCase):
|
||||
self.assertTrue(n_matches.item() == 2)
|
||||
self.assertTrue(validated_tokens.tolist()[0] == [1, 4, 8])
|
||||
|
||||
def test_speculative_sampling_target_distribution(self):
|
||||
"""
|
||||
Asserts that the target distribution is preserved.
|
||||
Should help with catching issues like #32867.
|
||||
"""
|
||||
# assume vocab size 10, input length 5 + 3 generated candidates
|
||||
candidate_input_ids = torch.tensor([[8, 0, 3, 9, 8, 1, 4, 5]]) # input tokens
|
||||
candidate_logits = torch.tensor(
|
||||
[
|
||||
[
|
||||
[-10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0], # generated 1
|
||||
[-10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0], # generated 4
|
||||
[-10.0, -10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0], # generated 5
|
||||
]
|
||||
]
|
||||
)
|
||||
candidate_length = 3
|
||||
inf = float("inf")
|
||||
new_logits = torch.tensor(
|
||||
[
|
||||
[
|
||||
# accepts 1:
|
||||
[-inf, 10.0, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
|
||||
# accepts 4:
|
||||
[-inf, -inf, -inf, -inf, 10.0, -inf, -inf, -inf, -inf, -inf],
|
||||
# most likely to be 1 or 8, less likely to be 3, then 7, and should never be any other value:
|
||||
[-inf, 2.0, -inf, 1.0, -inf, -inf, -inf, -0.01, 2.0, -inf],
|
||||
# N/A:
|
||||
[-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
|
||||
]
|
||||
]
|
||||
)
|
||||
last_assistant_token_is_eos = False
|
||||
last_validated_token = []
|
||||
for _ in range(10_000):
|
||||
validated_tokens, n_matches = _speculative_sampling(
|
||||
candidate_input_ids,
|
||||
candidate_logits,
|
||||
candidate_length,
|
||||
new_logits,
|
||||
last_assistant_token_is_eos,
|
||||
)
|
||||
self.assertTrue(n_matches.item() == 2)
|
||||
self.assertTrue(validated_tokens.tolist()[0][0] == 1)
|
||||
self.assertTrue(validated_tokens.tolist()[0][1] == 4)
|
||||
self.assertTrue(validated_tokens.tolist()[0][2] in [1, 3, 7, 8])
|
||||
last_validated_token.append(validated_tokens.tolist()[0][2])
|
||||
# check that the most likely tokens are selected more often than the less likely ones
|
||||
last_token_counts = collections.Counter(last_validated_token)
|
||||
self.assertTrue(last_token_counts[1] > last_token_counts[3] > last_token_counts[7] > 0)
|
||||
self.assertTrue(last_token_counts[8] > last_token_counts[3])
|
||||
|
||||
|
||||
@pytest.mark.generate
|
||||
@require_torch
|
||||
|
||||
@ -65,7 +65,7 @@ class BitNetTest(unittest.TestCase):
|
||||
"""
|
||||
Load the model
|
||||
"""
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device)
|
||||
|
||||
def tearDown(self):
|
||||
|
||||
Reference in New Issue
Block a user