From 2ca5f82c2a8152ba67eaa033fbdb479d28f4cc3b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 14:54:18 +0800 Subject: [PATCH] [Misc] Remove redundant config definitions (#21891) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/aimv2.py | 22 +- vllm/model_executor/models/dbrx.py | 14 +- vllm/model_executor/models/exaone.py | 8 +- vllm/model_executor/models/exaone4.py | 6 +- vllm/model_executor/models/keye.py | 3 - vllm/model_executor/models/minimax_vl_01.py | 7 +- vllm/model_executor/models/mpt.py | 8 +- vllm/model_executor/models/ovis.py | 13 +- vllm/transformers_utils/config.py | 28 +- vllm/transformers_utils/configs/__init__.py | 30 +- vllm/transformers_utils/configs/cohere2.py | 195 ------------ vllm/transformers_utils/configs/dbrx.py | 280 ------------------ vllm/transformers_utils/configs/exaone.py | 190 ------------ vllm/transformers_utils/configs/exaone4.py | 252 ---------------- .../configs/minimax_text_01.py | 70 ----- .../configs/minimax_vl_01.py | 71 ----- vllm/transformers_utils/configs/mpt.py | 180 ----------- vllm/transformers_utils/configs/nvlm_d.py | 31 -- vllm/transformers_utils/configs/ovis.py | 184 ------------ vllm/transformers_utils/configs/skyworkr1v.py | 54 ---- vllm/transformers_utils/configs/solar.py | 247 --------------- vllm/transformers_utils/configs/telechat2.py | 64 ---- .../transformers_utils/processors/__init__.py | 7 + 23 files changed, 54 insertions(+), 1910 deletions(-) delete mode 100644 vllm/transformers_utils/configs/cohere2.py delete mode 100644 vllm/transformers_utils/configs/dbrx.py delete mode 100644 vllm/transformers_utils/configs/exaone.py delete mode 100644 vllm/transformers_utils/configs/exaone4.py delete mode 100644 vllm/transformers_utils/configs/minimax_text_01.py delete mode 100644 vllm/transformers_utils/configs/minimax_vl_01.py delete mode 100644 vllm/transformers_utils/configs/mpt.py delete mode 100644 vllm/transformers_utils/configs/nvlm_d.py delete mode 100644 vllm/transformers_utils/configs/ovis.py delete mode 100644 vllm/transformers_utils/configs/skyworkr1v.py delete mode 100644 vllm/transformers_utils/configs/solar.py delete mode 100644 vllm/transformers_utils/configs/telechat2.py diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index b13d863ebb7..d2307bb464b 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -8,6 +8,7 @@ from typing import Optional import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.attention.layer import MultiHeadAttention from vllm.distributed import get_tensor_model_parallel_world_size @@ -20,13 +21,12 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.transformers_utils.configs.ovis import AIMv2Config class AIMv2SwiGLUFFN(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__(self, config: PretrainedConfig, + quant_config: QuantizationConfig, prefix: str): super().__init__() hidden_features = config.intermediate_size in_features = config.hidden_size @@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module): class AIMv2PatchEmbed(nn.Module): - def __init__(self, config: AIMv2Config): + def __init__(self, config: PretrainedConfig): super().__init__() self.proj = nn.Conv2d( config.num_channels, @@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module): class AIMv2ViTPreprocessor(nn.Module): - def __init__(self, config: AIMv2Config): + def __init__(self, config: PretrainedConfig): super().__init__() num_patches = (config.image_size // config.patch_size)**2 @@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module): class AIMv2Attention(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__(self, config: PretrainedConfig, + quant_config: QuantizationConfig, prefix: str): super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module): class AIMv2Block(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__(self, config: PretrainedConfig, + quant_config: QuantizationConfig, prefix: str): super().__init__() self.attn = AIMv2Attention(config, quant_config=quant_config, @@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module): def __init__( self, - config: AIMv2Config, + config: PretrainedConfig, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, @@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module): class AIMv2Model(torch.nn.Module): def __init__(self, - config: AIMv2Config, + config: PretrainedConfig, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 7a4dd69443a..360c7e66bf5 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -6,6 +6,7 @@ from typing import Optional, Union import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.config import CacheConfig, VllmConfig @@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.dbrx import DbrxConfig from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, @@ -39,7 +39,7 @@ class DbrxRouter(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, params_dtype: Optional[torch.dtype] = None, ): super().__init__() @@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -138,7 +138,7 @@ class DbrxMoE(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -169,7 +169,7 @@ class DbrxAttention(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -284,7 +284,7 @@ class DbrxBlock(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index aaf105ec255..8052b6bb823 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -30,6 +30,7 @@ from typing import Any, Optional, Union import torch from torch import nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.exaone import ExaoneConfig from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, @@ -99,7 +99,7 @@ class ExaoneAttention(nn.Module): def __init__( self, - config: ExaoneConfig, + config: PretrainedConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -194,7 +194,7 @@ class ExaoneBlockAttention(nn.Module): def __init__( self, - config: ExaoneConfig, + config: PretrainedConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -236,7 +236,7 @@ class ExaoneDecoderLayer(nn.Module): def __init__( self, - config: ExaoneConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 97aeb6fd7b1..3d6ce3e8895 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -26,6 +26,7 @@ from typing import Any, Optional, Union import torch from torch import nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -45,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.exaone4 import Exaone4Config from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, @@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module): def __init__( self, - config: Exaone4Config, + config: PretrainedConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module): def __init__( self, - config: Exaone4Config, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 36e57b5e4f4..892d970aaad 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -980,9 +980,6 @@ class KeyeMultiModalDataParser(MultiModalDataParser): class KeyeProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config(PretrainedConfig) - def get_hf_processor( self, *, diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 9aba82cb115..62a7d37ec9d 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -5,7 +5,7 @@ from typing import Literal, Optional, TypedDict, Union, cast import torch import torch.nn as nn -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.jsontree import json_map_leaves @@ -17,7 +17,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -90,8 +89,8 @@ class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder): class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config(MiniMaxVL01Config) + def get_hf_config(self): # Need to override the config type + return self.ctx.get_hf_config(PretrainedConfig) def get_hf_processor(self, **kwargs: object): hf_processor = self.ctx.get_hf_processor(**kwargs) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 0878ada34d1..c243f575ae5 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -8,6 +8,7 @@ from typing import Optional, Union import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -25,7 +26,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.mpt import MPTConfig from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, @@ -50,7 +50,7 @@ class MPTAttention(nn.Module): def __init__( self, - config: MPTConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -144,7 +144,7 @@ class MPTMLP(nn.Module): def __init__( self, - config: MPTConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -176,7 +176,7 @@ class MPTBlock(nn.Module): def __init__( self, - config: MPTConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 111628d8d18..c8b528048b5 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -25,7 +25,7 @@ import torch import torch.nn as nn from torch import Tensor from torch.nn.functional import gumbel_softmax, pad, softmax -from transformers import BaseImageProcessor, BatchFeature +from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -48,8 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig, - OvisConfig) from vllm.transformers_utils.processors.ovis import OvisProcessor from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -83,7 +81,7 @@ class VisualTokenizer(torch.nn.Module): def __init__( self, - config: BaseVisualTokenizerConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -107,7 +105,7 @@ class VisualTokenizer(torch.nn.Module): def _init_backbone( self, - config: BaseVisualTokenizerConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> nn.Module: @@ -247,9 +245,6 @@ class VisualEmbedding(torch.nn.Embedding): class OvisProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config(OvisConfig) - def get_hf_processor(self, **kwargs): return self.ctx.get_hf_processor( OvisProcessor, @@ -417,7 +412,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.config: OvisConfig = config + self.config: PretrainedConfig = config self.llm = init_vllm_registered_model( vllm_config=vllm_config.with_hf_config(config.get_text_config()), prefix=maybe_prefix(prefix, "llm"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 04ff08825bb..40a6a9118e5 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -29,19 +29,13 @@ from vllm import envs from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, - DbrxConfig, DeepseekVLV2Config, - EAGLEConfig, Exaone4Config, - ExaoneConfig, JAISConfig, +from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, + EAGLEConfig, JAISConfig, KimiVLConfig, MedusaConfig, - MiniMaxText01Config, - MiniMaxVL01Config, MllamaConfig, - MLPSpeculatorConfig, MPTConfig, + MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, - NemotronConfig, NVLM_D_Config, - OvisConfig, RWConfig, - SkyworkR1VChatConfig, SolarConfig, - Telechat2Config, UltravoxConfig) + NemotronConfig, RWConfig, + UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file @@ -77,28 +71,16 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = { _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "chatglm": ChatGLMConfig, - "cohere2": Cohere2Config, - "dbrx": DbrxConfig, "deepseek_vl_v2": DeepseekVLV2Config, "kimi_vl": KimiVLConfig, "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config, - "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, "eagle": EAGLEConfig, - "exaone": ExaoneConfig, - "exaone4": Exaone4Config, - "minimax_text_01": MiniMaxText01Config, - "minimax_vl_01": MiniMaxVL01Config, "nemotron": NemotronConfig, - "NVLM_D": NVLM_D_Config, - "ovis": OvisConfig, - "solar": SolarConfig, - "skywork_chat": SkyworkR1VChatConfig, - "telechat": Telechat2Config, "ultravox": UltravoxConfig, **_CONFIG_REGISTRY_OVERRIDE_HF } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 89303213a27..0fcb2beb8c7 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,13 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Model configs may be defined in this directory for the following reasons: + +- There is no configuration file defined by HF Hub or Transformers library. +- There is a need to override the existing config to support vLLM. +""" from vllm.transformers_utils.configs.chatglm import ChatGLMConfig -from vllm.transformers_utils.configs.cohere2 import Cohere2Config -from vllm.transformers_utils.configs.dbrx import DbrxConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.configs.eagle import EAGLEConfig -from vllm.transformers_utils.configs.exaone import ExaoneConfig -from vllm.transformers_utils.configs.exaone4 import Exaone4Config # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. @@ -15,36 +17,21 @@ from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.medusa import MedusaConfig -from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config -from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config from vllm.transformers_utils.configs.mllama import MllamaConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.moonvit import MoonViTConfig -from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config -from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config -from vllm.transformers_utils.configs.ovis import OvisConfig -from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig -from vllm.transformers_utils.configs.solar import SolarConfig -from vllm.transformers_utils.configs.telechat2 import Telechat2Config from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ "ChatGLMConfig", - "Cohere2Config", - "DbrxConfig", "DeepseekVLV2Config", - "MPTConfig", "RWConfig", "JAISConfig", "MedusaConfig", "EAGLEConfig", - "ExaoneConfig", - "Exaone4Config", - "MiniMaxText01Config", - "MiniMaxVL01Config", "MllamaConfig", "MLPSpeculatorConfig", "MoonViTConfig", @@ -52,10 +39,5 @@ __all__ = [ "NemotronConfig", "NemotronHConfig", "Nemotron_Nano_VL_Config", - "NVLM_D_Config", - "OvisConfig", - "SkyworkR1VChatConfig", - "SolarConfig", - "Telechat2Config", "UltravoxConfig", ] diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py deleted file mode 100644 index e547a9c281c..00000000000 --- a/vllm/transformers_utils/configs/cohere2.py +++ /dev/null @@ -1,195 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# ruff: noqa - -# Adapted from -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py -from transformers import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation - - -class Cohere2Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere - model according to the specified arguments, defining the model architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. Instantiating a configuration - with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. - - - Args: - vocab_size (`int`, *optional*, defaults to 256000): - Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`CohereModel`] - hidden_size (`int`, *optional*, defaults to 8192): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 22528): - Dimension of the MLP representations. - logit_scale (`float`, *optional*, defaults to 0.0625): - The scaling factor for the output logits. - num_hidden_layers (`int`, *optional*, defaults to 40): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 64): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 8192): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the layer normalization. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*, defaults to 0): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 5): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 255001): - End of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - sliding_window (`int`, *optional*, defaults to 4096): - Size of the sliding window attention context. - sliding_window_pattern (`int`, *optional*, defaults to 4): - Pattern for the sliding window attention. - cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`. - - ```python - >>> from transformers import Cohere2Model, Cohere2Config - - >>> # Initializing a Cohere Nextmodel configuration - >>> configuration = Cohere2Config() - - >>> # Initializing a model from the Cohere2 configuration - >>> model = Cohere2Model(configuration) # doctest: +SKIP - - >>> # Accessing the model configuration - >>> configuration = model.config # doctest: +SKIP - ``` - """ - - model_type = "cohere2" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=256000, - hidden_size=8192, - intermediate_size=22528, - logit_scale=0.0625, - num_hidden_layers=40, - num_attention_heads=64, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=8192, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=5, - eos_token_id=255001, - tie_word_embeddings=True, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - sliding_window=4096, - sliding_window_pattern=4, - cache_implementation="hybrid", - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.logit_scale = logit_scale - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - # Need to specify head_dim in the config so it can be used in the attention forward functions - self.head_dim = hidden_size // num_attention_heads - self.cache_implementation = cache_implementation - - # Validate the correctness of rotary position embeddings parameters - rope_config_validation(self) - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - -__all__ = ["Cohere2Config"] diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py deleted file mode 100644 index 7dbda99f85a..00000000000 --- a/vllm/transformers_utils/configs/dbrx.py +++ /dev/null @@ -1,280 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# yapf: disable -# ruff: noqa: E501 -# coding=utf-8 -# Copied from -# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py -"""Dbrx configuration.""" - -from typing import Any, Optional - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore - - -class DbrxAttentionConfig(PretrainedConfig): - """Configuration class for Dbrx Attention. - - [`DbrxAttention`] class. It is used to instantiate attention layers - according to the specified arguments, defining the layers architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - attn_pdrop (`float`, *optional*, defaults to 0.0): - The dropout probability for the attention layers. - clip_qkv (`float`, *optional*, defaults to None): - If not `None`, clip the queries, keys, and values in the attention layer to this value. - kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. - rope_theta (float): The base frequency for rope. - """ - - def __init__( - self, - attn_pdrop: float = 0, - clip_qkv: Optional[float] = None, - kv_n_heads: int = 1, - rope_theta: float = 10000.0, - **kwargs: Any, - ): - super().__init__(**kwargs) - self.attn_pdrop = attn_pdrop - self.clip_qkv = clip_qkv - self.kv_n_heads = kv_n_heads - self.rope_theta = rope_theta - - for k in ["model_type"]: - if k in kwargs: - kwargs.pop(k) - if len(kwargs) != 0: - raise ValueError(f"Found unknown {kwargs=}") - - @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: str, **kwargs: Any - ) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs - ) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["attn_config"] - - if ( - "model_type" in config_dict - and hasattr(cls, "model_type") - and config_dict["model_type"] != cls.model_type - ): - logger.warning( - "You are using a model of type %s to instantiate a model of " - "type %s. This is not supported for all configurations of " - "models and can yield errors.", - config_dict["model_type"], cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class DbrxFFNConfig(PretrainedConfig): - """Configuration class for Dbrx FFN. - - [`DbrxFFN`] class. It is used to instantiate feedforward layers according to - the specified arguments, defining the layers architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - ffn_act_fn (dict, optional): A dict specifying activation function for the FFN. - The dict should have a key 'name' with the value being the name of - the activation function along with any additional keyword arguments. - ffn_hidden_size (int, optional): The hidden size of the feedforward network. - moe_num_experts (int, optional): The number of experts in the mixture of experts layer. - moe_top_k (int, optional): The number of experts to use in the mixture of experts layer. - moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer. - moe_loss_weight (float, optional): The loss weight for the mixture of experts layer. - moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights. - uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment. - This should only be used for benchmarking purposes. - """ - - def __init__( - self, - ffn_act_fn: Optional[dict] = None, - ffn_hidden_size: int = 3584, - moe_num_experts: int = 4, - moe_top_k: int = 1, - moe_jitter_eps: Optional[float] = None, - moe_loss_weight: float = 0.01, - moe_normalize_expert_weights: Optional[float] = 1, - uniform_expert_assignment: bool = False, - **kwargs: Any, - ): - super().__init__() - if ffn_act_fn is None: - ffn_act_fn = {"name": "silu"} - self.ffn_act_fn = ffn_act_fn - self.ffn_hidden_size = ffn_hidden_size - self.moe_num_experts = moe_num_experts - self.moe_top_k = moe_top_k - self.moe_jitter_eps = moe_jitter_eps - self.moe_loss_weight = moe_loss_weight - self.moe_normalize_expert_weights = moe_normalize_expert_weights - self.uniform_expert_assignment = uniform_expert_assignment - - for k in ["model_type"]: - if k in kwargs: - kwargs.pop(k) - if len(kwargs) != 0: - raise ValueError(f"Found unknown {kwargs=}") - - @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: str, **kwargs: Any - ) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs - ) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["ffn_config"] - - if ( - "model_type" in config_dict - and hasattr(cls, "model_type") - and config_dict["model_type"] != cls.model_type - ): - logger.warning( - "You are using a model of type %s to instantiate a model of " - "type %s. This is not supported for all " - "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class DbrxConfig(PretrainedConfig): - """Configuration class for Dbrx. - - [`DbrxModel`]. It is used to instantiate a Dbrx model according to the - specified arguments, defining the model architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - d_model (`int`, *optional*, defaults to 6144): - Dimensionality of the embeddings and hidden states. - n_heads (`int`, *optional*, defaults to 48): - Number of attention heads for each attention layer in the Transformer encoder. - n_layers (`int`, *optional*, defaults to 40): - Number of hidden layers in the Transformer encoder. - max_seq_len (`int`, *optional*, defaults to 32768): - The maximum sequence length of the model. - vocab_size (`int`, *optional*, defaults to 100352): - Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by - the `inputs_ids` passed when calling [`DbrxModel`]. - resid_pdrop (`float`, *optional*, defaults to 0.0): - The dropout probability applied to the attention output before combining with residual. - emb_pdrop (`float`, *optional*, defaults to 0.0): - The dropout probability for the embedding layer. - attn_config (`dict`, *optional*): - A dictionary used to configure the model's attention module. - ffn_config (`dict`, *optional*): - A dictionary used to configure the model's FFN module. - use_cache (`bool`, *optional*, defaults to `False`): - Whether or not the model should return the last key/values attentions (not used by all models). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss. - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): - The aux loss factor for the total loss. - - - Example: - ```python - >>> from transformers import DbrxConfig, DbrxModel - - >>> # Initializing a Dbrx configuration - >>> configuration = DbrxConfig() - - >>> # Initializing a model (with random weights) from the configuration - >>> model = DbrxModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ``` - """ - - model_type = "dbrx" - attribute_map = { - "num_attention_heads": "n_heads", - "hidden_size": "d_model", - "num_hidden_layers": "n_layers", - "max_position_embeddings": "max_seq_len", - } - - def __init__( - self, - d_model: int = 2048, - n_heads: int = 16, - n_layers: int = 24, - max_seq_len: int = 2048, - vocab_size: int = 32000, - resid_pdrop: float = 0.0, - emb_pdrop: float = 0.0, - attn_config: Optional[DbrxAttentionConfig] = None, - ffn_config: Optional[DbrxFFNConfig] = None, - use_cache: bool = True, - initializer_range: float = 0.02, - output_router_logits: bool = False, - router_aux_loss_coef: float = 0.05, - **kwargs: Any, - ): - if attn_config is None: - self.attn_config = DbrxAttentionConfig() - elif isinstance(attn_config, dict): - self.attn_config = DbrxAttentionConfig(**attn_config) - else: - self.attn_config = attn_config - - if ffn_config is None: - self.ffn_config = DbrxFFNConfig() - elif isinstance(ffn_config, dict): - self.ffn_config = DbrxFFNConfig(**ffn_config) - else: - self.ffn_config = ffn_config - - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.resid_pdrop = resid_pdrop - self.emb_pdrop = emb_pdrop - self.use_cache = use_cache - self.initializer_range = initializer_range - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - - tie_word_embeddings = kwargs.pop("tie_word_embeddings", False) - if tie_word_embeddings: - raise ValueError( - "tie_word_embeddings is not supported for Dbrx models." - ) - - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py deleted file mode 100644 index 7450904a15c..00000000000 --- a/vllm/transformers_utils/configs/exaone.py +++ /dev/null @@ -1,190 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copied from -# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py -# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Exaone model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {} - - -class ExaoneConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a :class: - `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model - according to the specified arguments, defining the model architecture. - Instantiating a configuration with the defaults will yield a similar - configuration to that of the Exaone - - Configuration objects inherit from {class}`~transformers.PretrainedConfig` - and can be used to control the model outputs. Read the documentation from : - class:`~transformers.PretrainedConfig` for more information. - - Args: - vocab_size ({obj}`int`, `optional`, defaults to 50257): - Vocabulary size of the GPT Lingvo model. Defines the number of - different tokens that can be represented by the {obj}`inputs_ids` - passed when calling {class}`~transformers.ExaoneModel`. Vocabulary - size of the model. - Defines the different tokens that can be represented by the - `inputs_ids` passed to the forward method of :class: - `~transformers.EXAONEModel`. - hidden_size ({obj}`int`, `optional`, defaults to 2048): - Dimensionality of the encoder layers and the pooler layer. - num_layers ({obj}`int`, `optional`, defaults to 24): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the - Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to - implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi - Head Attention (MHA), if `num_key_value_heads=1 the model will use - Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, - each group key and value head should be constructed by meanpooling - all the original heads within that group. For more details checkout - [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not - specified, will default to `num_attention_heads`. - rotary_pct (`float`, *optional*, defaults to 0.25): - percentage of hidden dimensions to allocate to rotary embeddings - intermediate_size ({obj}`int`, `optional`, defaults to 8192): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in - the Transformer encoder. - activation_function ({obj}`str` or {obj}`function`, `optional`, - defaults to {obj}`"gelu_new"`): - The non-linear activation function (function or string) in the - encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`, - {obj}`"selu"` and {obj}`"gelu_new"` are supported. - embed_dropout ({obj}`float`, `optional`, defaults to 0.0): - The dropout probabilitiy for all fully connected layers in the - embeddings, encoder, and pooler. - attention_dropout ({obj}`float`, `optional`, defaults to 0.0): - The dropout ratio for the attention probabilities. - max_position_embeddings ({obj}`int`, `optional`, defaults to 2048): - The maximum sequence length that this model might ever be used with. - Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size ({obj}`int`, `optional`, defaults to 2): - The vocabulary size of the {obj}`token_type_ids` passed when calling - {class}`~transformers.EXAONEModel`. - initializer_range ({obj}`float`, `optional`, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5): - The epsilon used by the layer normalization layers. - use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`): - Whether or not the model should return the last key/values - attentions (not used by all models). - Only relevant if ``config.is_decoder=True``. - gradient_checkpointing ({obj}`bool`, `optional`, - defaults to {obj}`False`): - If True, use gradient checkpointing to save memory at the expense - of slower backward pass. - Example:: - - >>> from transformers import ExoneModel, ExaoneConfig - - >>> # Initializing a EXAONE configuration - >>> configuration = ExaoneConfig() - - >>> # Initializing a model from configuration - >>> model = ExoneModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - """ - - model_type = "exaone" - keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_hidden_layers": "num_layers"} - - def __init__( - self, - vocab_size=102400, - max_position_embeddings=2048, - hidden_size=2048, - num_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - intermediate_size=None, - activation_function="silu", - rotary_pct=0.25, - resid_dropout=0.0, - embed_dropout=0.0, - attention_dropout=0.0, - layer_norm_epsilon=1e-6, - initializer_range=0.02, - use_cache=True, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__( - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_attention_heads = num_attention_heads - self.num_hidden_layers = num_layers - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - if intermediate_size: - self.intermediate_size = intermediate_size - else: - self.intermediate_size = hidden_size * 4 - self.activation_function = activation_function - self.resid_dropout = resid_dropout - self.embed_dropout = embed_dropout - self.attention_dropout = attention_dropout - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.rotary_pct = rotary_pct - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.use_logit_cap = kwargs.pop("use_logit_cap", False) - self.ln_no_scale = kwargs.pop("ln_no_scale", False) - self.use_gated = kwargs.pop("use_gated", False) - self.use_emb_norm = kwargs.pop("use_emb_norm", False) - self.use_rotary_pos = kwargs.pop("use_rotary_pos", False) - self.rotary_type = kwargs.pop("rotary_type", None) - self.scaling_factor = kwargs.pop("scaling_factor", 1) - self.use_absolute_pos = kwargs.pop("use_absolute_pos", True) - self.use_extra_logit = kwargs.pop("use_extra_logit", True) - self.rotary_expand_length = kwargs.pop("rotary_expand_length", None) - self.rotary_base = kwargs.pop("rotary_base", 10000.0) - self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False) - self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head", - (rotary_pct == 0.25)) - if self.use_rotary_pos: - self.use_absolute_pos = False diff --git a/vllm/transformers_utils/configs/exaone4.py b/vllm/transformers_utils/configs/exaone4.py deleted file mode 100644 index a22ebaa6bd6..00000000000 --- a/vllm/transformers_utils/configs/exaone4.py +++ /dev/null @@ -1,252 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: E501 - -# Copied from -# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py -# Copyright 2025 The LG CNS Gen AI Solution Delivery Team. -# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved. -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from transformers.configuration_utils import (PretrainedConfig, - layer_type_validation) -from transformers.utils import logging - -logger = logging.get_logger(__name__) - - -def check_is_sliding(config, layer_idx): - """ - Check if the current layer is a sliding window attention (local attention) layer. - """ - if config.sliding_window is None: - return False - if config.layer_types is not None: - return config.layer_types[layer_idx] == "sliding_attention" - if isinstance(config.sliding_window_pattern, int): - return ((layer_idx + 1) % config.sliding_window_pattern) != 0 - elif isinstance(config.sliding_window_pattern, str): - assert isinstance(config.sliding_window, int), ( - f"Sliding window must be positive integer, but got {config.sliding_window}" - ) - return (layer_idx != config.num_hidden_layers - 1 - and config.sliding_window_pattern[layer_idx % len( - config.sliding_window_pattern)] == "L") - else: - logger.warning_once( - "Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. " - "Defaulting to use 'full_attention' for all layers.") - return False - - -class Exaone4Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to - instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct) - NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model - outputs. Read the documentation from [`PretrainedConfig`] for more information. - - Args: - vocab_size (`int`, *optional*, defaults to 102400): - Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Exaone4Model`]. - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`): - Dimensionality of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 32768 for EXAONE 3.5). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the layer normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if ``config.is_decoder=True``. - bos_token_id (`int`, *optional*, defaults to 0): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - sliding_window (`int`, *optional*): - The size of the sliding window for the sliding window attention. - sliding_window_pattern (`str`, *optional*): - The pattern to use for sliding window attention. Can be one of: - - `None`: No sliding window attention is used - - `int`: Every `sliding_window` layers, use global attention, else use local attention. - - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the - attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The - final layer always uses global attention regardless of the pattern. - For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means: - - Layer 0, 1, 2: local attention, - - Layer 3: global attention, - ...(repeated) - layer_types (`list`, *optional*): - Attention pattern for each layer. Prioritized over `sliding_window_pattern`. - - Example: - - ```python - >>> from transformers import Exaone4Model, Exaone4Config - - >>> # Initializing a EXAONE configuration - >>> configuration = Exaone4Config() - - >>> # Initializing a model from configuration - >>> model = Exaone4Model(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "exaone4" - keys_to_ignore_at_inference = ["past_key_values"] - # Default tensor parallel plan for base model `LlamaModel` - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } - - def __init__( - self, - vocab_size=102400, - hidden_size=4096, - intermediate_size=None, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_dropout=0.0, - sliding_window=None, - sliding_window_pattern=None, - layer_types=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - if intermediate_size: - self.intermediate_size = intermediate_size - else: - self.intermediate_size = hidden_size * 4 - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - - self.layer_types = layer_types - if self.layer_types is None: - self.layer_types = [ - "sliding_attention" - if check_is_sliding(self, i) else "full_attention" - for i in range(self.num_hidden_layers) - ] - layer_type_validation(self.layer_types) - - super().__init__(bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs) - - -__all__ = ["Exaone4Config"] diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py deleted file mode 100644 index e3b63dfa003..00000000000 --- a/vllm/transformers_utils/configs/minimax_text_01.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" MiniMaxText01 model configuration""" - -from transformers.configuration_utils import PretrainedConfig - - -class MiniMaxText01Config(PretrainedConfig): - model_type = "MiniMaxText01" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=False, - rope_theta=1e6, - sliding_window=None, - attention_dropout=0.0, - num_experts_per_tok=2, - num_local_experts=8, - output_router_logits=False, - router_aux_loss_coef=0.001, - router_jitter_noise=0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.attention_dropout = attention_dropout - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py deleted file mode 100644 index c62497192cc..00000000000 --- a/vllm/transformers_utils/configs/minimax_vl_01.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""MiniMaxVL01 model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.models.auto import CONFIG_MAPPING - -from .minimax_text_01 import MiniMaxText01Config - - -class MiniMaxVL01Config(PretrainedConfig): - model_type = "minimax_vl_01" - - def __init__( - self, - vision_config=None, - text_config=None, - ignore_index=-100, - image_token_index=32000, - projector_hidden_act="gelu", - vision_feature_select_strategy="default", - vision_feature_layer=-2, - image_grid_pinpoints=None, - tie_word_embeddings=False, - image_seq_length=576, - **kwargs, - ): - self.ignore_index = ignore_index - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - self.image_seq_length = image_seq_length - - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError("vision_feature_select_strategy should " + - "be one of 'default', 'full'." + - f"Got: {vision_feature_select_strategy}") - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - image_grid_pinpoints = ( - image_grid_pinpoints if image_grid_pinpoints is not None else - [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]) - self.image_grid_pinpoints = image_grid_pinpoints - - if isinstance(vision_config, dict): - if "model_type" not in vision_config: - vision_config["model_type"] = "clip_vision_model" - vision_config = CONFIG_MAPPING[vision_config["model_type"]]( - **vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["clip_vision_model"]( - intermediate_size=4096, - hidden_size=1024, - patch_size=14, - image_size=336, - num_hidden_layers=24, - num_attention_heads=16, - vocab_size=32000, - projection_dim=768, - ) - - self.vision_config = vision_config - - if text_config is not None: - text_config = MiniMaxText01Config(**text_config) - else: - text_config = MiniMaxText01Config() - - self.text_config = text_config - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py deleted file mode 100644 index 91316408dcd..00000000000 --- a/vllm/transformers_utils/configs/mpt.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copied from -# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py -"""A HuggingFace-style model configuration.""" -import warnings -from typing import Any, Optional, Union - -from transformers import PretrainedConfig - -attn_config_defaults: dict = { - 'attn_type': 'multihead_attention', - 'attn_pdrop': 0.0, - 'attn_impl': 'triton', - 'qk_ln': False, - 'clip_qkv': None, - 'softmax_scale': None, - 'prefix_lm': False, - 'attn_uses_sequence_id': False, - 'alibi': False, - 'alibi_bias_max': 8 -} -ffn_config_defaults: dict = {'ffn_type': 'mptmlp'} -init_config_defaults: dict = { - 'name': 'kaiming_normal_', - 'fan_mode': 'fan_in', - 'init_nonlinearity': 'relu', - 'init_div_is_residual': True, - 'emb_init_std': None, - 'emb_init_uniform_lim': None, - 'init_std': None, - 'init_gain': 0.0 -} - - -class MPTConfig(PretrainedConfig): - model_type = 'mpt' - attribute_map = { - 'num_attention_heads': 'n_heads', - 'hidden_size': 'd_model', - 'num_hidden_layers': 'n_layers', - } - - # pylint: disable=dangerous-default-value - def __init__(self, - d_model: int = 2048, - n_heads: int = 16, - n_layers: int = 24, - expansion_ratio: int = 4, - max_seq_len: int = 2048, - vocab_size: int = 50368, - resid_pdrop: float = 0.0, - emb_pdrop: float = 0.0, - learned_pos_emb: bool = True, - attn_config: dict = attn_config_defaults, - ffn_config: dict = ffn_config_defaults, - init_device: str = 'cpu', - logit_scale: Optional[Union[float, str]] = None, - no_bias: bool = False, - embedding_fraction: float = 1.0, - norm_type: str = 'low_precision_layernorm', - use_cache: bool = False, - init_config: dict = init_config_defaults, - fc_type: str = 'torch', - verbose: Optional[int] = None, - **kwargs: Any): - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.expansion_ratio = expansion_ratio - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.resid_pdrop = resid_pdrop - self.emb_pdrop = emb_pdrop - self.learned_pos_emb = learned_pos_emb - self.attn_config = attn_config - self.ffn_config = ffn_config - self.init_device = init_device - self.logit_scale = logit_scale - self.no_bias = no_bias - self.embedding_fraction = embedding_fraction - self.norm_type = norm_type - self.use_cache = use_cache - self.init_config = init_config - self.fc_type = fc_type - if verbose is not None: - warnings.warn(DeprecationWarning( - 'verbose argument for MPTConfig is now ignored and ' - 'will be removed. Use python_log_level instead.'), - stacklevel=2) - if 'name' in kwargs: - del kwargs['name'] - if 'loss_fn' in kwargs: - del kwargs['loss_fn'] - if self.attn_config.get('alibi', False): - self.learned_pos_emb = False - warnings.warn( - f'alibi is turned on, setting `learned_pos_emb` ' - f'to {self.learned_pos_emb}`', - stacklevel=2) - super().__init__(**kwargs) - self._validate_config() - - def _set_config_defaults( - self, config: dict[str, Any], - config_defaults: dict[str, Any]) -> dict[str, Any]: - for (k, v) in config_defaults.items(): - if k not in config: - config[k] = v - return config - - def _validate_config(self) -> None: - self.attn_config = self._set_config_defaults(self.attn_config, - attn_config_defaults) - self.ffn_config = self._set_config_defaults(self.ffn_config, - ffn_config_defaults) - self.init_config = self._set_config_defaults(self.init_config, - init_config_defaults) - if self.d_model % self.n_heads != 0: - raise ValueError('d_model must be divisible by n_heads') - if any( - prob < 0 or prob > 1 for prob in - [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop - ]): - raise ValueError( - "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " - "probabilities and must be between 0 and 1") - if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: - raise ValueError( - f"Unknown attn_impl={self.attn_config['attn_impl']}") - if self.attn_config['prefix_lm'] and self.attn_config[ - 'attn_impl'] not in ['torch', 'triton']: - raise NotImplementedError( - 'prefix_lm only implemented with torch and triton attention.') - if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [ - 'torch', 'triton' - ]: - raise NotImplementedError( - 'alibi only implemented with torch and triton attention.') - if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ - 'attn_impl'] not in ['torch', 'triton']: - raise NotImplementedError( - 'attn_uses_sequence_id only implemented with torch ' - 'and triton attention.') - if self.embedding_fraction > 1 or self.embedding_fraction <= 0: - raise ValueError( - 'model.embedding_fraction must be between 0 (exclusive) ' - 'and 1 (inclusive)!') - if isinstance(self.logit_scale, - str) and self.logit_scale != 'inv_sqrt_d_model': - raise ValueError( - f"self.logit_scale={self.logit_scale!r} is not recognized as " - "an option; use numeric value or 'inv_sqrt_d_model'.") - if self.init_config.get('name', None) is None: - raise ValueError( - f"self.init_config={self.init_config!r} 'name' needs to be set." - ) - if not self.learned_pos_emb and (not self.attn_config['alibi']): - warnings.warn( - 'Positional information not being provided to the model.', - stacklevel=2) - if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp': - try: - # pylint: disable=import-outside-toplevel - import transformer_engine.pytorch as te - del te - except Exception as exc: - raise ImportError( - 'TransformerEngine import fail. `fc_type: te` requires ' - 'TransformerEngine be installed. ' - 'The required version of transformer_engine also requires ' - 'FlashAttention v1.0.6 is installed:\n' - 'pip install flash-attn==1.0.6 --no-build-isolation \n' - 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' - ) from exc - if self.ffn_config['ffn_type'] == 'mptmlp': - self.ffn_config['fc_type'] = self.fc_type - elif self.ffn_config['ffn_type'] == 'te_ln_mlp': - self.ffn_config['bias'] = not self.no_bias diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py deleted file mode 100644 index edfc506882f..00000000000 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py -# -------------------------------------------------------- -# NVLM-D -# Copyright (c) 2024 NVIDIA -# Licensed under Apache 2.0 License [see LICENSE for details] -# -------------------------------------------------------- -from transformers import Qwen2Config -from transformers.configuration_utils import PretrainedConfig - - -class NVLM_D_Config(PretrainedConfig): - model_type = 'NVLM_D' - is_composition = True - - def __init__(self, vision_config=None, llm_config=None, **kwargs): - super().__init__(**kwargs) - - # Handle vision_config initialization - if vision_config is None: - vision_config = {} - - # Handle llm_config initialization - if llm_config is None: - llm_config = {} - - self.vision_config = PretrainedConfig(**vision_config) - self.text_config = Qwen2Config(**llm_config) diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py deleted file mode 100644 index 021d402a71f..00000000000 --- a/vllm/transformers_utils/configs/ovis.py +++ /dev/null @@ -1,184 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# yapf: disable -# ruff: noqa: E501 -# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py -# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py -from typing import Any, Optional, Union - -from transformers import AutoConfig, PretrainedConfig - - -class AIMv2Config(PretrainedConfig): - """This is the configuration class to store the configuration of an [`AIMv2Model`]. - - Instantiating a configuration with the defaults will yield a similar configuration - to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224). - - Args: - hidden_size: Dimension of the hidden representations. - intermediate_size: Dimension of the SwiGLU representations. - num_hidden_layers: Number of hidden layers in the Transformer. - num_attention_heads: Number of attention heads for each attention layer - in the Transformer. - num_channels: Number of input channels. - image_size: Image size. - patch_size: Patch size. - rms_norm_eps: Epsilon value used for the RMS normalization layer. - attention_dropout: Dropout ratio for attention probabilities. - projection_dropout: Dropout ratio for the projection layer after the attention. - qkv_bias: Whether to add a bias to the queries, keys and values. - use_bias: Whether to add a bias in the feed-forward and projection layers. - kwargs: Keyword arguments for the [`PretrainedConfig`]. - """ - - model_type: str = "aimv2" - - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 2816, - num_hidden_layers: int = 24, - num_attention_heads: int = 8, - num_channels: int = 3, - image_size: int = 224, - patch_size: int = 14, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - projection_dropout: float = 0.0, - qkv_bias: bool = False, - use_bias: bool = False, - **kwargs: Any, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.rms_norm_eps = rms_norm_eps - - self.projection_dropout = projection_dropout - self.qkv_bias = qkv_bias - self.use_bias = use_bias - - -IGNORE_ID = -100 -IMAGE_TOKEN_ID = -200 -IMAGE_TOKEN = "" -IMAGE_ATOM_ID = -300 -IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] - - -# ---------------------------------------------------------------------- -# Visual Tokenizer Configuration -# ---------------------------------------------------------------------- -class BaseVisualTokenizerConfig(PretrainedConfig): - - def __init__(self, - vocab_size=16384, - tokenize_function="softmax", - tau=1.0, - depths=None, - drop_cls_token=False, - backbone_config: Optional[Union[PretrainedConfig, - dict]] = None, - hidden_stride: int = 1, - **kwargs): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.tokenize_function = tokenize_function - self.tau = tau - if isinstance(depths, str): - depths = [int(x) for x in depths.split('|')] - self.depths = depths - self.backbone_kwargs = dict[str, Any]() - self.drop_cls_token = drop_cls_token - if backbone_config is not None: - assert isinstance(backbone_config, (PretrainedConfig, dict)), \ - f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type" - if not isinstance(backbone_config, PretrainedConfig): - model_type = backbone_config['model_type'] - if model_type != "aimv2": - backbone_config.pop('model_type') - backbone_config = AutoConfig.for_model(model_type, **backbone_config) - else: - backbone_config = AIMv2Config(**backbone_config) - self.backbone_config = backbone_config - self.hidden_stride = hidden_stride - - -class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig): - model_type = "aimv2_visual_tokenizer" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - if self.drop_cls_token: - self.drop_cls_token = False - if self.depths: - assert len(self.depths) == 1 - self.backbone_kwargs['num_hidden_layers'] = self.depths[0] - - -class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig): - model_type = "siglip_visual_tokenizer" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - if self.drop_cls_token: - self.drop_cls_token = False - if self.depths: - assert len(self.depths) == 1 - self.backbone_kwargs['num_hidden_layers'] = self.depths[0] - - -AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig) -AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig) - - -# ---------------------------------------------------------------------- -# Ovis Configuration -# ---------------------------------------------------------------------- -class OvisConfig(PretrainedConfig): - model_type = "ovis" - - def __init__(self, - llm_config: Optional[Union[PretrainedConfig, dict]] = None, - visual_tokenizer_config: Optional[Union[PretrainedConfig, - dict]] = None, - multimodal_max_length=8192, - hidden_size=None, - conversation_formatter_class=None, - llm_attn_implementation=None, - disable_tie_weight=False, - **kwargs): - super().__init__(**kwargs) - if llm_config is not None: - assert isinstance(llm_config, (PretrainedConfig, dict)), \ - f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type" - if not isinstance(llm_config, PretrainedConfig): - model_type = llm_config['model_type'] - llm_config.pop('model_type') - llm_config = AutoConfig.for_model(model_type, **llm_config) - - # map llm_config to text_config - self.text_config = llm_config - if visual_tokenizer_config is not None: - assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \ - f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type" - if not isinstance(visual_tokenizer_config, PretrainedConfig): - model_type = visual_tokenizer_config['model_type'] - visual_tokenizer_config.pop('model_type') - visual_tokenizer_config = AutoConfig.for_model( - model_type, **visual_tokenizer_config) - - self.visual_tokenizer_config = visual_tokenizer_config - self.multimodal_max_length = multimodal_max_length - self.hidden_size = hidden_size - self.conversation_formatter_class = conversation_formatter_class - self.llm_attn_implementation = llm_attn_implementation - self.disable_tie_weight = disable_tie_weight diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py deleted file mode 100644 index 33a45220e31..00000000000 --- a/vllm/transformers_utils/configs/skyworkr1v.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py -# -------------------------------------------------------- -# SkyworkR1V -# Copyright (c) 2025 Skywork -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- -from transformers.configuration_utils import PretrainedConfig - - -class SkyworkR1VChatConfig(PretrainedConfig): - model_type = 'internvl_chat' - is_composition = True - - def __init__(self, - vision_config=None, - llm_config=None, - use_backbone_lora=0, - use_llm_lora=0, - select_layer=-1, - force_image_size=None, - downsample_ratio=0.5, - template=None, - dynamic_image_size=False, - use_thumbnail=False, - ps_version='v1', - min_dynamic_patch=1, - max_dynamic_patch=6, - **kwargs): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - - if llm_config is None: - llm_config = {} - - self.vision_config = PretrainedConfig(**vision_config) - self.text_config = PretrainedConfig(**llm_config) - - self.use_backbone_lora = use_backbone_lora - self.use_llm_lora = use_llm_lora - self.select_layer = select_layer - self.force_image_size = force_image_size - self.downsample_ratio = downsample_ratio - self.template = template - self.dynamic_image_size = dynamic_image_size - self.use_thumbnail = use_thumbnail - self.ps_version = ps_version # pixel shuffle version - self.min_dynamic_patch = min_dynamic_patch - self.max_dynamic_patch = max_dynamic_patch diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py deleted file mode 100644 index a83dfa40b43..00000000000 --- a/vllm/transformers_utils/configs/solar.py +++ /dev/null @@ -1,247 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Solar model configuration""" - -from transformers import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - - -class SolarConfig(PretrainedConfig): - r""" - This is the configuration class to store - the configuration of a [`SolarModel`]. - It is used to instantiate an LLaMA model - according to the specified arguments, - defining the model architecture. - Instantiating a configuration with the - defaults will yield a similar - configuration to that of the LLaMA-7B. - Configuration objects inherit from [`PretrainedConfig`] - and can be used to control the model outputs. - Read the documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the LLaMA model. - Defines the number of different tokens - that can be represented by the `inputs_ids` - passed when calling [`SolarModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer - in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that - should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, - the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model - will use Multi Query Attention (MQA) - otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, - each group key and value head should be constructed - by meanpooling all the original heads within that group. - For more details checkout [this paper] - (https://arxiv.org/pdf/2305.13245.pdf). - If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) - in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. - Solar 1 supports up to 2048 tokens, - Solar 2 up to 4096, CodeSolar up to 16384. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of - the truncated_normal_initializer for initializing - all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return - the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - pretraining_tp (`int`, *optional*, defaults to 1): - Experimental feature. Tensor parallelism rank - used during pretraining. - Please refer to [this - document](https://huggingface.co/docs/ - transformers/main/ - perf_train_gpu_many#tensor-parallelism) - to understand more about it. This value is - necessary to ensure exact reproducibility - of the pretraining results. - Please refer to [this - issue](https://github.com/pytorch/pytorch/issues/76232). - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary containing the scaling configuration for - the RoPE embeddings. - Currently supports two scaling - strategies: linear and dynamic. - Their scaling factor must be a float greater than 1. - The expected format is - `{"type": strategy name, "factor": scaling factor}`. - When using this flag, don't update - `max_position_embeddings` to the expected new maximum. - See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/ - dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking - API changes in future versions. - attention_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value - and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - mlp_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in up_proj, down_proj and gate_proj - layers in the MLP layers. - sliding_window (`int`, *optional*, defaults to 2047): - Sliding window attention window size. If not specified, - will default to `2047`. - ```python - >>> from transformers import SolarModel, SolarConfig - >>> # Initializing a Solar-pro style configuration - >>> configuration = SolarConfig() - >>> # Initializing a model from the Solar-pro style configuration - >>> model = SolarModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "solar" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - sliding_window=2047, - bskcn_1=None, - bskcn_2=None, - bskcn_3=None, - bskcn_4=None, - bskcn_tv=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.sliding_window = sliding_window - self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44] - self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32] - self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48] - self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40] - self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8] - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if (not isinstance(self.rope_scaling, dict) - or len(self.rope_scaling) != 2): - raise ValueError( - "`rope_scaling` must be a dictionary with two fields," - " `type` and `factor`, " - f"got {self.rope_scaling}") - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in [ - "linear", - "dynamic", - ]: - raise ValueError(f"`rope_scaling`'s type field must be one of " - f"['linear', 'dynamic'], got {rope_scaling_type}") - if (rope_scaling_factor is None - or not isinstance(rope_scaling_factor, float) - or rope_scaling_factor <= 1.0): - raise ValueError( - f"`rope_scaling`'s factor field must be a float > 1," - f" got {rope_scaling_factor}") diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py deleted file mode 100644 index 050a7851d14..00000000000 --- a/vllm/transformers_utils/configs/telechat2.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py -""" Telechat configuration compatible with LlamaConfig. """ - -from transformers.configuration_utils import PretrainedConfig - - -class Telechat2Config(PretrainedConfig): - - model_type = "telechat" - keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = { - "num_hidden_layers": "n_layer", - "num_attention_heads": "n_head", - "intermediate_size": "ffn_hidden_size", - "rms_norm_eps": "layer_norm_epsilon" - } - - def __init__( - self, - vocab_size=160256, - hidden_size=4096, - n_layer=30, - n_head=32, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=1, - eos_token_id=2, - apply_residual_connection_post_layernorm=False, - hidden_dropout=0.0, - attention_dropout=0.0, - ffn_hidden_size=12288, - training_seqlen=8192, - logn=True, - embed_layernorm=False, - hidden_act="silu", - **kwargs, - ): - self.vocab_size = vocab_size - n_embed = kwargs.pop("n_embed", None) - self.hidden_size = hidden_size if n_embed is None else n_embed - self.n_layer = n_layer - self.n_head = n_head - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.apply_residual_connection_post_layernorm = ( - apply_residual_connection_post_layernorm) - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.logn = logn - self.training_seqlen = training_seqlen - self.embed_layernorm = embed_layernorm - self.num_key_value_heads = kwargs.pop("num_key_value_heads", None) - self.ffn_hidden_size = ffn_hidden_size - self.hidden_act = hidden_act - super().__init__(bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs) diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 14d15f2bc16..eca4d7c884d 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -1,5 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Multi-modal processors may be defined in this directory for the following +reasons: + +- There is no processing file defined by HF Hub or Transformers library. +- There is a need to override the existing processor to support vLLM. +""" from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor)