mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Misc] Remove redundant config definitions (#21891)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -8,6 +8,7 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.attention.layer import MultiHeadAttention
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
@ -20,13 +21,12 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.transformers_utils.configs.ovis import AIMv2Config
|
||||
|
||||
|
||||
class AIMv2SwiGLUFFN(nn.Module):
|
||||
|
||||
def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
|
||||
prefix: str):
|
||||
def __init__(self, config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig, prefix: str):
|
||||
super().__init__()
|
||||
hidden_features = config.intermediate_size
|
||||
in_features = config.hidden_size
|
||||
@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module):
|
||||
|
||||
class AIMv2PatchEmbed(nn.Module):
|
||||
|
||||
def __init__(self, config: AIMv2Config):
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
self.proj = nn.Conv2d(
|
||||
config.num_channels,
|
||||
@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module):
|
||||
|
||||
class AIMv2ViTPreprocessor(nn.Module):
|
||||
|
||||
def __init__(self, config: AIMv2Config):
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
num_patches = (config.image_size // config.patch_size)**2
|
||||
|
||||
@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module):
|
||||
|
||||
class AIMv2Attention(nn.Module):
|
||||
|
||||
def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
|
||||
prefix: str):
|
||||
def __init__(self, config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.embed_dim = config.hidden_size
|
||||
@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module):
|
||||
|
||||
class AIMv2Block(nn.Module):
|
||||
|
||||
def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
|
||||
prefix: str):
|
||||
def __init__(self, config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.attn = AIMv2Attention(config,
|
||||
quant_config=quant_config,
|
||||
@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: AIMv2Config,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig,
|
||||
*,
|
||||
require_post_norm: Optional[bool] = None,
|
||||
@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module):
|
||||
class AIMv2Model(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
config: AIMv2Config,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig,
|
||||
*,
|
||||
require_post_norm: Optional[bool] = None,
|
||||
|
@ -6,6 +6,7 @@ from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader, maybe_remap_kv_scale_name)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.dbrx import DbrxConfig
|
||||
|
||||
from .interfaces import SupportsPP
|
||||
from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
|
||||
@ -39,7 +39,7 @@ class DbrxRouter(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: DbrxConfig,
|
||||
config: PretrainedConfig,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
super().__init__()
|
||||
@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: DbrxConfig,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
prefix: str = "",
|
||||
@ -138,7 +138,7 @@ class DbrxMoE(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: DbrxConfig,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
prefix: str = "",
|
||||
@ -169,7 +169,7 @@ class DbrxAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: DbrxConfig,
|
||||
config: PretrainedConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: DbrxConfig,
|
||||
config: PretrainedConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
@ -284,7 +284,7 @@ class DbrxBlock(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: DbrxConfig,
|
||||
config: PretrainedConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
|
@ -30,6 +30,7 @@ from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader, maybe_remap_kv_scale_name)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.exaone import ExaoneConfig
|
||||
|
||||
from .interfaces import SupportsLoRA, SupportsPP
|
||||
from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
|
||||
@ -99,7 +99,7 @@ class ExaoneAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: ExaoneConfig,
|
||||
config: PretrainedConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
@ -194,7 +194,7 @@ class ExaoneBlockAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: ExaoneConfig,
|
||||
config: PretrainedConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
@ -236,7 +236,7 @@ class ExaoneDecoderLayer(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: ExaoneConfig,
|
||||
config: PretrainedConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
|
@ -26,6 +26,7 @@ from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
@ -45,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader, maybe_remap_kv_scale_name)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.exaone4 import Exaone4Config
|
||||
|
||||
from .interfaces import SupportsLoRA, SupportsPP
|
||||
from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
|
||||
@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Exaone4Config,
|
||||
config: PretrainedConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Exaone4Config,
|
||||
config: PretrainedConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
|
@ -980,9 +980,6 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
|
||||
|
||||
class KeyeProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config(PretrainedConfig)
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
|
@ -5,7 +5,7 @@ from typing import Literal, Optional, TypedDict, Union, cast
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import BatchFeature
|
||||
from transformers import BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.jsontree import json_map_leaves
|
||||
@ -17,7 +17,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalFieldConfig
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
|
||||
|
||||
from .clip import CLIPVisionModel
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
@ -90,8 +89,8 @@ class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder):
|
||||
|
||||
class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config(MiniMaxVL01Config)
|
||||
def get_hf_config(self): # Need to override the config type
|
||||
return self.ctx.get_hf_config(PretrainedConfig)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
hf_processor = self.ctx.get_hf_processor(**kwargs)
|
||||
|
@ -8,6 +8,7 @@ from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
@ -25,7 +26,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||
|
||||
from .interfaces import SupportsPP
|
||||
from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
|
||||
@ -50,7 +50,7 @@ class MPTAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: MPTConfig,
|
||||
config: PretrainedConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
@ -144,7 +144,7 @@ class MPTMLP(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: MPTConfig,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
):
|
||||
super().__init__()
|
||||
@ -176,7 +176,7 @@ class MPTBlock(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: MPTConfig,
|
||||
config: PretrainedConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
|
@ -25,7 +25,7 @@ import torch
|
||||
import torch.nn as nn
|
||||
from torch import Tensor
|
||||
from torch.nn.functional import gumbel_softmax, pad, softmax
|
||||
from transformers import BaseImageProcessor, BatchFeature
|
||||
from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
@ -48,8 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig,
|
||||
OvisConfig)
|
||||
from vllm.transformers_utils.processors.ovis import OvisProcessor
|
||||
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
@ -83,7 +81,7 @@ class VisualTokenizer(torch.nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: BaseVisualTokenizerConfig,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
@ -107,7 +105,7 @@ class VisualTokenizer(torch.nn.Module):
|
||||
|
||||
def _init_backbone(
|
||||
self,
|
||||
config: BaseVisualTokenizerConfig,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> nn.Module:
|
||||
@ -247,9 +245,6 @@ class VisualEmbedding(torch.nn.Embedding):
|
||||
|
||||
class OvisProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config(OvisConfig)
|
||||
|
||||
def get_hf_processor(self, **kwargs):
|
||||
return self.ctx.get_hf_processor(
|
||||
OvisProcessor,
|
||||
@ -417,7 +412,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.config: OvisConfig = config
|
||||
self.config: PretrainedConfig = config
|
||||
self.llm = init_vllm_registered_model(
|
||||
vllm_config=vllm_config.with_hf_config(config.get_text_config()),
|
||||
prefix=maybe_prefix(prefix, "llm"),
|
||||
|
@ -29,19 +29,13 @@ from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
|
||||
DbrxConfig, DeepseekVLV2Config,
|
||||
EAGLEConfig, Exaone4Config,
|
||||
ExaoneConfig, JAISConfig,
|
||||
from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
|
||||
EAGLEConfig, JAISConfig,
|
||||
KimiVLConfig, MedusaConfig,
|
||||
MiniMaxText01Config,
|
||||
MiniMaxVL01Config, MllamaConfig,
|
||||
MLPSpeculatorConfig, MPTConfig,
|
||||
MllamaConfig, MLPSpeculatorConfig,
|
||||
Nemotron_Nano_VL_Config,
|
||||
NemotronConfig, NVLM_D_Config,
|
||||
OvisConfig, RWConfig,
|
||||
SkyworkR1VChatConfig, SolarConfig,
|
||||
Telechat2Config, UltravoxConfig)
|
||||
NemotronConfig, RWConfig,
|
||||
UltravoxConfig)
|
||||
# yapf: enable
|
||||
from vllm.transformers_utils.configs.mistral import adapt_config_dict
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
@ -77,28 +71,16 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
|
||||
|
||||
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
|
||||
"chatglm": ChatGLMConfig,
|
||||
"cohere2": Cohere2Config,
|
||||
"dbrx": DbrxConfig,
|
||||
"deepseek_vl_v2": DeepseekVLV2Config,
|
||||
"kimi_vl": KimiVLConfig,
|
||||
"Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
|
||||
"mpt": MPTConfig,
|
||||
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
||||
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
||||
"jais": JAISConfig,
|
||||
"mlp_speculator": MLPSpeculatorConfig,
|
||||
"medusa": MedusaConfig,
|
||||
"eagle": EAGLEConfig,
|
||||
"exaone": ExaoneConfig,
|
||||
"exaone4": Exaone4Config,
|
||||
"minimax_text_01": MiniMaxText01Config,
|
||||
"minimax_vl_01": MiniMaxVL01Config,
|
||||
"nemotron": NemotronConfig,
|
||||
"NVLM_D": NVLM_D_Config,
|
||||
"ovis": OvisConfig,
|
||||
"solar": SolarConfig,
|
||||
"skywork_chat": SkyworkR1VChatConfig,
|
||||
"telechat": Telechat2Config,
|
||||
"ultravox": UltravoxConfig,
|
||||
**_CONFIG_REGISTRY_OVERRIDE_HF
|
||||
}
|
||||
|
@ -1,13 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Model configs may be defined in this directory for the following reasons:
|
||||
|
||||
- There is no configuration file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing config to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.cohere2 import Cohere2Config
|
||||
from vllm.transformers_utils.configs.dbrx import DbrxConfig
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
from vllm.transformers_utils.configs.exaone import ExaoneConfig
|
||||
from vllm.transformers_utils.configs.exaone4 import Exaone4Config
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
@ -15,36 +17,21 @@ from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.jais import JAISConfig
|
||||
from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
|
||||
from vllm.transformers_utils.configs.medusa import MedusaConfig
|
||||
from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config
|
||||
from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
|
||||
from vllm.transformers_utils.configs.mllama import MllamaConfig
|
||||
from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||
from vllm.transformers_utils.configs.nemotron import NemotronConfig
|
||||
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
|
||||
from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
|
||||
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
|
||||
from vllm.transformers_utils.configs.ovis import OvisConfig
|
||||
from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
|
||||
from vllm.transformers_utils.configs.solar import SolarConfig
|
||||
from vllm.transformers_utils.configs.telechat2 import Telechat2Config
|
||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||
|
||||
__all__ = [
|
||||
"ChatGLMConfig",
|
||||
"Cohere2Config",
|
||||
"DbrxConfig",
|
||||
"DeepseekVLV2Config",
|
||||
"MPTConfig",
|
||||
"RWConfig",
|
||||
"JAISConfig",
|
||||
"MedusaConfig",
|
||||
"EAGLEConfig",
|
||||
"ExaoneConfig",
|
||||
"Exaone4Config",
|
||||
"MiniMaxText01Config",
|
||||
"MiniMaxVL01Config",
|
||||
"MllamaConfig",
|
||||
"MLPSpeculatorConfig",
|
||||
"MoonViTConfig",
|
||||
@ -52,10 +39,5 @@ __all__ = [
|
||||
"NemotronConfig",
|
||||
"NemotronHConfig",
|
||||
"Nemotron_Nano_VL_Config",
|
||||
"NVLM_D_Config",
|
||||
"OvisConfig",
|
||||
"SkyworkR1VChatConfig",
|
||||
"SolarConfig",
|
||||
"Telechat2Config",
|
||||
"UltravoxConfig",
|
||||
]
|
||||
|
@ -1,195 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# ruff: noqa
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.modeling_rope_utils import rope_config_validation
|
||||
|
||||
|
||||
class Cohere2Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
|
||||
model according to the specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`CohereModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 8192):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 22528):
|
||||
Dimension of the MLP representations.
|
||||
logit_scale (`float`, *optional*, defaults to 0.0625):
|
||||
The scaling factor for the output logits.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 40):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 64):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 5):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 255001):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||
pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||
`factor` field to infer the suggested value.
|
||||
`beta_fast` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 32.
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
`short_factor` (`list[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`long_factor` (`list[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`low_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
||||
`high_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
sliding_window (`int`, *optional*, defaults to 4096):
|
||||
Size of the sliding window attention context.
|
||||
sliding_window_pattern (`int`, *optional*, defaults to 4):
|
||||
Pattern for the sliding window attention.
|
||||
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
||||
|
||||
```python
|
||||
>>> from transformers import Cohere2Model, Cohere2Config
|
||||
|
||||
>>> # Initializing a Cohere Nextmodel configuration
|
||||
>>> configuration = Cohere2Config()
|
||||
|
||||
>>> # Initializing a model from the Cohere2 configuration
|
||||
>>> model = Cohere2Model(configuration) # doctest: +SKIP
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config # doctest: +SKIP
|
||||
```
|
||||
"""
|
||||
|
||||
model_type = "cohere2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=256000,
|
||||
hidden_size=8192,
|
||||
intermediate_size=22528,
|
||||
logit_scale=0.0625,
|
||||
num_hidden_layers=40,
|
||||
num_attention_heads=64,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=8192,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=0,
|
||||
bos_token_id=5,
|
||||
eos_token_id=255001,
|
||||
tie_word_embeddings=True,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
sliding_window=4096,
|
||||
sliding_window_pattern=4,
|
||||
cache_implementation="hybrid",
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.logit_scale = logit_scale
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.sliding_window = sliding_window
|
||||
self.sliding_window_pattern = sliding_window_pattern
|
||||
# Need to specify head_dim in the config so it can be used in the attention forward functions
|
||||
self.head_dim = hidden_size // num_attention_heads
|
||||
self.cache_implementation = cache_implementation
|
||||
|
||||
# Validate the correctness of rotary position embeddings parameters
|
||||
rope_config_validation(self)
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Cohere2Config"]
|
@ -1,280 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
|
||||
"""Dbrx configuration."""
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
|
||||
|
||||
|
||||
class DbrxAttentionConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx Attention.
|
||||
|
||||
[`DbrxAttention`] class. It is used to instantiate attention layers
|
||||
according to the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the attention layers.
|
||||
clip_qkv (`float`, *optional*, defaults to None):
|
||||
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
||||
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
|
||||
rope_theta (float): The base frequency for rope.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attn_pdrop: float = 0,
|
||||
clip_qkv: Optional[float] = None,
|
||||
kv_n_heads: int = 1,
|
||||
rope_theta: float = 10000.0,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.clip_qkv = clip_qkv
|
||||
self.kv_n_heads = kv_n_heads
|
||||
self.rope_theta = rope_theta
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["attn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all configurations of "
|
||||
"models and can yield errors.",
|
||||
config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxFFNConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx FFN.
|
||||
|
||||
[`DbrxFFN`] class. It is used to instantiate feedforward layers according to
|
||||
the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
|
||||
The dict should have a key 'name' with the value being the name of
|
||||
the activation function along with any additional keyword arguments.
|
||||
ffn_hidden_size (int, optional): The hidden size of the feedforward network.
|
||||
moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
|
||||
moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
|
||||
moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
|
||||
moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
|
||||
moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
|
||||
uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
|
||||
This should only be used for benchmarking purposes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ffn_act_fn: Optional[dict] = None,
|
||||
ffn_hidden_size: int = 3584,
|
||||
moe_num_experts: int = 4,
|
||||
moe_top_k: int = 1,
|
||||
moe_jitter_eps: Optional[float] = None,
|
||||
moe_loss_weight: float = 0.01,
|
||||
moe_normalize_expert_weights: Optional[float] = 1,
|
||||
uniform_expert_assignment: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__()
|
||||
if ffn_act_fn is None:
|
||||
ffn_act_fn = {"name": "silu"}
|
||||
self.ffn_act_fn = ffn_act_fn
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.moe_num_experts = moe_num_experts
|
||||
self.moe_top_k = moe_top_k
|
||||
self.moe_jitter_eps = moe_jitter_eps
|
||||
self.moe_loss_weight = moe_loss_weight
|
||||
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
||||
self.uniform_expert_assignment = uniform_expert_assignment
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["ffn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all "
|
||||
"configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx.
|
||||
|
||||
[`DbrxModel`]. It is used to instantiate a Dbrx model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
d_model (`int`, *optional*, defaults to 6144):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
n_layers (`int`, *optional*, defaults to 40):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
max_seq_len (`int`, *optional*, defaults to 32768):
|
||||
The maximum sequence length of the model.
|
||||
vocab_size (`int`, *optional*, defaults to 100352):
|
||||
Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
|
||||
the `inputs_ids` passed when calling [`DbrxModel`].
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability applied to the attention output before combining with residual.
|
||||
emb_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the embedding layer.
|
||||
attn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's attention module.
|
||||
ffn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's FFN module.
|
||||
use_cache (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> from transformers import DbrxConfig, DbrxModel
|
||||
|
||||
>>> # Initializing a Dbrx configuration
|
||||
>>> configuration = DbrxConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = DbrxModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
"""
|
||||
|
||||
model_type = "dbrx"
|
||||
attribute_map = {
|
||||
"num_attention_heads": "n_heads",
|
||||
"hidden_size": "d_model",
|
||||
"num_hidden_layers": "n_layers",
|
||||
"max_position_embeddings": "max_seq_len",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 32000,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
attn_config: Optional[DbrxAttentionConfig] = None,
|
||||
ffn_config: Optional[DbrxFFNConfig] = None,
|
||||
use_cache: bool = True,
|
||||
initializer_range: float = 0.02,
|
||||
output_router_logits: bool = False,
|
||||
router_aux_loss_coef: float = 0.05,
|
||||
**kwargs: Any,
|
||||
):
|
||||
if attn_config is None:
|
||||
self.attn_config = DbrxAttentionConfig()
|
||||
elif isinstance(attn_config, dict):
|
||||
self.attn_config = DbrxAttentionConfig(**attn_config)
|
||||
else:
|
||||
self.attn_config = attn_config
|
||||
|
||||
if ffn_config is None:
|
||||
self.ffn_config = DbrxFFNConfig()
|
||||
elif isinstance(ffn_config, dict):
|
||||
self.ffn_config = DbrxFFNConfig(**ffn_config)
|
||||
else:
|
||||
self.ffn_config = ffn_config
|
||||
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.use_cache = use_cache
|
||||
self.initializer_range = initializer_range
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
|
||||
tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
|
||||
if tie_word_embeddings:
|
||||
raise ValueError(
|
||||
"tie_word_embeddings is not supported for Dbrx models."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
@ -1,190 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copied from
|
||||
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
|
||||
# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Exaone model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {}
|
||||
|
||||
|
||||
class ExaoneConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:
|
||||
`~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Exaone
|
||||
|
||||
Configuration objects inherit from {class}`~transformers.PretrainedConfig`
|
||||
and can be used to control the model outputs. Read the documentation from :
|
||||
class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
Args:
|
||||
vocab_size ({obj}`int`, `optional`, defaults to 50257):
|
||||
Vocabulary size of the GPT Lingvo model. Defines the number of
|
||||
different tokens that can be represented by the {obj}`inputs_ids`
|
||||
passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
|
||||
size of the model.
|
||||
Defines the different tokens that can be represented by the
|
||||
`inputs_ids` passed to the forward method of :class:
|
||||
`~transformers.EXAONEModel`.
|
||||
hidden_size ({obj}`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_layers ({obj}`int`, `optional`, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi
|
||||
Head Attention (MHA), if `num_key_value_heads=1 the model will use
|
||||
Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint,
|
||||
each group key and value head should be constructed by meanpooling
|
||||
all the original heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
|
||||
specified, will default to `num_attention_heads`.
|
||||
rotary_pct (`float`, *optional*, defaults to 0.25):
|
||||
percentage of hidden dimensions to allocate to rotary embeddings
|
||||
intermediate_size ({obj}`int`, `optional`, defaults to 8192):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
||||
the Transformer encoder.
|
||||
activation_function ({obj}`str` or {obj}`function`, `optional`,
|
||||
defaults to {obj}`"gelu_new"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
|
||||
{obj}`"selu"` and {obj}`"gelu_new"` are supported.
|
||||
embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
embeddings, encoder, and pooler.
|
||||
attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size ({obj}`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the {obj}`token_type_ids` passed when calling
|
||||
{class}`~transformers.EXAONEModel`.
|
||||
initializer_range ({obj}`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
Only relevant if ``config.is_decoder=True``.
|
||||
gradient_checkpointing ({obj}`bool`, `optional`,
|
||||
defaults to {obj}`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense
|
||||
of slower backward pass.
|
||||
Example::
|
||||
|
||||
>>> from transformers import ExoneModel, ExaoneConfig
|
||||
|
||||
>>> # Initializing a EXAONE configuration
|
||||
>>> configuration = ExaoneConfig()
|
||||
|
||||
>>> # Initializing a model from configuration
|
||||
>>> model = ExoneModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "exaone"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_hidden_layers": "num_layers"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=102400,
|
||||
max_position_embeddings=2048,
|
||||
hidden_size=2048,
|
||||
num_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
intermediate_size=None,
|
||||
activation_function="silu",
|
||||
rotary_pct=0.25,
|
||||
resid_dropout=0.0,
|
||||
embed_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layer_norm_epsilon=1e-6,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.num_layers = num_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_hidden_layers = num_layers
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
if intermediate_size:
|
||||
self.intermediate_size = intermediate_size
|
||||
else:
|
||||
self.intermediate_size = hidden_size * 4
|
||||
self.activation_function = activation_function
|
||||
self.resid_dropout = resid_dropout
|
||||
self.embed_dropout = embed_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.rotary_pct = rotary_pct
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
self.use_logit_cap = kwargs.pop("use_logit_cap", False)
|
||||
self.ln_no_scale = kwargs.pop("ln_no_scale", False)
|
||||
self.use_gated = kwargs.pop("use_gated", False)
|
||||
self.use_emb_norm = kwargs.pop("use_emb_norm", False)
|
||||
self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
|
||||
self.rotary_type = kwargs.pop("rotary_type", None)
|
||||
self.scaling_factor = kwargs.pop("scaling_factor", 1)
|
||||
self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
|
||||
self.use_extra_logit = kwargs.pop("use_extra_logit", True)
|
||||
self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
|
||||
self.rotary_base = kwargs.pop("rotary_base", 10000.0)
|
||||
self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
|
||||
self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
|
||||
(rotary_pct == 0.25))
|
||||
if self.use_rotary_pos:
|
||||
self.use_absolute_pos = False
|
@ -1,252 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
|
||||
# Copied from
|
||||
# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py
|
||||
# Copyright 2025 The LG CNS Gen AI Solution Delivery Team.
|
||||
# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from transformers.configuration_utils import (PretrainedConfig,
|
||||
layer_type_validation)
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def check_is_sliding(config, layer_idx):
|
||||
"""
|
||||
Check if the current layer is a sliding window attention (local attention) layer.
|
||||
"""
|
||||
if config.sliding_window is None:
|
||||
return False
|
||||
if config.layer_types is not None:
|
||||
return config.layer_types[layer_idx] == "sliding_attention"
|
||||
if isinstance(config.sliding_window_pattern, int):
|
||||
return ((layer_idx + 1) % config.sliding_window_pattern) != 0
|
||||
elif isinstance(config.sliding_window_pattern, str):
|
||||
assert isinstance(config.sliding_window, int), (
|
||||
f"Sliding window must be positive integer, but got {config.sliding_window}"
|
||||
)
|
||||
return (layer_idx != config.num_hidden_layers - 1
|
||||
and config.sliding_window_pattern[layer_idx % len(
|
||||
config.sliding_window_pattern)] == "L")
|
||||
else:
|
||||
logger.warning_once(
|
||||
"Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. "
|
||||
"Defaulting to use 'full_attention' for all layers.")
|
||||
return False
|
||||
|
||||
|
||||
class Exaone4Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
|
||||
instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
|
||||
NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
|
||||
outputs. Read the documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 102400):
|
||||
Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Exaone4Model`].
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
|
||||
Dimensionality of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 32768 for EXAONE 3.5).
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if ``config.is_decoder=True``.
|
||||
bos_token_id (`int`, *optional*, defaults to 0):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||
pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||
`factor` field to infer the suggested value.
|
||||
`beta_fast` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 32.
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
`short_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`long_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`low_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
||||
`high_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
sliding_window (`int`, *optional*):
|
||||
The size of the sliding window for the sliding window attention.
|
||||
sliding_window_pattern (`str`, *optional*):
|
||||
The pattern to use for sliding window attention. Can be one of:
|
||||
- `None`: No sliding window attention is used
|
||||
- `int`: Every `sliding_window` layers, use global attention, else use local attention.
|
||||
- `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
|
||||
attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
|
||||
final layer always uses global attention regardless of the pattern.
|
||||
For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
|
||||
- Layer 0, 1, 2: local attention,
|
||||
- Layer 3: global attention,
|
||||
...(repeated)
|
||||
layer_types (`list`, *optional*):
|
||||
Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import Exaone4Model, Exaone4Config
|
||||
|
||||
>>> # Initializing a EXAONE configuration
|
||||
>>> configuration = Exaone4Config()
|
||||
|
||||
>>> # Initializing a model from configuration
|
||||
>>> model = Exaone4Model(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "exaone4"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
# Default tensor parallel plan for base model `LlamaModel`
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.gate_proj": "colwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=102400,
|
||||
hidden_size=4096,
|
||||
intermediate_size=None,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_dropout=0.0,
|
||||
sliding_window=None,
|
||||
sliding_window_pattern=None,
|
||||
layer_types=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
if intermediate_size:
|
||||
self.intermediate_size = intermediate_size
|
||||
else:
|
||||
self.intermediate_size = hidden_size * 4
|
||||
self.hidden_act = hidden_act
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.attention_dropout = attention_dropout
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.sliding_window = sliding_window
|
||||
self.sliding_window_pattern = sliding_window_pattern
|
||||
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
self.layer_types = [
|
||||
"sliding_attention"
|
||||
if check_is_sliding(self, i) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs)
|
||||
|
||||
|
||||
__all__ = ["Exaone4Config"]
|
@ -1,70 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
""" MiniMaxText01 model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class MiniMaxText01Config(PretrainedConfig):
|
||||
model_type = "MiniMaxText01"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=14336,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=8,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096 * 32,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=None,
|
||||
eos_token_id=None,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=1e6,
|
||||
sliding_window=None,
|
||||
attention_dropout=0.0,
|
||||
num_experts_per_tok=2,
|
||||
num_local_experts=8,
|
||||
output_router_logits=False,
|
||||
router_aux_loss_coef=0.001,
|
||||
router_jitter_noise=0.0,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_local_experts = num_local_experts
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.router_jitter_noise = router_jitter_noise
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
@ -1,71 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""MiniMaxVL01 model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.models.auto import CONFIG_MAPPING
|
||||
|
||||
from .minimax_text_01 import MiniMaxText01Config
|
||||
|
||||
|
||||
class MiniMaxVL01Config(PretrainedConfig):
|
||||
model_type = "minimax_vl_01"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config=None,
|
||||
text_config=None,
|
||||
ignore_index=-100,
|
||||
image_token_index=32000,
|
||||
projector_hidden_act="gelu",
|
||||
vision_feature_select_strategy="default",
|
||||
vision_feature_layer=-2,
|
||||
image_grid_pinpoints=None,
|
||||
tie_word_embeddings=False,
|
||||
image_seq_length=576,
|
||||
**kwargs,
|
||||
):
|
||||
self.ignore_index = ignore_index
|
||||
self.image_token_index = image_token_index
|
||||
self.projector_hidden_act = projector_hidden_act
|
||||
self.image_seq_length = image_seq_length
|
||||
|
||||
if vision_feature_select_strategy not in ["default", "full"]:
|
||||
raise ValueError("vision_feature_select_strategy should " +
|
||||
"be one of 'default', 'full'." +
|
||||
f"Got: {vision_feature_select_strategy}")
|
||||
|
||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||
self.vision_feature_layer = vision_feature_layer
|
||||
image_grid_pinpoints = (
|
||||
image_grid_pinpoints if image_grid_pinpoints is not None else
|
||||
[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
|
||||
self.image_grid_pinpoints = image_grid_pinpoints
|
||||
|
||||
if isinstance(vision_config, dict):
|
||||
if "model_type" not in vision_config:
|
||||
vision_config["model_type"] = "clip_vision_model"
|
||||
vision_config = CONFIG_MAPPING[vision_config["model_type"]](
|
||||
**vision_config)
|
||||
elif vision_config is None:
|
||||
vision_config = CONFIG_MAPPING["clip_vision_model"](
|
||||
intermediate_size=4096,
|
||||
hidden_size=1024,
|
||||
patch_size=14,
|
||||
image_size=336,
|
||||
num_hidden_layers=24,
|
||||
num_attention_heads=16,
|
||||
vocab_size=32000,
|
||||
projection_dim=768,
|
||||
)
|
||||
|
||||
self.vision_config = vision_config
|
||||
|
||||
if text_config is not None:
|
||||
text_config = MiniMaxText01Config(**text_config)
|
||||
else:
|
||||
text_config = MiniMaxText01Config()
|
||||
|
||||
self.text_config = text_config
|
||||
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
@ -1,180 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copied from
|
||||
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
|
||||
"""A HuggingFace-style model configuration."""
|
||||
import warnings
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
attn_config_defaults: dict = {
|
||||
'attn_type': 'multihead_attention',
|
||||
'attn_pdrop': 0.0,
|
||||
'attn_impl': 'triton',
|
||||
'qk_ln': False,
|
||||
'clip_qkv': None,
|
||||
'softmax_scale': None,
|
||||
'prefix_lm': False,
|
||||
'attn_uses_sequence_id': False,
|
||||
'alibi': False,
|
||||
'alibi_bias_max': 8
|
||||
}
|
||||
ffn_config_defaults: dict = {'ffn_type': 'mptmlp'}
|
||||
init_config_defaults: dict = {
|
||||
'name': 'kaiming_normal_',
|
||||
'fan_mode': 'fan_in',
|
||||
'init_nonlinearity': 'relu',
|
||||
'init_div_is_residual': True,
|
||||
'emb_init_std': None,
|
||||
'emb_init_uniform_lim': None,
|
||||
'init_std': None,
|
||||
'init_gain': 0.0
|
||||
}
|
||||
|
||||
|
||||
class MPTConfig(PretrainedConfig):
|
||||
model_type = 'mpt'
|
||||
attribute_map = {
|
||||
'num_attention_heads': 'n_heads',
|
||||
'hidden_size': 'd_model',
|
||||
'num_hidden_layers': 'n_layers',
|
||||
}
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
expansion_ratio: int = 4,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 50368,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
learned_pos_emb: bool = True,
|
||||
attn_config: dict = attn_config_defaults,
|
||||
ffn_config: dict = ffn_config_defaults,
|
||||
init_device: str = 'cpu',
|
||||
logit_scale: Optional[Union[float, str]] = None,
|
||||
no_bias: bool = False,
|
||||
embedding_fraction: float = 1.0,
|
||||
norm_type: str = 'low_precision_layernorm',
|
||||
use_cache: bool = False,
|
||||
init_config: dict = init_config_defaults,
|
||||
fc_type: str = 'torch',
|
||||
verbose: Optional[int] = None,
|
||||
**kwargs: Any):
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.expansion_ratio = expansion_ratio
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.learned_pos_emb = learned_pos_emb
|
||||
self.attn_config = attn_config
|
||||
self.ffn_config = ffn_config
|
||||
self.init_device = init_device
|
||||
self.logit_scale = logit_scale
|
||||
self.no_bias = no_bias
|
||||
self.embedding_fraction = embedding_fraction
|
||||
self.norm_type = norm_type
|
||||
self.use_cache = use_cache
|
||||
self.init_config = init_config
|
||||
self.fc_type = fc_type
|
||||
if verbose is not None:
|
||||
warnings.warn(DeprecationWarning(
|
||||
'verbose argument for MPTConfig is now ignored and '
|
||||
'will be removed. Use python_log_level instead.'),
|
||||
stacklevel=2)
|
||||
if 'name' in kwargs:
|
||||
del kwargs['name']
|
||||
if 'loss_fn' in kwargs:
|
||||
del kwargs['loss_fn']
|
||||
if self.attn_config.get('alibi', False):
|
||||
self.learned_pos_emb = False
|
||||
warnings.warn(
|
||||
f'alibi is turned on, setting `learned_pos_emb` '
|
||||
f'to {self.learned_pos_emb}`',
|
||||
stacklevel=2)
|
||||
super().__init__(**kwargs)
|
||||
self._validate_config()
|
||||
|
||||
def _set_config_defaults(
|
||||
self, config: dict[str, Any],
|
||||
config_defaults: dict[str, Any]) -> dict[str, Any]:
|
||||
for (k, v) in config_defaults.items():
|
||||
if k not in config:
|
||||
config[k] = v
|
||||
return config
|
||||
|
||||
def _validate_config(self) -> None:
|
||||
self.attn_config = self._set_config_defaults(self.attn_config,
|
||||
attn_config_defaults)
|
||||
self.ffn_config = self._set_config_defaults(self.ffn_config,
|
||||
ffn_config_defaults)
|
||||
self.init_config = self._set_config_defaults(self.init_config,
|
||||
init_config_defaults)
|
||||
if self.d_model % self.n_heads != 0:
|
||||
raise ValueError('d_model must be divisible by n_heads')
|
||||
if any(
|
||||
prob < 0 or prob > 1 for prob in
|
||||
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
|
||||
]):
|
||||
raise ValueError(
|
||||
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
|
||||
"probabilities and must be between 0 and 1")
|
||||
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
||||
raise ValueError(
|
||||
f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
||||
if self.attn_config['prefix_lm'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'prefix_lm only implemented with torch and triton attention.')
|
||||
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
|
||||
'torch', 'triton'
|
||||
]:
|
||||
raise NotImplementedError(
|
||||
'alibi only implemented with torch and triton attention.')
|
||||
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'attn_uses_sequence_id only implemented with torch '
|
||||
'and triton attention.')
|
||||
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
||||
raise ValueError(
|
||||
'model.embedding_fraction must be between 0 (exclusive) '
|
||||
'and 1 (inclusive)!')
|
||||
if isinstance(self.logit_scale,
|
||||
str) and self.logit_scale != 'inv_sqrt_d_model':
|
||||
raise ValueError(
|
||||
f"self.logit_scale={self.logit_scale!r} is not recognized as "
|
||||
"an option; use numeric value or 'inv_sqrt_d_model'.")
|
||||
if self.init_config.get('name', None) is None:
|
||||
raise ValueError(
|
||||
f"self.init_config={self.init_config!r} 'name' needs to be set."
|
||||
)
|
||||
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
||||
warnings.warn(
|
||||
'Positional information not being provided to the model.',
|
||||
stacklevel=2)
|
||||
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
import transformer_engine.pytorch as te
|
||||
del te
|
||||
except Exception as exc:
|
||||
raise ImportError(
|
||||
'TransformerEngine import fail. `fc_type: te` requires '
|
||||
'TransformerEngine be installed. '
|
||||
'The required version of transformer_engine also requires '
|
||||
'FlashAttention v1.0.6 is installed:\n'
|
||||
'pip install flash-attn==1.0.6 --no-build-isolation \n'
|
||||
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
|
||||
) from exc
|
||||
if self.ffn_config['ffn_type'] == 'mptmlp':
|
||||
self.ffn_config['fc_type'] = self.fc_type
|
||||
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
self.ffn_config['bias'] = not self.no_bias
|
@ -1,31 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
|
||||
# --------------------------------------------------------
|
||||
# NVLM-D
|
||||
# Copyright (c) 2024 NVIDIA
|
||||
# Licensed under Apache 2.0 License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from transformers import Qwen2Config
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class NVLM_D_Config(PretrainedConfig):
|
||||
model_type = 'NVLM_D'
|
||||
is_composition = True
|
||||
|
||||
def __init__(self, vision_config=None, llm_config=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# Handle vision_config initialization
|
||||
if vision_config is None:
|
||||
vision_config = {}
|
||||
|
||||
# Handle llm_config initialization
|
||||
if llm_config is None:
|
||||
llm_config = {}
|
||||
|
||||
self.vision_config = PretrainedConfig(**vision_config)
|
||||
self.text_config = Qwen2Config(**llm_config)
|
@ -1,184 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
||||
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
|
||||
class AIMv2Config(PretrainedConfig):
|
||||
"""This is the configuration class to store the configuration of an [`AIMv2Model`].
|
||||
|
||||
Instantiating a configuration with the defaults will yield a similar configuration
|
||||
to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
|
||||
|
||||
Args:
|
||||
hidden_size: Dimension of the hidden representations.
|
||||
intermediate_size: Dimension of the SwiGLU representations.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer.
|
||||
num_attention_heads: Number of attention heads for each attention layer
|
||||
in the Transformer.
|
||||
num_channels: Number of input channels.
|
||||
image_size: Image size.
|
||||
patch_size: Patch size.
|
||||
rms_norm_eps: Epsilon value used for the RMS normalization layer.
|
||||
attention_dropout: Dropout ratio for attention probabilities.
|
||||
projection_dropout: Dropout ratio for the projection layer after the attention.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
use_bias: Whether to add a bias in the feed-forward and projection layers.
|
||||
kwargs: Keyword arguments for the [`PretrainedConfig`].
|
||||
"""
|
||||
|
||||
model_type: str = "aimv2"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 1024,
|
||||
intermediate_size: int = 2816,
|
||||
num_hidden_layers: int = 24,
|
||||
num_attention_heads: int = 8,
|
||||
num_channels: int = 3,
|
||||
image_size: int = 224,
|
||||
patch_size: int = 14,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
attention_dropout: float = 0.0,
|
||||
projection_dropout: float = 0.0,
|
||||
qkv_bias: bool = False,
|
||||
use_bias: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.attention_dropout = attention_dropout
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.projection_dropout = projection_dropout
|
||||
self.qkv_bias = qkv_bias
|
||||
self.use_bias = use_bias
|
||||
|
||||
|
||||
IGNORE_ID = -100
|
||||
IMAGE_TOKEN_ID = -200
|
||||
IMAGE_TOKEN = "<image>"
|
||||
IMAGE_ATOM_ID = -300
|
||||
IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Visual Tokenizer Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class BaseVisualTokenizerConfig(PretrainedConfig):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.tokenize_function = tokenize_function
|
||||
self.tau = tau
|
||||
if isinstance(depths, str):
|
||||
depths = [int(x) for x in depths.split('|')]
|
||||
self.depths = depths
|
||||
self.backbone_kwargs = dict[str, Any]()
|
||||
self.drop_cls_token = drop_cls_token
|
||||
if backbone_config is not None:
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
|
||||
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
||||
if not isinstance(backbone_config, PretrainedConfig):
|
||||
model_type = backbone_config['model_type']
|
||||
if model_type != "aimv2":
|
||||
backbone_config.pop('model_type')
|
||||
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
|
||||
else:
|
||||
backbone_config = AIMv2Config(**backbone_config)
|
||||
self.backbone_config = backbone_config
|
||||
self.hidden_stride = hidden_stride
|
||||
|
||||
|
||||
class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "aimv2_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
|
||||
|
||||
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "siglip_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
|
||||
|
||||
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
||||
AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Ovis Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class OvisConfig(PretrainedConfig):
|
||||
model_type = "ovis"
|
||||
|
||||
def __init__(self,
|
||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
visual_tokenizer_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if llm_config is not None:
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), \
|
||||
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
||||
if not isinstance(llm_config, PretrainedConfig):
|
||||
model_type = llm_config['model_type']
|
||||
llm_config.pop('model_type')
|
||||
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
||||
|
||||
# map llm_config to text_config
|
||||
self.text_config = llm_config
|
||||
if visual_tokenizer_config is not None:
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
|
||||
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
||||
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
||||
model_type = visual_tokenizer_config['model_type']
|
||||
visual_tokenizer_config.pop('model_type')
|
||||
visual_tokenizer_config = AutoConfig.for_model(
|
||||
model_type, **visual_tokenizer_config)
|
||||
|
||||
self.visual_tokenizer_config = visual_tokenizer_config
|
||||
self.multimodal_max_length = multimodal_max_length
|
||||
self.hidden_size = hidden_size
|
||||
self.conversation_formatter_class = conversation_formatter_class
|
||||
self.llm_attn_implementation = llm_attn_implementation
|
||||
self.disable_tie_weight = disable_tie_weight
|
@ -1,54 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
|
||||
# --------------------------------------------------------
|
||||
# SkyworkR1V
|
||||
# Copyright (c) 2025 Skywork
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class SkyworkR1VChatConfig(PretrainedConfig):
|
||||
model_type = 'internvl_chat'
|
||||
is_composition = True
|
||||
|
||||
def __init__(self,
|
||||
vision_config=None,
|
||||
llm_config=None,
|
||||
use_backbone_lora=0,
|
||||
use_llm_lora=0,
|
||||
select_layer=-1,
|
||||
force_image_size=None,
|
||||
downsample_ratio=0.5,
|
||||
template=None,
|
||||
dynamic_image_size=False,
|
||||
use_thumbnail=False,
|
||||
ps_version='v1',
|
||||
min_dynamic_patch=1,
|
||||
max_dynamic_patch=6,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if vision_config is None:
|
||||
vision_config = {}
|
||||
|
||||
if llm_config is None:
|
||||
llm_config = {}
|
||||
|
||||
self.vision_config = PretrainedConfig(**vision_config)
|
||||
self.text_config = PretrainedConfig(**llm_config)
|
||||
|
||||
self.use_backbone_lora = use_backbone_lora
|
||||
self.use_llm_lora = use_llm_lora
|
||||
self.select_layer = select_layer
|
||||
self.force_image_size = force_image_size
|
||||
self.downsample_ratio = downsample_ratio
|
||||
self.template = template
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail = use_thumbnail
|
||||
self.ps_version = ps_version # pixel shuffle version
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
@ -1,247 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Solar model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class SolarConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store
|
||||
the configuration of a [`SolarModel`].
|
||||
It is used to instantiate an LLaMA model
|
||||
according to the specified arguments,
|
||||
defining the model architecture.
|
||||
Instantiating a configuration with the
|
||||
defaults will yield a similar
|
||||
configuration to that of the LLaMA-7B.
|
||||
Configuration objects inherit from [`PretrainedConfig`]
|
||||
and can be used to control the model outputs.
|
||||
Read the documentation from [`PretrainedConfig`] for more information.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the LLaMA model.
|
||||
Defines the number of different tokens
|
||||
that can be represented by the `inputs_ids`
|
||||
passed when calling [`SolarModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 11008):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer
|
||||
in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that
|
||||
should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`,
|
||||
the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model
|
||||
will use Multi Query Attention (MQA)
|
||||
otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint,
|
||||
each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group.
|
||||
For more details checkout [this paper]
|
||||
(https://arxiv.org/pdf/2305.13245.pdf).
|
||||
If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string)
|
||||
in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Solar 1 supports up to 2048 tokens,
|
||||
Solar 2 up to 4096, CodeSolar up to 16384.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of
|
||||
the truncated_normal_initializer for initializing
|
||||
all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return
|
||||
the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
pretraining_tp (`int`, *optional*, defaults to 1):
|
||||
Experimental feature. Tensor parallelism rank
|
||||
used during pretraining.
|
||||
Please refer to [this
|
||||
document](https://huggingface.co/docs/
|
||||
transformers/main/
|
||||
perf_train_gpu_many#tensor-parallelism)
|
||||
to understand more about it. This value is
|
||||
necessary to ensure exact reproducibility
|
||||
of the pretraining results.
|
||||
Please refer to [this
|
||||
issue](https://github.com/pytorch/pytorch/issues/76232).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for
|
||||
the RoPE embeddings.
|
||||
Currently supports two scaling
|
||||
strategies: linear and dynamic.
|
||||
Their scaling factor must be a float greater than 1.
|
||||
The expected format is
|
||||
`{"type": strategy name, "factor": scaling factor}`.
|
||||
When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum.
|
||||
See the following thread for more information on how
|
||||
these scaling strategies behave:
|
||||
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
|
||||
dynamically_scaled_rope_further_increases/. This is an
|
||||
experimental feature, subject to breaking
|
||||
API changes in future versions.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value
|
||||
and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj, down_proj and gate_proj
|
||||
layers in the MLP layers.
|
||||
sliding_window (`int`, *optional*, defaults to 2047):
|
||||
Sliding window attention window size. If not specified,
|
||||
will default to `2047`.
|
||||
```python
|
||||
>>> from transformers import SolarModel, SolarConfig
|
||||
>>> # Initializing a Solar-pro style configuration
|
||||
>>> configuration = SolarConfig()
|
||||
>>> # Initializing a model from the Solar-pro style configuration
|
||||
>>> model = SolarModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "solar"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
sliding_window=2047,
|
||||
bskcn_1=None,
|
||||
bskcn_2=None,
|
||||
bskcn_3=None,
|
||||
bskcn_4=None,
|
||||
bskcn_tv=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
self.sliding_window = sliding_window
|
||||
self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
|
||||
self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
|
||||
self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
|
||||
self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
|
||||
self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.rope_scaling, dict)
|
||||
or len(self.rope_scaling) != 2):
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields,"
|
||||
" `type` and `factor`, "
|
||||
f"got {self.rope_scaling}")
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear",
|
||||
"dynamic",
|
||||
]:
|
||||
raise ValueError(f"`rope_scaling`'s type field must be one of "
|
||||
f"['linear', 'dynamic'], got {rope_scaling_type}")
|
||||
if (rope_scaling_factor is None
|
||||
or not isinstance(rope_scaling_factor, float)
|
||||
or rope_scaling_factor <= 1.0):
|
||||
raise ValueError(
|
||||
f"`rope_scaling`'s factor field must be a float > 1,"
|
||||
f" got {rope_scaling_factor}")
|
@ -1,64 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
|
||||
""" Telechat configuration compatible with LlamaConfig. """
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Telechat2Config(PretrainedConfig):
|
||||
|
||||
model_type = "telechat"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "n_layer",
|
||||
"num_attention_heads": "n_head",
|
||||
"intermediate_size": "ffn_hidden_size",
|
||||
"rms_norm_eps": "layer_norm_epsilon"
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=160256,
|
||||
hidden_size=4096,
|
||||
n_layer=30,
|
||||
n_head=32,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
ffn_hidden_size=12288,
|
||||
training_seqlen=8192,
|
||||
logn=True,
|
||||
embed_layernorm=False,
|
||||
hidden_act="silu",
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
n_embed = kwargs.pop("n_embed", None)
|
||||
self.hidden_size = hidden_size if n_embed is None else n_embed
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
self.logn = logn
|
||||
self.training_seqlen = training_seqlen
|
||||
self.embed_layernorm = embed_layernorm
|
||||
self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.hidden_act = hidden_act
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
@ -1,5 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Multi-modal processors may be defined in this directory for the following
|
||||
reasons:
|
||||
|
||||
- There is no processing file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing processor to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import (
|
||||
DeepseekVLV2Processor)
|
||||
|
Reference in New Issue
Block a user