mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 17:48:57 +08:00
Compare commits
11 Commits
v4.48.2
...
cohere-dif
Author | SHA1 | Date | |
---|---|---|---|
9faf7b0665 | |||
4547337ba5 | |||
1eff33af54 | |||
1146aa0be7 | |||
df3226143b | |||
3dc5e14321 | |||
bdb4cc9f0f | |||
4f6104a25b | |||
6135a1fb72 | |||
dc78118531 | |||
dd518307e7 |
@ -1,10 +1,11 @@
|
|||||||
# coding=utf-8
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
# Copyright 2024 Cohere team. All rights reserved.
|
# This file was automatically generated from <path_to_modular_file.py>.
|
||||||
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
|
# modular_xxx.py file directly. One of our CI enforces this
|
||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
|
||||||
#
|
#
|
||||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
|
||||||
# and OPT implementations in this library. It has been modified from its
|
|
||||||
# original forms to accommodate minor architectural differences compared
|
|
||||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
@ -17,41 +18,32 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Cohere model configuration"""
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...modeling_rope_utils import rope_config_validation
|
|
||||||
from ...utils import logging
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class CohereConfig(PretrainedConfig):
|
class CohereConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
|
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
|
||||||
model according to the specified arguments, defining the model architecture.
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||||
|
defaults will yield a similar configuration to that of the Cohere-7B.
|
||||||
|
e.g. [google/cohere-7b](https://huggingface.co/google/cohere-7b)
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
|
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size (`int`, *optional*, defaults to 256000):
|
vocab_size (`int`, *optional*, defaults to 256000):
|
||||||
Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
|
Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
|
||||||
`inputs_ids` passed when calling [`CohereModel`]
|
`inputs_ids` passed when calling [`CohereModel`]
|
||||||
hidden_size (`int`, *optional*, defaults to 8192):
|
hidden_size (`int`, *optional*, defaults to 3072):
|
||||||
Dimension of the hidden representations.
|
Dimension of the hidden representations.
|
||||||
intermediate_size (`int`, *optional*, defaults to 22528):
|
intermediate_size (`int`, *optional*, defaults to 24576):
|
||||||
Dimension of the MLP representations.
|
Dimension of the MLP representations.
|
||||||
logit_scale (`float`, *optional*, defaults to 0.0625):
|
num_hidden_layers (`int`, *optional*, defaults to 28):
|
||||||
The scaling factor for the output logits.
|
|
||||||
num_hidden_layers (`int`, *optional*, defaults to 40):
|
|
||||||
Number of hidden layers in the Transformer decoder.
|
Number of hidden layers in the Transformer decoder.
|
||||||
num_attention_heads (`int`, *optional*, defaults to 64):
|
num_attention_heads (`int`, *optional*, defaults to 16):
|
||||||
Number of attention heads for each attention layer in the Transformer decoder.
|
Number of attention heads for each attention layer in the Transformer decoder.
|
||||||
num_key_value_heads (`int`, *optional*):
|
num_key_value_heads (`int`, *optional*, defaults to 16):
|
||||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
@ -59,83 +51,48 @@ class CohereConfig(PretrainedConfig):
|
|||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details checkout [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The attention head dimension.
|
||||||
|
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
||||||
|
The legacy activation function. It is overwritten by the `hidden_activation`.
|
||||||
|
hidden_activation (`str` or `function`, *optional*):
|
||||||
|
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
|
||||||
|
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
|
||||||
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
||||||
The maximum sequence length that this model might ever be used with.
|
The maximum sequence length that this model might ever be used with.
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||||
The epsilon used by the layer normalization.
|
The epsilon used by the rms normalization layers.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
pad_token_id (`int`, *optional*, defaults to 0):
|
pad_token_id (`int`, *optional*, defaults to 0):
|
||||||
Padding token id.
|
Padding token id.
|
||||||
bos_token_id (`int`, *optional*, defaults to 5):
|
eos_token_id (`int`, *optional*, defaults to 1):
|
||||||
Beginning of stream token id.
|
|
||||||
eos_token_id (`int`, *optional*, defaults to 255001):
|
|
||||||
End of stream token id.
|
End of stream token id.
|
||||||
|
bos_token_id (`int`, *optional*, defaults to 2):
|
||||||
|
Beginning of stream token id.
|
||||||
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to tie weight embeddings
|
Whether to tie weight embeddings
|
||||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||||
The base period of the RoPE embeddings.
|
The base period of the RoPE embeddings.
|
||||||
rope_scaling (`Dict`, *optional*):
|
|
||||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
|
||||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
|
||||||
accordingly.
|
|
||||||
Expected contents:
|
|
||||||
`rope_type` (`str`):
|
|
||||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
|
||||||
'llama3'], with 'default' being the original RoPE implementation.
|
|
||||||
`factor` (`float`, *optional*):
|
|
||||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
|
||||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
|
||||||
original maximum pre-trained length.
|
|
||||||
`original_max_position_embeddings` (`int`, *optional*):
|
|
||||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
|
||||||
pretraining.
|
|
||||||
`attention_factor` (`float`, *optional*):
|
|
||||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
|
||||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
|
||||||
`factor` field to infer the suggested value.
|
|
||||||
`beta_fast` (`float`, *optional*):
|
|
||||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
|
||||||
ramp function. If unspecified, it defaults to 32.
|
|
||||||
`beta_slow` (`float`, *optional*):
|
|
||||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
|
||||||
ramp function. If unspecified, it defaults to 1.
|
|
||||||
`short_factor` (`List[float]`, *optional*):
|
|
||||||
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
|
||||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
|
||||||
size divided by the number of attention heads divided by 2
|
|
||||||
`long_factor` (`List[float]`, *optional*):
|
|
||||||
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
|
||||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
|
||||||
size divided by the number of attention heads divided by 2
|
|
||||||
`low_freq_factor` (`float`, *optional*):
|
|
||||||
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
|
||||||
`high_freq_factor` (`float`, *optional*):
|
|
||||||
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
|
||||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
use_qk_norm (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether to use query-key normalization in the attention
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from transformers import CohereModel, CohereConfig
|
>>> from transformers import CohereModel, CohereConfig
|
||||||
|
>>> # Initializing a Cohere cohere-7b style configuration
|
||||||
>>> # Initializing a Cohere model configuration
|
|
||||||
>>> configuration = CohereConfig()
|
>>> configuration = CohereConfig()
|
||||||
|
>>> # Initializing a model from the cohere-7b style configuration
|
||||||
>>> # Initializing a model from the Cohere configuration
|
>>> model = CohereModel(configuration)
|
||||||
>>> model = CohereModel(configuration) # doctest: +SKIP
|
|
||||||
|
|
||||||
>>> # Accessing the model configuration
|
>>> # Accessing the model configuration
|
||||||
>>> configuration = model.config # doctest: +SKIP
|
>>> configuration = model.config
|
||||||
```"""
|
```
|
||||||
|
use_qk_norm (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to use query-key normalization in the attention
|
||||||
|
"""
|
||||||
|
|
||||||
model_type = "cohere"
|
model_type = "cohere"
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
@ -145,51 +102,57 @@ class CohereConfig(PretrainedConfig):
|
|||||||
vocab_size=256000,
|
vocab_size=256000,
|
||||||
hidden_size=8192,
|
hidden_size=8192,
|
||||||
intermediate_size=22528,
|
intermediate_size=22528,
|
||||||
logit_scale=0.0625,
|
|
||||||
num_hidden_layers=40,
|
num_hidden_layers=40,
|
||||||
num_attention_heads=64,
|
num_attention_heads=64,
|
||||||
num_key_value_heads=None,
|
num_key_value_heads=None,
|
||||||
|
head_dim=256,
|
||||||
hidden_act="silu",
|
hidden_act="silu",
|
||||||
|
hidden_activation=None,
|
||||||
max_position_embeddings=8192,
|
max_position_embeddings=8192,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
layer_norm_eps=1e-5,
|
rms_norm_eps=1e-05,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
pad_token_id=0,
|
pad_token_id=0,
|
||||||
bos_token_id=5,
|
|
||||||
eos_token_id=255001,
|
eos_token_id=255001,
|
||||||
|
bos_token_id=5,
|
||||||
tie_word_embeddings=True,
|
tie_word_embeddings=True,
|
||||||
rope_theta=10000.0,
|
rope_theta=10000.0,
|
||||||
rope_scaling=None,
|
|
||||||
attention_bias=False,
|
attention_bias=False,
|
||||||
attention_dropout=0.0,
|
attention_dropout=0.0,
|
||||||
use_qk_norm=False,
|
use_qk_norm=False,
|
||||||
|
layer_norm_eps=1e-05,
|
||||||
|
logit_scale=0.0625,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.logit_scale = logit_scale
|
|
||||||
self.intermediate_size = intermediate_size
|
self.intermediate_size = intermediate_size
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.head_dim = head_dim
|
||||||
# for backward compatibility
|
|
||||||
if num_key_value_heads is None:
|
|
||||||
num_key_value_heads = num_attention_heads
|
|
||||||
|
|
||||||
self.num_key_value_heads = num_key_value_heads
|
self.num_key_value_heads = num_key_value_heads
|
||||||
|
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
|
self.hidden_activation = hidden_activation
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.rms_norm_eps = layer_norm_eps
|
||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
self.rope_theta = rope_theta
|
self.rope_theta = rope_theta
|
||||||
self.rope_scaling = rope_scaling
|
|
||||||
self.attention_bias = attention_bias
|
self.attention_bias = attention_bias
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
self.use_qk_norm = use_qk_norm
|
self.use_qk_norm = use_qk_norm
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.itermediate_size = intermediate_size
|
||||||
|
|
||||||
# Validate the correctness of rotary position embeddings parameters
|
self.bos_token_id = bos_token_id
|
||||||
rope_config_validation(self)
|
self.eos_token_id = eos_token_id
|
||||||
|
|
||||||
|
self.logit_scale = logit_scale
|
||||||
|
|
||||||
|
if num_key_value_heads is None:
|
||||||
|
self.num_key_value_heads = num_attention_heads
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
pad_token_id=pad_token_id,
|
pad_token_id=pad_token_id,
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
# coding=utf-8
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
# Copyright 2024 Cohere team. All rights reserved.
|
# This file was automatically generated from <path_to_modular_file.py>.
|
||||||
|
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||||
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
|
# modular_xxx.py file directly. One of our CI enforces this
|
||||||
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
|
||||||
#
|
#
|
||||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
|
||||||
# and OPT implementations in this library. It has been modified from its
|
|
||||||
# original forms to accommodate minor architectural differences compared
|
|
||||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
@ -17,11 +18,6 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
# This file is based on the LLama model definition file in transformers
|
|
||||||
|
|
||||||
"""PyTorch Cohere model."""
|
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from typing import List, Optional, Tuple, Union
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
@ -38,31 +34,21 @@ from ...modeling_outputs import (
|
|||||||
BaseModelOutputWithPast,
|
BaseModelOutputWithPast,
|
||||||
CausalLMOutputWithPast,
|
CausalLMOutputWithPast,
|
||||||
)
|
)
|
||||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
|
||||||
from ...modeling_utils import PreTrainedModel
|
from ...modeling_utils import PreTrainedModel
|
||||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
|
||||||
from ...utils import (
|
from ...utils import (
|
||||||
add_start_docstrings,
|
add_start_docstrings,
|
||||||
add_start_docstrings_to_model_forward,
|
add_start_docstrings_to_model_forward,
|
||||||
is_flash_attn_2_available,
|
|
||||||
is_flash_attn_greater_or_equal_2_10,
|
|
||||||
is_torchdynamo_compiling,
|
|
||||||
logging,
|
logging,
|
||||||
replace_return_docstrings,
|
replace_return_docstrings,
|
||||||
)
|
)
|
||||||
from .configuration_cohere import CohereConfig
|
from .configuration_cohere import CohereConfig
|
||||||
|
|
||||||
|
|
||||||
if is_flash_attn_2_available():
|
|
||||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
_CONFIG_FOR_DOC = "CohereConfig"
|
_CONFIG_FOR_DOC = "CohereConfig"
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
|
|
||||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||||
attention_mask: torch.Tensor,
|
attention_mask: torch.Tensor,
|
||||||
sequence_length: int,
|
sequence_length: int,
|
||||||
@ -116,7 +102,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
|
|||||||
return causal_mask
|
return causal_mask
|
||||||
|
|
||||||
|
|
||||||
class CohereLayerNorm(nn.Module):
|
class CohereRMSNorm(nn.Module):
|
||||||
def __init__(self, hidden_size=None, eps=1e-5, bias=False):
|
def __init__(self, hidden_size=None, eps=1e-5, bias=False):
|
||||||
"""The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
|
"""The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -133,105 +119,45 @@ class CohereLayerNorm(nn.Module):
|
|||||||
return hidden_states.to(input_dtype)
|
return hidden_states.to(input_dtype)
|
||||||
|
|
||||||
|
|
||||||
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
|
class CohereLayerNorm(CohereRMSNorm):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
logger.warning_once("CohereLayerNorm is deprecated by CohereRMSNorm and will be removed in v4.45.")
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class CohereRotaryEmbedding(nn.Module):
|
class CohereRotaryEmbedding(nn.Module):
|
||||||
# Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
|
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
||||||
# the same parameterization. The differences are highlighted with a comment.
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
dim=None,
|
|
||||||
max_position_embeddings=2048,
|
|
||||||
base=10000,
|
|
||||||
device=None,
|
|
||||||
scaling_factor=1.0,
|
|
||||||
rope_type="default",
|
|
||||||
config: Optional[CohereConfig] = None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# TODO (joao): remove the `if` below, only used for BC
|
self.scaling_factor = scaling_factor
|
||||||
self.rope_kwargs = {}
|
self.dim = dim
|
||||||
if config is None:
|
self.max_position_embeddings = max_position_embeddings
|
||||||
logger.warning_once(
|
self.base = base
|
||||||
"`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the "
|
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
|
||||||
"`config` argument. All other arguments will be removed in v4.46"
|
|
||||||
)
|
|
||||||
self.rope_kwargs = {
|
|
||||||
"rope_type": rope_type,
|
|
||||||
"factor": scaling_factor,
|
|
||||||
"dim": dim,
|
|
||||||
"base": base,
|
|
||||||
"max_position_embeddings": max_position_embeddings,
|
|
||||||
}
|
|
||||||
self.rope_type = rope_type
|
|
||||||
self.max_seq_len_cached = max_position_embeddings
|
|
||||||
self.original_max_seq_len = max_position_embeddings
|
|
||||||
else:
|
|
||||||
# BC: "rope_type" was originally "type"
|
|
||||||
if config.rope_scaling is not None:
|
|
||||||
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
|
||||||
else:
|
|
||||||
self.rope_type = "default"
|
|
||||||
self.max_seq_len_cached = config.max_position_embeddings
|
|
||||||
self.original_max_seq_len = config.max_position_embeddings
|
|
||||||
|
|
||||||
self.config = config
|
|
||||||
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
|
||||||
|
|
||||||
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
|
|
||||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||||
self.original_inv_freq = self.inv_freq
|
|
||||||
|
|
||||||
def _dynamic_frequency_update(self, position_ids, device):
|
|
||||||
"""
|
|
||||||
dynamic RoPE layers should recompute `inv_freq` in the following situations:
|
|
||||||
1 - growing beyond the cached sequence length (allow scaling)
|
|
||||||
2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
|
|
||||||
"""
|
|
||||||
seq_len = torch.max(position_ids) + 1
|
|
||||||
if seq_len > self.max_seq_len_cached: # growth
|
|
||||||
inv_freq, self.attention_scaling = self.rope_init_fn(
|
|
||||||
self.config, device, seq_len=seq_len, **self.rope_kwargs
|
|
||||||
)
|
|
||||||
self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
|
|
||||||
self.max_seq_len_cached = seq_len
|
|
||||||
|
|
||||||
if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
|
|
||||||
self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
|
|
||||||
self.max_seq_len_cached = self.original_max_seq_len
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def forward(self, x, position_ids):
|
def forward(self, x, position_ids):
|
||||||
if "dynamic" in self.rope_type:
|
# x: [bs, num_attention_heads, seq_len, head_size]
|
||||||
self._dynamic_frequency_update(position_ids, device=x.device)
|
|
||||||
|
|
||||||
# Core RoPE block
|
|
||||||
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||||
position_ids_expanded = position_ids[:, None, :].float()
|
position_ids_expanded = position_ids[:, None, :].float()
|
||||||
# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
|
|
||||||
|
# Force float32 since bfloat16 loses precision on long contexts
|
||||||
|
# See https://github.com/huggingface/transformers/pull/29285
|
||||||
device_type = x.device.type
|
device_type = x.device.type
|
||||||
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||||
with torch.autocast(device_type=device_type, enabled=False):
|
with torch.autocast(device_type=device_type, enabled=False):
|
||||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||||
emb = torch.repeat_interleave(freqs, 2, dim=-1) # This line differs from Llama's implementation
|
emb = torch.repeat_interleave(freqs, 2, dim=-1)
|
||||||
cos = emb.cos()
|
cos = emb.cos()
|
||||||
sin = emb.sin()
|
sin = emb.sin()
|
||||||
|
return cos, sin
|
||||||
# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
|
|
||||||
cos = cos * self.attention_scaling
|
|
||||||
sin = sin * self.attention_scaling
|
|
||||||
|
|
||||||
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def rotate_half(x):
|
def rotate_half(x):
|
||||||
# Split and rotate. Note that this function is different from e.g. Llama.
|
"""Rotates half the hidden dims of the input."""
|
||||||
x1 = x[..., ::2]
|
x1 = x[..., : x.shape[-1] // 2]
|
||||||
x2 = x[..., 1::2]
|
x2 = x[..., x.shape[-1] // 2 :]
|
||||||
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
|
return torch.cat((-x2, x1), dim=-1)
|
||||||
return rot_x
|
|
||||||
|
|
||||||
|
|
||||||
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||||
@ -254,14 +180,11 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|||||||
Returns:
|
Returns:
|
||||||
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||||
"""
|
"""
|
||||||
dtype = q.dtype
|
|
||||||
q = q.float()
|
|
||||||
k = k.float()
|
|
||||||
cos = cos.unsqueeze(unsqueeze_dim)
|
cos = cos.unsqueeze(unsqueeze_dim)
|
||||||
sin = sin.unsqueeze(unsqueeze_dim)
|
sin = sin.unsqueeze(unsqueeze_dim)
|
||||||
q_embed = (q * cos) + (rotate_half(q) * sin)
|
q_embed = (q * cos) + (rotate_half(q) * sin)
|
||||||
k_embed = (k * cos) + (rotate_half(k) * sin)
|
k_embed = (k * cos) + (rotate_half(k) * sin)
|
||||||
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
return q_embed, k_embed
|
||||||
|
|
||||||
|
|
||||||
class CohereMLP(nn.Module):
|
class CohereMLP(nn.Module):
|
||||||
@ -275,13 +198,11 @@ class CohereMLP(nn.Module):
|
|||||||
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
||||||
self.act_fn = ACT2FN[config.hidden_act]
|
self.act_fn = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||||
return down_proj
|
return down_proj
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.repeat_kv
|
|
||||||
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
||||||
@ -336,9 +257,14 @@ class CohereAttention(nn.Module):
|
|||||||
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
||||||
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
||||||
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
|
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
|
||||||
|
self._init_rope()
|
||||||
|
|
||||||
# TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
|
def _init_rope(self):
|
||||||
self.rotary_emb = CohereRotaryEmbedding(config=self.config)
|
self.rotary_emb = CohereRotaryEmbedding(
|
||||||
|
self.head_dim,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
base=self.rope_theta,
|
||||||
|
)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@ -349,7 +275,6 @@ class CohereAttention(nn.Module):
|
|||||||
output_attentions: bool = False,
|
output_attentions: bool = False,
|
||||||
use_cache: bool = False,
|
use_cache: bool = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
bsz, q_len, _ = hidden_states.size()
|
bsz, q_len, _ = hidden_states.size()
|
||||||
@ -368,16 +293,7 @@ class CohereAttention(nn.Module):
|
|||||||
key_states = key_states.transpose(1, 2)
|
key_states = key_states.transpose(1, 2)
|
||||||
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
if position_embeddings is None:
|
|
||||||
logger.warning_once(
|
|
||||||
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
|
|
||||||
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
|
|
||||||
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
|
|
||||||
"removed and `position_embeddings` will be mandatory."
|
|
||||||
)
|
|
||||||
cos, sin = self.rotary_emb(value_states, position_ids)
|
cos, sin = self.rotary_emb(value_states, position_ids)
|
||||||
else:
|
|
||||||
cos, sin = position_embeddings
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
if past_key_value is not None:
|
if past_key_value is not None:
|
||||||
@ -417,136 +333,7 @@ class CohereAttention(nn.Module):
|
|||||||
return attn_output, attn_weights, past_key_value
|
return attn_output, attn_weights, past_key_value
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
|
|
||||||
class CohereFlashAttention2(CohereAttention):
|
|
||||||
"""
|
|
||||||
Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays
|
|
||||||
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
|
||||||
flash attention and deal with padding tokens in case the input contains any of them.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
|
|
||||||
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
|
|
||||||
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
|
|
||||||
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
|
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
hidden_states: torch.Tensor,
|
|
||||||
attention_mask: Optional[torch.LongTensor] = None,
|
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
|
||||||
past_key_value: Optional[Cache] = None,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
use_cache: bool = False,
|
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
|
||||||
**kwargs,
|
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
|
||||||
if isinstance(past_key_value, StaticCache):
|
|
||||||
raise ValueError(
|
|
||||||
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
|
|
||||||
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
|
|
||||||
)
|
|
||||||
output_attentions = False
|
|
||||||
|
|
||||||
bsz, q_len, _ = hidden_states.size()
|
|
||||||
|
|
||||||
query_states = self.q_proj(hidden_states)
|
|
||||||
key_states = self.k_proj(hidden_states)
|
|
||||||
value_states = self.v_proj(hidden_states)
|
|
||||||
|
|
||||||
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
|
||||||
if self.use_qk_norm:
|
|
||||||
query_states = self.q_norm(query_states)
|
|
||||||
key_states = self.k_norm(key_states)
|
|
||||||
|
|
||||||
query_states = query_states.transpose(1, 2)
|
|
||||||
key_states = key_states.transpose(1, 2)
|
|
||||||
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
|
||||||
|
|
||||||
if position_embeddings is None:
|
|
||||||
logger.warning_once(
|
|
||||||
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
|
|
||||||
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
|
|
||||||
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
|
|
||||||
"removed and `position_embeddings` will be mandatory."
|
|
||||||
)
|
|
||||||
cos, sin = self.rotary_emb(value_states, position_ids)
|
|
||||||
else:
|
|
||||||
cos, sin = position_embeddings
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
|
||||||
|
|
||||||
if past_key_value is not None:
|
|
||||||
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
|
||||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
||||||
|
|
||||||
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
|
|
||||||
# to be able to avoid many of these transpose/reshape/view.
|
|
||||||
query_states = query_states.transpose(1, 2)
|
|
||||||
key_states = key_states.transpose(1, 2)
|
|
||||||
value_states = value_states.transpose(1, 2)
|
|
||||||
|
|
||||||
dropout_rate = self.attention_dropout if self.training else 0.0
|
|
||||||
|
|
||||||
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
|
|
||||||
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
|
||||||
# cast them back in the correct dtype just to be sure everything works as expected.
|
|
||||||
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
|
||||||
# in fp32. (CohereLayerNorm handles it correctly)
|
|
||||||
|
|
||||||
input_dtype = query_states.dtype
|
|
||||||
if input_dtype == torch.float32:
|
|
||||||
if torch.is_autocast_enabled():
|
|
||||||
target_dtype = torch.get_autocast_gpu_dtype()
|
|
||||||
# Handle the case where the model is quantized
|
|
||||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
|
||||||
target_dtype = self.config._pre_quantization_dtype
|
|
||||||
else:
|
|
||||||
target_dtype = self.q_proj.weight.dtype
|
|
||||||
|
|
||||||
logger.warning_once(
|
|
||||||
f"The input hidden states seems to be silently casted in float32, this might be related to"
|
|
||||||
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
|
|
||||||
f" {target_dtype}."
|
|
||||||
)
|
|
||||||
|
|
||||||
query_states = query_states.to(target_dtype)
|
|
||||||
key_states = key_states.to(target_dtype)
|
|
||||||
value_states = value_states.to(target_dtype)
|
|
||||||
|
|
||||||
attn_output = _flash_attention_forward(
|
|
||||||
query_states,
|
|
||||||
key_states,
|
|
||||||
value_states,
|
|
||||||
attention_mask,
|
|
||||||
q_len,
|
|
||||||
dropout=dropout_rate,
|
|
||||||
use_top_left_mask=self._flash_attn_uses_top_left_mask,
|
|
||||||
is_causal=self.is_causal,
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
|
|
||||||
attn_output = self.o_proj(attn_output)
|
|
||||||
|
|
||||||
if not output_attentions:
|
|
||||||
attn_weights = None
|
|
||||||
|
|
||||||
return attn_output, attn_weights, past_key_value
|
|
||||||
|
|
||||||
|
|
||||||
class CohereSdpaAttention(CohereAttention):
|
class CohereSdpaAttention(CohereAttention):
|
||||||
"""
|
|
||||||
Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
|
|
||||||
`CohereAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
|
|
||||||
SDPA API.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
@ -556,7 +343,6 @@ class CohereSdpaAttention(CohereAttention):
|
|||||||
output_attentions: bool = False,
|
output_attentions: bool = False,
|
||||||
use_cache: bool = False,
|
use_cache: bool = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
if output_attentions:
|
if output_attentions:
|
||||||
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
||||||
@ -590,16 +376,7 @@ class CohereSdpaAttention(CohereAttention):
|
|||||||
key_states = key_states.transpose(1, 2)
|
key_states = key_states.transpose(1, 2)
|
||||||
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
if position_embeddings is None:
|
|
||||||
logger.warning_once(
|
|
||||||
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
|
|
||||||
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
|
|
||||||
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
|
|
||||||
"removed and `position_embeddings` will be mandatory."
|
|
||||||
)
|
|
||||||
cos, sin = self.rotary_emb(value_states, position_ids)
|
cos, sin = self.rotary_emb(value_states, position_ids)
|
||||||
else:
|
|
||||||
cos, sin = position_embeddings
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
if past_key_value is not None:
|
if past_key_value is not None:
|
||||||
@ -643,6 +420,97 @@ class CohereSdpaAttention(CohereAttention):
|
|||||||
return attn_output, None, past_key_value
|
return attn_output, None, past_key_value
|
||||||
|
|
||||||
|
|
||||||
|
class CohereFlashAttention2(CohereAttention):
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.LongTensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
output_attentions: bool = False,
|
||||||
|
use_cache: bool = False,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
if isinstance(past_key_value, StaticCache):
|
||||||
|
raise ValueError(
|
||||||
|
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
|
||||||
|
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
|
||||||
|
)
|
||||||
|
output_attentions = False
|
||||||
|
|
||||||
|
bsz, q_len, _ = hidden_states.size()
|
||||||
|
|
||||||
|
query_states = self.q_proj(hidden_states)
|
||||||
|
key_states = self.k_proj(hidden_states)
|
||||||
|
value_states = self.v_proj(hidden_states)
|
||||||
|
|
||||||
|
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
||||||
|
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
||||||
|
if self.use_qk_norm:
|
||||||
|
query_states = self.q_norm(query_states)
|
||||||
|
key_states = self.k_norm(key_states)
|
||||||
|
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
|
cos, sin = self.rotary_emb(value_states, position_ids)
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
||||||
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||||
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
|
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
|
||||||
|
# to be able to avoid many of these transpose/reshape/view.
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.transpose(1, 2)
|
||||||
|
|
||||||
|
dropout_rate = self.attention_dropout if self.training else 0.0
|
||||||
|
|
||||||
|
# Ignore copy
|
||||||
|
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
|
||||||
|
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
||||||
|
# cast them back in the correct dtype just to be sure everything works as expected.
|
||||||
|
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
||||||
|
# in fp32. (CohereLayerNorm handles it correctly)
|
||||||
|
|
||||||
|
input_dtype = query_states.dtype
|
||||||
|
if input_dtype == torch.float32:
|
||||||
|
if torch.is_autocast_enabled():
|
||||||
|
target_dtype = torch.get_autocast_gpu_dtype()
|
||||||
|
# Handle the case where the model is quantized
|
||||||
|
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||||
|
target_dtype = self.config._pre_quantization_dtype
|
||||||
|
else:
|
||||||
|
target_dtype = self.q_proj.weight.dtype
|
||||||
|
|
||||||
|
logger.warning_once(
|
||||||
|
f"The input hidden states seems to be silently casted in float32, this might be related to"
|
||||||
|
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
|
||||||
|
f" {target_dtype}."
|
||||||
|
)
|
||||||
|
|
||||||
|
query_states = query_states.to(target_dtype)
|
||||||
|
key_states = key_states.to(target_dtype)
|
||||||
|
value_states = value_states.to(target_dtype)
|
||||||
|
|
||||||
|
attn_output = self._flash_attention_forward(
|
||||||
|
query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
|
||||||
|
)
|
||||||
|
|
||||||
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
|
if not output_attentions:
|
||||||
|
attn_weights = None
|
||||||
|
|
||||||
|
return attn_output, attn_weights, past_key_value
|
||||||
|
|
||||||
|
|
||||||
COHERE_ATTENTION_CLASSES = {
|
COHERE_ATTENTION_CLASSES = {
|
||||||
"eager": CohereAttention,
|
"eager": CohereAttention,
|
||||||
"flash_attention_2": CohereFlashAttention2,
|
"flash_attention_2": CohereFlashAttention2,
|
||||||
@ -669,7 +537,6 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
output_attentions: Optional[bool] = False,
|
output_attentions: Optional[bool] = False,
|
||||||
use_cache: Optional[bool] = False,
|
use_cache: Optional[bool] = False,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
|
|
||||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@ -684,11 +551,6 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
||||||
(see `past_key_values`).
|
(see `past_key_values`).
|
||||||
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
||||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence
|
|
||||||
position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
|
|
||||||
Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
|
|
||||||
with `head_dim` being the embedding dimension of each attention head.
|
|
||||||
"""
|
"""
|
||||||
residual = hidden_states
|
residual = hidden_states
|
||||||
|
|
||||||
@ -703,7 +565,6 @@ class CohereDecoderLayer(nn.Module):
|
|||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
use_cache=use_cache,
|
use_cache=use_cache,
|
||||||
cache_position=cache_position,
|
cache_position=cache_position,
|
||||||
position_embeddings=position_embeddings,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fully Connected
|
# Fully Connected
|
||||||
@ -744,7 +605,6 @@ COHERE_START_DOCSTRING = r"""
|
|||||||
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
||||||
COHERE_START_DOCSTRING,
|
COHERE_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Cohere
|
|
||||||
class CoherePreTrainedModel(PreTrainedModel):
|
class CoherePreTrainedModel(PreTrainedModel):
|
||||||
config_class = CohereConfig
|
config_class = CohereConfig
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
@ -837,6 +697,10 @@ COHERE_INPUTS_DOCSTRING = r"""
|
|||||||
more detail.
|
more detail.
|
||||||
return_dict (`bool`, *optional*):
|
return_dict (`bool`, *optional*):
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||||
|
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||||
|
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
||||||
|
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
||||||
|
the complete sequence length.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -844,7 +708,6 @@ COHERE_INPUTS_DOCSTRING = r"""
|
|||||||
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
|
||||||
COHERE_START_DOCSTRING,
|
COHERE_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE
|
|
||||||
class CohereModel(CoherePreTrainedModel):
|
class CohereModel(CoherePreTrainedModel):
|
||||||
"""
|
"""
|
||||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
|
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
|
||||||
@ -853,7 +716,6 @@ class CohereModel(CoherePreTrainedModel):
|
|||||||
config: CohereConfig
|
config: CohereConfig
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def __init__(self, config: CohereConfig):
|
def __init__(self, config: CohereConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
@ -863,7 +725,7 @@ class CohereModel(CoherePreTrainedModel):
|
|||||||
self.layers = nn.ModuleList(
|
self.layers = nn.ModuleList(
|
||||||
[CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
[CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
||||||
)
|
)
|
||||||
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
self.norm = CohereRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||||
self.rotary_emb = CohereRotaryEmbedding(config=config)
|
self.rotary_emb = CohereRotaryEmbedding(config=config)
|
||||||
self.gradient_checkpointing = False
|
self.gradient_checkpointing = False
|
||||||
|
|
||||||
@ -1068,18 +930,15 @@ class CohereModel(CoherePreTrainedModel):
|
|||||||
return causal_mask
|
return causal_mask
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
|
|
||||||
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
||||||
_tied_weights_keys = ["lm_head.weight"]
|
_tied_weights_keys = ["lm_head.weight"]
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.model = CohereModel(config)
|
self.model = CohereModel(config)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||||
self.logit_scale = config.logit_scale
|
|
||||||
self.tie_word_embeddings = config.tie_word_embeddings
|
|
||||||
# Initialize weights and apply final processing
|
# Initialize weights and apply final processing
|
||||||
self.post_init()
|
self.post_init()
|
||||||
|
|
||||||
@ -1101,7 +960,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
def get_decoder(self):
|
def get_decoder(self):
|
||||||
return self.model
|
return self.model
|
||||||
|
|
||||||
# Ignore copy
|
|
||||||
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
|
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
|
||||||
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
||||||
def forward(
|
def forward(
|
||||||
@ -1117,7 +975,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
num_logits_to_keep: int = 0,
|
|
||||||
) -> Union[Tuple, CausalLMOutputWithPast]:
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||||
r"""
|
r"""
|
||||||
Args:
|
Args:
|
||||||
@ -1126,11 +983,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||||
|
|
||||||
num_logits_to_keep (`int`, *optional*):
|
|
||||||
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
|
||||||
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
|
||||||
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
@ -1170,19 +1022,12 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = outputs[0]
|
hidden_states = outputs[0]
|
||||||
if labels is None and not is_torchdynamo_compiling():
|
logits = self.lm_head(hidden_states)
|
||||||
logger.warning_once(
|
logits = logits * self.config.logit_scale
|
||||||
"Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
|
logits = logits.float()
|
||||||
)
|
|
||||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
|
||||||
# TODO: remove the float() operation in v4.46
|
|
||||||
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
|
|
||||||
logits = logits * self.logit_scale
|
|
||||||
|
|
||||||
loss = None
|
loss = None
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
# Upcast to float if we need to compute the loss to avoid potential precision issues
|
|
||||||
logits = logits.float()
|
|
||||||
# Shift so that tokens < n predict n
|
# Shift so that tokens < n predict n
|
||||||
shift_logits = logits[..., :-1, :].contiguous()
|
shift_logits = logits[..., :-1, :].contiguous()
|
||||||
shift_labels = labels[..., 1:].contiguous()
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
@ -1279,3 +1124,17 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
@property
|
||||||
|
def logit_scale(self):
|
||||||
|
logger.warning(
|
||||||
|
"`logit_scale` attribute is going to be deprecated in future versions, please use `model.config.logit_scale` instead."
|
||||||
|
)
|
||||||
|
return self.config.logit_scale
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tie_word_embeddings(self):
|
||||||
|
logger.warning(
|
||||||
|
"`tie_word_embeddings` attribute is going to be deprecated in future versions, please use `model.config.tie_word_embeddings` instead."
|
||||||
|
)
|
||||||
|
return self.config.tie_word_embeddings
|
||||||
|
663
src/transformers/models/cohere/modular_cohere.py
Normal file
663
src/transformers/models/cohere/modular_cohere.py
Normal file
@ -0,0 +1,663 @@
|
|||||||
|
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import math
|
||||||
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.checkpoint
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import CrossEntropyLoss
|
||||||
|
|
||||||
|
from transformers.models.gemma.configuration_gemma import GemmaConfig
|
||||||
|
from transformers.models.llama.modeling_llama import (
|
||||||
|
LlamaFlashAttention2,
|
||||||
|
LlamaForCausalLM,
|
||||||
|
LlamaSdpaAttention,
|
||||||
|
repeat_kv,
|
||||||
|
)
|
||||||
|
|
||||||
|
from ...activations import ACT2FN
|
||||||
|
from ...cache_utils import Cache, StaticCache
|
||||||
|
from ...modeling_outputs import CausalLMOutputWithPast
|
||||||
|
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||||
|
from ...utils import logging
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereConfig(GemmaConfig):
|
||||||
|
r"""
|
||||||
|
use_qk_norm (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to use query-key normalization in the attention
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size=8192,
|
||||||
|
intermediate_size=22528,
|
||||||
|
num_hidden_layers=40,
|
||||||
|
hidden_act="silu",
|
||||||
|
use_qk_norm=False,
|
||||||
|
layer_norm_eps=1e-05,
|
||||||
|
num_attention_heads=64,
|
||||||
|
num_key_value_heads=None,
|
||||||
|
logit_scale=0.0625,
|
||||||
|
rms_norm_eps=1e-05,
|
||||||
|
bos_token_id=5,
|
||||||
|
eos_token_id=255001,
|
||||||
|
**super_kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(self, **super_kwargs)
|
||||||
|
self.use_qk_norm = use_qk_norm
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.rms_norm_eps = layer_norm_eps
|
||||||
|
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.itermediate_size = intermediate_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
|
self.bos_token_id = bos_token_id
|
||||||
|
self.eos_token_id = eos_token_id
|
||||||
|
|
||||||
|
self.logit_scale = logit_scale
|
||||||
|
|
||||||
|
|
||||||
|
if num_key_value_heads is None:
|
||||||
|
self.num_key_value_heads = num_attention_heads
|
||||||
|
|
||||||
|
|
||||||
|
class CohereRMSNorm(nn.Module):
|
||||||
|
def __init__(self, hidden_size=None, eps=1e-5, bias=False):
|
||||||
|
"""The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
|
||||||
|
super().__init__()
|
||||||
|
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||||
|
self.variance_epsilon = eps
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
input_dtype = hidden_states.dtype
|
||||||
|
hidden_states = hidden_states.to(torch.float32)
|
||||||
|
mean = hidden_states.mean(-1, keepdim=True)
|
||||||
|
variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
|
||||||
|
hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
|
||||||
|
hidden_states = self.weight.to(torch.float32) * hidden_states
|
||||||
|
return hidden_states.to(input_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
ALL_LAYERNORM_LAYERS.append(CohereRMSNorm)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereLayerNorm(CohereRMSNorm):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
logger.warning_once("CohereLayerNorm is deprecated by CohereRMSNorm and will be removed in v4.45.")
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereRotaryEmbedding(nn.Module):
|
||||||
|
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
||||||
|
super().__init__()
|
||||||
|
self.scaling_factor = scaling_factor
|
||||||
|
self.dim = dim
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.base = base
|
||||||
|
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
|
||||||
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def forward(self, x, position_ids):
|
||||||
|
# x: [bs, num_attention_heads, seq_len, head_size]
|
||||||
|
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||||
|
position_ids_expanded = position_ids[:, None, :].float()
|
||||||
|
|
||||||
|
# Force float32 since bfloat16 loses precision on long contexts
|
||||||
|
# See https://github.com/huggingface/transformers/pull/29285
|
||||||
|
device_type = x.device.type
|
||||||
|
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||||
|
with torch.autocast(device_type=device_type, enabled=False):
|
||||||
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||||
|
emb = torch.repeat_interleave(freqs, 2, dim=-1)
|
||||||
|
cos = emb.cos()
|
||||||
|
sin = emb.sin()
|
||||||
|
return cos, sin
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_half(x):
|
||||||
|
# Split and rotate
|
||||||
|
x1 = x[..., ::2]
|
||||||
|
x2 = x[..., 1::2]
|
||||||
|
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
|
||||||
|
return rot_x
|
||||||
|
|
||||||
|
|
||||||
|
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||||
|
"""Applies Rotary Position Embedding to the query and key tensors.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
q (`torch.Tensor`): The query tensor.
|
||||||
|
k (`torch.Tensor`): The key tensor.
|
||||||
|
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
||||||
|
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
||||||
|
position_ids (`torch.Tensor`, *optional*):
|
||||||
|
Deprecated and unused.
|
||||||
|
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
||||||
|
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
||||||
|
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
||||||
|
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
||||||
|
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
||||||
|
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
||||||
|
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
||||||
|
Returns:
|
||||||
|
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||||
|
"""
|
||||||
|
dtype = q.dtype
|
||||||
|
q = q.float()
|
||||||
|
k = k.float()
|
||||||
|
cos = cos.unsqueeze(unsqueeze_dim)
|
||||||
|
sin = sin.unsqueeze(unsqueeze_dim)
|
||||||
|
q_embed = (q * cos) + (rotate_half(q) * sin)
|
||||||
|
k_embed = (k * cos) + (rotate_half(k) * sin)
|
||||||
|
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class CohereMLP(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.intermediate_size = config.intermediate_size
|
||||||
|
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||||
|
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||||
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
||||||
|
self.act_fn = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||||
|
return down_proj
|
||||||
|
|
||||||
|
|
||||||
|
class CohereAttention(nn.Module):
|
||||||
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||||
|
|
||||||
|
def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.layer_idx = layer_idx
|
||||||
|
if layer_idx is None:
|
||||||
|
logger.warning_once(
|
||||||
|
f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
|
||||||
|
"lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
|
||||||
|
"when creating this class."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.attention_dropout = config.attention_dropout
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.num_heads = config.num_attention_heads
|
||||||
|
self.head_dim = self.hidden_size // self.num_heads
|
||||||
|
self.num_key_value_heads = config.num_key_value_heads
|
||||||
|
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
|
||||||
|
self.max_position_embeddings = config.max_position_embeddings
|
||||||
|
self.rope_theta = config.rope_theta
|
||||||
|
self.is_causal = True
|
||||||
|
self.use_qk_norm = config.use_qk_norm
|
||||||
|
|
||||||
|
if (self.head_dim * self.num_heads) != self.hidden_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
|
||||||
|
f" and `num_heads`: {self.num_heads})."
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.use_qk_norm:
|
||||||
|
# When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
|
||||||
|
self.q_norm = CohereLayerNorm(hidden_size=(self.num_heads, self.head_dim), eps=config.layer_norm_eps)
|
||||||
|
self.k_norm = CohereLayerNorm(
|
||||||
|
hidden_size=(self.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
|
||||||
|
)
|
||||||
|
|
||||||
|
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
|
||||||
|
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
||||||
|
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
|
||||||
|
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
|
||||||
|
self._init_rope()
|
||||||
|
|
||||||
|
def _init_rope(self):
|
||||||
|
self.rotary_emb = CohereRotaryEmbedding(
|
||||||
|
self.head_dim,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
base=self.rope_theta,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
output_attentions: bool = False,
|
||||||
|
use_cache: bool = False,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
bsz, q_len, _ = hidden_states.size()
|
||||||
|
|
||||||
|
query_states = self.q_proj(hidden_states)
|
||||||
|
key_states = self.k_proj(hidden_states)
|
||||||
|
value_states = self.v_proj(hidden_states)
|
||||||
|
|
||||||
|
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
||||||
|
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
||||||
|
if self.use_qk_norm:
|
||||||
|
query_states = self.q_norm(query_states)
|
||||||
|
key_states = self.k_norm(key_states)
|
||||||
|
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
|
cos, sin = self.rotary_emb(value_states, position_ids)
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
||||||
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||||
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
|
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||||
|
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||||
|
|
||||||
|
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
|
||||||
|
|
||||||
|
if attention_mask is not None: # no matter the length, we just slice it
|
||||||
|
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
|
||||||
|
attn_weights = attn_weights + causal_mask
|
||||||
|
|
||||||
|
# upcast attention to fp32
|
||||||
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
|
||||||
|
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
|
||||||
|
attn_output = torch.matmul(attn_weights, value_states)
|
||||||
|
|
||||||
|
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
||||||
|
raise ValueError(
|
||||||
|
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
|
||||||
|
f" {attn_output.size()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||||
|
|
||||||
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
||||||
|
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
|
if not output_attentions:
|
||||||
|
attn_weights = None
|
||||||
|
|
||||||
|
return attn_output, attn_weights, past_key_value
|
||||||
|
|
||||||
|
|
||||||
|
class CohereSdpaAttention(CohereAttention):
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
output_attentions: bool = False,
|
||||||
|
use_cache: bool = False,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
if output_attentions:
|
||||||
|
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
||||||
|
logger.warning_once(
|
||||||
|
"CohereModel is using CohereSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
|
||||||
|
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||||
|
)
|
||||||
|
return super().forward(
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
use_cache=use_cache,
|
||||||
|
cache_position=cache_position,
|
||||||
|
)
|
||||||
|
|
||||||
|
bsz, q_len, _ = hidden_states.size()
|
||||||
|
|
||||||
|
query_states = self.q_proj(hidden_states)
|
||||||
|
key_states = self.k_proj(hidden_states)
|
||||||
|
value_states = self.v_proj(hidden_states)
|
||||||
|
|
||||||
|
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
||||||
|
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
||||||
|
if self.use_qk_norm:
|
||||||
|
query_states = self.q_norm(query_states)
|
||||||
|
key_states = self.k_norm(key_states)
|
||||||
|
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
|
cos, sin = self.rotary_emb(value_states, position_ids)
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# sin and cos are specific to RoPE models; cache_position needed for the static cache
|
||||||
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||||
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
|
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||||
|
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||||
|
|
||||||
|
causal_mask = attention_mask
|
||||||
|
# if attention_mask is not None and cache_position is not None:
|
||||||
|
if attention_mask is not None:
|
||||||
|
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
|
||||||
|
|
||||||
|
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
|
||||||
|
# Reference: https://github.com/pytorch/pytorch/issues/112577.
|
||||||
|
if query_states.device.type == "cuda" and causal_mask is not None:
|
||||||
|
query_states = query_states.contiguous()
|
||||||
|
key_states = key_states.contiguous()
|
||||||
|
value_states = value_states.contiguous()
|
||||||
|
|
||||||
|
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
|
||||||
|
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
|
||||||
|
is_causal = True if causal_mask is None and q_len > 1 else False
|
||||||
|
|
||||||
|
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
||||||
|
query_states,
|
||||||
|
key_states,
|
||||||
|
value_states,
|
||||||
|
attn_mask=causal_mask,
|
||||||
|
dropout_p=self.attention_dropout if self.training else 0.0,
|
||||||
|
is_causal=is_causal,
|
||||||
|
)
|
||||||
|
|
||||||
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||||
|
attn_output = attn_output.view(bsz, q_len, self.hidden_size)
|
||||||
|
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
|
return attn_output, None, past_key_value
|
||||||
|
|
||||||
|
|
||||||
|
class CohereFlashAttention2(CohereAttention):
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.LongTensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Cache] = None,
|
||||||
|
output_attentions: bool = False,
|
||||||
|
use_cache: bool = False,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
if isinstance(past_key_value, StaticCache):
|
||||||
|
raise ValueError(
|
||||||
|
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
|
||||||
|
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
|
||||||
|
)
|
||||||
|
output_attentions = False
|
||||||
|
|
||||||
|
bsz, q_len, _ = hidden_states.size()
|
||||||
|
|
||||||
|
query_states = self.q_proj(hidden_states)
|
||||||
|
key_states = self.k_proj(hidden_states)
|
||||||
|
value_states = self.v_proj(hidden_states)
|
||||||
|
|
||||||
|
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
|
||||||
|
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
||||||
|
if self.use_qk_norm:
|
||||||
|
query_states = self.q_norm(query_states)
|
||||||
|
key_states = self.k_norm(key_states)
|
||||||
|
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
|
cos, sin = self.rotary_emb(value_states, position_ids)
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# sin and cos are specific to RoPE models; position_ids needed for the static cache
|
||||||
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||||
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
|
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
|
||||||
|
# to be able to avoid many of these transpose/reshape/view.
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.transpose(1, 2)
|
||||||
|
|
||||||
|
dropout_rate = self.attention_dropout if self.training else 0.0
|
||||||
|
|
||||||
|
# Ignore copy
|
||||||
|
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
|
||||||
|
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
||||||
|
# cast them back in the correct dtype just to be sure everything works as expected.
|
||||||
|
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
||||||
|
# in fp32. (CohereLayerNorm handles it correctly)
|
||||||
|
|
||||||
|
input_dtype = query_states.dtype
|
||||||
|
if input_dtype == torch.float32:
|
||||||
|
if torch.is_autocast_enabled():
|
||||||
|
target_dtype = torch.get_autocast_gpu_dtype()
|
||||||
|
# Handle the case where the model is quantized
|
||||||
|
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||||
|
target_dtype = self.config._pre_quantization_dtype
|
||||||
|
else:
|
||||||
|
target_dtype = self.q_proj.weight.dtype
|
||||||
|
|
||||||
|
logger.warning_once(
|
||||||
|
f"The input hidden states seems to be silently casted in float32, this might be related to"
|
||||||
|
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
|
||||||
|
f" {target_dtype}."
|
||||||
|
)
|
||||||
|
|
||||||
|
query_states = query_states.to(target_dtype)
|
||||||
|
key_states = key_states.to(target_dtype)
|
||||||
|
value_states = value_states.to(target_dtype)
|
||||||
|
|
||||||
|
attn_output = self._flash_attention_forward(
|
||||||
|
query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
|
||||||
|
)
|
||||||
|
|
||||||
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
|
if not output_attentions:
|
||||||
|
attn_weights = None
|
||||||
|
|
||||||
|
return attn_output, attn_weights, past_key_value
|
||||||
|
|
||||||
|
|
||||||
|
COHERE_ATTENTION_CLASSES = {
|
||||||
|
"eager": CohereAttention,
|
||||||
|
"flash_attention_2": CohereFlashAttention2,
|
||||||
|
"sdpa": CohereSdpaAttention,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CohereDecoderLayer(nn.Module):
|
||||||
|
def __init__(self, config: CohereConfig, layer_idx: int):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
|
||||||
|
self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
|
||||||
|
|
||||||
|
self.mlp = CohereMLP(config)
|
||||||
|
self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||||
|
output_attentions: Optional[bool] = False,
|
||||||
|
use_cache: Optional[bool] = False,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||||
|
attention_mask (`torch.FloatTensor`, *optional*):
|
||||||
|
attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
|
||||||
|
query_sequence_length, key_sequence_length)` if default attention is used.
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
|
returned tensors for more detail.
|
||||||
|
use_cache (`bool`, *optional*):
|
||||||
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
||||||
|
(see `past_key_values`).
|
||||||
|
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
||||||
|
"""
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
|
||||||
|
# Self Attention
|
||||||
|
hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
use_cache=use_cache,
|
||||||
|
cache_position=cache_position,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fully Connected
|
||||||
|
hidden_states_mlp = self.mlp(hidden_states)
|
||||||
|
|
||||||
|
# Add everything together
|
||||||
|
hidden_states = residual + hidden_states_attention + hidden_states_mlp
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
|
||||||
|
if output_attentions:
|
||||||
|
outputs += (self_attn_weights,)
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
outputs += (present_key_value,)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class CohereForCausalLM(LlamaForCausalLM):
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.LongTensor = None,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||||
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||||
|
labels: Optional[torch.LongTensor] = None,
|
||||||
|
use_cache: Optional[bool] = None,
|
||||||
|
output_attentions: Optional[bool] = None,
|
||||||
|
output_hidden_states: Optional[bool] = None,
|
||||||
|
return_dict: Optional[bool] = None,
|
||||||
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||||
|
r"""
|
||||||
|
Args:
|
||||||
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||||
|
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||||
|
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>> from transformers import AutoTokenizer, CohereForCausalLM
|
||||||
|
|
||||||
|
>> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
|
||||||
|
>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
|
||||||
|
|
||||||
|
>> prompt = "Hey, are you conscious? Can you talk to me?"
|
||||||
|
>> inputs = tokenizer(prompt, return_tensors="pt")
|
||||||
|
|
||||||
|
>> # Generate
|
||||||
|
>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
||||||
|
>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||||
|
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
||||||
|
```"""
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
|
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||||
|
outputs = self.model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
use_cache=use_cache,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
cache_position=cache_position,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = outputs[0]
|
||||||
|
logits = self.lm_head(hidden_states)
|
||||||
|
logits = logits * self.config.logit_scale
|
||||||
|
logits = logits.float()
|
||||||
|
|
||||||
|
loss = None
|
||||||
|
if labels is not None:
|
||||||
|
# Shift so that tokens < n predict n
|
||||||
|
shift_logits = logits[..., :-1, :].contiguous()
|
||||||
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
|
# Flatten the tokens
|
||||||
|
loss_fct = CrossEntropyLoss()
|
||||||
|
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
||||||
|
shift_labels = shift_labels.view(-1)
|
||||||
|
# Enable model parallelism
|
||||||
|
shift_labels = shift_labels.to(shift_logits.device)
|
||||||
|
loss = loss_fct(shift_logits, shift_labels)
|
||||||
|
|
||||||
|
if not return_dict:
|
||||||
|
output = (logits,) + outputs[1:]
|
||||||
|
return (loss,) + output if loss is not None else output
|
||||||
|
|
||||||
|
return CausalLMOutputWithPast(
|
||||||
|
loss=loss,
|
||||||
|
logits=logits,
|
||||||
|
past_key_values=outputs.past_key_values,
|
||||||
|
hidden_states=outputs.hidden_states,
|
||||||
|
attentions=outputs.attentions,
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def logit_scale(self):
|
||||||
|
logger.warning(
|
||||||
|
"`logit_scale` attribute is going to be deprecated in future versions, please use `model.config.logit_scale` instead."
|
||||||
|
)
|
||||||
|
return self.config.logit_scale
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tie_word_embeddings(self):
|
||||||
|
logger.warning(
|
||||||
|
"`tie_word_embeddings` attribute is going to be deprecated in future versions, please use `model.config.tie_word_embeddings` instead."
|
||||||
|
)
|
||||||
|
return self.config.tie_word_embeddings
|
Reference in New Issue
Block a user