diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 4b591f07ca..e288770f2f 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.platforms import current_platform -from vllm.utils import GiB_bytes, direct_register_custom_op +from vllm.utils import direct_register_custom_op FP8_DTYPE = current_platform.fp8_dtype() logger = init_logger(__name__) @@ -281,25 +281,10 @@ class Attention(nn.Module, AttentionLayerBase): ) ] - try: - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) - except torch.cuda.OutOfMemoryError as e: - logger.error("Failed to initialize attention q/k/v range constants: %s", e) - if torch.cuda.is_available(): - logger.debug("CUDA device: %s", torch.cuda.current_device()) - logger.debug( - "Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes - ) - logger.debug( - "Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes - ) - raise RuntimeError( - "Failed to initialize q/k/v range constants. " - "This may be caused by insufficient memory to allocate " - "kv cache." - ) from e + # Initialize q/k/v range constants. + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) # for attn backends supporting query quantization self.query_quant = None @@ -668,13 +653,9 @@ class MLAAttention(nn.Module, AttentionLayerBase): self.use_sparse = use_sparse # Initialize q/k/v range constants. - try: - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) - except torch.cuda.OutOfMemoryError: - # Keep defaults if allocation fails; not critical for init. - pass + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) def forward( self, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 34bfcabc69..dfcc601a1c 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -34,7 +34,6 @@ from vllm.model_executor.parameter import ( ) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import GiB_bytes logger = init_logger(__name__) @@ -211,33 +210,17 @@ class UnquantizedLinearMethod(LinearMethodBase): # The weights are not quantized, and they are not sharded. # The amount of memory allocated for the weights is # sum(output_partition_sizes) * input_size_per_partition. - try: - weight_loader = extra_weight_attrs.pop("weight_loader") - weight = ModelWeightParameter( - data=torch.empty( - sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype, - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - except torch.cuda.OutOfMemoryError as e: - logger.error("Failed to create unquantized linear weights: %s", e) - if torch.cuda.is_available(): - logger.debug("CUDA device: %s", torch.cuda.current_device()) - logger.debug( - "Allocated: %.2f GiB", torch.cuda.memory_allocated() / GiB_bytes - ) - logger.debug( - "Reserved: %.2f GiB", torch.cuda.memory_reserved() / GiB_bytes - ) - raise RuntimeError( - "Failed to create unquantized linear weights. " - "This may be caused by insufficient memory to allocate " - "the weight." - ) from e + weight_loader = extra_weight_attrs.pop("weight_loader") + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs)