Attempt to use custom cuda kernels for speed up inference for bloom.

2025-10-20 17:13:56 +08:00 · 2022-07-21 15:06:12 +02:00
3 changed files with 137 additions and 0 deletions
--- a/src/transformers/models/bloom/gelu.cpp
+++ b/src/transformers/models/bloom/gelu.cpp
@ -0,0 +1,30 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size);
+
+template <typename T>
+at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
+{
+    auto input_cont = input.contiguous();
+
+    int bsz = input_cont.size(0) * input_cont.size(1);
+    int intermediate_size = input_cont.size(2);
+
+    launch_bias_gelu((T*)input_cont.data_ptr(),
+                     (T*)bias.data_ptr(),
+                     intermediate_size,
+                     bsz);
+    return input_cont;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+   m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
+    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp32 (CUDA)");
+}
--- a/src/transformers/models/bloom/gelu.cu
+++ b/src/transformers/models/bloom/gelu.cu
@ -0,0 +1,100 @@
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+#define MAX_CAP 4
+#define MAX_SEQ 2048
+
+inline __device__ float gelu(const float x)
+{
+    const float sqrt_param = 0.79788456080286535587989211986876f;
+    const float mul_param = 0.044715;
+    return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
+}
+
+__global__ void fused_bias_gelu(float* input,
+                                const float* bias,
+                                int total_count,
+                                int intermediate_size)
+{
+    float4* input_cast = reinterpret_cast<float4*>(input);
+    const float4* bias_cast = reinterpret_cast<const float4*>(bias);
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float4 data = input_cast[offset];
+        float4 bias_data = bias_cast[offset % intermediate_size];
+
+        data.x += bias_data.x;
+        data.y += bias_data.y;
+        data.z += bias_data.z;
+        data.w += bias_data.w;
+
+        data.x = gelu(data.x);
+        data.y = gelu(data.y);
+        data.z = gelu(data.z);
+        data.w = gelu(data.w);
+
+        input_cast[offset] = data;
+    }
+}
+
+__global__ void fused_bias_gelu(__half* input,
+                                const __half* bias,
+                                int total_count,
+                                int intermediate_size)
+{
+#ifdef HALF_PRECISION_AVAILABLE
+
+    float2* input_cast = reinterpret_cast<float2*>(input);
+    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
+
+    int offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offset < total_count) {
+        float2 vals_vec = input_cast[offset];
+        float2 bias_vec = bias_cast[offset % intermediate_size];
+
+        __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
+        __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
+
+        float2 low_data = __half22float2(vals_half[0]);
+        float2 high_data = __half22float2(vals_half[1]);
+
+        float2 low_bias = __half22float2(bias_half[0]);
+        float2 high_bias = __half22float2(bias_half[1]);
+
+        low_data.x += low_bias.x;
+        low_data.y += low_bias.y;
+        high_data.x += high_bias.x;
+        high_data.y += high_bias.y;
+
+        low_data.x = gelu(low_data.x);
+        low_data.y = gelu(low_data.y);
+        high_data.x = gelu(high_data.x);
+        high_data.y = gelu(high_data.y);
+
+        vals_half[0] = __float22half2_rn(low_data);
+        vals_half[1] = __float22half2_rn(high_data);
+
+        input_cast[offset] = vals_vec;
+    }
+#endif
+}
+
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size)
+{
+    int total_count = batch_size * (intermediate_size / 4);
+    int threads = 1024;  // intermediate_size / iterations / 4;
+    dim3 block_dims(threads);
+    dim3 grid_dims(((total_count - 1) / 1024 + 1));  // (batch_size);
+
+    fused_bias_gelu<<<grid_dims, block_dims>>>(
+        input, bias, total_count, intermediate_size / 4);
+}
+
+template void launch_bias_gelu<float>(float*, const float*, int, int);
+template void launch_bias_gelu<__half>(__half*, const __half*, int, int);
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@ -15,12 +15,14 @@
 """PyTorch BLOOM model."""

 import math
+import os
 from typing import Tuple, Union

 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.utils.cpp_extension import load

 from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
@ -51,6 +53,11 @@ BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]


+dirname = os.path.dirname(__file__)
+
+cuda_gelu = load(name="bias_gelu_fp16", sources=[os.path.join(dirname, "gelu.cpp"), os.path.join(dirname, "gelu.cu")])
+
+
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.