mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
[inference] test suite for ds-kernels (bert, roberta, gpt2, gpt-neo, gpt-j) (#1992)
Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com> Co-authored-by: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
This commit is contained in:
2
.github/workflows/amd.yml
vendored
2
.github/workflows/amd.yml
vendored
@ -63,5 +63,5 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 -m 'not sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 unit/
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -m 'sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -m 'sequential' unit/
|
||||||
|
63
.github/workflows/nv-inference.yml
vendored
Normal file
63
.github/workflows/nv-inference.yml
vendored
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
name: nv-inference
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- 'master'
|
||||||
|
- 'staging**'
|
||||||
|
paths-ignore:
|
||||||
|
- 'docs/**'
|
||||||
|
pull_request:
|
||||||
|
paths-ignore:
|
||||||
|
- 'docs/**'
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
unit-tests:
|
||||||
|
runs-on: [self-hosted, nvidia, cu111, v100]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: environment
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
which python
|
||||||
|
python --version
|
||||||
|
which nvcc
|
||||||
|
nvcc --version
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip uninstall --yes torch torchvision
|
||||||
|
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
|
||||||
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
|
- name: Install transformers
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/huggingface/transformers
|
||||||
|
cd transformers
|
||||||
|
# if needed switch to the last known good SHA until transformers@master is fixed
|
||||||
|
# git checkout 1cc453d33
|
||||||
|
git rev-parse --short HEAD
|
||||||
|
pip uninstall --yes transformers
|
||||||
|
pip install .
|
||||||
|
|
||||||
|
- name: Python environment
|
||||||
|
run: |
|
||||||
|
pip list
|
||||||
|
|
||||||
|
- name: Install deepspeed
|
||||||
|
run: |
|
||||||
|
pip uninstall --yes deepspeed
|
||||||
|
pip install .[dev,1bit,autotuning,sparse_attn,inf]
|
||||||
|
ds_report
|
||||||
|
|
||||||
|
- name: Unit tests
|
||||||
|
run: |
|
||||||
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
|
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
||||||
|
cd tests
|
||||||
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'inference' unit/
|
52
.github/workflows/nv-nightly.yml
vendored
Normal file
52
.github/workflows/nv-nightly.yml
vendored
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
name: nv-nightly
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: "0 0 * * *"
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
unit-tests:
|
||||||
|
runs-on: [self-hosted, nvidia, cu111, v100]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: environment
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
which python
|
||||||
|
python --version
|
||||||
|
which nvcc
|
||||||
|
nvcc --version
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip uninstall --yes torch torchvision
|
||||||
|
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
|
||||||
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
|
- name: Install transformers
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/huggingface/transformers
|
||||||
|
cd transformers
|
||||||
|
# if needed switch to the last known good SHA until transformers@master is fixed
|
||||||
|
# git checkout 1cc453d33
|
||||||
|
git rev-parse --short HEAD
|
||||||
|
pip uninstall --yes transformers
|
||||||
|
pip install .
|
||||||
|
|
||||||
|
- name: Install deepspeed
|
||||||
|
run: |
|
||||||
|
pip uninstall --yes deepspeed
|
||||||
|
pip install .[dev,1bit,autotuning,sparse_attn]
|
||||||
|
ds_report
|
||||||
|
|
||||||
|
- name: Unit tests
|
||||||
|
run: |
|
||||||
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
|
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
||||||
|
cd tests
|
||||||
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/
|
2
.github/workflows/nv-torch-latest-v100.yml
vendored
2
.github/workflows/nv-torch-latest-v100.yml
vendored
@ -60,5 +60,5 @@ jobs:
|
|||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
|
||||||
|
2
.github/workflows/nv-torch-nightly-v100.yml
vendored
2
.github/workflows/nv-torch-nightly-v100.yml
vendored
@ -53,5 +53,5 @@ jobs:
|
|||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
|
||||||
|
2
.github/workflows/nv-torch18-v100.yml
vendored
2
.github/workflows/nv-torch18-v100.yml
vendored
@ -60,5 +60,5 @@ jobs:
|
|||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
|
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
|
||||||
|
2
.github/workflows/nv-transformers-v100.yml
vendored
2
.github/workflows/nv-transformers-v100.yml
vendored
@ -57,6 +57,8 @@ jobs:
|
|||||||
pip install .[testing]
|
pip install .[testing]
|
||||||
# find reqs used in ds integration tests
|
# find reqs used in ds integration tests
|
||||||
find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
|
find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
|
||||||
|
# force datasets version due to issues
|
||||||
|
pip install datasets==2.2.2
|
||||||
# force protobuf version due to issues
|
# force protobuf version due to issues
|
||||||
pip install "protobuf<4.21.0"
|
pip install "protobuf<4.21.0"
|
||||||
pip list
|
pip list
|
||||||
|
@ -174,7 +174,8 @@ __global__ void fused_bias_residual(float* input,
|
|||||||
float* attnbias,
|
float* attnbias,
|
||||||
int total_count,
|
int total_count,
|
||||||
int intermediate_size,
|
int intermediate_size,
|
||||||
int mp_size)
|
int mp_size,
|
||||||
|
bool preln)
|
||||||
{
|
{
|
||||||
float4* input_cast = reinterpret_cast<float4*>(input);
|
float4* input_cast = reinterpret_cast<float4*>(input);
|
||||||
float4* output_cast = reinterpret_cast<float4*>(output);
|
float4* output_cast = reinterpret_cast<float4*>(output);
|
||||||
@ -189,12 +190,17 @@ __global__ void fused_bias_residual(float* input,
|
|||||||
float4 res_vec = attn_cast[offset];
|
float4 res_vec = attn_cast[offset];
|
||||||
float4 bias_data = bias_cast[offset % intermediate_size];
|
float4 bias_data = bias_cast[offset % intermediate_size];
|
||||||
float4 attn_bias = attnbias_cast[offset % intermediate_size];
|
float4 attn_bias = attnbias_cast[offset % intermediate_size];
|
||||||
|
if (preln) {
|
||||||
data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
|
data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
|
||||||
data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
|
data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
|
||||||
data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
|
data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
|
||||||
data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
|
data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
|
||||||
|
} else {
|
||||||
|
data.x = data.x + out.x + bias_data.x;
|
||||||
|
data.y = data.y + out.y + bias_data.y;
|
||||||
|
data.z = data.z + out.z + bias_data.z;
|
||||||
|
data.w = data.w + out.w + bias_data.w;
|
||||||
|
}
|
||||||
output_cast[offset] = data;
|
output_cast[offset] = data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,7 +212,8 @@ __global__ void fused_bias_residual(__half* input,
|
|||||||
__half* attn_bias,
|
__half* attn_bias,
|
||||||
int total_count,
|
int total_count,
|
||||||
int intermediate_size,
|
int intermediate_size,
|
||||||
int mp_size)
|
int mp_size,
|
||||||
|
bool preln)
|
||||||
{
|
{
|
||||||
#ifdef HALF_PRECISION_AVAILABLE
|
#ifdef HALF_PRECISION_AVAILABLE
|
||||||
|
|
||||||
@ -248,15 +255,21 @@ __global__ void fused_bias_residual(__half* input,
|
|||||||
float2 attn_low_bias = __half22float2(attnbias_half[0]);
|
float2 attn_low_bias = __half22float2(attnbias_half[0]);
|
||||||
float2 attn_high_bias = __half22float2(attnbias_half[1]);
|
float2 attn_high_bias = __half22float2(attnbias_half[1]);
|
||||||
|
|
||||||
low_data.x =
|
if (preln) {
|
||||||
(low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
|
low_data.x =
|
||||||
low_data.y =
|
(low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
|
||||||
(low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
|
low_data.y =
|
||||||
high_data.x =
|
(low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
|
||||||
(high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
|
high_data.x = (high_data.x + high_res.x) * mp_size +
|
||||||
high_data.y =
|
(high_out.x + (high_bias.x + attn_high_bias.x));
|
||||||
(high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
|
high_data.y = (high_data.y + high_res.y) * mp_size +
|
||||||
|
(high_out.y + (high_bias.y + attn_high_bias.y));
|
||||||
|
} else {
|
||||||
|
low_data.x = (low_data.x + low_out.x + low_bias.x);
|
||||||
|
low_data.y = (low_data.y + low_out.y + low_bias.y);
|
||||||
|
high_data.x = (high_data.x + high_out.x + high_bias.x);
|
||||||
|
high_data.y = (high_data.y + high_out.y + high_bias.y);
|
||||||
|
}
|
||||||
vals_half[0] = __float22half2_rn(low_data);
|
vals_half[0] = __float22half2_rn(low_data);
|
||||||
vals_half[1] = __float22half2_rn(high_data);
|
vals_half[1] = __float22half2_rn(high_data);
|
||||||
|
|
||||||
@ -274,6 +287,7 @@ void launch_bias_residual(T* input,
|
|||||||
int batch,
|
int batch,
|
||||||
int hidden_dim,
|
int hidden_dim,
|
||||||
int mp_size,
|
int mp_size,
|
||||||
|
bool preln,
|
||||||
cudaStream_t stream)
|
cudaStream_t stream)
|
||||||
{
|
{
|
||||||
int total_count = batch * hidden_dim / 4;
|
int total_count = batch * hidden_dim / 4;
|
||||||
@ -281,20 +295,13 @@ void launch_bias_residual(T* input,
|
|||||||
dim3 grid_dims((total_count - 1) / 1024 + 1); // (batch_size);
|
dim3 grid_dims((total_count - 1) / 1024 + 1); // (batch_size);
|
||||||
|
|
||||||
fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
|
fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
|
||||||
input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
|
input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size, preln);
|
||||||
}
|
}
|
||||||
|
|
||||||
template void
|
template void launch_bias_residual<
|
||||||
launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, cudaStream_t);
|
float>(float*, float*, float*, float*, float*, int, int, int, bool, cudaStream_t);
|
||||||
template void launch_bias_residual<__half>(__half*,
|
template void launch_bias_residual<
|
||||||
__half*,
|
__half>(__half*, __half*, __half*, __half*, __half*, int, int, int, bool, cudaStream_t);
|
||||||
__half*,
|
|
||||||
__half*,
|
|
||||||
__half*,
|
|
||||||
int,
|
|
||||||
int,
|
|
||||||
int,
|
|
||||||
cudaStream_t);
|
|
||||||
|
|
||||||
__global__ void gptj_residual_add(float* input,
|
__global__ void gptj_residual_add(float* input,
|
||||||
float* output,
|
float* output,
|
||||||
|
@ -787,17 +787,17 @@ at::Tensor ds_vector_matmul_int8(at::Tensor& input,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void mlp_unfused_cublas(at::Tensor& output,
|
at::Tensor mlp_unfused_cublas(at::Tensor& output,
|
||||||
at::Tensor& input,
|
at::Tensor& input,
|
||||||
at::Tensor& residual,
|
at::Tensor& residual,
|
||||||
at::Tensor& input_bias,
|
at::Tensor& input_bias,
|
||||||
at::Tensor& weight,
|
at::Tensor& weight,
|
||||||
at::Tensor& bias,
|
at::Tensor& bias,
|
||||||
at::Tensor& gamma,
|
at::Tensor& gamma,
|
||||||
at::Tensor& beta,
|
at::Tensor& beta,
|
||||||
const float epsilon,
|
const float epsilon,
|
||||||
bool preLayerNorm,
|
bool preLayerNorm,
|
||||||
bool mlp_after_attn)
|
bool mlp_after_attn)
|
||||||
{
|
{
|
||||||
int bsz = input.size(0) * input.size(1);
|
int bsz = input.size(0) * input.size(1);
|
||||||
auto inp_norm = at::empty_like(input);
|
auto inp_norm = at::empty_like(input);
|
||||||
@ -840,18 +840,19 @@ void mlp_unfused_cublas(at::Tensor& output,
|
|||||||
weight.size(1),
|
weight.size(1),
|
||||||
bsz,
|
bsz,
|
||||||
Context::Instance().GetCurrentStream());
|
Context::Instance().GetCurrentStream());
|
||||||
|
return inp_norm;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
at::Tensor ds_mlp_gemm(at::Tensor& input,
|
std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
|
||||||
at::Tensor& residual,
|
at::Tensor& residual,
|
||||||
at::Tensor& input_bias,
|
at::Tensor& input_bias,
|
||||||
at::Tensor& weight,
|
at::Tensor& weight,
|
||||||
at::Tensor& bias,
|
at::Tensor& bias,
|
||||||
at::Tensor& gamma,
|
at::Tensor& gamma,
|
||||||
at::Tensor& beta,
|
at::Tensor& beta,
|
||||||
const float epsilon,
|
const float epsilon,
|
||||||
bool preLayerNorm,
|
bool preLayerNorm,
|
||||||
bool mlp_after_attn)
|
bool mlp_after_attn)
|
||||||
{
|
{
|
||||||
auto input_cont = input.contiguous();
|
auto input_cont = input.contiguous();
|
||||||
auto options = at::TensorOptions()
|
auto options = at::TensorOptions()
|
||||||
@ -863,19 +864,19 @@ at::Tensor ds_mlp_gemm(at::Tensor& input,
|
|||||||
auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
|
auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
|
||||||
int bsz = input_cont.size(0) * input_cont.size(1);
|
int bsz = input_cont.size(0) * input_cont.size(1);
|
||||||
|
|
||||||
mlp_unfused_cublas<T>(output,
|
auto res_add = mlp_unfused_cublas<T>(output,
|
||||||
mlp_after_attn ? input : residual,
|
mlp_after_attn ? input : residual,
|
||||||
residual,
|
residual,
|
||||||
input_bias,
|
input_bias,
|
||||||
weight,
|
weight,
|
||||||
bias,
|
bias,
|
||||||
gamma,
|
gamma,
|
||||||
beta,
|
beta,
|
||||||
epsilon,
|
epsilon,
|
||||||
preLayerNorm,
|
preLayerNorm,
|
||||||
mlp_after_attn);
|
mlp_after_attn);
|
||||||
|
|
||||||
return output;
|
return {output, res_add};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -1001,7 +1002,8 @@ void residual_add_bias(at::Tensor& output,
|
|||||||
at::Tensor& attention_b,
|
at::Tensor& attention_b,
|
||||||
int mp_size,
|
int mp_size,
|
||||||
bool mlp_after_attn,
|
bool mlp_after_attn,
|
||||||
bool add_bias)
|
bool add_bias,
|
||||||
|
bool preln)
|
||||||
{
|
{
|
||||||
int bsz = input.size(0) * input.size(1);
|
int bsz = input.size(0) * input.size(1);
|
||||||
int hidden_size = input.size(2);
|
int hidden_size = input.size(2);
|
||||||
@ -1017,6 +1019,7 @@ void residual_add_bias(at::Tensor& output,
|
|||||||
bsz,
|
bsz,
|
||||||
hidden_size,
|
hidden_size,
|
||||||
mp_size,
|
mp_size,
|
||||||
|
preln,
|
||||||
Context::Instance().GetCurrentStream());
|
Context::Instance().GetCurrentStream());
|
||||||
else
|
else
|
||||||
launch_gptj_residual_add<float>((float*)input.data_ptr(),
|
launch_gptj_residual_add<float>((float*)input.data_ptr(),
|
||||||
@ -1037,6 +1040,7 @@ void residual_add_bias(at::Tensor& output,
|
|||||||
bsz,
|
bsz,
|
||||||
hidden_size,
|
hidden_size,
|
||||||
mp_size,
|
mp_size,
|
||||||
|
preln,
|
||||||
Context::Instance().GetCurrentStream());
|
Context::Instance().GetCurrentStream());
|
||||||
else
|
else
|
||||||
launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
|
launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
|
||||||
|
@ -58,6 +58,7 @@ void launch_bias_residual(T* input,
|
|||||||
int batch,
|
int batch,
|
||||||
int hidden_dim,
|
int hidden_dim,
|
||||||
int mp_size,
|
int mp_size,
|
||||||
|
bool preln,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -3,21 +3,22 @@ Copyright 2021 The Microsoft DeepSpeed Team
|
|||||||
'''
|
'''
|
||||||
import torch
|
import torch
|
||||||
import os
|
import os
|
||||||
from torch.nn.modules import Module
|
|
||||||
import deepspeed.comm as dist
|
import deepspeed.comm as dist
|
||||||
|
import deepspeed.utils.groups as groups
|
||||||
|
|
||||||
|
from torch.nn.modules import Module
|
||||||
|
from packaging import version as pkg_version
|
||||||
|
|
||||||
from ..runtime.state_dict_factory import SDLoaderFactory
|
from ..runtime.state_dict_factory import SDLoaderFactory
|
||||||
from ..runtime.weight_quantizer import WeightQuantization
|
from ..runtime.weight_quantizer import WeightQuantization
|
||||||
from ..module_inject.replace_module import replace_transformer_layer
|
from ..module_inject.replace_module import replace_transformer_layer
|
||||||
from ..utils import logger
|
from ..utils import logger
|
||||||
from ..comm.comm import init_distributed
|
from ..comm.comm import init_distributed
|
||||||
|
|
||||||
from ..pipe import PipelineModule
|
from ..pipe import PipelineModule
|
||||||
from ..moe.utils import has_moe_layers
|
from ..moe.utils import has_moe_layers
|
||||||
from ..moe.layer import MoE
|
from ..moe.layer import MoE
|
||||||
|
|
||||||
import deepspeed.comm as dist
|
|
||||||
import deepspeed.utils.groups as groups
|
|
||||||
|
|
||||||
DS_INFERENCE_ENABLED = False
|
DS_INFERENCE_ENABLED = False
|
||||||
|
|
||||||
|
|
||||||
@ -88,9 +89,13 @@ class InferenceEngine(Module):
|
|||||||
self.ep_group = ep_group
|
self.ep_group = ep_group
|
||||||
self.expert_mp_group = expert_mp_group
|
self.expert_mp_group = expert_mp_group
|
||||||
self.enable_cuda_graph = enable_cuda_graph
|
self.enable_cuda_graph = enable_cuda_graph
|
||||||
self.cuda_grah_created = False
|
self.cuda_graph_created = False
|
||||||
self._init_quantization_setting(quantization_setting)
|
self._init_quantization_setting(quantization_setting)
|
||||||
|
|
||||||
|
if enable_cuda_graph:
|
||||||
|
assert pkg_version.parse(torch.__version__) >= pkg_version.parse("1.10"), \
|
||||||
|
"If you want to use cuda graph, please upgrade torch to at least v1.10"
|
||||||
|
|
||||||
if self.checkpoint:
|
if self.checkpoint:
|
||||||
self._load_checkpoint(self.checkpoint)
|
self._load_checkpoint(self.checkpoint)
|
||||||
|
|
||||||
@ -372,7 +377,7 @@ class InferenceEngine(Module):
|
|||||||
with torch.cuda.graph(self._cuda_graphs):
|
with torch.cuda.graph(self._cuda_graphs):
|
||||||
self.static_output = self.module(*self.static_inputs, **self.static_kwargs)
|
self.static_output = self.module(*self.static_inputs, **self.static_kwargs)
|
||||||
|
|
||||||
self.cuda_grah_created = True
|
self.cuda_graph_created = True
|
||||||
|
|
||||||
def _graph_replay(self, *inputs, **kwargs):
|
def _graph_replay(self, *inputs, **kwargs):
|
||||||
for i in range(len(inputs)):
|
for i in range(len(inputs)):
|
||||||
@ -409,7 +414,7 @@ class InferenceEngine(Module):
|
|||||||
outputs = self.model_orig_fwd(*inputs, **kwargs)
|
outputs = self.model_orig_fwd(*inputs, **kwargs)
|
||||||
else:
|
else:
|
||||||
if self.enable_cuda_graph:
|
if self.enable_cuda_graph:
|
||||||
if self.cuda_grah_created:
|
if self.cuda_graph_created:
|
||||||
outputs = self._graph_replay(*inputs, **kwargs)
|
outputs = self._graph_replay(*inputs, **kwargs)
|
||||||
else:
|
else:
|
||||||
self._create_cuda_graph(*inputs, **kwargs)
|
self._create_cuda_graph(*inputs, **kwargs)
|
||||||
|
@ -351,8 +351,11 @@ def replace_transformer_layer(orig_layer_impl,
|
|||||||
# linear layer is created with [input, output] shape
|
# linear layer is created with [input, output] shape
|
||||||
# transpose it here to reduce inference cost!
|
# transpose it here to reduce inference cost!
|
||||||
def transpose(data):
|
def transpose(data):
|
||||||
|
# temp move to cpu to avoid requiring extra GPU memory during the reshape
|
||||||
|
data = data.to('cpu')
|
||||||
data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
|
data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
|
||||||
data = data.reshape(data.shape[-1], data.shape[-2])
|
data = data.reshape(data.shape[-1], data.shape[-2])
|
||||||
|
data.to(torch.cuda.current_device())
|
||||||
return data
|
return data
|
||||||
|
|
||||||
if attn_linear_layer:
|
if attn_linear_layer:
|
||||||
@ -460,7 +463,7 @@ def replace_transformer_layer(orig_layer_impl,
|
|||||||
new_module.norm_b.data = input_nb.to(torch.cuda.current_device())
|
new_module.norm_b.data = input_nb.to(torch.cuda.current_device())
|
||||||
else:
|
else:
|
||||||
transformer_config = deepspeed.DeepSpeedTransformerConfig(
|
transformer_config = deepspeed.DeepSpeedTransformerConfig(
|
||||||
batch_size=micro_batch_size,
|
batch_size=micro_batch_size if micro_batch_size > 0 else 1,
|
||||||
hidden_size=config.hidden_size,
|
hidden_size=config.hidden_size,
|
||||||
heads=config.num_attention_heads,
|
heads=config.num_attention_heads,
|
||||||
attn_dropout_ratio=config.attention_probs_dropout_prob,
|
attn_dropout_ratio=config.attention_probs_dropout_prob,
|
||||||
|
@ -471,7 +471,7 @@ class DeepSpeedMLPFunction(Function):
|
|||||||
config.pre_layer_norm,
|
config.pre_layer_norm,
|
||||||
False)
|
False)
|
||||||
else:
|
else:
|
||||||
intermediate = mlp_gemm_func(input,
|
intermediate, residual_add = mlp_gemm_func(input,
|
||||||
residual,
|
residual,
|
||||||
bias,
|
bias,
|
||||||
inter_w,
|
inter_w,
|
||||||
@ -482,14 +482,16 @@ class DeepSpeedMLPFunction(Function):
|
|||||||
config.pre_layer_norm,
|
config.pre_layer_norm,
|
||||||
config.mlp_after_attn)
|
config.mlp_after_attn)
|
||||||
output = vector_matmul_func(intermediate, output_w, False)
|
output = vector_matmul_func(intermediate, output_w, False)
|
||||||
inference_cuda_module.residual_add(output,
|
inference_cuda_module.residual_add(
|
||||||
residual,
|
output,
|
||||||
input,
|
residual if config.pre_layer_norm else residual_add,
|
||||||
output_b,
|
input,
|
||||||
bias if bias is not None else output_b,
|
output_b,
|
||||||
config.mp_size,
|
bias if bias is not None else output_b,
|
||||||
config.mlp_after_attn,
|
config.mp_size,
|
||||||
bias is not None)
|
config.mlp_after_attn,
|
||||||
|
bias is not None,
|
||||||
|
config.pre_layer_norm)
|
||||||
if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
|
if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
|
||||||
dist.all_reduce(output, group=mp_group)
|
dist.all_reduce(output, group=mp_group)
|
||||||
return output
|
return output
|
||||||
|
2
requirements/requirements-inf.txt
Normal file
2
requirements/requirements-inf.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
lm-eval>=0.2.0
|
||||||
|
transformers
|
4
setup.py
4
setup.py
@ -61,7 +61,8 @@ extras_require = {
|
|||||||
'dev': fetch_requirements('requirements/requirements-dev.txt'),
|
'dev': fetch_requirements('requirements/requirements-dev.txt'),
|
||||||
'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
|
'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
|
||||||
'autotuning_ml': fetch_requirements('requirements/requirements-autotuning-ml.txt'),
|
'autotuning_ml': fetch_requirements('requirements/requirements-autotuning-ml.txt'),
|
||||||
'sparse_attn': fetch_requirements('requirements/requirements-sparse_attn.txt')
|
'sparse_attn': fetch_requirements('requirements/requirements-sparse_attn.txt'),
|
||||||
|
'inf': fetch_requirements('requirements/requirements-inf.txt')
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add specific cupy version to both onebit extension variants
|
# Add specific cupy version to both onebit extension variants
|
||||||
@ -291,6 +292,7 @@ setup(name='deepspeed',
|
|||||||
'bin/ds',
|
'bin/ds',
|
||||||
'bin/ds_ssh',
|
'bin/ds_ssh',
|
||||||
'bin/ds_report',
|
'bin/ds_report',
|
||||||
|
'bin/dsr',
|
||||||
'bin/ds_elastic'
|
'bin/ds_elastic'
|
||||||
],
|
],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
6
tests/pytest.ini
Normal file
6
tests/pytest.ini
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
[pytest]
|
||||||
|
addopts = -m "not sequential and not nightly and not inference"
|
||||||
|
markers =
|
||||||
|
sequential:Tests that need to be run sequentially
|
||||||
|
inference:Inference model tests
|
||||||
|
nightly:Tests that should be run nightly
|
@ -1,123 +1,323 @@
|
|||||||
import os
|
import os
|
||||||
|
import time
|
||||||
import torch
|
import torch
|
||||||
import pytest
|
import pytest
|
||||||
|
import itertools
|
||||||
import deepspeed
|
import deepspeed
|
||||||
|
from deepspeed.git_version_info import torch_info
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from transformers import pipeline
|
|
||||||
from .common import distributed_test
|
from .common import distributed_test
|
||||||
from packaging import version as pkg_version
|
from packaging import version as pkg_version
|
||||||
|
from deepspeed.ops.op_builder import OpBuilder
|
||||||
|
|
||||||
pytest.task_query_dict = {
|
try:
|
||||||
"fill-mask":
|
import lm_eval
|
||||||
defaultdict(
|
import lm_eval.models
|
||||||
lambda: "Hello I'm a [MASK] model.",
|
import lm_eval.tasks
|
||||||
{"roberta-base": "Hello I'm a <mask> model."},
|
from lm_eval.evaluator import evaluate
|
||||||
),
|
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
|
||||||
"question-answering":
|
from huggingface_hub import HfApi
|
||||||
defaultdict(lambda: {
|
except ImportError:
|
||||||
"question": "What is the greatest?",
|
pytest.skip("please install w. [inf] extra to run this test",
|
||||||
"context": "DeepSpeed is the greatest",
|
allow_module_level=True)
|
||||||
}),
|
|
||||||
"text-classification":
|
rocm_version = OpBuilder.installed_rocm_version()
|
||||||
defaultdict(lambda: "DeepSpeed is the greatest"),
|
if rocm_version != (0, 0):
|
||||||
"token-classification":
|
pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
|
||||||
defaultdict(lambda: "My name is jean-baptiste and I live in montreal."),
|
|
||||||
"text-generation":
|
_bert_models = [
|
||||||
defaultdict(lambda: "DeepSpeed is the greatest"),
|
"bert-base-cased",
|
||||||
}
|
"bert-base-uncased",
|
||||||
pytest.task_model_dict = {
|
"bert-large-cased",
|
||||||
"fill-mask": {
|
"bert-large-uncased",
|
||||||
"bert": "bert-base-cased",
|
"bert-base-multilingual-cased",
|
||||||
"roberta": "roberta-base"
|
"bert-base-multilingual-uncased",
|
||||||
},
|
"deepset/minilm-uncased-squad2",
|
||||||
"question-answering": {
|
"cross-encoder/ms-marco-MiniLM-L-12-v2",
|
||||||
"bert": "deepset/minilm-uncased-squad2",
|
"dslim/bert-base-NER",
|
||||||
"roberta": "deepset/roberta-base-squad2",
|
"bert-large-uncased-whole-word-masking-finetuned-squad",
|
||||||
},
|
"distilbert-base-cased-distilled-squad",
|
||||||
"text-classification": {
|
]
|
||||||
"bert": "cross-encoder/ms-marco-MiniLM-L-12-v2",
|
_roberta_models = [
|
||||||
"roberta": "j-hartmann/emotion-english-distilroberta-base",
|
"roberta-large",
|
||||||
},
|
"roberta-base",
|
||||||
"token-classification": {
|
"deepset/roberta-base-squad2",
|
||||||
"bert": "dslim/bert-base-NER",
|
"j-hartmann/emotion-english-distilroberta-base",
|
||||||
"roberta": "Jean-Baptiste/roberta-large-ner-english",
|
"Jean-Baptiste/roberta-large-ner-english",
|
||||||
},
|
]
|
||||||
"text-generation": {
|
_gpt_models = [
|
||||||
"gpt2": "distilgpt2",
|
"gpt2",
|
||||||
"gpt_neo": "Norod78/hebrew-bad_wiki-gpt_neo-tiny",
|
"distilgpt2",
|
||||||
"gptj": "EleutherAI/gpt-j-6B",
|
"Norod78/hebrew-bad_wiki-gpt_neo-tiny",
|
||||||
},
|
"EleutherAI/gpt-j-6B",
|
||||||
|
]
|
||||||
|
_all_models = HfApi().list_models()
|
||||||
|
|
||||||
|
test_models = set(_bert_models + _roberta_models + _gpt_models)
|
||||||
|
test_tasks = [
|
||||||
|
"fill-mask",
|
||||||
|
"question-answering",
|
||||||
|
"text-classification",
|
||||||
|
"token-classification",
|
||||||
|
"text-generation",
|
||||||
|
]
|
||||||
|
pytest.all_models = {
|
||||||
|
task: [m.modelId for m in _all_models if m.pipeline_tag == task]
|
||||||
|
for task in test_tasks
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_model_w_tasks = itertools.product(*[test_models, test_tasks])
|
||||||
|
|
||||||
|
|
||||||
|
def _valid_model_task(model_task):
|
||||||
|
m, t = model_task
|
||||||
|
return m in pytest.all_models[t]
|
||||||
|
|
||||||
|
|
||||||
|
pytest.models_w_tasks = list(filter(_valid_model_task, _model_w_tasks))
|
||||||
|
pytest.mt_names = [f"{m}-{t}" for m, t in pytest.models_w_tasks]
|
||||||
|
"""
|
||||||
|
These fixtures iterate all combinations of tasks and models, dtype, & cuda_graph
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=pytest.models_w_tasks, ids=pytest.mt_names)
|
||||||
|
def model_w_task(request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=[torch.float, torch.half], ids=["fp32", "fp16"])
|
||||||
|
def dtype(request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=[True, False], ids=["CG", "noCG"])
|
||||||
|
def enable_cuda_graph(request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This fixture will validate the configuration
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph):
|
||||||
|
model, task = model_w_task
|
||||||
|
if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
|
||||||
|
msg = "DS inference injection doesn't work well on older torch versions"
|
||||||
|
elif model not in pytest.all_models[task]:
|
||||||
|
msg = f"Not a valid model / task combination: {model} / {task}"
|
||||||
|
elif enable_cuda_graph and (torch_info["cuda_version"] == "0.0"):
|
||||||
|
msg = "CUDA not detected, cannot use CUDA Graph"
|
||||||
|
elif enable_cuda_graph and pkg_version.parse(
|
||||||
|
torch.__version__) < pkg_version.parse("1.10"):
|
||||||
|
msg = "CUDA Graph is only available in torch versions >= 1.10"
|
||||||
|
elif ("gpt-j-6B" in model) and (dtype == torch.float):
|
||||||
|
msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
|
||||||
|
else:
|
||||||
|
msg = ""
|
||||||
|
return msg
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
These fixtures can be used to customize the query, inference args, and assert
|
||||||
|
statement for each combination of model /task
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def model(task, model_family):
|
def query(model_w_task):
|
||||||
if model_family not in pytest.task_model_dict[task]:
|
model, task = model_w_task
|
||||||
pytest.skip(f"No models in family {model_family} for task {task}")
|
if task == "fill-mask":
|
||||||
return pytest.task_model_dict[task][model_family]
|
if "roberta" in model:
|
||||||
|
return "Hello I'm a <mask> model."
|
||||||
|
else:
|
||||||
|
return "Hell I'm a [MASK] model."
|
||||||
|
elif task == "question-answering":
|
||||||
|
return {
|
||||||
|
"question": "What's my name?",
|
||||||
|
"context": "My name is Clara and I live in Berkeley",
|
||||||
|
}
|
||||||
|
elif task == "text-classification":
|
||||||
|
return "DeepSpeed is the greatest"
|
||||||
|
elif task == "token-classification":
|
||||||
|
return "My name is jean-baptiste and I live in montreal."
|
||||||
|
elif task == "text-generation":
|
||||||
|
return "DeepSpeed is the greatest"
|
||||||
|
else:
|
||||||
|
NotImplementedError(f'query for task "{task}" is not implemented')
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def query(task, model):
|
def inf_kwargs(model_w_task):
|
||||||
return pytest.task_query_dict[task][model]
|
model, task = model_w_task
|
||||||
|
if task == "text-generation":
|
||||||
|
return {"do_sample": False}
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def assert_fn(model_w_task):
|
||||||
|
model, task = model_w_task
|
||||||
|
if task == "fill-mask":
|
||||||
|
return lambda x, y: set(res["token_str"] for res in x) == set(
|
||||||
|
res["token_str"] for res in y
|
||||||
|
)
|
||||||
|
elif task == "question-answering":
|
||||||
|
return lambda x, y: x["answer"] == y["answer"]
|
||||||
|
elif task == "text-classification":
|
||||||
|
return lambda x, y: set(res["label"] for res in x) == set(
|
||||||
|
res["label"] for res in y
|
||||||
|
)
|
||||||
|
elif task == "token-classification":
|
||||||
|
return lambda x, y: set(ent["word"] for ent in x) == set(
|
||||||
|
ent["word"] for ent in y
|
||||||
|
)
|
||||||
|
elif task == "text-generation":
|
||||||
|
return lambda x, y: set(res["generated_text"] for res in x) == set(
|
||||||
|
res["generated_text"] for res in y
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
NotImplementedError(f'assert_fn for task "{task}" is not implemented')
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Tests
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.inference
|
||||||
|
def test_model_task(
|
||||||
|
model_w_task,
|
||||||
|
dtype,
|
||||||
|
enable_cuda_graph,
|
||||||
|
query,
|
||||||
|
inf_kwargs,
|
||||||
|
assert_fn,
|
||||||
|
invalid_model_task_config,
|
||||||
|
):
|
||||||
|
if invalid_model_task_config:
|
||||||
|
pytest.skip(invalid_model_task_config)
|
||||||
|
|
||||||
|
model, task = model_w_task
|
||||||
|
|
||||||
|
@distributed_test(world_size=[1])
|
||||||
|
def _go():
|
||||||
|
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
||||||
|
|
||||||
|
if "gpt-j-6B" in model and dtype == torch.half:
|
||||||
|
_model = AutoModelForCausalLM.from_pretrained(model)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
_model.half()
|
||||||
|
pipe = pipeline(
|
||||||
|
task,
|
||||||
|
model=_model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
device=local_rank,
|
||||||
|
framework="pt",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pipe = pipeline(task, model=model, device=local_rank, framework="pt")
|
||||||
|
if dtype == torch.half:
|
||||||
|
pipe.model.half()
|
||||||
|
|
||||||
|
# Warm-up queries for perf measurement
|
||||||
|
for i in range(10):
|
||||||
|
_ = pipe(query, **inf_kwargs)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.time()
|
||||||
|
bs_output = pipe(query, **inf_kwargs)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
bs_time = time.time() - start
|
||||||
|
|
||||||
|
pipe.model = deepspeed.init_inference(
|
||||||
|
pipe.model,
|
||||||
|
mp_size=1,
|
||||||
|
dtype=dtype,
|
||||||
|
replace_method="auto",
|
||||||
|
replace_with_kernel_inject=True,
|
||||||
|
enable_cuda_graph=enable_cuda_graph,
|
||||||
|
)
|
||||||
|
# Warm-up queries for perf measurement
|
||||||
|
for i in range(10):
|
||||||
|
_ = pipe(query, **inf_kwargs)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.time()
|
||||||
|
ds_output = pipe(query, **inf_kwargs)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
ds_time = time.time() - start
|
||||||
|
|
||||||
|
if task == "text-generation":
|
||||||
|
bs_output = pipe(query, **inf_kwargs)
|
||||||
|
|
||||||
|
# These performance tests are only measuring the time for a single
|
||||||
|
# inference request, we just want to check that performance isn't terrible
|
||||||
|
assert ds_time <= (bs_time * 1.1)
|
||||||
|
assert assert_fn(bs_output, ds_output)
|
||||||
|
|
||||||
|
_go()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.nightly
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"task",
|
"model_family, model_name",
|
||||||
(
|
(
|
||||||
"fill-mask",
|
["gpt2",
|
||||||
"question-answering",
|
"EleutherAI/gpt-neo-2.7B"],
|
||||||
"text-classification",
|
["gpt2",
|
||||||
"token-classification",
|
"EleutherAI/gpt-j-6B"],
|
||||||
"text-generation",
|
["gpt2",
|
||||||
|
"gpt2-xl"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("model_family", ("bert", "roberta", "gpt2", "gpt_neo"))
|
@pytest.mark.parametrize("task", ["lambada"])
|
||||||
def test_model_task_inject(task, model, query, dtype=torch.float):
|
def test_lm_correctness(model_family, model_name, task):
|
||||||
if pkg_version.parse(torch.__version__) <= pkg_version.parse('1.2'):
|
|
||||||
pytest.skip("DS inference injection doesn't work well on older torch versions")
|
|
||||||
|
|
||||||
@distributed_test(world_size=[1])
|
@distributed_test(world_size=[1])
|
||||||
def _go():
|
def _go():
|
||||||
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
local_rank = os.getenv("LOCAL_RANK", "0")
|
||||||
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
device = torch.device(f"cuda:{local_rank}")
|
||||||
generator = pipeline(task, model=model, device=local_rank)
|
dtype = torch.float
|
||||||
|
task_dict = lm_eval.tasks.get_task_dict([task])
|
||||||
|
|
||||||
generator.model = deepspeed.init_inference(
|
if 'gpt-j-6B' in model_name:
|
||||||
generator.model,
|
dtype = torch.half
|
||||||
mp_size=world_size,
|
lm = lm_eval.models.get_model(model_family).create_from_arg_string(
|
||||||
|
f"pretrained={model_name}",
|
||||||
|
{"device": "cpu"})
|
||||||
|
setattr(lm, model_family, getattr(lm, model_family).half().to(device))
|
||||||
|
lm._device = device
|
||||||
|
else:
|
||||||
|
lm = lm_eval.models.get_model(model_family).create_from_arg_string(
|
||||||
|
f"pretrained={model_name}",
|
||||||
|
{"device": f"cuda:{local_rank}"})
|
||||||
|
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.time()
|
||||||
|
bs_output = evaluate(lm=lm, task_dict=task_dict)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
bs_time = time.time() - start
|
||||||
|
|
||||||
|
ds_model = deepspeed.init_inference(
|
||||||
|
getattr(lm,
|
||||||
|
model_family),
|
||||||
|
mp_size=1,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
replace_method="auto",
|
replace_method="auto",
|
||||||
replace_with_kernel_inject=True,
|
replace_with_kernel_inject=True,
|
||||||
|
enable_cuda_graph=False,
|
||||||
)
|
)
|
||||||
|
setattr(lm, model_family, ds_model)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.time()
|
||||||
|
ds_output = evaluate(lm=lm, task_dict=task_dict)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
ds_time = time.time() - start
|
||||||
|
|
||||||
response = generator(query)
|
ppl_diff = abs(bs_output["results"][task]["ppl"] -
|
||||||
|
ds_output["results"][task]["ppl"])
|
||||||
_go()
|
assert ds_time <= bs_time
|
||||||
|
assert ppl_diff < 0.01
|
||||||
|
|
||||||
@pytest.mark.parametrize("dtype", [(torch.float), (torch.half)])
|
|
||||||
def test_gpt2_inject(dtype):
|
|
||||||
if pkg_version.parse(torch.__version__) <= pkg_version.parse('1.2'):
|
|
||||||
pytest.skip("DS inference injection doesn't work well on older torch versions")
|
|
||||||
|
|
||||||
@distributed_test(world_size=[1])
|
|
||||||
def _go():
|
|
||||||
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
|
||||||
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
|
||||||
generator = pipeline("text-generation", model="gpt2", device=local_rank)
|
|
||||||
|
|
||||||
generator.model = deepspeed.init_inference(
|
|
||||||
generator.model,
|
|
||||||
mp_size=world_size,
|
|
||||||
dtype=dtype,
|
|
||||||
replace_method="auto",
|
|
||||||
replace_with_kernel_inject=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt = "DeepSpeed is"
|
|
||||||
string_1 = generator(prompt, do_sample=False, max_length=128)
|
|
||||||
string_2 = generator(prompt, do_sample=False, max_length=128)
|
|
||||||
assert string_1 == string_2
|
|
||||||
|
|
||||||
_go()
|
_go()
|
||||||
|
Reference in New Issue
Block a user