mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# DeepSpeed Team
|
|
|
|
import os
|
|
import time
|
|
import pytest
|
|
import torch
|
|
import deepspeed
|
|
from transformers import pipeline
|
|
from unit.common import DistributedTest
|
|
from deepspeed.accelerator import get_accelerator
|
|
|
|
|
|
@pytest.mark.inference
|
|
@pytest.mark.parametrize("use_cuda_events", [True, False])
|
|
@pytest.mark.parametrize("enable_cuda_graph", [True, False])
|
|
class TestModelProfiling(DistributedTest):
|
|
world_size = 1
|
|
|
|
def test(self, enable_cuda_graph, use_cuda_events):
|
|
task = "fill-mask"
|
|
model = "bert-base-cased"
|
|
dtype = torch.float16
|
|
query = "I am a [MASK] model"
|
|
|
|
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
|
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
|
|
|
pipe = pipeline(task, model, framework="pt", device=get_accelerator().device_name(local_rank))
|
|
pipe.model = deepspeed.init_inference(pipe.model,
|
|
dtype=dtype,
|
|
mp_size=world_size,
|
|
replace_with_kernel_inject=True,
|
|
enable_cuda_graph=enable_cuda_graph)
|
|
pipe.model.profile_model_time(use_cuda_events=use_cuda_events)
|
|
|
|
e2e_times = []
|
|
model_times = []
|
|
for _ in range(10):
|
|
get_accelerator().synchronize()
|
|
start = time.perf_counter_ns()
|
|
|
|
r = pipe(query)
|
|
|
|
get_accelerator().synchronize()
|
|
end = time.perf_counter_ns()
|
|
|
|
e2e_times.append((end - start) / 1e6) # convert ns to ms
|
|
model_times.extend(pipe.model.model_times())
|
|
|
|
for e2e_t, model_t in zip(e2e_times, model_times):
|
|
assert e2e_t >= model_t
|