mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Compare commits
18 Commits
Author | SHA1 | Date | |
---|---|---|---|
8fbd84bf78 | |||
7d2dcce175 | |||
dc903e70ac | |||
a9c8212895 | |||
c20ecb6a51 | |||
5253edaacb | |||
017d9f1515 | |||
181b27d881 | |||
63e2a6419d | |||
264017a2bf | |||
e433c115bc | |||
86fd8bb0ac | |||
ab3a5a8259 | |||
a61f0521b8 | |||
537c9755a7 | |||
786b7f18a5 | |||
8f36444c4f | |||
185b2c29e2 |
@ -11,8 +11,16 @@ steps:
|
||||
- label: AsyncEngine Test
|
||||
command: pytest -v -s async_engine
|
||||
|
||||
- label: Distributed Test
|
||||
command: pytest -v -s test_comm_ops.py
|
||||
- label: Basic Correctness Test
|
||||
command: pytest -v -s --forked basic_correctness
|
||||
|
||||
- label: Distributed Comm Ops Test
|
||||
command: pytest -v -s --forked test_comm_ops.py
|
||||
working_dir: "/vllm-workspace/tests/distributed"
|
||||
num_gpus: 2 # only support 1 or 2 for now.
|
||||
|
||||
- label: Distributed Correctness Test
|
||||
command: pytest -v -s --forked test_basic_distributed_correctness.py
|
||||
working_dir: "/vllm-workspace/tests/distributed"
|
||||
num_gpus: 2 # only support 1 or 2 for now.
|
||||
|
||||
|
@ -60,6 +60,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
|
||||
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
|
||||
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
|
||||
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
||||
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
|
||||
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
||||
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
||||
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
||||
@ -70,6 +71,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
|
||||
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
||||
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
|
||||
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
||||
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
|
||||
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
||||
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
|
||||
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
||||
|
@ -49,4 +49,43 @@ the third parameter is the path to the LoRA adapter.
|
||||
|
||||
|
||||
Check out `examples/multilora_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py>`_
|
||||
for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||
for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||
|
||||
Serving LoRA Adapters
|
||||
---------------------
|
||||
LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
|
||||
``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m vllm.entrypoints.api_server \
|
||||
--model meta-llama/Llama-2-7b-hf \
|
||||
--enable-lora \
|
||||
--lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
|
||||
|
||||
The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
|
||||
etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along
|
||||
with its base model:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
curl localhost:8000/v1/models | jq .
|
||||
{
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "meta-llama/Llama-2-7b-hf",
|
||||
"object": "model",
|
||||
...
|
||||
},
|
||||
{
|
||||
"id": "sql-lora",
|
||||
"object": "model",
|
||||
...
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be
|
||||
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
|
||||
LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
|
||||
|
@ -32,6 +32,9 @@ Alongside each architecture, we include some popular models that use it.
|
||||
* - :code:`FalconForCausalLM`
|
||||
- Falcon
|
||||
- :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
|
||||
* - :code:`GemmaForCausalLM`
|
||||
- Gemma
|
||||
- :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
|
||||
* - :code:`GPT2LMHeadModel`
|
||||
- GPT-2
|
||||
- :code:`gpt2`, :code:`gpt2-xl`, etc.
|
||||
@ -62,6 +65,9 @@ Alongside each architecture, we include some popular models that use it.
|
||||
* - :code:`MPTForCausalLM`
|
||||
- MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
|
||||
- :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
|
||||
* - :code:`OLMoForCausalLM`
|
||||
- OLMo
|
||||
- :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
|
||||
* - :code:`OPTForCausalLM`
|
||||
- OPT, OPT-IML
|
||||
- :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
|
||||
|
@ -12,7 +12,9 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
def create_test_prompts(lora_path: str) -> List[Tuple[str, SamplingParams]]:
|
||||
def create_test_prompts(
|
||||
lora_path: str
|
||||
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
||||
"""Create a list of test prompts with their sampling parameters.
|
||||
|
||||
2 requests for base model, 4 requests for the LoRA. We define 2
|
||||
|
@ -6,7 +6,7 @@ ray >= 2.9
|
||||
sentencepiece # Required for LLaMA tokenizer.
|
||||
numpy
|
||||
tokenizers>=0.15.0
|
||||
transformers >= 4.37.0 # Required for Mixtral.
|
||||
transformers >= 4.38.0 # Required for Gemma.
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
pydantic >= 2.0 # Required for OpenAI server.
|
||||
|
@ -4,7 +4,7 @@ ray >= 2.9
|
||||
sentencepiece # Required for LLaMA tokenizer.
|
||||
numpy
|
||||
torch == 2.1.2
|
||||
transformers >= 4.37.0 # Required for Qwen2
|
||||
transformers >= 4.38.0 # Required for Gemma.
|
||||
xformers == 0.0.23.post1 # Required for CUDA 12.1.
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
|
2
setup.py
2
setup.py
@ -24,7 +24,7 @@ MAIN_CUDA_VERSION = "12.1"
|
||||
|
||||
# Supported NVIDIA GPU architectures.
|
||||
NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
|
||||
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx942", "gfx1100"}
|
||||
ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx942", "gfx1100"}
|
||||
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
|
||||
|
||||
|
||||
|
@ -64,7 +64,7 @@ def test_request_tracker():
|
||||
stream_5 = tracker.add_request("5")
|
||||
assert tracker.new_requests_event.flag
|
||||
tracker.process_request_output(
|
||||
RequestOutput("2", "output", [], [], [], finished=True))
|
||||
RequestOutput("2", "output", [], [], [], bool(finished)))
|
||||
new, finished = tracker.get_new_and_finished_requests()
|
||||
assert not tracker.new_requests_event.flag
|
||||
assert len(finished) == 1
|
||||
|
38
tests/basic_correctness/test_basic_correctness.py
Normal file
38
tests/basic_correctness/test_basic_correctness.py
Normal file
@ -0,0 +1,38 @@
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(model, dtype=dtype)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
@ -13,12 +13,10 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
|
||||
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
|
||||
|
||||
|
||||
def _read_prompts(filename: str) -> str:
|
||||
prompts = []
|
||||
def _read_prompts(filename: str) -> List[str]:
|
||||
with open(filename, "r") as f:
|
||||
prompt = f.readline()
|
||||
prompts.append(prompt)
|
||||
return prompts
|
||||
prompts = f.readlines()
|
||||
return prompts
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -165,6 +163,8 @@ class VllmRunner:
|
||||
model_name: str,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
dtype: str = "half",
|
||||
disable_log_stats: bool = True,
|
||||
tensor_parallel_size: int = 1,
|
||||
) -> None:
|
||||
self.model = LLM(
|
||||
model=model_name,
|
||||
@ -172,6 +172,8 @@ class VllmRunner:
|
||||
trust_remote_code=True,
|
||||
dtype=dtype,
|
||||
swap_space=0,
|
||||
disable_log_stats=disable_log_stats,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
)
|
||||
|
||||
def generate(
|
||||
|
41
tests/distributed/test_basic_distributed_correctness.py
Normal file
41
tests/distributed/test_basic_distributed_correctness.py
Normal file
@ -0,0 +1,41 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(model, dtype=dtype, tensor_parallel_size=2)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
@ -7,9 +7,11 @@ import pytest
|
||||
import requests
|
||||
import ray # using Ray for overall ease of process management, parallel requests, and debugging.
|
||||
import openai # use the official client for correctness check
|
||||
from huggingface_hub import snapshot_download # downloading lora to test lora requests
|
||||
|
||||
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora" # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
|
||||
|
||||
pytestmark = pytest.mark.asyncio
|
||||
|
||||
@ -54,7 +56,12 @@ class ServerRunner:
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def server():
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def server(zephyr_lora_files):
|
||||
ray.init()
|
||||
server_runner = ServerRunner.remote([
|
||||
"--model",
|
||||
@ -64,6 +71,17 @@ def server():
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"128"
|
||||
])
|
||||
ray.get(server_runner.ready.remote())
|
||||
yield server_runner
|
||||
@ -79,8 +97,25 @@ def client():
|
||||
yield client
|
||||
|
||||
|
||||
async def test_single_completion(server, client: openai.AsyncOpenAI):
|
||||
completion = await client.completions.create(model=MODEL_NAME,
|
||||
async def test_check_models(server, client: openai.AsyncOpenAI):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
lora_models = models[1:]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert all(model.root == MODEL_NAME for model in models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[1].id == "zephyr-lora2"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
||||
)
|
||||
async def test_single_completion(server, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
@ -104,7 +139,13 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
|
||||
completion.choices[0].text) >= 5
|
||||
|
||||
|
||||
async def test_single_chat_session(server, client: openai.AsyncOpenAI):
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_single_chat_session(server, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -115,7 +156,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
@ -139,11 +180,17 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
async def test_completion_streaming(server, client: openai.AsyncOpenAI):
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_completion_streaming(server, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
prompt = "What is an LLM?"
|
||||
|
||||
single_completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
@ -152,7 +199,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI):
|
||||
single_usage = single_completion.usage
|
||||
|
||||
stream = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
@ -166,7 +213,13 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI):
|
||||
assert "".join(chunks) == single_output
|
||||
|
||||
|
||||
async def test_chat_streaming(server, client: openai.AsyncOpenAI):
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_chat_streaming(server, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -177,7 +230,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
temperature=0.0,
|
||||
@ -187,7 +240,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
temperature=0.0,
|
||||
@ -204,10 +257,16 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
async def test_batch_completions(server, client: openai.AsyncOpenAI):
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_batch_completions(server, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
# test simple list
|
||||
batch = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
prompt=["Hello, my name is", "Hello, my name is"],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
@ -217,7 +276,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI):
|
||||
|
||||
# test n = 2
|
||||
batch = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
prompt=["Hello, my name is", "Hello, my name is"],
|
||||
n=2,
|
||||
max_tokens=5,
|
||||
@ -236,7 +295,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI):
|
||||
|
||||
# test streaming
|
||||
batch = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
prompt=["Hello, my name is", "Hello, my name is"],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
|
33
tests/metrics/test_metrics.py
Normal file
33
tests/metrics/test_metrics.py
Normal file
@ -0,0 +1,33 @@
|
||||
import pytest
|
||||
import vllm.engine.metrics
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_metrics(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False)
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
|
||||
# This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
|
||||
assert len(example_prompts) > 1, "at least 2 prompts are required"
|
||||
assert prompt_token_counts[0] != prompt_token_counts[1], (
|
||||
"prompts of different lengths are required")
|
||||
vllm_prompt_token_count = sum(prompt_token_counts)
|
||||
|
||||
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
metric_count = vllm.engine.metrics.counter_prompt_tokens.get_value({})
|
||||
|
||||
assert vllm_prompt_token_count == metric_count, (
|
||||
f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
|
||||
)
|
@ -5,11 +5,20 @@ Run `pytest tests/models/test_models.py --forked`.
|
||||
import pytest
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m", "meta-llama/Llama-2-7b-hf",
|
||||
"mistralai/Mistral-7B-v0.1", "Deci/DeciLM-7b", "tiiuae/falcon-7b", "gpt2",
|
||||
"bigcode/tiny_starcoder_py", "EleutherAI/gpt-j-6b",
|
||||
"EleutherAI/pythia-70m", "bigscience/bloom-560m", "mosaicml/mpt-7b",
|
||||
"microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"mistralai/Mistral-7B-v0.1",
|
||||
"Deci/DeciLM-7b",
|
||||
"tiiuae/falcon-7b",
|
||||
"gpt2",
|
||||
"bigcode/tiny_starcoder_py",
|
||||
"EleutherAI/gpt-j-6b",
|
||||
"EleutherAI/pythia-70m",
|
||||
"bigscience/bloom-560m",
|
||||
"mosaicml/mpt-7b",
|
||||
"microsoft/phi-2",
|
||||
"stabilityai/stablelm-3b-4e1t",
|
||||
"allenai/OLMo-1B",
|
||||
]
|
||||
|
||||
|
||||
|
@ -26,6 +26,7 @@ def test_beam_search_single_input(
|
||||
max_tokens: int,
|
||||
beam_width: int,
|
||||
) -> None:
|
||||
example_prompts = example_prompts[:1]
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
|
||||
max_tokens)
|
||||
|
@ -1,10 +1,11 @@
|
||||
import random
|
||||
from typing import Tuple
|
||||
from typing import Tuple, List
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import GenerationConfig, GenerationMixin
|
||||
from typing import Optional
|
||||
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
@ -46,6 +47,34 @@ CUDA_DEVICES = [
|
||||
]
|
||||
|
||||
|
||||
def _do_sample(
|
||||
batch_size: int,
|
||||
input_tensor: torch.Tensor,
|
||||
sampler: MockLogitsSampler,
|
||||
model_runner: ModelRunner,
|
||||
sampling_params: SamplingParams,
|
||||
):
|
||||
seq_group_metadata_list = []
|
||||
prompt_lens = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData([1, 2, 3])},
|
||||
sampling_params=sampling_params,
|
||||
block_tables={0: [1]},
|
||||
))
|
||||
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||
|
||||
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||
prompt_lens,
|
||||
subquery_lens=prompt_lens)
|
||||
return sampler(embedding=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_sampler_all_greedy(seed: int, device: str):
|
||||
@ -55,25 +84,9 @@ def test_sampler_all_greedy(seed: int, device: str):
|
||||
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||
batch_size)
|
||||
|
||||
seq_group_metadata_list = []
|
||||
prompt_lens = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData([1, 2, 3])},
|
||||
sampling_params=SamplingParams(temperature=0, ),
|
||||
block_tables={0: [1]},
|
||||
))
|
||||
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||
|
||||
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||
prompt_lens,
|
||||
subquery_lens=prompt_lens)
|
||||
sampler_output = sampler(embedding=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
sampler_output = _do_sample(batch_size, input_tensor, sampler,
|
||||
model_runner, sampling_params)
|
||||
expected = torch.argmax(fake_logits, dim=-1)
|
||||
for i, sequence_output in enumerate(sampler_output):
|
||||
for nth_output in sequence_output.samples:
|
||||
@ -94,28 +107,13 @@ def test_sampler_all_random(seed: int, device: str):
|
||||
for i in range(batch_size):
|
||||
fake_logits[i, i] = 1e2
|
||||
|
||||
seq_group_metadata_list = []
|
||||
prompt_lens = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData([1, 2, 3])},
|
||||
sampling_params=SamplingParams(
|
||||
temperature=1.0,
|
||||
n=random.randint(1, 10),
|
||||
),
|
||||
block_tables={0: [1]},
|
||||
))
|
||||
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
n=random.randint(1, 10),
|
||||
)
|
||||
sampler_output = _do_sample(batch_size, input_tensor, sampler,
|
||||
model_runner, sampling_params)
|
||||
|
||||
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||
prompt_lens,
|
||||
subquery_lens=prompt_lens)
|
||||
sampler_output = sampler(embedding=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
for i, sequence_output in enumerate(sampler_output):
|
||||
for nth_output in sequence_output.samples:
|
||||
assert nth_output.output_token == i
|
||||
@ -123,6 +121,58 @@ def test_sampler_all_random(seed: int, device: str):
|
||||
del model_runner
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_sampler_all_random_seed(seed: int, device: str):
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
batch_size = random.randint(1, 256)
|
||||
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||
batch_size)
|
||||
|
||||
for i in range(batch_size):
|
||||
fake_logits[i, i] = 1e2
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
n=random.randint(1, 10),
|
||||
seed=random.randint(0, 10000),
|
||||
)
|
||||
sampler_output = _do_sample(batch_size, input_tensor, sampler,
|
||||
model_runner, sampling_params)
|
||||
|
||||
for i, sequence_output in enumerate(sampler_output):
|
||||
for nth_output in sequence_output.samples:
|
||||
assert nth_output.output_token == i
|
||||
|
||||
del model_runner
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_sampler_all_random_seed_deterministic(seed: int, device: str):
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
batch_size = random.randint(1, 256)
|
||||
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||
batch_size)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
n=random.randint(1, 10),
|
||||
seed=random.randint(0, 10000),
|
||||
)
|
||||
first_sampler_output = _do_sample(batch_size, input_tensor, sampler,
|
||||
model_runner, sampling_params)
|
||||
|
||||
second_sampler_output = _do_sample(batch_size, input_tensor, sampler,
|
||||
model_runner, sampling_params)
|
||||
|
||||
assert first_sampler_output == second_sampler_output
|
||||
|
||||
del model_runner
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_sampler_all_beam(seed: int, device: str):
|
||||
@ -131,29 +181,13 @@ def test_sampler_all_beam(seed: int, device: str):
|
||||
batch_size = random.randint(1, 256)
|
||||
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
|
||||
|
||||
seq_group_metadata_list = []
|
||||
prompt_lens = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData([1, 2, 3])},
|
||||
sampling_params=SamplingParams(
|
||||
temperature=0,
|
||||
best_of=2,
|
||||
use_beam_search=True,
|
||||
),
|
||||
block_tables={0: [1]},
|
||||
))
|
||||
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||
|
||||
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||
prompt_lens,
|
||||
subquery_lens=prompt_lens)
|
||||
sampler(embedding=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
best_of=2,
|
||||
use_beam_search=True,
|
||||
)
|
||||
_do_sample(batch_size, input_tensor, sampler, model_runner,
|
||||
sampling_params)
|
||||
# no assertion here as I am not sure how to determine whether
|
||||
# the outputs are expected - in other words, this just tests
|
||||
# whether there are no exceptions in the sampler
|
||||
@ -171,14 +205,15 @@ def test_sampler_mixed(seed: int, device: str):
|
||||
batch_size)
|
||||
|
||||
seq_group_metadata_list = []
|
||||
expected_tokens = []
|
||||
expected_tokens: List[Optional[List[int]]] = []
|
||||
prompt_lens = []
|
||||
for i in range(batch_size):
|
||||
n = 1
|
||||
sampling_type = random.randint(0, 2)
|
||||
expected: Optional[List[int]] = None
|
||||
sampling_type = random.randint(0, 3)
|
||||
if sampling_type == 0:
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
elif sampling_type == 1:
|
||||
expected = [torch.argmax(fake_logits[i], dim=-1).item()]
|
||||
elif sampling_type in (1, 2):
|
||||
n = random.randint(1, 10)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=random.random() + 0.1,
|
||||
@ -187,13 +222,17 @@ def test_sampler_mixed(seed: int, device: str):
|
||||
n=n,
|
||||
presence_penalty=random.randint(0, 1),
|
||||
)
|
||||
if sampling_type == 2:
|
||||
sampling_params.seed = random.randint(0, 10000)
|
||||
else:
|
||||
for idx in range(n):
|
||||
fake_logits[i, i + idx] = 1e2
|
||||
expected = list(range(i, i + n))
|
||||
else:
|
||||
sampling_params = SamplingParams(temperature=0,
|
||||
use_beam_search=True,
|
||||
best_of=2)
|
||||
for idx in range(n):
|
||||
fake_logits[i, i + idx] = 1e2
|
||||
expected_tokens.append(i + idx)
|
||||
expected_tokens.append(expected)
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
@ -204,17 +243,50 @@ def test_sampler_mixed(seed: int, device: str):
|
||||
))
|
||||
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||
|
||||
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||
prompt_lens,
|
||||
subquery_lens=prompt_lens)
|
||||
sampler_output = sampler(embedding=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
for i, sequence_output in enumerate(sampler_output):
|
||||
if seq_group_metadata_list[i].sampling_params.use_beam_search:
|
||||
continue
|
||||
for nth_output in sequence_output.samples:
|
||||
assert nth_output.output_token in expected_tokens
|
||||
def test_sampling(model_runner: ModelRunner):
|
||||
sampling_metadata = model_runner._prepare_sample(
|
||||
seq_group_metadata_list, prompt_lens, subquery_lens=prompt_lens)
|
||||
sampler_output = sampler(embedding=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
|
||||
for i, (sequence_output, metadata) in enumerate(
|
||||
zip(sampler_output, seq_group_metadata_list)):
|
||||
if metadata.sampling_params.use_beam_search:
|
||||
continue
|
||||
|
||||
if metadata.sampling_params.seed is not None \
|
||||
and expected_tokens[i] is None:
|
||||
# Record seeded random result to compare with results of second invocation
|
||||
expected_tokens[i] = [
|
||||
nth_output.output_token
|
||||
for nth_output in sequence_output.samples
|
||||
]
|
||||
continue
|
||||
|
||||
for n, nth_output in enumerate(sequence_output.samples):
|
||||
if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None:
|
||||
# Ensure exact matches for greedy or random with seed
|
||||
assert nth_output.output_token == expected_tokens[i][n]
|
||||
else:
|
||||
# For non-seeded random check that one of the high-logit tokens were chosen
|
||||
assert nth_output.output_token in expected_tokens[i]
|
||||
|
||||
# Test batch
|
||||
test_sampling(model_runner)
|
||||
|
||||
# Shuffle the batch and resample
|
||||
target_index = list(range(batch_size))
|
||||
for list_to_shuffle in (target_index, seq_group_metadata_list,
|
||||
expected_tokens, prompt_lens):
|
||||
random.Random(seed).shuffle(list_to_shuffle)
|
||||
target_index = torch.tensor(target_index)
|
||||
input_tensor.data = input_tensor.index_select(0, target_index)
|
||||
fake_logits.data = fake_logits.index_select(0, target_index)
|
||||
|
||||
# This time, results of seeded random samples will be compared with the corresponding
|
||||
# sample in the pre-shuffled batch
|
||||
test_sampling(model_runner)
|
||||
|
||||
del model_runner
|
||||
|
||||
|
82
tests/samplers/test_seeded_generate.py
Normal file
82
tests/samplers/test_seeded_generate.py
Normal file
@ -0,0 +1,82 @@
|
||||
"""Verify that seeded random sampling is deterministic.
|
||||
|
||||
Run `pytest tests/samplers/test_seeded_generate.py --forked`.
|
||||
"""
|
||||
import copy
|
||||
import random
|
||||
from itertools import combinations
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm import SamplingParams
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
RANDOM_SEEDS = list(range(5))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vllm_model(vllm_runner):
|
||||
vllm_model = vllm_runner(MODEL, dtype="half")
|
||||
yield vllm_model
|
||||
del vllm_model
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||
def test_random_sample_with_seed(
|
||||
vllm_model,
|
||||
example_prompts,
|
||||
seed: int,
|
||||
) -> None:
|
||||
set_random_seed(seed)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
# Parameters to ensure sufficient randomness
|
||||
temperature=2.0,
|
||||
top_p=min(random.random() + 0.3, 1),
|
||||
top_k=random.randint(5, 20),
|
||||
n=random.randint(1, 10),
|
||||
presence_penalty=random.randint(0, 1),
|
||||
max_tokens=8,
|
||||
ignore_eos=True,
|
||||
)
|
||||
|
||||
sampling_params_seed_1 = copy.deepcopy(sampling_params)
|
||||
sampling_params_seed_1.seed = 100
|
||||
sampling_params_seed_2 = copy.deepcopy(sampling_params)
|
||||
sampling_params_seed_2.seed = 200
|
||||
|
||||
llm = vllm_model.model
|
||||
|
||||
for prompt in example_prompts:
|
||||
for params in (
|
||||
sampling_params,
|
||||
sampling_params_seed_1,
|
||||
sampling_params_seed_2,
|
||||
sampling_params,
|
||||
sampling_params_seed_1,
|
||||
sampling_params_seed_2,
|
||||
):
|
||||
llm._add_request(
|
||||
prompt=prompt,
|
||||
prompt_token_ids=None,
|
||||
sampling_params=params,
|
||||
)
|
||||
|
||||
results = llm._run_engine(use_tqdm=False)
|
||||
all_outputs = [[out.token_ids for out in output.outputs]
|
||||
for output in results]
|
||||
|
||||
for i in range(0, len(example_prompts), 6):
|
||||
outputs = all_outputs[i:i + 6]
|
||||
|
||||
# verify all non-seeded requests differ
|
||||
for output_a, output_b in combinations(
|
||||
(outputs[0], outputs[1], outputs[2], outputs[3]),
|
||||
2,
|
||||
):
|
||||
assert output_a != output_b
|
||||
|
||||
# verify requests with the same seed match
|
||||
assert outputs[1] == outputs[4]
|
||||
assert outputs[2] == outputs[5]
|
@ -8,7 +8,7 @@ from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
__version__ = "0.3.1"
|
||||
__version__ = "0.3.2"
|
||||
|
||||
__all__ = [
|
||||
"LLM",
|
||||
|
@ -44,6 +44,9 @@ class ModelConfig:
|
||||
revision: The specific model version to use. It can be a branch name,
|
||||
a tag name, or a commit id. If unspecified, will use the default
|
||||
version.
|
||||
code_revision: The specific revision to use for the model code on
|
||||
Hugging Face Hub. It can be a branch name, a tag name, or a
|
||||
commit id. If unspecified, will use the default version.
|
||||
tokenizer_revision: The specific tokenizer version to use. It can be a
|
||||
branch name, a tag name, or a commit id. If unspecified, will use
|
||||
the default version.
|
||||
@ -70,6 +73,7 @@ class ModelConfig:
|
||||
dtype: Union[str, torch.dtype],
|
||||
seed: int,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
max_model_len: Optional[int] = None,
|
||||
quantization: Optional[str] = None,
|
||||
@ -84,6 +88,7 @@ class ModelConfig:
|
||||
self.load_format = load_format
|
||||
self.seed = seed
|
||||
self.revision = revision
|
||||
self.code_revision = code_revision
|
||||
self.tokenizer_revision = tokenizer_revision
|
||||
self.quantization = quantization
|
||||
self.enforce_eager = enforce_eager
|
||||
@ -103,7 +108,8 @@ class ModelConfig:
|
||||
self.download_dir = model_path
|
||||
self.tokenizer = model_path
|
||||
|
||||
self.hf_config = get_config(self.model, trust_remote_code, revision)
|
||||
self.hf_config = get_config(self.model, trust_remote_code, revision,
|
||||
code_revision)
|
||||
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||
self.max_model_len = _get_and_verify_max_len(self.hf_config,
|
||||
max_model_len)
|
||||
|
@ -33,7 +33,7 @@ class FCFS(Policy):
|
||||
now: float,
|
||||
seq_group: SequenceGroup,
|
||||
) -> float:
|
||||
return now - seq_group.arrival_time
|
||||
return now - seq_group.metrics.arrival_time
|
||||
|
||||
|
||||
class PolicyFactory:
|
||||
|
@ -365,10 +365,13 @@ class Scheduler:
|
||||
# This function call changes the internal states of the scheduler
|
||||
# such as self.running, self.swapped, and self.waiting.
|
||||
scheduler_outputs = self._schedule()
|
||||
now = time.time()
|
||||
|
||||
# Create input data structures.
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
for seq_group in scheduler_outputs.scheduled_seq_groups:
|
||||
seq_group.maybe_set_first_scheduled_time(now)
|
||||
|
||||
seq_data: Dict[int, SequenceData] = {}
|
||||
block_tables: Dict[int, List[int]] = {}
|
||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||
@ -384,6 +387,7 @@ class Scheduler:
|
||||
block_tables=block_tables,
|
||||
lora_request=seq_group.lora_request,
|
||||
prefix=seq_group.prefix,
|
||||
state=seq_group.state,
|
||||
)
|
||||
seq_group_metadata_list.append(seq_group_metadata)
|
||||
return seq_group_metadata_list, scheduler_outputs
|
||||
|
@ -32,6 +32,7 @@ class EngineArgs:
|
||||
max_paddings: int = 256
|
||||
disable_log_stats: bool = False
|
||||
revision: Optional[str] = None
|
||||
code_revision: Optional[str] = None
|
||||
tokenizer_revision: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
enforce_eager: bool = False
|
||||
@ -75,6 +76,13 @@ class EngineArgs:
|
||||
help='the specific model version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument(
|
||||
'--code-revision',
|
||||
type=str,
|
||||
default=None,
|
||||
help='the specific revision to use for the model code on '
|
||||
'Hugging Face Hub. It can be a branch name, a tag name, or a '
|
||||
'commit id. If unspecified, will use the default version.')
|
||||
parser.add_argument(
|
||||
'--tokenizer-revision',
|
||||
type=str,
|
||||
@ -165,7 +173,6 @@ class EngineArgs:
|
||||
default=EngineArgs.block_size,
|
||||
choices=[8, 16, 32],
|
||||
help='token block size')
|
||||
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=EngineArgs.seed,
|
||||
@ -279,13 +286,12 @@ class EngineArgs:
|
||||
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
|
||||
DeviceConfig, Optional[LoRAConfig]]:
|
||||
device_config = DeviceConfig(self.device)
|
||||
model_config = ModelConfig(self.model, self.tokenizer,
|
||||
self.tokenizer_mode, self.trust_remote_code,
|
||||
self.download_dir, self.load_format,
|
||||
self.dtype, self.seed, self.revision,
|
||||
self.tokenizer_revision, self.max_model_len,
|
||||
self.quantization, self.enforce_eager,
|
||||
self.max_context_len_to_capture)
|
||||
model_config = ModelConfig(
|
||||
self.model, self.tokenizer, self.tokenizer_mode,
|
||||
self.trust_remote_code, self.download_dir, self.load_format,
|
||||
self.dtype, self.seed, self.revision, self.code_revision,
|
||||
self.tokenizer_revision, self.max_model_len, self.quantization,
|
||||
self.enforce_eager, self.max_context_len_to_capture)
|
||||
cache_config = CacheConfig(self.block_size,
|
||||
self.gpu_memory_utilization,
|
||||
self.swap_space, self.kv_cache_dtype,
|
||||
|
@ -464,6 +464,9 @@ class LLMEngine:
|
||||
prompt_token_ids[:prefix_pos], lora_request.lora_int_id
|
||||
if lora_request else 0) if prefix_pos is not None else None
|
||||
|
||||
# Defensive copy of SamplingParams, which are used by the sampler
|
||||
sampling_params = copy.deepcopy(sampling_params)
|
||||
|
||||
# Create the sequence group.
|
||||
seq_group = SequenceGroup(request_id, [seq], sampling_params,
|
||||
arrival_time, lora_request, prefix)
|
||||
@ -725,6 +728,7 @@ class LLMEngine:
|
||||
def _process_model_outputs(
|
||||
self, output: SamplerOutput,
|
||||
scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
|
||||
now = time.time()
|
||||
# Update the scheduled sequence groups with the model outputs.
|
||||
scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
|
||||
for seq_group, outputs in zip(scheduled_seq_groups, output):
|
||||
@ -736,6 +740,7 @@ class LLMEngine:
|
||||
# Create the outputs.
|
||||
request_outputs: List[RequestOutput] = []
|
||||
for seq_group in scheduled_seq_groups:
|
||||
seq_group.maybe_set_first_token_time(now)
|
||||
request_output = RequestOutput.from_seq_group(seq_group)
|
||||
request_outputs.append(request_output)
|
||||
for seq_group in scheduler_outputs.ignored_seq_groups:
|
||||
@ -864,18 +869,21 @@ class LLMEngine:
|
||||
|
||||
# Number of Tokens.
|
||||
if prompt_run:
|
||||
num_prompt_tokens = scheduler_outputs.num_batched_tokens
|
||||
num_prompt_tokens = sum(
|
||||
len(seq_group.prompt_token_ids)
|
||||
for seq_group in scheduler_outputs.scheduled_seq_groups)
|
||||
else:
|
||||
num_generation_tokens = scheduler_outputs.num_batched_tokens
|
||||
|
||||
# Latency Timings.
|
||||
time_last_iters = []
|
||||
for seq_group in scheduler_outputs.scheduled_seq_groups:
|
||||
# Time since last token. (n.b. updates seq_group.last_token_time)
|
||||
# Time since last token. (n.b. updates seq_group.metrics.last_token_time)
|
||||
time_last_iters.append(seq_group.get_last_latency(now))
|
||||
# Time since arrival for all finished requests.
|
||||
if seq_group.is_finished():
|
||||
time_e2e_requests.append(now - seq_group.arrival_time)
|
||||
time_e2e_requests.append(now -
|
||||
seq_group.metrics.arrival_time)
|
||||
|
||||
time_to_first_tokens = time_last_iters if prompt_run else []
|
||||
time_per_output_tokens = [] if prompt_run else time_last_iters
|
||||
|
@ -1,3 +1,9 @@
|
||||
"""
|
||||
NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks.
|
||||
It is not intended for production use. For production use, we recommend using our OpenAI compatible server.
|
||||
We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from typing import AsyncGenerator
|
||||
|
@ -23,6 +23,7 @@ from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRe
|
||||
from vllm.logger import init_logger
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.serving_engine import LoRA
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
||||
|
||||
@ -48,6 +49,16 @@ async def lifespan(app: fastapi.FastAPI):
|
||||
app = fastapi.FastAPI(lifespan=lifespan)
|
||||
|
||||
|
||||
class LoRAParserAction(argparse.Action):
|
||||
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
lora_list = []
|
||||
for item in values:
|
||||
name, path = item.split('=')
|
||||
lora_list.append(LoRA(name, path))
|
||||
setattr(namespace, self.dest, lora_list)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="vLLM OpenAI-Compatible RESTful API server.")
|
||||
@ -81,6 +92,15 @@ def parse_args():
|
||||
help="The model name used in the API. If not "
|
||||
"specified, the model name will be the same as "
|
||||
"the huggingface name.")
|
||||
parser.add_argument(
|
||||
"--lora-modules",
|
||||
type=str,
|
||||
default=None,
|
||||
nargs='+',
|
||||
action=LoRAParserAction,
|
||||
help=
|
||||
"LoRA module configurations in the format name=path. Multiple modules can be specified."
|
||||
)
|
||||
parser.add_argument("--chat-template",
|
||||
type=str,
|
||||
default=None,
|
||||
@ -217,8 +237,10 @@ if __name__ == "__main__":
|
||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
openai_serving_chat = OpenAIServingChat(engine, served_model,
|
||||
args.response_role,
|
||||
args.lora_modules,
|
||||
args.chat_template)
|
||||
openai_serving_completion = OpenAIServingCompletion(engine, served_model)
|
||||
openai_serving_completion = OpenAIServingCompletion(
|
||||
engine, served_model, args.lora_modules)
|
||||
|
||||
# Register labels for metrics
|
||||
add_global_metrics_labels(model_name=engine_args.model)
|
||||
|
@ -60,6 +60,7 @@ class ChatCompletionRequest(BaseModel):
|
||||
top_p: Optional[float] = 1.0
|
||||
n: Optional[int] = 1
|
||||
max_tokens: Optional[int] = None
|
||||
seed: Optional[int] = None
|
||||
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
||||
stream: Optional[bool] = False
|
||||
presence_penalty: Optional[float] = 0.0
|
||||
@ -90,6 +91,7 @@ class ChatCompletionRequest(BaseModel):
|
||||
temperature=self.temperature,
|
||||
top_p=self.top_p,
|
||||
min_p=self.min_p,
|
||||
seed=self.seed,
|
||||
stop=self.stop,
|
||||
stop_token_ids=self.stop_token_ids,
|
||||
max_tokens=self.max_tokens,
|
||||
@ -117,6 +119,7 @@ class CompletionRequest(BaseModel):
|
||||
logprobs: Optional[int] = None
|
||||
echo: Optional[bool] = False
|
||||
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
||||
seed: Optional[int] = None
|
||||
presence_penalty: Optional[float] = 0.0
|
||||
frequency_penalty: Optional[float] = 0.0
|
||||
best_of: Optional[int] = None
|
||||
@ -147,6 +150,7 @@ class CompletionRequest(BaseModel):
|
||||
top_p=self.top_p,
|
||||
top_k=self.top_k,
|
||||
min_p=self.min_p,
|
||||
seed=self.seed,
|
||||
stop=self.stop,
|
||||
stop_token_ids=self.stop_token_ids,
|
||||
ignore_eos=self.ignore_eos,
|
||||
|
@ -1,7 +1,7 @@
|
||||
import time
|
||||
import codecs
|
||||
from fastapi import Request
|
||||
from typing import AsyncGenerator, AsyncIterator, Union
|
||||
from typing import AsyncGenerator, AsyncIterator, Optional, List, Union
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
|
||||
UsageInfo)
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -22,8 +22,11 @@ class OpenAIServingChat(OpenAIServing):
|
||||
engine: AsyncLLMEngine,
|
||||
served_model: str,
|
||||
response_role: str,
|
||||
lora_modules: Optional[List[LoRA]] = None,
|
||||
chat_template=None):
|
||||
super().__init__(engine=engine, served_model=served_model)
|
||||
super().__init__(engine=engine,
|
||||
served_model=served_model,
|
||||
lora_modules=lora_modules)
|
||||
self.response_role = response_role
|
||||
self._load_chat_template(chat_template)
|
||||
|
||||
@ -64,11 +67,13 @@ class OpenAIServingChat(OpenAIServing):
|
||||
token_ids = self._validate_prompt_and_tokenize(request,
|
||||
prompt=prompt)
|
||||
sampling_params = request.to_sampling_params()
|
||||
lora_request = self._maybe_get_lora(request)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
result_generator = self.engine.generate(prompt, sampling_params,
|
||||
request_id, token_ids)
|
||||
request_id, token_ids,
|
||||
lora_request)
|
||||
# Streaming response
|
||||
if request.stream:
|
||||
return self.chat_completion_stream_generator(
|
||||
|
@ -15,7 +15,7 @@ from .protocol import (
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -249,8 +249,13 @@ def merge_async_iterators(*iterators):
|
||||
|
||||
class OpenAIServingCompletion(OpenAIServing):
|
||||
|
||||
def __init__(self, engine: AsyncLLMEngine, served_model: str):
|
||||
super().__init__(engine=engine, served_model=served_model)
|
||||
def __init__(self,
|
||||
engine: AsyncLLMEngine,
|
||||
served_model: str,
|
||||
lora_modules: Optional[List[LoRA]] = None):
|
||||
super().__init__(engine=engine,
|
||||
served_model=served_model,
|
||||
lora_modules=lora_modules)
|
||||
|
||||
async def create_completion(self, request: CompletionRequest,
|
||||
raw_request: Request):
|
||||
@ -284,6 +289,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
generators = []
|
||||
try:
|
||||
sampling_params = request.to_sampling_params()
|
||||
lora_request = self._maybe_get_lora(request)
|
||||
prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
@ -298,7 +304,8 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
self.engine.generate(None,
|
||||
sampling_params,
|
||||
f"{request_id}-{i}",
|
||||
prompt_token_ids=input_ids))
|
||||
prompt_token_ids=input_ids,
|
||||
lora_request=lora_request))
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from typing import Dict, List, Optional, Union
|
||||
from vllm.logger import init_logger
|
||||
@ -9,15 +10,35 @@ from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
||||
ErrorResponse, LogProbs,
|
||||
ModelCard, ModelList,
|
||||
ModelPermission)
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoRA:
|
||||
name: str
|
||||
local_path: str
|
||||
|
||||
|
||||
class OpenAIServing:
|
||||
|
||||
def __init__(self, engine: AsyncLLMEngine, served_model: str):
|
||||
def __init__(self,
|
||||
engine: AsyncLLMEngine,
|
||||
served_model: str,
|
||||
lora_modules=Optional[List[LoRA]]):
|
||||
self.engine = engine
|
||||
self.served_model = served_model
|
||||
if lora_modules is None:
|
||||
self.lora_requests = []
|
||||
else:
|
||||
self.lora_requests = [
|
||||
LoRARequest(
|
||||
lora_name=lora.name,
|
||||
lora_int_id=i,
|
||||
lora_local_path=lora.local_path,
|
||||
) for i, lora in enumerate(lora_modules, start=1)
|
||||
]
|
||||
|
||||
self.max_model_len = 0
|
||||
self.tokenizer = None
|
||||
@ -50,6 +71,13 @@ class OpenAIServing:
|
||||
root=self.served_model,
|
||||
permission=[ModelPermission()])
|
||||
]
|
||||
lora_cards = [
|
||||
ModelCard(id=lora.lora_name,
|
||||
root=self.served_model,
|
||||
permission=[ModelPermission()])
|
||||
for lora in self.lora_requests
|
||||
]
|
||||
model_cards.extend(lora_cards)
|
||||
return ModelList(data=model_cards)
|
||||
|
||||
def _create_logprobs(
|
||||
@ -99,11 +127,22 @@ class OpenAIServing:
|
||||
async def _check_model(self, request) -> Optional[ErrorResponse]:
|
||||
if request.model == self.served_model:
|
||||
return
|
||||
if request.model in [lora.lora_name for lora in self.lora_requests]:
|
||||
return
|
||||
return self.create_error_response(
|
||||
message=f"The model `{request.model}` does not exist.",
|
||||
err_type="NotFoundError",
|
||||
status_code=HTTPStatus.NOT_FOUND)
|
||||
|
||||
def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
|
||||
if request.model == self.served_model:
|
||||
return
|
||||
for lora in self.lora_requests:
|
||||
if request.model == lora.lora_name:
|
||||
return lora
|
||||
# if _check_model has been called earlier, this will be unreachable
|
||||
raise ValueError("The model `{request.model}` does not exist.")
|
||||
|
||||
def _validate_prompt_and_tokenize(
|
||||
self,
|
||||
request: Union[ChatCompletionRequest, CompletionRequest],
|
||||
|
@ -5,6 +5,8 @@ import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
|
||||
|
||||
_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
|
||||
_DATE_FORMAT = "%m-%d %H:%M:%S"
|
||||
|
||||
@ -45,13 +47,15 @@ def _setup_logger():
|
||||
# The logger is initialized when the module is imported.
|
||||
# This is thread-safe as the module is only imported once,
|
||||
# guaranteed by the Python GIL.
|
||||
_setup_logger()
|
||||
if VLLM_CONFIGURE_LOGGING:
|
||||
_setup_logger()
|
||||
|
||||
|
||||
def init_logger(name: str):
|
||||
# Use the same settings as above for root logger
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG"))
|
||||
logger.addHandler(_default_handler)
|
||||
logger.propagate = False
|
||||
if VLLM_CONFIGURE_LOGGING:
|
||||
logger.addHandler(_default_handler)
|
||||
logger.propagate = False
|
||||
return logger
|
||||
|
@ -342,7 +342,9 @@ def _beam_search_sample(
|
||||
def _multinomial(
|
||||
probs: torch.Tensor,
|
||||
num_samples: int,
|
||||
):
|
||||
seq_groups: Optional[List[Tuple[List[int], SamplingParams]]] = None,
|
||||
generators: Optional[List[torch.Generator]] = None,
|
||||
) -> torch.Tensor:
|
||||
if num_samples > 1:
|
||||
# This is equivalent to torch.repeat_interleaved (which also
|
||||
# forces a GPU<->CPU sync).
|
||||
@ -352,7 +354,15 @@ def _multinomial(
|
||||
probs = probs[:, None, :].expand(probs.shape[0], num_samples,
|
||||
probs.shape[1]).contiguous().view(
|
||||
-1, probs.shape[1])
|
||||
q = torch.empty_like(probs).exponential_(1)
|
||||
q = torch.empty_like(probs)
|
||||
if seq_groups is None:
|
||||
q.exponential_()
|
||||
else:
|
||||
sample_idx = 0
|
||||
for (seq_ids, _), generator in zip(seq_groups, generators):
|
||||
next_sample_idx = sample_idx + len(seq_ids) * num_samples
|
||||
q[sample_idx:next_sample_idx].exponential_(generator=generator)
|
||||
sample_idx = next_sample_idx
|
||||
return probs.div_(q).argmax(dim=1).view(-1, num_samples)
|
||||
|
||||
|
||||
@ -370,6 +380,7 @@ def _sample(
|
||||
|
||||
sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
|
||||
sample_metadata = {}
|
||||
multinomial_samples = {}
|
||||
|
||||
# Counterintiutively, having two loops here is actually faster.
|
||||
# The first loop can run without waiting on GPU<->CPU sync.
|
||||
@ -385,14 +396,18 @@ def _sample(
|
||||
is_prompts, sample_indices)
|
||||
if sampling_type == SamplingType.GREEDY:
|
||||
greedy_samples = torch.argmax(logprobs[sample_indices], dim=-1)
|
||||
elif sampling_type == SamplingType.RANDOM:
|
||||
elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
|
||||
max_best_of = 1
|
||||
for seq_group, is_prompt in zip(seq_groups, is_prompts):
|
||||
if is_prompt:
|
||||
_, sampling_params = seq_group
|
||||
max_best_of = max(max_best_of, sampling_params.best_of)
|
||||
multinomial_samples = _multinomial(probs[sample_indices],
|
||||
max_best_of)
|
||||
seeded_args = {} if sampling_type == SamplingType.RANDOM else {
|
||||
"seq_groups": seq_groups,
|
||||
"generators": sampling_metadata.generators,
|
||||
}
|
||||
multinomial_samples[sampling_type] = _multinomial(
|
||||
probs[sample_indices], max_best_of, **seeded_args)
|
||||
elif sampling_type == SamplingType.BEAM:
|
||||
beam_search_logprobs = logprobs[sample_indices]
|
||||
else:
|
||||
@ -407,9 +422,9 @@ def _sample(
|
||||
sampling_type]
|
||||
if sampling_type == SamplingType.GREEDY:
|
||||
sample_results = _greedy_sample(seq_groups, greedy_samples)
|
||||
elif sampling_type == SamplingType.RANDOM:
|
||||
elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
|
||||
sample_results = _random_sample(seq_groups, is_prompts,
|
||||
multinomial_samples)
|
||||
multinomial_samples[sampling_type])
|
||||
elif sampling_type == SamplingType.BEAM:
|
||||
sample_results = _beam_search_sample(seq_groups, is_prompts,
|
||||
sampling_metadata.seq_data,
|
||||
|
@ -20,6 +20,7 @@ _MODELS = {
|
||||
"DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
|
||||
"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
|
||||
"FalconForCausalLM": ("falcon", "FalconForCausalLM"),
|
||||
"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
|
||||
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
|
||||
"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
|
||||
"GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
|
||||
@ -35,6 +36,7 @@ _MODELS = {
|
||||
# transformers's mpt class has lower case
|
||||
"MptForCausalLM": ("mpt", "MPTForCausalLM"),
|
||||
"MPTForCausalLM": ("mpt", "MPTForCausalLM"),
|
||||
"OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),
|
||||
"OPTForCausalLM": ("opt", "OPTForCausalLM"),
|
||||
"PhiForCausalLM": ("phi", "PhiForCausalLM"),
|
||||
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
||||
|
333
vllm/model_executor/models/gemma.py
Normal file
333
vllm/model_executor/models/gemma.py
Normal file
@ -0,0 +1,333 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright (c) Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Gemma model compatible with HuggingFace weights."""
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import GemmaConfig
|
||||
|
||||
from vllm.model_executor.input_metadata import InputMetadata
|
||||
from vllm.model_executor.layers.attention import PagedAttention
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
LinearMethodBase,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||
get_tensor_model_parallel_world_size)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.model_executor.weight_utils import (default_weight_loader,
|
||||
hf_model_weights_iterator)
|
||||
from vllm.sequence import SamplerOutput
|
||||
|
||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||
|
||||
|
||||
class GemmaRMSNorm(nn.Module):
|
||||
|
||||
def __init__(self, dim: int, eps: float = 1e-6):
|
||||
super().__init__()
|
||||
self.eps = eps
|
||||
self.weight = nn.Parameter(torch.zeros(dim))
|
||||
|
||||
def _norm(self, x):
|
||||
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
|
||||
|
||||
def forward(self, x):
|
||||
output = self._norm(x.float()).type_as(x)
|
||||
return output * (1 + self.weight)
|
||||
|
||||
|
||||
class GemmaMLP(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
linear_method: Optional[LinearMethodBase] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.gate_proj = ColumnParallelLinear(hidden_size,
|
||||
intermediate_size,
|
||||
bias=False,
|
||||
linear_method=linear_method)
|
||||
self.up_proj = ColumnParallelLinear(hidden_size,
|
||||
intermediate_size,
|
||||
bias=False,
|
||||
linear_method=linear_method)
|
||||
self.down_proj = RowParallelLinear(intermediate_size,
|
||||
hidden_size,
|
||||
bias=False,
|
||||
linear_method=linear_method)
|
||||
self.act_fn = nn.GELU()
|
||||
|
||||
def forward(self, x):
|
||||
gate, _ = self.gate_proj(x)
|
||||
gate = self.act_fn(gate)
|
||||
up, _ = self.up_proj(x)
|
||||
fuse = gate * up
|
||||
outputs, _ = self.down_proj(fuse)
|
||||
return outputs
|
||||
|
||||
|
||||
class GemmaAttention(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
max_position_embeddings: int = 8192,
|
||||
rope_theta: float = 10000,
|
||||
linear_method: Optional[LinearMethodBase] = None) -> None:
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
tp_size = get_tensor_model_parallel_world_size()
|
||||
self.total_num_heads = num_heads
|
||||
assert self.total_num_heads % tp_size == 0
|
||||
self.num_heads = self.total_num_heads // tp_size
|
||||
self.total_num_kv_heads = num_kv_heads
|
||||
if self.total_num_kv_heads >= tp_size:
|
||||
# Number of KV heads is greater than TP size, so we partition
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert self.total_num_kv_heads % tp_size == 0
|
||||
else:
|
||||
# Number of KV heads is less than TP size, so we replicate
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
self.head_dim = head_dim
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
hidden_size,
|
||||
self.head_dim,
|
||||
self.total_num_heads,
|
||||
self.total_num_kv_heads,
|
||||
bias=False,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
self.o_proj = RowParallelLinear(
|
||||
self.total_num_heads * self.head_dim,
|
||||
hidden_size,
|
||||
bias=False,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=self.rope_theta,
|
||||
is_neox_style=True,
|
||||
)
|
||||
self.attn = PagedAttention(self.num_heads,
|
||||
self.head_dim,
|
||||
self.scaling,
|
||||
num_kv_heads=self.num_kv_heads)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
kv_cache: KVCache,
|
||||
input_metadata: InputMetadata,
|
||||
) -> torch.Tensor:
|
||||
qkv, _ = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
k_cache, v_cache = kv_cache
|
||||
attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
return output
|
||||
|
||||
|
||||
class GemmaDecoderLayer(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: GemmaConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.self_attn = GemmaAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
num_kv_heads=config.num_key_value_heads,
|
||||
head_dim=config.head_dim,
|
||||
max_position_embeddings=config.max_position_embeddings,
|
||||
rope_theta=config.rope_theta,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
self.mlp = GemmaMLP(
|
||||
hidden_size=self.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
self.input_layernorm = GemmaRMSNorm(config.hidden_size,
|
||||
eps=config.rms_norm_eps)
|
||||
self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
|
||||
eps=config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
kv_cache: KVCache,
|
||||
input_metadata: InputMetadata,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
hidden_states = self.self_attn(
|
||||
positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
kv_cache=kv_cache,
|
||||
input_metadata=input_metadata,
|
||||
)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class GemmaModel(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: GemmaConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
)
|
||||
self.layers = nn.ModuleList([
|
||||
GemmaDecoderLayer(config, linear_method)
|
||||
for _ in range(config.num_hidden_layers)
|
||||
])
|
||||
self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[KVCache],
|
||||
input_metadata: InputMetadata,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = self.embed_tokens(input_ids)
|
||||
# Normalize the embedding by sqrt(hidden_size)
|
||||
hidden_states = hidden_states * (self.config.hidden_size**0.5)
|
||||
|
||||
for i in range(len(self.layers)):
|
||||
layer = self.layers[i]
|
||||
hidden_states = layer(
|
||||
positions,
|
||||
hidden_states,
|
||||
kv_caches[i],
|
||||
input_metadata,
|
||||
)
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class GemmaForCausalLM(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: GemmaConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.linear_method = linear_method
|
||||
self.model = GemmaModel(config, linear_method)
|
||||
self.sampler = Sampler(config.vocab_size)
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[KVCache],
|
||||
input_metadata: InputMetadata,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = self.model(input_ids, positions, kv_caches,
|
||||
input_metadata)
|
||||
return hidden_states
|
||||
|
||||
def sample(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(self.model.embed_tokens.weight,
|
||||
hidden_states, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
def load_weights(self,
|
||||
model_name_or_path: str,
|
||||
cache_dir: Optional[str] = None,
|
||||
load_format: str = "auto",
|
||||
revision: Optional[str] = None):
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params = set()
|
||||
for name, loaded_weight in hf_model_weights_iterator(
|
||||
model_name_or_path, cache_dir, load_format, revision):
|
||||
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
||||
if shard_name not in name:
|
||||
continue
|
||||
name = name.replace(shard_name, param_name)
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# Skip loading extra layer for lora models.
|
||||
if "lm_head" in name:
|
||||
continue
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
unloaded_params = params_dict.keys() - loaded_params
|
||||
if unloaded_params:
|
||||
raise RuntimeError(
|
||||
f"Some weights are not initialized from checkpoints: {unloaded_params}"
|
||||
)
|
378
vllm/model_executor/models/olmo.py
Normal file
378
vllm/model_executor/models/olmo.py
Normal file
@ -0,0 +1,378 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/allenai/OLMo/blob/v0.2.4/olmo/model.py and
|
||||
# https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/modeling_olmo.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
#
|
||||
# BSD 3-Clause License
|
||||
#
|
||||
# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"""Inference-only OLMo model compatible with HuggingFace weights."""
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from vllm.model_executor.input_metadata import InputMetadata
|
||||
from vllm.model_executor.layers.attention import PagedAttention
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
LinearMethodBase,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||
get_tensor_model_parallel_world_size, )
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.model_executor.weight_utils import (
|
||||
default_weight_loader,
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
from vllm.sequence import SamplerOutput
|
||||
from vllm.transformers_utils.configs.olmo import OLMoConfig
|
||||
|
||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||
|
||||
|
||||
class SwiGLU(nn.Module):
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x, gate = x.chunk(2, dim=-1)
|
||||
return F.silu(gate) * x
|
||||
|
||||
@property
|
||||
def output_multiplier(self) -> float:
|
||||
return 0.5
|
||||
|
||||
|
||||
class OlmoAttention(nn.Module):
|
||||
"""
|
||||
This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: OLMoConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.hidden_size = config.d_model
|
||||
assert config.d_model % config.n_heads == 0
|
||||
tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
|
||||
)
|
||||
self.total_num_heads = self.config.n_heads
|
||||
assert self.total_num_heads % tensor_model_parallel_world_size == 0
|
||||
self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
|
||||
self.head_dim = self.hidden_size // self.total_num_heads
|
||||
|
||||
# Layer norms.
|
||||
self.attn_norm = nn.LayerNorm(config.d_model,
|
||||
elementwise_affine=False,
|
||||
bias=False)
|
||||
# Attention input projection. Projects x -> (q, k, v)
|
||||
self.att_proj = QKVParallelLinear(
|
||||
config.d_model,
|
||||
self.head_dim,
|
||||
self.total_num_heads,
|
||||
bias=config.include_bias,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
|
||||
# Rotary embeddings.
|
||||
if self.config.rope:
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
max_position_embeddings = getattr(config,
|
||||
"max_position_embeddings", 8192)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.attn = PagedAttention(self.num_heads,
|
||||
self.head_dim,
|
||||
scale=self.scaling)
|
||||
|
||||
# Attention output projection.
|
||||
self.attn_out = RowParallelLinear(
|
||||
config.d_model,
|
||||
config.d_model,
|
||||
bias=config.include_bias,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
kv_cache: KVCache,
|
||||
input_metadata: InputMetadata,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = self.attn_norm(hidden_states)
|
||||
qkv, _ = self.att_proj(hidden_states)
|
||||
q, k, v = qkv.chunk(chunks=3, dim=-1)
|
||||
if self.config.rope:
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
k_cache, v_cache = kv_cache
|
||||
attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
|
||||
output, _ = self.attn_out(attn_output)
|
||||
return output
|
||||
|
||||
|
||||
class OlmoMLP(nn.Module):
|
||||
"""
|
||||
This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: OLMoConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.hidden_size = (config.mlp_hidden_size if config.mlp_hidden_size
|
||||
is not None else config.mlp_ratio * config.d_model)
|
||||
|
||||
# Layer norms.
|
||||
self.ff_norm = nn.LayerNorm(config.d_model,
|
||||
elementwise_affine=False,
|
||||
bias=False)
|
||||
|
||||
# Feed-forward input projection.
|
||||
self.ff_proj = ColumnParallelLinear(
|
||||
config.d_model,
|
||||
self.hidden_size,
|
||||
bias=config.include_bias,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
|
||||
# Activation function.
|
||||
# self.act = SiluAndMul()
|
||||
# self.act.output_multiplier = 0.5
|
||||
self.act = SwiGLU()
|
||||
assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
|
||||
|
||||
# Feed-forward output projection.
|
||||
self.ff_out = RowParallelLinear(
|
||||
int(self.act.output_multiplier * self.hidden_size),
|
||||
config.d_model,
|
||||
bias=config.include_bias,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# Add feed-forward projection.
|
||||
# shape: (batch_size, seq_len, d_model)
|
||||
og_x = x
|
||||
x = self.ff_norm(x)
|
||||
x, _ = self.ff_proj(x)
|
||||
x = self.act(x)
|
||||
x, _ = self.ff_out(x)
|
||||
x = og_x + x
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class OlmoBlock(nn.Module):
|
||||
"""
|
||||
This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
||||
(plus another skip connection).
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
config: OLMoConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None):
|
||||
super().__init__()
|
||||
# Attention block.
|
||||
self.attn = OlmoAttention(config, linear_method)
|
||||
|
||||
# MLP block.
|
||||
self.mlp = OlmoMLP(config, linear_method)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
kv_cache: KVCache,
|
||||
input_metadata: InputMetadata,
|
||||
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||
# Attention block.
|
||||
og_x = hidden_states
|
||||
x = self.attn(positions, hidden_states, kv_cache, input_metadata)
|
||||
x = x + og_x
|
||||
|
||||
# MLP block.
|
||||
hidden_states = self.mlp(x)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class OlmoModel(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
config: OLMoConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
self.transformer = nn.ModuleDict(
|
||||
dict(
|
||||
wte=VocabParallelEmbedding(
|
||||
config.embedding_size or config.vocab_size,
|
||||
config.d_model,
|
||||
),
|
||||
ln_f=nn.LayerNorm(config.d_model,
|
||||
elementwise_affine=False,
|
||||
bias=False),
|
||||
))
|
||||
|
||||
blocks = [
|
||||
OlmoBlock(config, linear_method) for i in range(config.n_layers)
|
||||
]
|
||||
if self.config.block_group_size > 1:
|
||||
raise NotImplementedError("Block group size > 1 not supported yet")
|
||||
else:
|
||||
self.transformer.update({"blocks": nn.ModuleList(blocks)})
|
||||
|
||||
if not config.weight_tying:
|
||||
self.transformer.update({
|
||||
"ff_out":
|
||||
ColumnParallelLinear(
|
||||
config.d_model,
|
||||
config.embedding_size or config.vocab_size,
|
||||
bias=config.include_bias,
|
||||
linear_method=linear_method,
|
||||
)
|
||||
})
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[KVCache],
|
||||
input_metadata: InputMetadata,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
:param input_ids: A tensor of shape `(batch_size, seq_len)`.
|
||||
"""
|
||||
# Get embeddings of input.
|
||||
# shape: (batch_size, seq_len, d_model)
|
||||
x = self.transformer.wte(input_ids) # type: ignore
|
||||
|
||||
# Apply blocks one-by-one.
|
||||
for block_idx, block in enumerate(self.transformer.blocks):
|
||||
# shape: (batch_size, seq_len, d_model)
|
||||
x = block(
|
||||
positions,
|
||||
x,
|
||||
kv_caches[block_idx],
|
||||
input_metadata,
|
||||
)
|
||||
|
||||
# Apply final layer norm.
|
||||
# shape: (batch_size, seq_len or 1, d_model)
|
||||
x = self.transformer.ln_f(x) # type: ignore
|
||||
return x
|
||||
|
||||
|
||||
class OLMoForCausalLM(nn.Module):
|
||||
"""
|
||||
Extremely barebones HF model wrapper.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
config: OLMoConfig,
|
||||
linear_method: Optional[LinearMethodBase] = None):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.linear_method = linear_method
|
||||
self.model = OlmoModel(config, linear_method)
|
||||
self.lm_head_weight = (self.model.transformer.wte.weight
|
||||
if config.weight_tying else
|
||||
self.model.transformer.ff_out.weight)
|
||||
self.sampler = Sampler(config.vocab_size)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[KVCache],
|
||||
input_metadata: InputMetadata,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = self.model(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
kv_caches=kv_caches,
|
||||
input_metadata=input_metadata,
|
||||
)
|
||||
return hidden_states
|
||||
|
||||
def sample(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(self.lm_head_weight, hidden_states,
|
||||
sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
def load_weights(
|
||||
self,
|
||||
model_name_or_path: str,
|
||||
cache_dir: Optional[str] = None,
|
||||
load_format: str = "auto",
|
||||
revision: Optional[str] = None,
|
||||
):
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
for name, loaded_weight in hf_model_weights_iterator(
|
||||
model_name_or_path, cache_dir, load_format, revision):
|
||||
# attention
|
||||
if ".att" in name:
|
||||
name = name.replace(".att", ".attn.att")
|
||||
# mlp
|
||||
if ".ff" in name and "transformer.ff_out" not in name:
|
||||
name = name.replace(".ff", ".mlp.ff")
|
||||
# there is no bias in olmo
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
@ -19,6 +19,7 @@ class SamplingMetadata:
|
||||
prompt_lens: Lengths of prompts.
|
||||
selected_token_indices: Token indices selected for sampling.
|
||||
categorized_sample_indices: SamplingType -> token indices to sample.
|
||||
generators: List of torch.Generators to use for seeded sampling
|
||||
perform_sampling: Whether to perform sampling. This option is used to
|
||||
make the sampling only happens in the driver worker, and disable
|
||||
sampling in other worker processes.
|
||||
@ -31,6 +32,7 @@ class SamplingMetadata:
|
||||
prompt_lens: Optional[List[int]],
|
||||
selected_token_indices: torch.Tensor,
|
||||
categorized_sample_indices: Optional[Dict[SamplingType, torch.Tensor]],
|
||||
generators: Optional[List[torch.Generator]] = None,
|
||||
perform_sampling: bool = True,
|
||||
) -> None:
|
||||
self.seq_groups = seq_groups
|
||||
@ -38,6 +40,7 @@ class SamplingMetadata:
|
||||
self.prompt_lens = prompt_lens
|
||||
self.selected_token_indices = selected_token_indices
|
||||
self.categorized_sample_indices = categorized_sample_indices
|
||||
self.generators = generators
|
||||
self.perform_sampling = perform_sampling
|
||||
|
||||
self.num_prompts = len(prompt_lens) if prompt_lens is not None else 0
|
||||
|
@ -1,7 +1,8 @@
|
||||
from typing import List, Optional
|
||||
import time
|
||||
|
||||
from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
|
||||
SequenceStatus)
|
||||
SequenceStatus, RequestMetrics)
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
@ -60,6 +61,7 @@ class RequestOutput:
|
||||
prompt_logprobs: The log probabilities to return per prompt token.
|
||||
outputs: The output sequences of the request.
|
||||
finished: Whether the whole request is finished.
|
||||
metrics: Metrics associated with the request.
|
||||
lora_request: The LoRA request that was used to generate the output.
|
||||
"""
|
||||
|
||||
@ -71,6 +73,7 @@ class RequestOutput:
|
||||
prompt_logprobs: Optional[PromptLogprobs],
|
||||
outputs: List[CompletionOutput],
|
||||
finished: bool,
|
||||
metrics: Optional[RequestMetrics] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> None:
|
||||
self.request_id = request_id
|
||||
@ -79,6 +82,7 @@ class RequestOutput:
|
||||
self.prompt_logprobs = prompt_logprobs
|
||||
self.outputs = outputs
|
||||
self.finished = finished
|
||||
self.metrics = metrics
|
||||
self.lora_request = lora_request
|
||||
|
||||
@classmethod
|
||||
@ -115,12 +119,15 @@ class RequestOutput:
|
||||
prompt_token_ids = seq_group.prompt_token_ids
|
||||
prompt_logprobs = seq_group.prompt_logprobs
|
||||
finished = seq_group.is_finished()
|
||||
finished_time = time.time() if finished else None
|
||||
seq_group.set_finished_time(finished_time)
|
||||
return cls(seq_group.request_id,
|
||||
prompt,
|
||||
prompt_token_ids,
|
||||
prompt_logprobs,
|
||||
outputs,
|
||||
finished,
|
||||
seq_group.metrics,
|
||||
lora_request=seq_group.lora_request)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
@ -130,4 +137,5 @@ class RequestOutput:
|
||||
f"prompt_logprobs={self.prompt_logprobs}, "
|
||||
f"outputs={self.outputs}, "
|
||||
f"finished={self.finished}, "
|
||||
f"metrics={self.metrics}, "
|
||||
f"lora_request={self.lora_request})")
|
||||
|
@ -11,7 +11,8 @@ _SAMPLING_EPS = 1e-5
|
||||
class SamplingType(IntEnum):
|
||||
GREEDY = 0
|
||||
RANDOM = 1
|
||||
BEAM = 2
|
||||
RANDOM_SEED = 2
|
||||
BEAM = 3
|
||||
|
||||
|
||||
LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor]
|
||||
@ -56,6 +57,7 @@ class SamplingParams:
|
||||
min_p: Float that represents the minimum probability for a token to be
|
||||
considered, relative to the probability of the most likely token.
|
||||
Must be in [0, 1]. Set to 0 to disable this.
|
||||
seed: Random seed to use for the generation.
|
||||
use_beam_search: Whether to use beam search instead of sampling.
|
||||
length_penalty: Float that penalizes sequences based on their length.
|
||||
Used in beam search.
|
||||
@ -101,6 +103,7 @@ class SamplingParams:
|
||||
top_p: float = 1.0,
|
||||
top_k: int = -1,
|
||||
min_p: float = 0.0,
|
||||
seed: Optional[int] = None,
|
||||
use_beam_search: bool = False,
|
||||
length_penalty: float = 1.0,
|
||||
early_stopping: Union[bool, str] = False,
|
||||
@ -124,6 +127,7 @@ class SamplingParams:
|
||||
self.top_p = top_p
|
||||
self.top_k = top_k
|
||||
self.min_p = min_p
|
||||
self.seed = seed
|
||||
self.use_beam_search = use_beam_search
|
||||
self.length_penalty = length_penalty
|
||||
self.early_stopping = early_stopping
|
||||
@ -229,6 +233,8 @@ class SamplingParams:
|
||||
return SamplingType.BEAM
|
||||
if self.temperature < _SAMPLING_EPS:
|
||||
return SamplingType.GREEDY
|
||||
if self.seed is not None:
|
||||
return SamplingType.RANDOM_SEED
|
||||
return SamplingType.RANDOM
|
||||
|
||||
def __repr__(self) -> str:
|
||||
@ -242,6 +248,7 @@ class SamplingParams:
|
||||
f"top_p={self.top_p}, "
|
||||
f"top_k={self.top_k}, "
|
||||
f"min_p={self.min_p}, "
|
||||
f"seed={self.seed}, "
|
||||
f"use_beam_search={self.use_beam_search}, "
|
||||
f"length_penalty={self.length_penalty}, "
|
||||
f"early_stopping={self.early_stopping}, "
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""Sequence and its related classes."""
|
||||
import copy
|
||||
import enum
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from vllm.block import LogicalTokenBlock
|
||||
@ -49,6 +50,25 @@ class SequenceStatus(enum.Enum):
|
||||
return finish_reason
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestMetrics:
|
||||
"""Metrics associated with a request.
|
||||
|
||||
Args:
|
||||
arrival_time: The time when the request arrived.
|
||||
first_scheduled_time: The time when the request was first scheduled.
|
||||
first_token_time: The time when the first token was generated.
|
||||
time_in_queue: The time the request spent in the queue.
|
||||
finished_time: The time when the request was finished.
|
||||
"""
|
||||
arrival_time: float
|
||||
last_token_time: float
|
||||
first_scheduled_time: Optional[float]
|
||||
first_token_time: Optional[float]
|
||||
time_in_queue: Optional[float]
|
||||
finished_time: Optional[float] = None
|
||||
|
||||
|
||||
class SequenceData:
|
||||
"""Data associated with a sequence.
|
||||
|
||||
@ -228,6 +248,14 @@ class Sequence:
|
||||
f"num_blocks={len(self.logical_token_blocks)})")
|
||||
|
||||
|
||||
@dataclass
|
||||
class SequenceGroupState:
|
||||
"""Mutable state tied to a specific sequence group"""
|
||||
|
||||
# torch.Generator used in seeded sampling
|
||||
generator: Optional = None
|
||||
|
||||
|
||||
class SequenceGroup:
|
||||
"""A group of sequences that are generated from the same prompt.
|
||||
|
||||
@ -252,11 +280,15 @@ class SequenceGroup:
|
||||
self.request_id = request_id
|
||||
self.seqs_dict = {seq.seq_id: seq for seq in seqs}
|
||||
self.sampling_params = sampling_params
|
||||
self.arrival_time = arrival_time
|
||||
self.last_token_time = arrival_time
|
||||
self.metrics = RequestMetrics(arrival_time=arrival_time,
|
||||
last_token_time=arrival_time,
|
||||
first_scheduled_time=None,
|
||||
first_token_time=None,
|
||||
time_in_queue=None)
|
||||
self.lora_request = lora_request
|
||||
self.prefix: Optional[Prefix] = prefix
|
||||
self.prompt_logprobs: Optional[PromptLogprobs] = None
|
||||
self.state = SequenceGroupState()
|
||||
|
||||
@property
|
||||
def prompt(self) -> str:
|
||||
@ -276,10 +308,25 @@ class SequenceGroup:
|
||||
|
||||
def get_last_latency(self, now: float) -> float:
|
||||
"""Gets last token latency for Request level timings."""
|
||||
latency = now - self.last_token_time
|
||||
self.last_token_time = now
|
||||
latency = now - self.metrics.last_token_time
|
||||
self.metrics.last_token_time = now
|
||||
return latency
|
||||
|
||||
def maybe_set_first_token_time(self, time: float) -> None:
|
||||
"""Sets the first token time for Request level timings."""
|
||||
if self.metrics.first_token_time is None:
|
||||
self.metrics.first_token_time = time
|
||||
|
||||
def maybe_set_first_scheduled_time(self, time: float) -> None:
|
||||
"""Sets the first scheduled time and time in queue for Request level timings."""
|
||||
if self.metrics.first_scheduled_time is None:
|
||||
self.metrics.first_scheduled_time = time
|
||||
self.metrics.time_in_queue = time - self.metrics.arrival_time
|
||||
|
||||
def set_finished_time(self, time: Optional[float]) -> None:
|
||||
"""Sets the finished time for Request level timings."""
|
||||
self.metrics.finished_time = time
|
||||
|
||||
def get_max_num_running_seqs(self) -> int:
|
||||
"""The maximum number of sequences running in parallel in the remaining
|
||||
lifetime of the request."""
|
||||
@ -359,6 +406,7 @@ class SequenceGroupMetadata:
|
||||
sampling_params: The sampling parameters used to generate the outputs.
|
||||
block_tables: The block tables. (Seq id -> list of physical block
|
||||
numbers)
|
||||
state: Internal state tied to this sequence group.
|
||||
lora_request: LoRA request.
|
||||
prefix: The prefix of the prompt of the sequence group.
|
||||
"""
|
||||
@ -372,6 +420,7 @@ class SequenceGroupMetadata:
|
||||
block_tables: Dict[int, List[int]],
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prefix: Optional[Prefix] = None,
|
||||
state: Optional[SequenceGroupState] = None,
|
||||
) -> None:
|
||||
self.request_id = request_id
|
||||
self.is_prompt = is_prompt
|
||||
@ -380,6 +429,7 @@ class SequenceGroupMetadata:
|
||||
self.block_tables = block_tables
|
||||
self.lora_request = lora_request
|
||||
self.prefix = prefix
|
||||
self.state = SequenceGroupState() if state is None else state
|
||||
|
||||
@property
|
||||
def lora_int_id(self) -> int:
|
||||
|
@ -16,10 +16,14 @@ _CONFIG_REGISTRY = {
|
||||
|
||||
def get_config(model: str,
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None) -> PretrainedConfig:
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None) -> PretrainedConfig:
|
||||
try:
|
||||
config = AutoConfig.from_pretrained(
|
||||
model, trust_remote_code=trust_remote_code, revision=revision)
|
||||
model,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
code_revision=code_revision)
|
||||
except ValueError as e:
|
||||
if (not trust_remote_code and
|
||||
"requires you to execute the configuration file" in str(e)):
|
||||
@ -33,5 +37,7 @@ def get_config(model: str,
|
||||
raise e
|
||||
if config.model_type in _CONFIG_REGISTRY:
|
||||
config_class = _CONFIG_REGISTRY[config.model_type]
|
||||
config = config_class.from_pretrained(model, revision=revision)
|
||||
config = config_class.from_pretrained(model,
|
||||
revision=revision,
|
||||
code_revision=code_revision)
|
||||
return config
|
||||
|
@ -1,6 +1,7 @@
|
||||
from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||
from vllm.transformers_utils.configs.olmo import OLMoConfig
|
||||
from vllm.transformers_utils.configs.qwen import QWenConfig
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
@ -11,6 +12,7 @@ __all__ = [
|
||||
"BaiChuanConfig",
|
||||
"ChatGLMConfig",
|
||||
"MPTConfig",
|
||||
"OLMoConfig",
|
||||
"QWenConfig",
|
||||
"RWConfig",
|
||||
]
|
||||
|
72
vllm/transformers_utils/configs/olmo.py
Normal file
72
vllm/transformers_utils/configs/olmo.py
Normal file
@ -0,0 +1,72 @@
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
|
||||
"""OLMo configuration"""
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class OLMoConfig(PretrainedConfig):
|
||||
model_type = 'olmo'
|
||||
attribute_map = {
|
||||
'num_attention_heads': 'n_heads',
|
||||
'hidden_size': 'd_model',
|
||||
'num_hidden_layers': 'n_layers',
|
||||
}
|
||||
|
||||
# Note that the defaults for these attributes are equivalent to the base GPT2 model.
|
||||
def __init__(
|
||||
self,
|
||||
d_model=768,
|
||||
n_heads=12,
|
||||
n_layers=12,
|
||||
mlp_ratio=4,
|
||||
mlp_hidden_size=None,
|
||||
activation_type="swiglu",
|
||||
block_type="sequential",
|
||||
block_group_size=1,
|
||||
alibi=False,
|
||||
alibi_bias_max=8.0,
|
||||
rope=False,
|
||||
rope_full_precision=True,
|
||||
multi_query_attention=False,
|
||||
attention_layer_norm=False,
|
||||
layer_norm_type="default",
|
||||
layer_norm_with_affine=True,
|
||||
attention_layer_norm_with_affine=True,
|
||||
max_sequence_length=1024,
|
||||
include_bias=True,
|
||||
bias_for_layer_norm=None,
|
||||
scale_logits=False,
|
||||
vocab_size=50257,
|
||||
embedding_size=50304,
|
||||
weight_tying=True,
|
||||
eos_token_id=50256,
|
||||
pad_token_id=50256,
|
||||
**kwargs,
|
||||
):
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.mlp_hidden_size = mlp_hidden_size
|
||||
self.activation_type = activation_type
|
||||
self.block_type = block_type
|
||||
self.block_group_size = block_group_size
|
||||
self.alibi = alibi
|
||||
self.alibi_bias_max = alibi_bias_max
|
||||
self.rope = rope
|
||||
self.rope_full_precision = rope_full_precision
|
||||
self.multi_query_attention = multi_query_attention
|
||||
self.attention_layer_norm = attention_layer_norm
|
||||
self.layer_norm_type = layer_norm_type
|
||||
self.layer_norm_with_affine = layer_norm_with_affine
|
||||
self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
|
||||
self.max_sequence_length = max_sequence_length
|
||||
self.include_bias = include_bias
|
||||
self.bias_for_layer_norm = bias_for_layer_norm
|
||||
self.scale_logits = scale_logits
|
||||
self.vocab_size = vocab_size
|
||||
self.embedding_size = embedding_size
|
||||
self.weight_tying = weight_tying
|
||||
self.eos_token_id = eos_token_id
|
||||
self.pad_token_id = pad_token_id
|
||||
super().__init__(**kwargs)
|
@ -389,6 +389,7 @@ class ModelRunner:
|
||||
) -> SamplingMetadata:
|
||||
seq_groups: List[Tuple[List[int], SamplingParams]] = []
|
||||
selected_token_indices: List[int] = []
|
||||
generators: List[torch.Generator] = []
|
||||
selected_token_start_idx = 0
|
||||
categorized_sample_indices = {t: [] for t in SamplingType}
|
||||
categorized_sample_indices_start_idx = 0
|
||||
@ -419,6 +420,10 @@ class ModelRunner:
|
||||
selected_token_indices.append(selected_token_start_idx +
|
||||
subquery_len - 1)
|
||||
selected_token_start_idx += max_subquery_len
|
||||
|
||||
if sampling_params.seed is not None:
|
||||
seq_group_metadata.state.generator = torch.Generator(
|
||||
device="cuda").manual_seed(sampling_params.seed)
|
||||
else:
|
||||
num_seqs = len(seq_ids)
|
||||
selected_token_indices.extend(
|
||||
@ -432,6 +437,9 @@ class ModelRunner:
|
||||
categorized_sample_indices_start_idx + num_seqs))
|
||||
categorized_sample_indices_start_idx += num_seqs
|
||||
|
||||
if sampling_params.seed is not None:
|
||||
generators.append(seq_group_metadata.state.generator)
|
||||
|
||||
selected_token_indices = _async_h2d(selected_token_indices,
|
||||
dtype=torch.long,
|
||||
target_device=self.device,
|
||||
@ -454,6 +462,7 @@ class ModelRunner:
|
||||
prompt_lens=prompt_lens,
|
||||
selected_token_indices=selected_token_indices,
|
||||
categorized_sample_indices=categorized_sample_indices,
|
||||
generators=generators,
|
||||
)
|
||||
return sampling_metadata
|
||||
|
||||
@ -536,6 +545,7 @@ class ModelRunner:
|
||||
prompt_lens=None,
|
||||
selected_token_indices=metadata_dict["selected_token_indices"],
|
||||
categorized_sample_indices=None,
|
||||
generators=None,
|
||||
perform_sampling=False,
|
||||
)
|
||||
|
||||
|
@ -93,8 +93,6 @@ class Worker:
|
||||
# Initialize the distributed environment.
|
||||
init_distributed_environment(self.parallel_config, self.rank,
|
||||
cupy_port, self.distributed_init_method)
|
||||
if not self.parallel_config.disable_custom_all_reduce:
|
||||
init_custom_ar()
|
||||
# Initialize the model.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
@ -288,6 +286,10 @@ def init_distributed_environment(
|
||||
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
|
||||
parallel_config.pipeline_parallel_size)
|
||||
|
||||
# Initialize a custom fast all-reduce implementation.
|
||||
if not parallel_config.disable_custom_all_reduce:
|
||||
init_custom_ar()
|
||||
|
||||
|
||||
def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
||||
# Check if the GPU supports the dtype.
|
||||
|
Reference in New Issue
Block a user