[init] feat: upload first open source version of verl

This commit is contained in:
shengguangming
2024-10-31 14:29:44 +08:00
commit 30911f133a
201 changed files with 29407 additions and 0 deletions

110
.gitignore vendored Normal file
View File

@ -0,0 +1,110 @@
**/*.pt
**/checkpoints
**/wget-log
**/_build/
**/*.ckpt
**/outputs
**/*.tar.gz
**/playground
**/wandb
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
dataset/*
tensorflow/my_graph/*
.idea/
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
# vscode
.vscode
# Mac
.DS_Store
# output logs
tests/e2e/toy_examples/deepspeed/synchronous/output.txt

16
.readthedocs.yaml Normal file
View File

@ -0,0 +1,16 @@
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
sphinx:
configuration: docs/conf.py
python:
install:
- requirements: docs/requirements-docs.txt

5
.style.yapf Normal file
View File

@ -0,0 +1,5 @@
[style]
based_on_style = google
column_limit = 120
indent_width = 4
split_arguments_when_comma_terminated: true

202
LICENSE Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

1
Notice.txt Normal file
View File

@ -0,0 +1 @@
Copyright 2023-2024 Bytedance Ltd. and/or its affiliates

168
README.md Normal file
View File

@ -0,0 +1,168 @@
<div align=center>
<img src="docs/_static/logo.png" width = "20%" height = "20%" />
</div>
<h1 style="text-align: center;">veRL: Volcano Engine Reinforcement Learning for LLM</h1>
veRL (HybridFlow) is a flexible, efficient and industrial-level RL(HF) training framework designed for large language models (LLMs). veRL is the open-source version of [HybridFlow](https://arxiv.org/abs/2409.19256v2) paper.
veRL is flexible and easy to use with:
- **Easy to support diverse RL(HF) algorithms**: The Hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
- **Seamless integration of existing LLM infra with modular API design**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM and vLLM. Moreover, users can easily extend to other LLM training and inference frameworks.
- **Flexible device mapping**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
- Readily integration with popular Hugging Face models
veRL is fast with:
- **State-of-the-art throughput**: By seamlessly integrating existing SOTA LLM training and inference frameworks, veRL achieves high generation and training throughput.
- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
<p align="center">
| <a href="https://verl-doc.readthedocs.io/en/latest/index.html"><b>Documentation</b></a> | <a href="https://arxiv.org/abs/2409.19256v2"><b>Paper</b></a> |
<!-- <a href=""><b>Slides</b></a> | -->
</p>
## Installation
For installing the latest version of veRL, the best way is to clone and install it from source. Then you can modify our code to customize your own post-training jobs.
```bash
# install verl together with some lightweight dependencies in setup.py
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
```
You can also install veRL using `pip3 install`
```bash
# directly install from pypi
pip3 install verl
```
### Dependencies
veRL requires Python >= 3.9 and CUDA >= 12.1.
veRL support various backend, we currently release FSDP and Megatron-LM for actor training and vLLM for rollout generation.
To install the dependencies, we recommend using conda:
```bash
conda create -n verl python==3.9
conda activate verl
```
The following dependencies are required for all backends.
```bash
# install torch [or you can skip this step and let vllm to install the correct version for you]
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
# install vllm
pip3 install vllm==0.5.4
pip3 install ray==2.10 # other version may have bug
# flash attention 2
pip3 install flash-attn --no-build-isolation
```
**FSDP**
We recommend using FSDP backend to investigate, research and prototype different models, datasets and RL algorithms.
The pros, cons and extension guide for using FSDP backend can be found in fsdp.md
**Megatron-LM**
For users who pursue better scalability, we recommend using Megatron-LM backend. Please install the above dependencies first.
Currently, we support Megatron-LM@core_v0.4.0 and we fix some internal issues of Megatron-LM. Here's the additional installation guide.
The pros, cons and extension guide for using Megatron-LM backend can be found in megatron.md.
```bash
# FOR Megatron-LM Backend
# apex
pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# transformer engine
pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
# megatron core v0.4.0
cd ..
git clone -b core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
cp ../verl/patches/megatron_v4.patch .
git apply megatron_v4.patch
pip3 install -e .
export PYTHONPATH=$PYTHONPATH:$(pwd)
```
## Getting Started
Visit our [documentation](https://verl-doc.readthedocs.io/en/latest/index.html) to learn more.
**Running an PPO example should follow:**
- Preparation
- [Installation](https://verl-doc.readthedocs.io/en/latest/preparation/install.html)
- [Prepare Data (Parquet) for Post-Training](https://verl-doc.readthedocs.io/en/latest/preparation/prepare_data.html)
- [Implement Reward Function for Dataset](https://verl-doc.readthedocs.io/en/latest/preparation/reward_function.html)
- PPO Example (Run an example)
- [PPO Example Architecture](https://verl-doc.readthedocs.io/en/latest/examples/ppo_code_architecture.html)
- [Config Explanation](https://verl-doc.readthedocs.io/en/latest/examples/config.html)
- [Run GSM8K Example](https://verl-doc.readthedocs.io/en/latest/examples/gsm8k_example.html)
**For code explanation and advance usage (extension):**
- PPO Trainer and Workers
- [PPO Ray Trainer](https://verl-doc.readthedocs.io/en/latest/workers/ray_trainer.html)
- [PyTorch FSDP Backend](https://verl-doc.readthedocs.io/en/latest/workers/fsdp_workers.html)
- [Megatron-LM Backend](https://verl-doc.readthedocs.io/en/latest/index.html)
- Advance Usage and Extension
- [Ray API Design Tutorial](https://verl-doc.readthedocs.io/en/latest/advance/placement.html)
- [Extend to other RL(HF) algorithms](https://verl-doc.readthedocs.io/en/latest/advance/dpo_extension.html)
- [Add models to FSDP backend](https://verl-doc.readthedocs.io/en/latest/advance/fsdp_extension.html)
- [Add models to Megatron-LM backend](https://verl-doc.readthedocs.io/en/latest/advance/megatron_extension.html)
## Contribution
### Code formatting
We use yapf (Google style) to enforce strict code formatting when reviewing MRs. To reformat you code locally, make sure you installed `yapf`
```bash
pip3 install yapf
```
Then, make sure you are at top level of verl repo and run
```bash
yapf -ir -vv --style ./.style.yapf verl single_controller examples
```
## Citation
```tex
@article{sheng2024hybridflow,
title = {HybridFlow: A Flexible and Efficient RLHF Framework},
author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
year = {2024},
journal = {arXiv preprint arXiv: 2409.19256}
}
@inproceedings{zhang2024framework,
title={A Framework for Training Large Language Models for Code Generation via Proximal Policy Optimization},
author={Zhang, Chi and Sheng, Guangming and Liu, Siyao and Li, Jiahao and Feng, Ziyuan and Liu, Zherui and Liu, Xin and Jia, Xiaoying and Peng, Yanghua and Lin, Haibin and Wu, Chuan},
booktitle={In NL2Code Workshop of ACM KDD},
year={2024}
}
```

20
docs/Makefile Normal file
View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = verl
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

19
docs/README.md Normal file
View File

@ -0,0 +1,19 @@
# veRL documents
## Build the docs
```bash
# Install dependencies.
pip install -r requirements-docs.txt
# Build the docs.
make clean
make html
```
## Open the docs with your browser
```bash
python -m http.server -d _build/html/
```
Launch your browser and open localhost:8000.

BIN
docs/_static/logo.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

View File

@ -0,0 +1,271 @@
Extend to other RL(HF) algorithms
=================================
We already implemented the complete training pipeline of the PPO
algorithms. To extend to other algorithms, we analyze the high-level
principle to use veRL and provide a tutorial to implement the DPO
algorithm. Users can follow the similar paradigm to extend to other RL algorithms.
.. note:: **Key ideas**: Single process drives multi-process computation and data communication.
Overall Approach
----------------
Step 1: Consider what multi-machine multi-GPU computations are needed
for each model, such as ``generate_sequence`` , ``compute_log_prob`` and
``update_policy`` in the actor_rollout model. Implement distributed
single-process-multiple-data (SPMD) computation and encapsulate them
into APIs
Step 2: Based on different distributed scenarios, including FSDP and 3D
parallelism in Megatron-LM, implement single-process control of data
interaction among multi-process computations.
Step 3: Utilize the encapsulated APIs to implement the control flow
Example: Online DPO
-------------------
We use veRL to implement a simple online DPO algorithm. The algorithm
flow of Online DPO is as follows:
1. There is a prompt (rollout) generator which has the same weight as
the actor model. After a batch of prompts are fed into the generator,
it generates N responses for each prompt.
2. Send all the prompts + responses to a verifier for scoring, which can
be reward model or a rule-based function. Then sort them in pairs to
form a training batch.
3. Use this training batch to train the actor model using DPO. During
the process, a reference policy is needed.
Step 1: What are the multi-machine multi-GPU computations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**Sample Generator**
Implementation details:
.. code:: python
from single_controller.base import Worker
from single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
import ray
@ray.remote
class SampleGenerator(Worker):
def __init__(self, config):
super().__init__()
self.config = config
def generate_sequences(self, data):
pass
Here, ``SampleGenerator`` can be viewed as a multi-process pulled up by
``torchrun``, with each process running the same code (SPMD).
``SampleGenerator`` needs to implement a ``generate_sequences`` API for
the control flow to call. The implementation details inside can use any
inference engine including vllm, sglang and huggingface. Users can
largely reuse the code in
verl/verl/trainer/ppo/rollout/vllm_rollout/vllm_rollout.py and we wont
go into details here.
**ReferencePolicy inference**
API: compute reference log probability
.. code:: python
from single_controller.base import Worker
import ray
@ray.remote
class ReferencePolicy(Worker):
def __init__(self):
super().__init__()
self.model = Model()
def infer(self, data):
return self.model(data)
**Actor update**
API: Update actor model parameters
.. code:: python
from single_controller.base import Worker
import ray
@ray.remote
class DPOActor(Worker):
def __init__(self):
super().__init__()
self.model = Model()
self.model = FSDP(self.model) # or other distributed strategy
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
self.loss_fn = xxx
def update(self, data):
self.optimizer.zero_grad()
logits = self.model(data)
loss = self.loss_fn(logits)
loss.backward()
self.optimizer.step()
**Notes: How to distinguish between control processes and distributed computation processes**
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- Control processes are generally functions directly decorated with
``@ray.remote``
- Computation processes are all wrapped into a ``RayWorkerGroup``.
Users can reuse most of the distribtued computation logics implemented
in PPO algorithm, including FSDP and Megatron-LM backend in
verl/verl/trainer/ppo.
Step 2: Based on different distributed scenarios, implement single-process control of multi-process data interaction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**The core problem to solve here is how a single process sends data to
multiple processes, drives multi-process computation, and how the
control process obtains the results of multi-process computation.**
First, we initialize the multi-process ``WorkerGroup`` in the control
process.
.. code:: python
@ray.remote(num_cpus=1)
def main_task(config):
# construct SampleGenerator
resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
# put SampleGenerator onto resource pool
worker_group = RayWorkerGroup(resource_pool, ray_cls)
# construct reference policy
As we can see, in the control process, multiple processes are wrapped
into a ``RayWorkerGroup``. Inside this ``WorkerGroup``, there is a
``self._workers`` member, where each worker is a RayActor
(https://docs.ray.io/en/latest/ray-core/actors.html) of SampleGenerator.
ray_trainer.md also provide an implementation of
``MegatronRayWorkerGroup``.
Assuming the model is distributed using FSDP, and there is a batch of
data on the control process, for data parallelism, the underlying
calling process is:
.. code:: python
data = xxx
data_list = data.chunk(dp_size)
output = []
for d in data_list:
# worker_group._workers[i] is a SampleGenerator
output.append(worker_group._workers[i].generate_sequences.remote(d))
output = ray.get(output)
output = torch.cat(output)
Single process calling multiple processes involves the following 3
steps:
1. Split the data into DP parts on the control process.
2. Send the data to remote, call the remote computation through RPC, and
utilize multi-process computation.
3. Obtain the computation results of each worker on the control process
and merge them.
Frequently calling these 3 steps on the controller process greatly hurts
code readability. **In veRL, we have abstracted and encapsulated these 3
steps, so that the workers method + dispatch + collect can be
registered into the worker_group**
.. code:: python
from single_controller.base.decorator import register
def dispatch_data(worker_group, data):
return data.chunk(worker_group.world_size)
def collect_data(worker_group, data):
return torch.cat(data)
dispatch_mode = {
'dispatch_fn': dispatch_data,
'collect_fn': collect_data
}
@register(dispatch_mode=dispatch_mode)
def generate_sequences(self, data):
pass
In this way, we can directly call the method inside the worker through
the ``worker_group`` on the control (driver) process (which is a single
process):
.. code:: python
output = worker_group.generate_sequences(data)
This single line includes data splitting, data distribution and
computation, and data collection.
Furthermore, the model parallelism size of each model is usually fixed,
including dp, tp, pp. So for these common distributed scenarios, we have
pre-implemented specific dispatch and collect methods,in `decorator.py <https://github.com/volcengine/verl/blob/main/single_controller/base/decorator.py>`_, which can be directly used to wrap the computations.
.. code:: python
from single_controller.base.decorator import register, Dispatch
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(self, data: DataProto) -> DataProto:
pass
Here it requires the data interface to be ``DataProto``. Definition of
``DataProto`` is in `protocol.py <https://github.com/volcengine/verl/blob/main/verl/protocol.py>`_.
Step 3: Main training loop
~~~~~~~~~~~~~~~~~~~~~~~~~~
With the above training flows, we can implement the algorithms control
flow. It is recommended that ``main_task`` is also a ray remote process.
.. code:: python
@ray.remote(num_cpus=1)
def main_task(config):
# construct SampleGenerator
resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
# put SampleGenerator onto resource pool
sample_gen = RayWorkerGroup(resource_pool, ray_cls)
# construct reference policy
ray_cls = RayClassWithInitArgs(ReferencePolicy)
ref_policy = RayWorkerGroup(resource_pool, ray_cls)
# construct actor
ray_cls = RayClassWithInitArgs(DPOActor)
dpo_policy = RayWorkerGroup(resource_pool, ray_cls)
dataloader = DataLoader()
for data in dataloader:
# generate data
data = sample_gen.generate_sequences(data)
# generate scores for each data
data = generate_scores(data)
# generate pairwise data using scores
data = generate_pairwise_data(data)
# generate ref_log_prob
data.batch['ref_log_prob'] = ref_policy.infer(data)
# update using dpo
dpo_policy.update(data)
# logging
Here, different ``WorkerGroups`` can be placed in the same resource pool or
in different resource pools using ``create_colocated_worker_cls``
similar as in `ray_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py>`_.

View File

@ -0,0 +1,94 @@
Add models to FSDP backend
===========================
Model
--------------------------
In principle, our FSDP backend can support any HF model and we can
sychronoize the actor model weight with vLLM using `hf_weight_loader.py <https://github.com/volcengine/verl/blob/main/verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py>`_.
However, ``hf_weight_loader`` is will gather the full state_dict of a
model during synchronization, which may cause OOM. We suggest using
``dtensor_weight_loader`` which gather the full model parameter layer by
layer to reduce the peak memory usage. We already support dtensor weight
loader for the models below in `dtensor_weight_loader.py <https://github.com/volcengine/verl/blob/main/verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loader.py>`_.:
- ``GPT2LMHeadModel``
- ``LlamaForCausalLM``
- ``LLaMAForCausalLM``
- ``MistralForCausalLM``
- ``InternLMForCausalLM``
- ``AquilaModel``
- ``AquilaForCausalLM``
- ``Phi3ForCausalLM``
- ``GemmaForCausalLM``
- ``Gemma2ForCausalLM``
- ``GPTBigCodeForCausalLM``
- ``Starcoder2ForCausalLM``
- ``Qwen2ForCausalLM``
- ``DeepseekV2ForCausalLM``
To implement ``dtensor_weight_loader`` of a model thats supported in
vLLM, follow the guide of gemma model below:
1. Copy the
``load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]])`` from the vllm model class
to ``dtensor_weight_loaders.py``
2. Modify the arguments to
``(actor_weights: Dict, vllm_model: nn.Module)``
3. Replace the ``self`` to ``vllm_model``
4. Add the
``local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)``
before each ``param = params_dict[name]`` and modify the following
weight loading using ``local_loaded_weight``.
5. Register the implemented dtensor weight loader to ``__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__``.
.. code-block:: diff
- def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
- params_dict = dict(self.named_parameters())
+ params_dict = dict(vllm_model.named_parameters())
loaded_params = set()
- for name, loaded_weight in weights:
+ for name, loaded_weight in actor_weights.items():
for (param_name, shard_name, shard_id) in stacked_params_mapping:
if shard_name not in name:
continue
name = name.replace(shard_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = param.weight_loader
- weight_loader(param, loaded_weight, shard_id)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
break
else:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name:
continue
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
loaded_params.add(name)
unloaded_params = params_dict.keys() - loaded_params
if unloaded_params:
raise RuntimeError(
"Some weights are not initialized from checkpoints: "
f"{unloaded_params}")

View File

@ -0,0 +1,25 @@
Add models to Megatron-LM backend
===========
Model
-----------
The most challenging aspect to use Megatron-LM backend is implementing
the models for training. Currently, we implement Llama model that
support data parallelism, tensor parallelism, pipeline parallelism (also
vPP) and sequence parallelism. We also implement remove padding on Llama
model, which can be found in `modeling_llama_megatron.py <https://github.com/volcengine/verl/blob/main/verl/models/llama/megatron/modeling_llama_megatron.py>`_.
To support other model, users are required to implement:
1. Implemnt a model similar to ``modeling_llama_megatron.py`` that satisfy the
parallelism requirements of Megatron-LM. Then register your model in
the `registry.py <https://github.com/volcengine/verl/blob/main/verl/models/registry.py>`_.
2. Checkpoint utils that can load full checkpoint (e.g. huggingface
checkpoint) to partitioned models during the runtime. Then register
your loader to ``weight_loader_registry`` in `weight_loader_registry.py <https://github.com/volcengine/verl/blob/main/verl/models/weight_loader_registry.py>`_.
3. Weight loader that synchronize the weight from Megatron to rollout
(vLLM) model. Note that both the actor model and rollout model are
partitioned during runtime. So, its advisable to map the model name
in actor model implementation. Otherwise, you may need an additional
name mapping and even weight transformation.

View File

@ -0,0 +1,11 @@
Ray API Design Tutorial
=======================================
We provide a tutorial for our Ray API design, including:
- Ray basic concepts
- Resource Pool and RayWorkerGroup
- Data Dispatch, Execution and Collection
- Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool
See details in `tutorial.ipynb <https://github.com/volcengine/verl/blob/main/examples/ray/tutorial.ipynb>`_.

67
docs/conf.py Normal file
View File

@ -0,0 +1,67 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = u'veRL'
# pylint: disable=W0622
copyright = u'2024 ByteDance Seed Foundation MLSys Team'
author = u'Guangming Sheng, Chi Zhang, Yanghua Peng, Haibin Lin'
# -- General configuration ---------------------------------------------------
# The master toctree document.
master_doc = 'index'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['recommonmark',
'sphinx.ext.autosectionlabel',
]
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
source_suffix = ['.rst', 'rest', '.md']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = u'en'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

339
docs/examples/config.rst Normal file
View File

@ -0,0 +1,339 @@
Config Explaination
===================
ppo_trainer.yaml for FSDP Backend
---------------------------------
Data
~~~~
.. code:: yaml
data:
tokenizer: null
train_files: ~/data/rlhf/gsm8k/train.parquet
val_files: ~/data/rlhf/gsm8k/test.parquet
prompt_key: prompt
max_prompt_length: 512
max_response_length: 512
train_batch_size: 1024
val_batch_size: 1312
return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
return_raw_chat: False
- ``data.train_files``: Training set parquet. Can be a list or a single
file. The program will read all files into memory, so it cant be too
large (< 100GB). The path can be either local path or HDFS path. For
HDFS path, we provide utils to download it to DRAM and convert the
HDFS path to local path.
- ``data.val_files``: Validation parquet. Can be a list or a single
file.
- ``data.prompt_key``: The field in the dataset where the prompt is
located. Default is prompt.
- ``data.max_prompt_length``: Maximum prompt length. All prompts will be
left-padded to this length. An error will be reported if the length is
too long
- ``data.max_response_length``: Maximum response length. Rollout in RL
algorithms (e.g. PPO) generates up to this length
- ``data.train_batch_size``: Batch size sampled for one training
iteration of different RL algorithms.
- ``data.val_batch_size``: Batch size sampled for one validation
iteration.
- ``data.return_raw_input_ids``: Whether to return the original
input_ids without adding chat template. This is mainly used to
accommodate situations where the reward models chat template differs
from the policy. It needs to be decoded first, then apply the RMs
chat template. If using a model-based RM, and the policy and RM
chat_templates are different, this flag needs to be set
- ``data.return_raw_chat``:
- ``data.truncation``: Truncate the input_ids or prompt length if they
exceed max_prompt_length. Default is error, not allow exceed the
max_prompt_length. The users should increase the max_prompt_length if
throwing the error.
V1:
- data.prompt_id_key: The field in the dataset where the prompt_id is
located
- data.max_prompt_id_length: In data processing, the prompt_id will be
tokenized using the tokenizer and packaged with the prompt. This
specifies the maximum length of the tokenized prompt_id. **An error
will be reported if its not long enough**
Actor/Rollout/Reference Policy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: yaml
actor_rollout_ref:
hybrid_engine: True
model:
path: ~/models/deepseek-llm-7b-chat
external_lib: null
override_config: {}
enable_gradient_checkpointing: False
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: 64
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
ppo_epochs: 1
shuffle: True
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
grad_offload: False
optimizer_offload: False
ref:
fsdp_config:
param_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: 128
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor # or dummy_hf or dummy_megatron
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: 128
# for vllm and hf rollout
do_sample: True
**Common config for actor, rollout and reference model**
- ``actor_rollout_ref.hybrid_engine``: Whether its a hybrid engine,
currently only supports hybrid engine
- ``actor_rollout_ref.model.path``: Huggingface model path. This can be
either local path or HDFS path. For HDFS path, we provide utils to
download it to DRAM and convert the HDFS path to local path.
- ``actor_rollout_ref.model.external_libs``: Additional Python packages
that need to be imported. Used to register models or tokenizers into
the Huggingface system.
- ``actor_rollout_ref.model.override_config``: Used to override some of
the models original configurations, mainly dropout
- ``actor_rollout_ref.model.enable_gradient_checkpointing``: Whether to
enable gradient checkpointing for the actor
**Actor model**
- ``actor_rollout_ref.actor.strategy``: fsdp or megatron. In this
example, we use fsdp backend.
- ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split
into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO
updates
- ``actor_rollout_ref.actor.ppo_micro_batch_size``: Similar to gradient
accumulation, the micro_batch_size for one forward pass, trading speed
for GPU memory
- ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor
updates
- ``actor_rollout_ref.actor.clip_ratio``: PPO clip ratio
- ``actor_rollout_ref.actor.entropy_coeff``: The weight of entropy when
calculating PPO loss
- ``actor_rollout_ref.actor.ppo_epochs``: Number of epochs for PPO
updates on one set of sampled data
- ``actor_rollout_ref.actor.shuffle``: Whether to shuffle data when
there are multiple epochs
- ``actor_rollout_ref.actor.optim``: Actors optimizer parameters
- ``actor_rollout_ref.actor.fsdp_config``: FSDP config for actor
training
- ``wrap_policy``: FSDP wrap policy. By default, it uses Huggingfaces
wrap policy, i.e., wrapping by DecoderLayer
- No need to set transformer_layer_cls_to_wrap, so we comment it.
- ``*_offload``: Whether to enable parameter, gradient and optimizer
offload
- Trading speed for GPU memory.
**Reference Model**
- ``actor_rollout_ref.ref``: FSDP config same as actor. **For models
larger than 7B, its recommended to turn on offload for ref by
default**
- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: The batch size
for one forward pass in the computation of ``ref_log_prob``.
**Rollout Model**
- ``actor_rollout_ref.rollout.name``: hf/vllm. We use vLLM by default
because its much efficient and our hybrid engine is implemented with
vLLM.
- Rollout (Auto-regressive) parameters. The key should be equal to the
property name in vLLMs ``SamplingParams``.
- ``temperature``, ``top_k``, ``top_p`` and others: Sampling
parameters in ``SamplingParams``.
- ``dtype``: Rollout model parameters type. This should be align with
the actor model parameter type in FSDP/Megatron backend.
- ``gpu_memory_utilization``: The proportion of the remaining GPU memory
allocated for kv cache after other models have initialized when using
vllm.
- ``tensor_model_parallel_size``: TP size for rollout. Only effective
for vllm.
- ``log_prob_micro_batch_size``: Micro_batch_size (The batch size for
one forward pass) for recalculating log_prob.
- ``do_sample``: Whether to sample. If set to False, the rollout model
will perform greedy sampling. We disable ``do_sample`` during
validation.
- ``actor_rollout_ref.rollout.ignore_eos``: Whether to ignore the EOS
token and continue generating tokens after the EOS token is generated.
- ``actor_rollout_ref.rollout.free_cache_engine``: Offload the KVCache
after rollout generation stage. Default is True. When set to True, we
need to disable the usage of CUDAGraph (set ``enforce_eager`` to
True.)
- ``actor_rollout_ref.rollout.enforce_eager``: Whether to use CUDAGraph
in vLLM generation. Default set to True to disable CUDAGraph.
- ``actor_rollout_ref.rollout.load_format``: Which weight loader to use
to load the actor model weights to the rollout model.
- ``auto``: Use Megatron weight loader.
- ``megatron``: Use Megatron weight loader. Deployed with Megatron
backend. The input model ``state_dict()`` is already partitioned
along TP dimension and already gathered along PP dimension. This
weight loader requires that the Rollout model and Actor models
parameters shape and name should be identical.
- ``dtensor``: Default solution when using Huggingface weight loader.
Deployed with FSDP backend and the state_dict_type is
``StateDictType.SHARDED_STATE_DICT``. Recommend to use this weight
loader
- ``hf``: Use Huggingface weight loader. Deployed with FSDP backend
and the state_dict_type is ``StateDictType.FULL_STATE_DICT``. This
solution doesnt need to rewrite the weight loader for each model
implemented in vLLM but it results in larger peak memory usage.
- ``dummy_hf``, ``dummy_megatron``, ``dummy_dtensor``: Random
initialization.
.. note:: **NOTED**: In this config field, users only need to select from ``dummy_megatron``, ``dummy_dtensor``, ``dummy_hf`` for rollout initialization and our hybrid engine will select the corresponding weight loader (i.e., ``megatron``, ``dtensor``, ``hf``) during actor/rollout weight synchronization.
Critic Model
~~~~~~~~~~~~
Most parameters for Critic are similar to Actor Model.
Reward Model
~~~~~~~~~~~~
.. code:: yaml
reward_model:
enable: False
model:
input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
path: ~/models/Anomy-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
fsdp_config:
min_num_params: 0
param_offload: False
micro_batch_size: 64
max_length: null
- ``reward_model.enable``: Whether to enable reward model. If False, we
compute the reward only with the user-defined reward functions. In
GSM8K and Math examples, we disable reward model. For RLHF alignment
example using full_hh_rlhf, we utilize reward model to assess the
responses. If False, the following parameters are not effective.
- ``reward_model.model``
- ``input_tokenizer``: Input tokenizer. If the reward models chat
template is inconsistent with the policy, we need to first decode to
plaintext, then apply the rms chat_template. Then score with RM. If
chat_templates are consistent, it can be set to null.
- ``path``: RMs HDFS path or local path. Note that RM only supports
AutoModelForSequenceClassification. Other model types need to define
their own RewardModelWorker and pass it from the code.
Algorithm
~~~~~~~~~
.. code:: yaml
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: gae
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.005
- ``gemma``: discount factor
- ``lam``: Trade-off between bias and variance in the GAE estimator
- ``adv_estimator``: gae. Currently only supports gae, will support GRPO
in the future
- ``kl_penalty``\ Support ``kl``, ``abs``, ``mse`` and ``full``.How to
calculate the kl divergence between actor and reference policy. For
specific options, refer to `core_algos.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/core_algos.py#L192>`_ .
Trainer
~~~~~~~
.. code:: yaml
trainer:
total_epochs: 30
project_name: verl_examples
experiment_name: gsm8k
logger: ['console', 'tracking']
nnodes: 1
n_gpus_per_node: 8
save_freq: -1
test_freq: 2
critic_warmup: 0
default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} # hdfs checkpoint path
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} # local checkpoint path
- ``trainer.total_epochs``: Number of epochs in training.
- ``trainer.project_name``: For wandb
- ``trainer.experiment_name``: For wandb
- ``trainer.logger``: Support console and tracking. For tracking, we
will initialize a wandb
- ``trainer.nnodes``: Number of nodes used in the training.
- ``trainer.n_gpus_per_node``: Number of GPUs per node.
- ``trainer.save_freq``: The frequency (by iteration) to save checkpoint
of the actor and critic model.
- ``trainer.test_freq``: The validation frequency (by iteration).
- ``trainer.critic_warmup``: The number of iteration to train the critic
model before actual policy learning.

View File

@ -0,0 +1,165 @@
GSM8K Example
=============
Introduction
------------
In this example, we train an LLM to tackle the GSM8k task.
Paper: https://arxiv.org/pdf/2110.14168
Dataset: https://huggingface.co/datasets/gsm8k
Note that the original paper mainly focuses on training a verifier (a
reward model) to solve math problems via Best-of-N sampling. In this
example, we train an RLHF agent using a rule-based reward model.
Dataset Introduction
--------------------
GSM8k is a math problem dataset. The prompt is an elementary school
problem. The LLM model is required to answer the math problem.
The training set contains 7473 samples and the test set contains 1319
samples.
**An example**
Prompt
Katy makes coffee using teaspoons of sugar and cups of water in the
ratio of 7:13. If she used a total of 120 teaspoons of sugar and cups
of water, calculate the number of teaspoonfuls of sugar she used.
Solution
The total ratio representing the ingredients she used to make the
coffee is 7+13 = <<7+13=20>>20 Since the fraction representing the
number of teaspoons she used is 7/20, she used 7/20\ *120 =
<<7/20*\ 120=42>>42 #### 42
Step 1: Prepare dataset
-----------------------
.. code:: bash
cd examples/data_preprocess
python3 gsm8k.py --local_dir ~/data/gsm8k
Step 2: Download Model
----------------------
Therere three ways to prepare the model checkpoints for post-training:
- Download the required models from hugging face
.. code:: bash
huggingface-cli download deepseek-ai/deepseek-math-7b-instruct --local-dir ~/models/deepseek-math-7b-instruct --local-dir-use-symlinks False
- Already store your store model in the local directory or HDFS path.
- Also, you can directly use the model name in huggingface (e.g.,
deepseek-ai/deepseek-math-7b-instruct) in
``actor_rollout_ref.model.path`` and ``critic.model.path`` field in
the run script.
Noted that users should prepare checkpoints for actor, critic and reward
model.
[Optional] Step 3: SFT your Model
---------------------------------
We provide a SFT Trainer using PyTorch FSDP in
`fsdp_sft_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/fsdp_sft_trainer.py>`_.
Users can customize their own SFT
script using our FSDP SFT Trainer.
We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft directory <https://github.com/volcengine/verl/blob/main/examples/gsm8k/sft/>`_.
.. code:: shell
set -x
torchrun -m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=question \
data.response_key=answer \
data.micro_batch_size=8 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_hdfs_dir=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
trainer.total_epochs=4 \
trainer.logger=['console','tracking']
Step 4: Perform PPO training with your model on GSM8K Dataset
-------------------------------------------------------------
- Prepare your own run.sh script. Heres an example for GSM8k dataset
and deepseek-llm-7b-chat model.
- Users could replace the ``data.train_files`` ,\ ``data.val_files``,
``actor_rollout_ref.model.path`` and ``critic.model.path`` based on
their environment.
- See :doc:`config` for detailed explaination of each config field.
**Reward Model/Function**
We use a rule-based reward model. We force the model to produce a final
answer following 4 “#” as shown in the solution. We extract the final
answer from both the solution and models output using regular
expression matching. We compare them and assign a reward of 1 to correct
answer, 0.1 to incorrect answer and 0 to no answer.
**Training Script**
The training script example for FSDP and Megatron-LM backend are stored in examples/ppo_trainer directory.
.. code:: bash
cd ../ppo_trainer
bash run_deepseek7b_llm.sh
The script of run_deepseek7b_llm.sh
.. code:: bash
set -x
python3 -m verl.trainer.main_ppo \
data.train_files=~/data/rlhf/gsm8k/train.parquet \
data.val_files=~/data/rlhf/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=~/models/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=64 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.micro_batch_size=256 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.path=~/models/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=64 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_epochs=15

View File

@ -0,0 +1,207 @@
PPO Example Architecture
========================
Lets start with the Proximal Policy Optimization algorithm, which is
most widely used algorithm in LLM post-training.
The main entry point of the PPO algorithm example is:
`main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py>`_.
In this tutorial, we will go through the code architecture in `main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py>`_.
Define the data
---------------
Users need to preprocess and store the dataset in parquet files.
And we implement `RLHFDataset` to load and tokenize the parquet files.
For ``RLHFDataset`` (Default), at least 1 fields are required:
- ``prompt``: Contains the string prompt
We already provide some examples of processing the datasets to parquet
files in `data_preprocess directory <https://github.com/volcengine/verl/blob/main/examples/data_preprocess>`_. Currently, we support
preprocess of GSM8k, MATH, Hellasage, Full_hh_rlhf datasets. See :doc:`../preparation/prepare_data` for
more information.
Define the reward functions for different datasets
--------------------------------------------------
In this main entry point, the users only need to define their own reward
function based on the datasets (or applications) utilized in PPO
training.
For example, we already provide reward functions for `GSM8k <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/gsm8k.py>`_
and `MATH <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/math.py>`_
datasets in the ``_select_rm_score_fn``. In the ``RewardManager``, we
will compute the reward score based on the data_source to select
corresponding reward functions. For some RLHF datasets (e.g.,
full_hh_rlhf), the reward model is utilized to assess the responses
without any reward functions. In this case, the ``RewardManager`` will
return the ``rm_score`` computed by the reward model directly.
See `reward functions <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_ for detailed implementation.
Define worker classes
---------------------
.. code:: python
if config.actor_rollout_ref.actor.strategy == 'fsdp': # for FSDP backend
assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
from verl.trainer.ppo.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
from single_controller.ray import RayWorkerGroup
ray_worker_group_cls = RayWorkerGroup
elif config.actor_rollout_ref.actor.strategy == 'megatron': # for Megatron backend
assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
from verl.trainer.ppo.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
from single_controller.ray.megatron import NVMegatronRayWorkerGroup
ray_worker_group_cls = NVMegatronRayWorkerGroup # Ray worker class for Megatron-LM
else:
raise NotImplementedError
from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
role_worker_mapping = {
Role.ActorRollout: ActorRolloutRefWorker,
Role.Critic: CriticWorker,
Role.RefPolicy: ActorRolloutRefWorker
}
global_pool_id = 'global_pool'
resource_pool_spec = {
global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
}
mapping = {
Role.ActorRollout: global_pool_id,
Role.Critic: global_pool_id,
Role.RefPolicy: global_pool_id,
}
Step 1: Construct the mapping between roles and workers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A role represents a group of workers in the same process. We have
pre-defined several roles in `ray_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py#L38>`_.
.. code:: python
class Role(Enum):
"""
To create more roles dynamically, you can subclass Role and add new members
"""
Actor = 0 # This worker only has Actor
Rollout = 1 # This worker only has Rollout
ActorRollout = 2 # This worker has both actor and rollout, it's a HybridEngine
Critic = 3 # This worker only has critic
RefPolicy = 4 # This worker only has reference policy
RewardModel = 5 # This worker only has reward model
ActorRolloutRef = 6 # This worker contains actor, rollout and reference policy simultaneously
Step 2: Define the worker class corresponding to this role
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- We have pre-implemented the ``ActorRolloutRefWorker``. Through
different configs, it can be a standalone actor, a standalone rollout,
an ActorRollout HybridEngine, or an ActorRolloutRef HybridEngine
- We also pre-implemented workers for ``Actor``, ``Rollout``,
``Critic``, ``Reward Model`` and ``Reference model`` on two different
backend: PyTorch FSDP
and Megatron-LM.
See `FSDP Workers <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/workers/fsdp_workers.py>`_
and `Megatron-LM Workers <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/workers/megatron_workers.py>`_
for more information.
Step 3: Define resource pool id and resource pool spec
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Resource pool is a division of global GPU resources,
``resource_pool_spec`` is a dict, mapping from id to # of GPUs
- In the above example, we defined a global resource pool:
global_pool_id, and then put all roles on this one resource pool
with all the GPUs in this post-training task. This refers to
*co-locate* placement where all the models share the same set of
GPUs.
- See resource pool and placement for advance usage.
Defining reward model/function
------------------------------
.. code:: python
# we should adopt a multi-source reward function here
# - for rule-based rm, we directly call a reward score
# - for model-based rm, we call a model
# - for code related prompt, we send to a sandbox if there are test cases
# - finally, we combine all the rewards together
# - The reward type depends on the tag of the data
if config.reward_model.enable:
from verl.trainer.ppo.workers.fsdp_workers import RewardModelWorker
role_worker_mapping[Role.RewardModel] = RewardModelWorker
mapping[Role.RewardModel] = global_pool_id
reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0)
# Note that we always use function-based RM for validation
val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
Since not all tasks use model-based RM, users need to define here
whether its a model-based RM or a function-based RM
- If its a model-based RM, directly add the ``RewardModel`` role in the
resource mapping and add it to the resource pool mapping.
- Note that the pre-defined ``RewardModelWorker`` only supports models
with the structure of huggingface
``AutoModelForSequenceClassification``. If its not this model, you
need to define your own RewardModelWorker in `FSDP Workers <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/workers/fsdp_workers.py>`_
and `Megatron-LM Workers <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/workers/megatron_workers.py>`_.
- If its a function-based RM, the users are required to classified the
reward function for each datasets.
.. code:: python
def _select_rm_score_fn(data_source):
if data_source == 'openai/gsm8k':
return gsm8k.compute_score
elif data_source == 'lighteval/MATH':
return math.compute_score
else:
raise NotImplementedError
See reward functions implemented in `directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/>`_
for more information.
Define, init and run the PPO Trainer
------------------------------------
.. code:: python
trainer = RayPPOTrainer(config=config,
tokenizer=tokenizer,
role_worker_mapping=role_worker_mapping,
resource_pool_manager=resource_pool_manager,
ray_worker_group_cls=ray_worker_group_cls,
reward_fn=reward_fn,
val_reward_fn=val_reward_fn)
trainer.init_workers()
trainer.fit()
- We first initialize the ``RayPPOTrainer`` with user config, tokenizer
and all the above worker mapping, resource pool, worker group and
reward functions
- We first call the ``trainer.init_workers()`` to initialize the models
on the allocated GPUs (in the resource pool)
- The actual PPO training will be executed in ``trainer.fit()``
veRL can be easily extended to other RL algorithms by reusing the Ray
model workers, resource pool and reward functions. See :doc:`extension<../advance/dpo_extension>` for
more information.
Details of the ``RayPPOTrainer`` is discussed in :doc:`Ray Trainer<../workers/ray_trainer>`.

73
docs/index.rst Normal file
View File

@ -0,0 +1,73 @@
Welcome to veRL/HybridFlow's documentation!
================================================
veRL (HybridFlow) is a flexible, efficient and industrial-level RL(HF) training framework designed for large language models (LLMs) Post-Training.
veRL is flexible and easy to use with:
- **Easy to support diverse RL(HF) algorithms**: The Hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
- **Seamless integration of existing LLM infra with modular API design**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM and vLLM. Moreover, users can easily extend to other LLM training and inference frameworks.
- **Flexible device mapping**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
- Readily integration with popular Hugging Face models
veRL is fast with:
- **State-of-the-art throughput**: By seamlessly integrating existing SOTA LLM training and inference frameworks, veRL achieves high generation and training throughput.
- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
--------------------------------------------
.. _Contents:
.. toctree::
:maxdepth: 5
:caption: Preparation
:titlesonly:
:numbered:
preparation/install
preparation/prepare_data
preparation/reward_function
.. toctree::
:maxdepth: 2
:caption: PPO Example
:titlesonly:
:numbered:
examples/ppo_code_architecture
examples/config
examples/gsm8k_example
.. toctree::
:maxdepth: 1
:caption: PPO Trainer and Workers
workers/ray_trainer
workers/fsdp_workers
workers/megatron_workers
.. toctree::
:maxdepth: 1
:caption: Advance Usage and Extension
advance/placement
advance/dpo_extension
advance/fsdp_extension
advance/megatron_extension
Contribution
-------------
veRL is free software; you can redistribute it and/or modify it under the terms
of the Apache License 2.0. We welcome contributions.
Join us on `GitHub <https://github.com/volcengine/verl>`_ .
.. and check out our
.. :doc:`contribution guidelines <contribute>`.

View File

@ -0,0 +1,81 @@
Installation
============
To install the veRL, we recommend using conda:
.. code:: bash
conda create -n verl python==3.9
conda activate verl
For installing the latest version of veRL, the best way is to clone and
install it from source. Then you can modify our code to customize your
own post-training jobs.
.. code:: bash
# install verl together with some lightweight dependencies in setup.py
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
You can also install veRL using ``pip3 install``
.. code:: bash
# directly install from pypi
pip3 install verl
Dependencies
------------
veRL requires Python >= 3.9 and CUDA >= 12.1.
veRL support various backend, we currently release FSDP and Megatron-LM
for actor training and vLLM for rollout generation.
The following dependencies are required for all backends, PyTorch FSDP and Megatron-LM.
The pros, cons and extension guide for using PyTorch FSDP backend can be
found in :doc:`FSDP Workers<../workers/fsdp_workers>`.
.. code:: bash
# install torch [or you can skip this step and let vllm to install the correct version for you]
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
# install vllm
pip3 install vllm==0.5.4
pip3 install ray==2.10 # other version may have bug
# flash attention 2
pip3 install flash-attn --no-build-isolation
For users who pursue better scalability, we recommend using Megatron-LM
backend. Please install the above dependencies first.
Currently, we support Megatron-LM\@core_v0.4.0 and we fix some internal
issues of Megatron-LM. Heres the additional installation guide.
The pros, cons and extension guide for using Megatron-LM backend can be
found in :doc:`Megatron-LM Workers<../workers/megatron_workers>`.
.. code:: bash
# FOR Megatron-LM Backend
# apex
pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# transformer engine
pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
# megatron core v0.4.0
cd ..
git clone -b core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
cp ../verl/patches/megatron_v4.patch .
git apply megatron_v4.patch
pip3 install -e .
export PYTHONPATH=$PYTHONPATH:$(pwd)

View File

@ -0,0 +1,126 @@
Prepare Data (Parquet) for Post-Training
========================================
Before starting the post-training job, we need to prepare the data for
the policy training. The data should be stored in the parquet format.
We provide several data preprocess scripts for different datasets,
including GSM8K, MATH, HelloSwag, Full_hh_rlhf. To prepare other datasets, we need
to follow the following steps: The data preprocess script can be divided
into two parts:
1. The first part is the common part, which loads the dataset from
huggingfaces ``datasets`` package. Then preprocess the datasets with
the ``make_map_fn`` and then store in the parquet format.
.. code:: python
import re
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
import argparse
# To extract the solution for each prompts in the dataset
# def extract_solution(solution_str):
# ...
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='/opt/tiger/gsm8k')
parser.add_argument('--hdfs_dir', default='hdfs://haruna/home/byte_data_seed/lf_lq/user/zhangchi.usc1992/data/rlhf')
args = parser.parse_args()
num_few_shot = 5
data_source = 'openai/gsm8k'
dataset = datasets.load_dataset(data_source, 'main')
train_dataset = dataset['train']
test_dataset = dataset['test']
# Construct a `def make_map_fn(split)` for the corresponding datasets.
# ...
train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
2. The users are required to implement the ``make_map_fn()`` function
(as well as the ``extract_solution``) on their own to support
different datasets or tasks.
We already implemented the data preprocess of GSM8k, MATH, Hellaswag and Full_hh_rlhf
datasets. And we take the GSM8k dataset as an example:
**GSM8K**
In the ``make_map_fn``, each data field should consist of the following
5 fields:
1. ``data_source``: The name of the dataset. To index the corresponding
reward function in the ``RewardModule``
2. ``prompt``: This field should be constructed in the format of
huggingface chat_template. The tokenizer in ``RLHFDataset`` will
apply chat template and tokenize the prompt.
3. ``ability``: Define the task category.
4. ``reward_model``: Currently, we only utilize the ``ground_truth``
field during evaluation. The ``ground_truth`` is computed by the
``extract_solution`` function. **NOTED** that the implementation of
the corresponding reward function should align with this extracted
``ground_truth``.
5. ``extra_info``: Record some information of the current prompt. Not
use for now.
.. code:: python
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) # extract the solution after ####
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split('#### ')[1].replace(',', '')
return final_solution
instruction_following = "Let's think step by step and output the final answer after \"####\"."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question = example.pop('question')
question = question + ' ' + instruction_following
answer = example.pop('answer')
solution = extract_solution(answer)
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": question
}],
"ability": "math",
"reward_model": {
"style": "rule",
"ground_truth": solution
},
"extra_info": {
'split': split,
'index': idx
}
}
return data
return process_fn

View File

@ -0,0 +1,46 @@
Implment Reward Function for Dataset
=======================
For each dataset, we need to implement a reward function or utilize a reward model to compute the rewards for the generated responses.
We already pre-implemented some reward functions in `reward_score directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_.
Currently, we support reward functions for GSM8k and MATH datasets. For RLHF datasets (e.g.,
full_hh_rlhf) and Code Generation (e.g., APPS), we utilize reward model
and SandBox (will opensource soon) for evaluation respectively.
RewardManager
-------------
In the entrypoint of the PPO Post-Training script `main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py#L33>`_,
we implement a ``RewardManager`` that utilze pre-implemented reward functions to compute the scores for each response.
In the ``RewardManager``, we implemented a ``__call__`` function to
compute the score for each response.
All the reward functions are executed by ``compute_score_fn``.
The input is a ``DataProto``, which includes:
- ``input_ids``, ``attention_mask``: ``input_ids`` and ``attention_mask`` after applying
chat_template, including prompt and response
- ``responses``: response tokens
- ``ground_truth``: The ground truth string of the current prompt.
Stored in ``non_tensor_batch`` in the ``DataProto``, which should be
preprocessed in the parquet files.
- ``data_source``: The dataset name of the current prompt. Stored in
``non_tensor_batch`` in the ``DataProto``, which should be
preprocessed in the parquet files.
After detokenize the responses, the responses string and the ground
truth string will be input to the ``compute_score_fn`` to compute the
score for each response.
Reward Functions
----------------
We already pre-implemented some reward functions in `reward_score directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_.
- In the `GSM8k example <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/gsm8k.py>`_, we
force the response to output the final answer after four ####, then
use string matching to compare with the ground truth. If completely
correct, score 1 point; if the format is correct, score 0.1 points; if
the format is incorrect, score 0 points.
- In the `MATH example <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/math.py>`_, we follow
the implementation in `lm-evaluation-harness repository <https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py>`_.

View File

@ -0,0 +1,9 @@
# markdown suport
recommonmark
# markdown table suport
sphinx-markdown-tables
# theme default rtd
# crate-docs-theme
sphinx-rtd-theme

View File

@ -0,0 +1,142 @@
PyTorch FSDP Backend
============
We support PyTorch FSDP Backend by implementing various workers for
actor, critic, reference, rollout and reward models. We also implement
the ``FSDPVLLMShardingManager`` that reshard weight between FSDP and
vLLM in `fsdp_vllm.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/hybrid_engine/fsdp_vllm.py>`_.
**Pros**
- Readily support various models.
- Users only need to implement the corresponding
``dtensor_weight_loader`` for weight synchronization between FSDP
and vLLM. While for ``hf_weight_loader``, users can directly apply
any models supported both in HF and vLLM without any code change.
- Easy to organize the forward and backward computation for each model.
**Cons**
- Poor scalability when it comes to large-scale models (e.g. Llama 70B
and 405B)
- The resharding overhead between actor and rollout could be larger than
Megatron-LM backend.
Due to the simplicity, we recommend using FSDP backend for algorithm
research and prototyping.
FSDP Workers
------------
ActorRolloutRefWorker
^^^^^^^^^^^^^^^^^^^^^
Actor/Rollout HybridEngine
''''''''''''''''''''''''''
1. HybridEngine, Actor and Rollout initialization API.
.. code:: python
@register(dispatch_mode=Dispatch.ONE_TO_ALL)
def init_model(self):
``ONE_TO_ALL``: when calling the ``init_model`` function from the driver
process, each worker (on a GPU) will execute the following model
initialization process.
The initialization details of HybridEngine, Actor and Rollout are
highlighted below:
1. ``DataParallelPPOActor`` implements the simple PPO computation logics
when the model is built with FSDP, including compute log prob, model
update.
2. ``vLLMRollout`` support generation with vLLM. We modify the vLLM
Engine and make it executed under SPMD to fit into our
``WorkerGroup`` design.
3. ``FSDPVLLMShardingManager`` a context manager to perform actual
resharding between actor and rollout.
See `source code <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/workers/fsdp_workers.py#L42>`_. for more information.
1. Generate sequence and recompute log prob
.. code:: python
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(self, prompts: DataProto):
- ``Dispatch.DP_COMPUTE_PROTO``: The data will be dispatched and
collected along the DP dimension
- In this function, the rollout model will perform auto-regressive
generation and the actor model will recompute the old log prob for the
generetad response.
3. Update actor model
.. code:: python
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def update_actor(self, data: DataProto):
- Update the actor model weight using PPO & entropy loss.
ReferenceModel
''''''''''''''
1. Reference model initialization
The reference model is initialized using the same function as the actor
model without initializing the HybridEngine and Optimizer. Then the
actor model is also wrapped by the ``DataParallelPPOActor``.
2. Compute reference log prob
.. code:: python
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def compute_ref_log_prob(self, data: DataProto):
- In this function, the reference model will call the compute log prob
function in ``DataParallelPPOActor`` to compute the reference log
prob.
CriticWorker and RewardWorker
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1. Model initialization
Quite similar to reference model. The CriticWorker will perform
additional initialization for the Optimizer.
2. Compute Values for CriticWorker
.. code:: python
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def compute_values(self, data: DataProto):
3. Update Critic
.. code:: python
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def update_critic(self, data: DataProto):
4. Compute Reward
.. code:: python
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def compute_rm_score(self, data: DataProto):
HybridShard
------------
We didnt support FSDP `HybridShard`. To support this, we may need to
construct a 2D device mesh and test the corresponding
``dtensor_weight_loader`` and ``hf_weight_loader`` for each model.

View File

@ -0,0 +1,200 @@
Megatron-LM Backend
================
We support Megatron Backend by implementing various workers for actor,
critic, reference, rollout and reward models. We also implement the
``3DHybridEngine`` using Megatron-LM and vLLM in `megatron_vllm.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/hybrid_engine/megatron_vllm.py>`_.
**Pros**
- Support 3D parallelism and sequence parallelism for best scalablility
and throughput.
- 3D HybridEngine can significantly reduce peak memory usage and reduce
weight synchronize overhead between actor and rollout.
**Cons**
- Users should implement their own models for Megatron-LM
- Users should implement the corresponding weight_loader to
- synchronize the model weight between actor (in Megatron) and rollout
(in vLLM).
- load weights from checkpoints to corresponding model in Megatron-LM
Megatron Workers
----------------
MegatronWorker
^^^^^^^^^^^^^^
``MegatronWorker`` is the base class of different megatron worker
classes. In this class, ``get_megatron_global_info`` and
``get_megatron_rank_info`` function to retrive the 3D parallel world
size and rank of each ``Worker`` running on specific GPU. These information
will be used in transfer protocol for Megatron Backend.
The following ``Worker`` class for different models will be utilized to
construct the ``WorkerGroup`` .
We implement various of APIs for each ``Worker`` class decorated by the
``@register(dispatch_mode=)`` . These APIs can be called by the ray
driver process. The data can be correctly collect and dispatch following
the ``dispatch_mode`` on each function. The supported dispatch_model
(i.e., transfer protocols) can be found in `decorator.py <https://github.com/volcengine/verl/blob/main/single_controller/base/decorator.py>`_.
ActorRolloutRefWorker
^^^^^^^^^^^^^^^^^^^^^
This class is implemented for Actor/Rollout HybridEngine or for the
reference model to initialize their model and perform computation.
Actor/Rollout HybridEngine
''''''''''''''''''''''''''
1. HybridEngine, Actor and Rollout initialization API.
.. code:: python
@register(dispatch_mode=Dispatch.ONE_TO_ALL)
def init_model(self):
``ONE_TO_ALL``: when calling the ``init_model`` function from the driver
process, each worker (on a GPU) will execute the following model
initialization process.
The initialization details of HybridEngine, Actor and Rollout are
highlighted below:
1. ``AllGatherPPModel`` holds memory buffer for both Actor and Rollout
and support weight resharding between actor and rollout.
2. ``MegatronPPOActor`` implements the simple PPO computation logics
when the model is built with Megatron, including compute log prob,
model update.
3. ``vLLMRollout`` support generation with vLLM. We modify the vLLM
Engine and make it executed under SPMD to fit into our
``WorkerGroup`` design.
4. ``MegatronVLLMShardingManager`` a context manager to perform actual
resharding between actor and rollout.
See `source code <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/workers/megatron_workers.py#L63>`_ for more information.
.. code:: python
# Initialize the 3D HybridEngine
hybrid_engine = AllGatherPPModel(model_provider=megatron_actor_model_provider)
# Fetch the model at current rank
actor_module = hybrid_engine.this_rank_models
...
# build actor model
self.actor = MegatronPPOActor(config=self.config.actor,
model_config=self.actor_model_config,
megatron_config=megatron_config,
actor_module=self.actor_module,
actor_optimizer=self.actor_optimizer,
actor_optimizer_config=self.actor_optim_config)
# build rollout
# rollout initialization
rollout = vLLMRollout(actor_module=params,
config=self.config.rollout,
tokenizer=self.tokenizer,
model_hf_config=self.actor_model_config,
train_tp=mpu.get_tensor_model_parallel_world_size())
# perform weight resharding between actor and rollout
sharding_manager = MegatronVLLMShardingManager(module=self.hybrid_engine,
inference_engine=rollout.inference_engine,
model_config=self.actor_model_config,
layer_name_mapping=layer_name_mapping)
...
2. Generate sequence and recompute log prob
.. code:: python
@register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO)
def generate_sequences(self, prompts: DataProto):
- ``Dispatch.MEGATRON_PP_AS_DP_PROTO``: The PP dimension of the actor
model will be regarded as DP dimension. Then the driver process will
dispatch and collect the data according to this reorganization. This
is because, in HybridEngine, the actor weight, which usually applied
larger 3D parallel sizes, will be gathered along the PP dimension and
TP dimension. Therefore, the corresponding data should be dispatched
and collected through the 3D parallel group of the rollout model,
rather than the actor model. However, the world_size and rank
information can only be retrived from ``get_megatron_global_info`` and
``get_megatron_rank_info``, which records the 3D information for the
actor model. Moreover, the data resharding inside TP dimension will be
processed within the HybridEngine.
- In this function, the rollout model will perform auto-regressive
generation and the actor model will recompute the old log prob for the
generetad response.
3. Update actor model
.. code:: python
@register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
def update_actor(self, data: DataProto):
- ``Dispatch.MEGATRON_COMPUTE_PROTO``: User passes the data partitioned
by DP dimension. The data is dispatched to all tp/pp ranks within the
same dp group, and ultimately only collects output data from tp=0 and
the last pp.
- Update the actor model weight using PPO & entropy loss.
ReferenceModel
''''''''''''''
1. Reference model initialization
The reference model is initialized using the same function as the actor
model without initializing the HybridEngine and Optimizer. Then the
actor model is also wrapped by the ``MegatronPPOActor``.
2. Compute reference log prob
.. code:: python
@register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
def compute_ref_log_prob(self, data: DataProto):
- In this function, the reference model will call the compute log prob
function in ``MegatronPPOActor`` to compute the reference log prob.
CriticWorker and RewardWorker
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1. Model initialization
Quite similar to reference model. The CriticWorker will perform
additional initialization for the Optimizer.
2. Compute Values for CriticWorker
.. code:: python
@register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
def compute_values(self, data: DataProto):
3. Update Critic
.. code:: python
@register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
def update_critic(self, data: DataProto):
4. Compute Reward
.. code:: python
@register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
def compute_rm_score(self, data: DataProto):
Context Parallel
----------------
This require the developer/contributor to implement the context parallel
both in Megatron-LM and models.

View File

@ -0,0 +1,243 @@
PPO Ray Trainer
===============
We implement the RayPPOTrainer, which is a trainer runs on the driver
process on a single CPU/GPU node (default is CPU).
The PPORayTrainer include 3 core functions for data preparation,
WorkerGroup initialization and PPO training loop.
Data Preparation
----------------
The ``PPORayTrainer``, as a single process, is responsible for loading a
complete batch of samples (prompts) from the dataset and then dispatch
to different worker_groups runnning on different GPUs.
To generalize the data loading, we implement the ``RLHFDataset`` class
to load the preprocessed parquet files, apply chat templates to the
prompts, add padding, truncate prompts that exceed max prompt length and
then tokenize.
.. code:: python
self.train_dataset = RLHFDataset(parquet_files=self.config.data.train_files,
tokenizer=self.tokenizer,
prompt_key=self.config.data.prompt_key,
max_prompt_length=self.config.data.max_prompt_length,
filter_prompts=True,
return_raw_chat=self.config.data.get('return_raw_chat', False),
truncation='error')
Then, the dataloader will iterate the dataset under PPO mini batch size.
WorkerGroup Initialization
--------------------------
We first introduce a basic implementation of initializing the
``WorkerGroup`` of the actor model on a given set of GPUs.
.. code:: python
# Due to the Ray issue, we can only support max_colocate_count=1 for now.
# This means that each GPU can only have one process.
# We can support max_colocate > 1 when applying this pull request: https://github.com/ray-project/ray/pull/44385
resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes,
use_gpu=True,
max_colocate_count=1)
# define actor rollout cls to be init on remote
actor_rollout_cls = RayClassWithInitArgs(cls=ActorRolloutWorker)
# define actor_rollout worker group
actor_rollout_worker_group = MegatronRayWorkerGroup(resource_pool=resource_pool,
ray_cls_with_init=actor_rollout_cls,
default_megatron_kwargs=config.actor_rollout.megatron)
Different WorkerGroups, like ``actor_rollout_worker_group`` ,
``critic_worker_group`` and ``ref_worker_group`` lies on a separate
process in the above implementation.
The driver process can then call the distributed compute function within
the ``actor_rollout_worker_group`` and other roles to construct the RL
training loop.
For models colocated in the same set of GPUs, we further provide a
fine-grain optimization, which merge the ``worker_group`` of different roles
in the same process. This optimization can save the redundant
CUDA/distributed context in different processes.
.. code:: python
# initialize WorkerGroup
# NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
# you should not use `create_colocated_worker_cls`. Instead, directly pass different resource pool to different worker groups.
# See TODO(url) for more information.
all_wg = {}
for resource_pool, class_dict in self.resource_pool_to_cls.items():
worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
wg_dict = self.ray_worker_group_cls(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls)
spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
all_wg.update(spawn_wg)
if self.use_critic:
self.critic_wg = all_wg['critic']
self.critic_wg.init_model()
if self.use_reference_policy:
self.ref_policy_wg = all_wg['ref']
self.ref_policy_wg.init_model()
if self.use_rm:
self.rm_wg = all_wg['rm']
self.rm_wg.init_model()
# we should create rollout at the end so that vllm can have a better estimation of kv cache memory
self.actor_rollout_wg = all_wg['actor_rollout']
self.actor_rollout_wg.init_model()
.. note:: For megatron backend, if we merge the ``worker_groups`` into the same processes, all the roles will utilize the same 3D parallel size. To optimize this, we may need to maintain several 3D process groups for each role in the same distributed context. If you want to use different 3D parallel size for different roles, please follow the similar architecture of the first code block to initialize each roles ``worker_group``
PPO Training Loop
-----------------
We implement the PPO training loop by calling the functions in
worker_group of each role. The input and output data of each function is
a ``DataProto`` object implemented in `protocol.py <https://github.com/volcengine/verl/blob/main/verl/protocol.py>`_. In the training
loop, trainer will dispatch/collect the data to/from different GPUs
following the transfer protocols wrapped in the workers functions. The
computation of PPO micro batches is processed in ``update_actor`` and
``update_critic`` functions.
To extend to other RLHF algorithms, such as DPO, GRPO, please refer to
:doc:`../advance/dpo_extension`.
.. code:: python
def fit(self):
"""
The training loop of PPO.
The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow.
The light-weight advantage computation is done on the driver process.
"""
from verl.utils.tracking import Tracking
from omegaconf import OmegaConf
logger = Tracking(project_name=self.config.trainer.project_name,
experiment_name=self.config.trainer.experiment_name,
default_backend=self.config.trainer.logger,
config=OmegaConf.to_container(self.config, resolve=True))
global_steps = 0
# perform validation before training
# currently, we only support validation using the reward_function.
if self.val_reward_fn is not None:
val_metrics = self._validate()
pprint(f'Initial validation metrics: {val_metrics}')
for epoch in range(self.config.trainer.total_epochs):
for batch_dict in self.train_dataloader:
metrics = {}
batch: DataProto = DataProto.from_single_dict(batch_dict)
# batch = batch.to('cuda')
# pop those keys for generation
gen_batch = batch.pop(batch_keys=['input_ids', 'attention_mask', 'position_ids'])
# generate a batch
with Timer(name='gen', logger=None) as timer:
gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
metrics['timing/gen'] = timer.last
batch = batch.union(gen_batch_output)
if self.use_reference_policy:
# compute reference log_prob
with Timer(name='ref', logger=None) as timer:
ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
batch = batch.union(ref_log_prob)
metrics['timing/ref'] = timer.last
# compute values
with Timer(name='values', logger=None) as timer:
values = self.critic_wg.compute_values(batch)
batch = batch.union(values)
metrics['timing/values'] = timer.last
with Timer(name='adv', logger=None) as timer:
# compute scores. Support both model and function-based.
# We first compute the scores using reward model. Then, we call reward_fn to combine
# the results from reward model and rule-based results.
if self.use_rm:
# we first compute reward model score
reward_tensor = self.rm_wg.compute_rm_score(batch)
batch = batch.union(reward_tensor)
# we combine with rule-based rm
reward_tensor = self.reward_fn(batch)
batch.batch['token_level_scores'] = reward_tensor
# compute rewards. apply_kl_penalty if available
batch, kl_metrics = apply_kl_penalty(batch,
kl_ctrl=self.kl_ctrl,
kl_penalty=self.config.algorithm.kl_penalty)
metrics.update(kl_metrics)
# compute advantages, executed on the driver process
batch = compute_advantage(batch,
self.config.algorithm.gamma,
self.config.algorithm.lam,
adv_estimator=self.config.algorithm.adv_estimator)
metrics['timing/adv'] = timer.last
# update critic
if self.use_critic:
with Timer(name='update_critic', logger=None) as timer:
critic_output = self.critic_wg.update_critic(batch)
metrics['timing/update_critic'] = timer.last
critic_output_metrics = reduce_metrics(critic_output.meta_info['metrics'])
metrics.update(critic_output_metrics)
# implement critic warmup
if self.config.trainer.critic_warmup <= global_steps:
# update actor
with Timer(name='update_actor', logger=None) as timer:
actor_output = self.actor_rollout_wg.update_actor(batch)
metrics['timing/update_actor'] = timer.last
actor_output_metrics = reduce_metrics(actor_output.meta_info['metrics'])
metrics.update(actor_output_metrics)
# validate
if self.val_reward_fn is not None and (global_steps + 1) % self.config.trainer.test_freq == 0:
with Timer(name='testing', logger=None) as timer:
val_metrics: dict = self._validate()
val_metrics = {f'val/{key}': val for key, val in val_metrics.items()}
metrics['timing/testing'] = timer.last
metrics.update(val_metrics)
# collect metrics
data_metrics = compute_data_metrics(batch=batch)
metrics.update(data_metrics)
# TODO: make a canonical logger that supports various backend
logger.log(data=metrics, step=global_steps)
if self.config.trainer.save_freq > 0 and (global_steps + 1) % self.config.trainer.save_freq == 0:
actor_local_path = os.path.join(self.config.trainer.default_local_dir, 'actor',
f'global_step_{global_steps}')
actor_remote_path = os.path.join(self.config.trainer.default_hdfs_dir, 'actor')
self.actor_rollout_wg.save_checkpoint(actor_local_path, actor_remote_path)
if self.use_critic:
critic_local_path = os.path.join(self.config.trainer.default_local_dir, 'critic',
f'global_step_{global_steps}')
critic_remote_path = os.path.join(self.config.trainer.default_hdfs_dir, 'critic')
self.critic_wg.save_checkpoint(critic_local_path, critic_remote_path)
global_steps += 1
# perform validation after training
if self.val_reward_fn is not None:
val_metrics = self._validate()
pprint(f'Final validation metrics: {val_metrics}')

View File

@ -0,0 +1,146 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
- Preprocess data and split the training set into 75% for training RM and 25% for validting RM.
- All the training data is used to train SFT and RL.
- Both chosen and rejected is used to train SFT
"""
import argparse
import os
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm
from verl.utils.fs import copy, makedirs
def generate_sft_dataset(target_hdfs_path_dir, local_dir='~/data/full_hh_rlh/sft'):
dataset = load_dataset('Dahoas/full-hh-rlhf')
output = {'prompt': [], 'response': []}
for data in tqdm(dataset['train']):
# add chosen
output['prompt'].append(data['prompt'])
output['response'].append(data['chosen'])
# add rejection
output['prompt'].append(data['prompt'])
output['response'].append(data['rejected'])
df = pd.DataFrame(output)
local_dir = os.path.expanduser(local_dir)
os.makedirs(local_dir, exist_ok=True)
local_path = os.path.join(local_dir, 'train.parquet')
df.to_parquet(path=local_path)
if target_hdfs_path_dir is not None:
hdfs_dir = target_hdfs_path_dir + '/' + 'train.parquet'
makedirs(hdfs_dir)
copy(local_path, hdfs_dir)
def generate_rm_dataset(target_hdfs_path_dir, local_dir='~/data/full_hh_rlh/rm'):
train_dataset = load_dataset('Dahoas/full-hh-rlhf', split='train[:75%]')
test_dataset = load_dataset('Dahoas/full-hh-rlhf', split='train[-25%:]')
local_dir = os.path.expanduser(local_dir)
os.makedirs(local_dir, exist_ok=True)
for dataset, name in zip([train_dataset, test_dataset], ['train', 'test']):
output = {'prompt': [], 'chosen': [], 'rejected': []}
for data in tqdm(dataset):
# add chosen
output['prompt'].append(data['prompt'])
output['chosen'].append(data['chosen'])
output['rejected'].append(data['rejected'])
df = pd.DataFrame(output)
local_path = os.path.join(local_dir, name + '.parquet')
df.to_parquet(path=local_path)
if target_hdfs_path_dir is not None:
hdfs_dir = target_hdfs_path_dir + '/' + name + '.parquet'
makedirs(hdfs_dir)
copy(local_path, hdfs_dir)
def generate_rl_dataset(target_hdfs_path_dir, local_dir='~/data/full_hh_rlhf/rl'):
dataset = load_dataset('Dahoas/full-hh-rlhf')
train_dataset = dataset['train']
data_source = 'Dahoas/full-hh-rlhf'
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
prompt = example.pop('prompt')
response = example.pop('response')
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": prompt
}],
"ability": "alignment",
"reward_model": {
"style": "model",
"ground_truth": response # should not be used
},
"extra_info": {
'split': split,
'index': idx
}
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
local_dir = os.path.expanduser(local_dir)
local_path = os.path.join(local_dir, 'train.parquet')
train_dataset.to_parquet(local_path)
if target_hdfs_path_dir is not None:
hdfs_dir = target_hdfs_path_dir + '/' + 'train.parquet'
makedirs(hdfs_dir)
copy(local_path, hdfs_dir)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--split', type=str, choices=['sft', 'rm', 'rl'], required=True)
parser.add_argument('--local_dir', type=str, default='~/data/full_hh_rlhf')
parser.add_argument('--hdfs_dir', type=str, required=False, default=None)
args = parser.parse_args()
if args.split == 'sft':
generate_sft_dataset(args.hdfs_dir, os.path.join(args.local_dir, args.split))
elif args.split == 'rm':
generate_rm_dataset(args.hdfs_dir, os.path.join(args.local_dir, args.split))
elif args.split == 'rl':
generate_rl_dataset(args.hdfs_dir, os.path.join(args.local_dir, args.split))
else:
raise NotImplementedError

View File

@ -0,0 +1,93 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""
import re
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
import argparse
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split('#### ')[1].replace(',', '')
return final_solution
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='~/data/gsm8k')
parser.add_argument('--hdfs_dir', default=None)
args = parser.parse_args()
num_few_shot = 5
data_source = 'openai/gsm8k'
dataset = datasets.load_dataset(data_source, 'main')
train_dataset = dataset['train']
test_dataset = dataset['test']
instruction_following = "Let's think step by step and output the final answer after \"####\"."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question = example.pop('question')
question = question + ' ' + instruction_following
answer = example.pop('answer')
solution = extract_solution(answer)
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": question
}],
"ability": "math",
"reward_model": {
"style": "rule",
"ground_truth": solution
},
"extra_info": {
'split': split,
'index': idx
}
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)

View File

@ -0,0 +1,102 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess Hellaswag dataset.
"""
import re
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
import argparse
def preprocess(text):
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='/opt/tiger/hellaswag')
parser.add_argument('--hdfs_dir', default=None)
args = parser.parse_args()
data_source = 'Rowan/hellaswag'
dataset = datasets.load_dataset(data_source, trust_remote_code=True)
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']
instruction = 'Please complete the following sentence.\n'
def make_map_fn(split):
def process_fn(doc, idx):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
query = preprocess(doc["activity_label"] + ": " + ctx)
choices = [preprocess(ending) for ending in doc["endings"]]
gold = int(doc["label"])
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": query
}],
"ability": "nlp",
"reward_model": {
"style": "model",
"eval": "multiple_choice", # using loglikelihood
"ground_truth": gold,
"choices": choices
},
"extra_info": {
'split': split,
'index': idx
}
}
return data
return process_fn
# filter data that doesn't have a label
train_dataset = train_dataset.filter(lambda x: len(x['label']) > 0)
val_dataset = val_dataset.filter(lambda x: len(x['label']) > 0)
test_dataset = test_dataset.filter(lambda x: len(x['label']) > 0)
train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
val_dataset = val_dataset.map(function=make_map_fn('validation'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
val_dataset.to_parquet(os.path.join(local_dir, 'validation.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)

View File

@ -0,0 +1,89 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
import argparse
from verl.utils.reward_score.math import remove_boxed, last_boxed_only_string
def extract_solution(solution_str):
return remove_boxed(last_boxed_only_string(solution_str))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='~/data/math')
parser.add_argument('--hdfs_dir', default=None)
args = parser.parse_args()
data_source = 'lighteval/MATH'
dataset = datasets.load_dataset(data_source, trust_remote_code=True)
train_dataset = dataset['train']
test_dataset = dataset['test']
instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question = example.pop('problem')
question = question + ' ' + instruction_following
answer = example.pop('solution')
solution = extract_solution(answer)
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": question
}],
"ability": "math",
"reward_model": {
"style": "rule",
"ground_truth": solution
},
"extra_info": {
'split': split,
'index': idx
}
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)

View File

@ -0,0 +1,16 @@
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node=8 \
data.path=~/data/rlhf/gsm8k/test.parquet \
data.prompt_key=prompt \
data.n_samples=1 \
data.output_path=~/data/rlhf/math/deepseek_v2_lite_gen_test.parquet \
model.path=deepseek-ai/deepseek-llm-7b-chat \
+model.trust_remote_code=True \
rollout.temperature=1.0 \
rollout.top_k=50 \
rollout.top_p=0.7 \
rollout.prompt_length=2048 \
rollout.response_length=1024 \
rollout.tensor_model_parallel_size=2 \
rollout.gpu_memory_utilization=0.8

View File

@ -0,0 +1,38 @@
set -x
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_epochs=15

View File

@ -0,0 +1,41 @@
set -x
train_files=$HOME/data/full_hh_rlhf/rl/train.parquet
test_files=$HOME/data/full_hh_rlhf/rl/train.parquet # no use
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=512 \
data.val_batch_size=128 \
data.max_prompt_length=128 \
data.max_response_length=128 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \
actor_rollout_ref.ref.param_offload=False \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \
reward_model.enable=True \
reward_model.megatron.tensor_model_parallel_size=4 \
reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
reward_model.micro_batch_size=16 \
reward_model.param_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_megatron_full_hh_rlhf_examples' \
trainer.experiment_name='deepseek_llm_7b_model_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=100

View File

@ -0,0 +1,40 @@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.val_batch_size=6312 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_megatron_math_gsm8k_examples' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=100

View File

@ -0,0 +1,31 @@
set -x
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=64 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=64 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=64 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_epochs=15

View File

@ -0,0 +1,47 @@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.val_batch_size=6312 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2-7B-Instruct_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15

View File

@ -0,0 +1,54 @@
set -x
# Discliamer: the model used in the script is only for academic example,
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.val_batch_size=6312 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=sfairXC/FsfairX-Gemma2-RM-v0.1\
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2-7B-Instruct_hybrid_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15

View File

@ -0,0 +1,48 @@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.val_batch_size=6304 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-32B-Instruct \
actor_rollout_ref.model.enable_gradient_checkpointing=False \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.path=Qwen/Qwen2.5-32B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.0001 \
trainer.critic_warmup=0 \
trainer.logger=['console','tracking'] \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=4 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15

949
examples/ray/tutorial.ipynb Normal file
View File

@ -0,0 +1,949 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "ed65145e-1dcc-4fad-85cd-8cc683fde91b",
"metadata": {},
"source": [
"# VeRL Ray API Tutorial"
]
},
{
"cell_type": "markdown",
"id": "5f90dfa8-3285-41e4-ba44-12abcb76b3ce",
"metadata": {
"tags": []
},
"source": [
"## Chapter 1: Ray Basics"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "cafc9be5-614b-4380-9b69-31525ea4e73a",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# turn off Megatron timer\n",
"os.environ['MEGATRON_USE_CUDA_TIMER'] = '0'\n",
"os.environ['MEGATRON_START_PROCESS_TIMER'] = 'False'\n",
"os.environ['NCCL_DEBUG'] = 'WARN'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3fb65288-4410-43b4-9ffc-8538197ae039",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import ray\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ca7f0778-f676-4f67-8c1f-9f6c8eee74e2",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-01-22 14:52:10,398\tINFO worker.py:1655 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <div style=\"margin-left: 50px;display: flex;flex-direction: row;align-items: center\">\n",
" <h3 style=\"color: var(--jp-ui-font-color0)\">Ray</h3>\n",
" <svg version=\"1.1\" id=\"ray\" width=\"3em\" viewBox=\"0 0 144.5 144.6\" style=\"margin-left: 3em;margin-right: 3em\">\n",
" <g id=\"layer-1\">\n",
" <path fill=\"#00a2e9\" class=\"st0\" d=\"M97.3,77.2c-3.8-1.1-6.2,0.9-8.3,5.1c-3.5,6.8-9.9,9.9-17.4,9.6S58,88.1,54.8,81.2c-1.4-3-3-4-6.3-4.1\n",
" c-5.6-0.1-9.9,0.1-13.1,6.4c-3.8,7.6-13.6,10.2-21.8,7.6C5.2,88.4-0.4,80.5,0,71.7c0.1-8.4,5.7-15.8,13.8-18.2\n",
" c8.4-2.6,17.5,0.7,22.3,8c1.3,1.9,1.3,5.2,3.6,5.6c3.9,0.6,8,0.2,12,0.2c1.8,0,1.9-1.6,2.4-2.8c3.5-7.8,9.7-11.8,18-11.9\n",
" c8.2-0.1,14.4,3.9,17.8,11.4c1.3,2.8,2.9,3.6,5.7,3.3c1-0.1,2,0.1,3,0c2.8-0.5,6.4,1.7,8.1-2.7s-2.3-5.5-4.1-7.5\n",
" c-5.1-5.7-10.9-10.8-16.1-16.3C84,38,81.9,37.1,78,38.3C66.7,42,56.2,35.7,53,24.1C50.3,14,57.3,2.8,67.7,0.5\n",
" C78.4-2,89,4.7,91.5,15.3c0.1,0.3,0.1,0.5,0.2,0.8c0.7,3.4,0.7,6.9-0.8,9.8c-1.7,3.2-0.8,5,1.5,7.2c6.7,6.5,13.3,13,19.8,19.7\n",
" c1.8,1.8,3,2.1,5.5,1.2c9.1-3.4,17.9-0.6,23.4,7c4.8,6.9,4.6,16.1-0.4,22.9c-5.4,7.2-14.2,9.9-23.1,6.5c-2.3-0.9-3.5-0.6-5.1,1.1\n",
" c-6.7,6.9-13.6,13.7-20.5,20.4c-1.8,1.8-2.5,3.2-1.4,5.9c3.5,8.7,0.3,18.6-7.7,23.6c-7.9,5-18.2,3.8-24.8-2.9\n",
" c-6.4-6.4-7.4-16.2-2.5-24.3c4.9-7.8,14.5-11,23.1-7.8c3,1.1,4.7,0.5,6.9-1.7C91.7,98.4,98,92.3,104.2,86c1.6-1.6,4.1-2.7,2.6-6.2\n",
" c-1.4-3.3-3.8-2.5-6.2-2.6C99.8,77.2,98.9,77.2,97.3,77.2z M72.1,29.7c5.5,0.1,9.9-4.3,10-9.8c0-0.1,0-0.2,0-0.3\n",
" C81.8,14,77,9.8,71.5,10.2c-5,0.3-9,4.2-9.3,9.2c-0.2,5.5,4,10.1,9.5,10.3C71.8,29.7,72,29.7,72.1,29.7z M72.3,62.3\n",
" c-5.4-0.1-9.9,4.2-10.1,9.7c0,0.2,0,0.3,0,0.5c0.2,5.4,4.5,9.7,9.9,10c5.1,0.1,9.9-4.7,10.1-9.8c0.2-5.5-4-10-9.5-10.3\n",
" C72.6,62.3,72.4,62.3,72.3,62.3z M115,72.5c0.1,5.4,4.5,9.7,9.8,9.9c5.6-0.2,10-4.8,10-10.4c-0.2-5.4-4.6-9.7-10-9.7\n",
" c-5.3-0.1-9.8,4.2-9.9,9.5C115,72.1,115,72.3,115,72.5z M19.5,62.3c-5.4,0.1-9.8,4.4-10,9.8c-0.1,5.1,5.2,10.4,10.2,10.3\n",
" c5.6-0.2,10-4.9,9.8-10.5c-0.1-5.4-4.5-9.7-9.9-9.6C19.6,62.3,19.5,62.3,19.5,62.3z M71.8,134.6c5.9,0.2,10.3-3.9,10.4-9.6\n",
" c0.5-5.5-3.6-10.4-9.1-10.8c-5.5-0.5-10.4,3.6-10.8,9.1c0,0.5,0,0.9,0,1.4c-0.2,5.3,4,9.8,9.3,10\n",
" C71.6,134.6,71.7,134.6,71.8,134.6z\"/>\n",
" </g>\n",
" </svg>\n",
" <table>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Python version:</b></td>\n",
" <td style=\"text-align: left\"><b>3.9.2</b></td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Ray version:</b></td>\n",
" <td style=\"text-align: left\"><b> 2.3.0</b></td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://127.0.0.1:8265\" target=\"_blank\">http://127.0.0.1:8265</a></b></td>\n",
"</tr>\n",
"\n",
" </table>\n",
" </div>\n",
"</div>\n"
],
"text/plain": [
"RayContext(dashboard_url='127.0.0.1:8265', python_version='3.9.2', ray_version='2.3.0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '10.122.229.29', 'raylet_ip_address': '10.122.229.29', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202', 'metrics_export_port': 46547, 'gcs_address': '10.122.229.29:57281', 'address': '10.122.229.29:57281', 'dashboard_agent_listen_port': 52365, 'node_id': 'ec81d96ec29985ce0b98441aabb46fc8dcb49b0e1dec1088f973977f'})"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489193)\u001b[0m rank 0, value: tensor([1.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489357)\u001b[0m rank 2, value: tensor([3.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489358)\u001b[0m rank 3, value: tensor([4.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489356)\u001b[0m rank 1, value: tensor([2.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489837)\u001b[0m rank 0, value: tensor([2.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489999)\u001b[0m rank 3, value: tensor([5.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489998)\u001b[0m rank 2, value: tensor([4.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1489997)\u001b[0m rank 1, value: tensor([3.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490000)\u001b[0m rank 0, value: tensor([3.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490633)\u001b[0m rank 3, value: tensor([6.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490635)\u001b[0m rank 5, value: tensor([8.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490636)\u001b[0m rank 6, value: tensor([9.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490631)\u001b[0m rank 1, value: tensor([4.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490637)\u001b[0m rank 7, value: tensor([10.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490634)\u001b[0m rank 4, value: tensor([7.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulator pid=1490632)\u001b[0m rank 2, value: tensor([5.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491761)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491761)\u001b[0m rank 0, value: tensor([10.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491923)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491922)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491921)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491924)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491919)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491925)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491920)\u001b[0m 10\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491923)\u001b[0m rank 5, value: tensor([15.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491921)\u001b[0m rank 3, value: tensor([13.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491924)\u001b[0m rank 7, value: tensor([17.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491925)\u001b[0m rank 6, value: tensor([16.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491920)\u001b[0m rank 2, value: tensor([12.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491919)\u001b[0m rank 1, value: tensor([11.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491922)\u001b[0m rank 4, value: tensor([14.], device='cuda:0')\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument timing_log_level:2 with timing_log_level:0\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument tensor_model_parallel_size:1 with tensor_model_parallel_size:4\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument use_distributed_optimizer:False with use_distributed_optimizer:True\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument weight_decay:0.01 with weight_decay:0.0\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument sgd_momentum:0.9 with sgd_momentum:0.0\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument bf16:False with bf16:True\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument clip_grad:1.0 with clip_grad:0.0\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument sequence_parallel:False with sequence_parallel:True\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m using world size: 4, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 1 \n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m setting global batch size to 1\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m accumulate gradients in fp32 and all-reduce gradients in fp32 for bfloat16 data type. \n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m using torch.bfloat16 for parameters ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m setting number of micro-batches to constant 1\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:47] > initializing torch distributed ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group1]\tnew_group dp\thigh_stream False\tranks: [0]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group2]\tnew_group dp\thigh_stream False\tranks: [1]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group3]\tnew_group dp\thigh_stream False\tranks: [2]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group4]\tnew_group dp\thigh_stream False\tranks: [3]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group5]\tnew_group mp\thigh_stream False\tranks: [0, 1, 2, 3]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group6]\tnew_group tp\thigh_stream False\tranks: [0, 1, 2, 3]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group8]\tnew_group pp\thigh_stream False\tranks: [0]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group9]\tnew_group emb\thigh_stream False\tranks: [0]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group10]\tnew_group posemb\thigh_stream False\tranks: [0]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group11]\tnew_group pp\thigh_stream False\tranks: [1]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group12]\tnew_group emb\thigh_stream False\tranks: [1]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group13]\tnew_group posemb\thigh_stream False\tranks: [1]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group14]\tnew_group pp\thigh_stream False\tranks: [2]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group15]\tnew_group emb\thigh_stream False\tranks: [2]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group16]\tnew_group posemb\thigh_stream False\tranks: [2]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group17]\tnew_group pp\thigh_stream False\tranks: [3]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group18]\tnew_group emb\thigh_stream False\tranks: [3]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group19]\tnew_group posemb\thigh_stream False\tranks: [3]\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m NCCL version 2.18.6+cuda12.1\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] > initialized tensor model parallel with size 4\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] > initialized pipeline model parallel with size 1\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > setting random seeds to 1234 ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > compiling dataset index builder ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m already compiled, skip compiling\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m >>> done with dataset index builder. Compilation time: 0.005 seconds\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > compiling and loading fused kernels ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.039 seconds\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m > compiling and loading fused kernels ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.044 seconds\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m > compiling and loading fused kernels ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.044 seconds\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m > compiling and loading fused kernels ...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.045 seconds\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
"\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n"
]
}
],
"source": [
"# Build a local ray cluster. The head node and worker node are on this machine\n",
"ray.init()"
]
},
{
"cell_type": "markdown",
"id": "a0d35eb2-cf28-411a-b39f-f45e97c2edba",
"metadata": {
"tags": []
},
"source": [
"Implement an Accumulator class."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "86c98e9c-26a5-44d0-bbe9-064f8809708c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"@ray.remote\n",
"class Accumulator:\n",
" def __init__(self):\n",
" self.value = 0\n",
" \n",
" def add(self, x):\n",
" self.value += x\n",
" \n",
" def get_value(self):\n",
" return self.value"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9d5aa406-23c3-4270-a6f8-d4af20769ed9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Instantiate an accumulator. Accumulator can be viewed as a process, acting as an RPC service.\n",
"accumulator = Accumulator.remote()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "56850d3a-3803-4395-a73e-a27e99dc6905",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
}
],
"source": [
"value_ref = accumulator.get_value.remote() # Check the current value. Note that this function returns immediately and does not actually wait for the remote execution to complete.\n",
"# Get the value\n",
"value = ray.get(value_ref)\n",
"print(value)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "932807f1-792b-42af-aa44-6a6202718919",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10\n"
]
}
],
"source": [
"# Accumulate, then check the result.\n",
"accumulator.add.remote(10) # Similarly, the 'add' here will return immediately.\n",
"new_value = ray.get(accumulator.get_value.remote())\n",
"print(new_value)"
]
},
{
"cell_type": "markdown",
"id": "ca31e450-e02c-44d0-86a1-a3f9eeadd311",
"metadata": {},
"source": [
"## Chapter 2: Resource Pool and RayWorkerGroup\n",
"In the previous example, it was a simple single-process worker. \n",
"In this example, we implement a worker with a GPU and form a RayWorkerGroup. Within this RayWorkerGroup, we implement a simple operation of an accumulator."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "84727b3d-1351-4550-9009-48526d18dc2c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, merge_resource_pool\n",
"from single_controller.base import Worker"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "aab5eeb7-6485-4911-94cc-3f5ca52b6634",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"resource_pool = RayResourcePool([4], use_gpu=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "54e4d531-ca50-43e9-ab79-d24bb66ffe0e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"@ray.remote\n",
"class GPUAccumulator(Worker):\n",
"\n",
" def __init__(self) -> None:\n",
" super().__init__()\n",
" # The initial value of each rank is the same as the rank\n",
" self.value = torch.zeros(size=(1,), device='cuda') + self.rank\n",
"\n",
" def add(self, x):\n",
" self.value += x\n",
" print(f'rank {self.rank}, value: {self.value}')\n",
" return self.value.cpu()\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "73e0658b-a6a7-4a86-a53f-5442c8cd56fe",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tensor([1.]), tensor([2.]), tensor([3.]), tensor([4.])]\n"
]
}
],
"source": [
"# Each worker's initial value is its rank, and then each rank's value is incremented by 1, so the values obtained on each rank are [1, 2, 3, 4]\n",
"class_with_args = RayClassWithInitArgs(cls=GPUAccumulator)\n",
"worker_group = RayWorkerGroup(resource_pool, class_with_args)\n",
"print(worker_group.execute_all_sync('add', x=[1,1,1,1]))"
]
},
{
"cell_type": "markdown",
"id": "e36f73e8-4f38-459f-a09b-f6ffb95fd42b",
"metadata": {},
"source": [
"The principle of parameter passing: The input parameter is a list of length world_size, where each element in the list is dispatched respectively to each worker in the RayWorkerGroup. \n",
"The return parameter is also a list, corresponding to the return value of each worker."
]
},
{
"cell_type": "markdown",
"id": "3873383b-e89d-4283-ae3e-af367e6160a5",
"metadata": {},
"source": [
"### GPU Resource Sharing"
]
},
{
"cell_type": "markdown",
"id": "cf62933e-143e-4a15-90c3-3d0d0568d89c",
"metadata": {
"tags": []
},
"source": [
"RayWorkerGroups mapped to the same resource pool share the GPU. In this example, we implement three resource pools: the first occupies 4 GPUs, the second also occupies 4 GPUs, and the last occupies all 8 GPUs. Among them, the first resource pool reuses the resource pool mentioned above."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "100e75f1-6f2b-44b6-b4e1-f703eae8e5bf",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Create a new resource pool and then merge the newly created resource pool with the previous one.\n",
"resource_pool_1 = RayResourcePool([4], use_gpu=True, name_prefix='a')\n",
"resource_pool_merge = merge_resource_pool(resource_pool, resource_pool_1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "59393509-037b-475a-800a-5e6de9ea7b66",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Establish a RayWorkerGroup on the newly created resource pool.\n",
"worker_group_1 = RayWorkerGroup(resource_pool_1, class_with_args)\n",
"worker_group_merge = RayWorkerGroup(resource_pool_merge, class_with_args)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "558dc5bf-75aa-4675-b8de-99944e088522",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tensor([2.]), tensor([3.]), tensor([4.]), tensor([5.])]\n"
]
}
],
"source": [
"# Run 'add' on the second set of 4 GPUs; the result should be [2, 3, 4, 5].\n",
"output_1 = worker_group_1.execute_all_sync('add', x=[2,2,2,2])\n",
"print(output_1)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "30228667-f8ef-448c-a2cc-44241ede86d9",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tensor([3.]), tensor([4.]), tensor([5.]), tensor([6.]), tensor([7.]), tensor([8.]), tensor([9.]), tensor([10.])]\n"
]
}
],
"source": [
"# Run 'add' on the merged set of 8 GPUs; the result should be [3, 4, 5, 6, 7, 8, 9, 10].\n",
"output_merge = worker_group_merge.execute_all_sync('add', x=[3,3,3,3,3,3,3,3])\n",
"print(output_merge)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "041329de-0e27-4e31-ac11-f0dd54f4f4b3",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4 4 8\n"
]
}
],
"source": [
"print(worker_group.world_size, worker_group_1.world_size, worker_group_merge.world_size)"
]
},
{
"cell_type": "markdown",
"id": "8cb9fc59-6035-4b72-9863-67bd4d48f3fe",
"metadata": {},
"source": [
"## Chapter 3: Data Dispatch, Execution and Collection"
]
},
{
"cell_type": "markdown",
"id": "0e1fedea-16d1-4dec-b269-c402747e11e8",
"metadata": {},
"source": [
"In the above example, we used the `execute_all_sync` function in the RayWorkerGroup to dispatch data from the driver to each worker. This is very inconvenient for coding. \n",
"In this chapter, we use the form of function decorators to allow RayWorkerGroup to directly call functions written in the Worker, and to greatly simplify parameter passing."
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "c70d1b33-030a-4681-aeb2-16ab48b6445b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from single_controller.ray.decorator import register, Dispatch, Execute"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "bb6b76c1-8caf-4749-b20d-c3f842751aa4",
"metadata": {},
"outputs": [],
"source": [
"@ray.remote\n",
"class GPUAccumulatorDecorator(Worker):\n",
"\n",
" def __init__(self) -> None:\n",
" super().__init__()\n",
" # The initial value of each rank is the same as the rank\n",
" self.value = torch.zeros(size=(1,), device='cuda') + self.rank\n",
" \n",
" # map from a single input to all the worker\n",
" @register(Dispatch.ONE_TO_ALL)\n",
" def add(self, x):\n",
" print(x)\n",
" self.value = self.value + x\n",
" print(f'rank {self.rank}, value: {self.value}')\n",
" return self.value.cpu()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2d2bb460-4e9d-4fc5-a767-e7c8b9a4d7fe",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"class_with_args = RayClassWithInitArgs(cls=GPUAccumulatorDecorator)\n",
"gpu_accumulator_decorator = RayWorkerGroup(resource_pool_merge, class_with_args)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "e4db17f7-8896-4ea6-a732-2146400ff2da",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tensor([10.]), tensor([11.]), tensor([12.]), tensor([13.]), tensor([14.]), tensor([15.]), tensor([16.]), tensor([17.])]\n"
]
}
],
"source": [
"# As we can see, 10 is automatically dispatched to each Worker in this RayWorkerGroup.\n",
"print(gpu_accumulator_decorator.add(x=10))"
]
},
{
"cell_type": "markdown",
"id": "f8b54ccb-0064-4d06-b74b-fbf39f8c5faf",
"metadata": {},
"source": [
"### Custom Dispatch, Collection\n",
"Users can customize `dispatch` and `collection` function. You only need to write the `dispatch_fn` and `collect_fn` functions yourself. We also support executing RPC only on rank_zero, with specific examples provided below."
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "9ad68e0f-372e-4792-b4db-c2406f211113",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from single_controller.ray.decorator import register, Dispatch, collect_all_to_all, Execute"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "74f91242-3176-4ea8-adce-104a58d21874",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"def two_to_all_dispatch_fn(worker_group, *args, **kwargs):\n",
" \"\"\"\n",
" Assume the input is a list of 2. Duplicate the input interleaved and pass to each worker.\n",
" \"\"\"\n",
" for arg in args:\n",
" assert len(arg) == 2\n",
" for i in range(worker_group.world_size - 2):\n",
" arg.append(arg[i % 2])\n",
" for k, v in kwargs.items():\n",
" assert len(v) == 2\n",
" for i in range(worker_group.world_size - 2):\n",
" v.append(v[i % 2])\n",
" return args, kwargs\n",
"\n",
"\n",
"@ray.remote\n",
"class TestActor(Worker):\n",
" # TODO: pass *args and **kwargs is bug prone and not very convincing\n",
" def __init__(self, x) -> None:\n",
" super().__init__()\n",
" self._x = x\n",
"\n",
" def foo(self, y):\n",
" return self._x + y\n",
"\n",
" @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)\n",
" def foo_rank_zero(self, x, y):\n",
" return self._x + y + x\n",
"\n",
" @register(dispatch_mode={'dispatch_fn': two_to_all_dispatch_fn, 'collect_fn': collect_all_to_all})\n",
" def foo_custom(self, x, y):\n",
" return self._x + y + x"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "00f386a8-c5f7-49c4-87ec-6e2ab3547bf1",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)\n",
"worker_group = RayWorkerGroup(resource_pool, class_with_args)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "45ae935c-4f92-4c28-9e35-add439dcd2a0",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"output_ref = worker_group.foo_custom(x=[1, 2], y=[5, 6])\n",
"assert output_ref == [8, 10, 8, 10]\n",
"\n",
"output_ref = worker_group.foo_rank_zero(x=1, y=2)\n",
"assert output_ref == 5"
]
},
{
"cell_type": "markdown",
"id": "165ae826-7b20-4305-a5c6-d7f236eab410",
"metadata": {},
"source": [
"## Chapter 4: MegatronRayWorkerGroup"
]
},
{
"cell_type": "markdown",
"id": "d0855a6b-41d7-41bb-95e6-d9395b175d81",
"metadata": {},
"source": [
"Finally, we implement a `MegatronRayWorkerGroup`, within which we create a Megatron and then run a tensor parallel (tp) split Llama mlp layer. Here, we use a complex dispatch mode, `Megatron_COMPUTE`. This dispatch mode assumes that user passes the data partitioned by DP dimension. The data is dispatched to all tp/pp ranks within the same dp group, and ultimately only collects output data from tp=0 and the last pp. In this way, for users that only write code on the driver, the Megatron behind the RPC becomes transparent."
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "d36b4606-0e28-4fd6-8e04-498408ca161a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from single_controller.ray import MegatronRayWorkerGroup\n",
"from single_controller.megatron.worker import MegatronWorker\n",
"from omegaconf import OmegaConf"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "430fa41d-7874-49fd-b7f0-2c173a6c849b",
"metadata": {},
"outputs": [],
"source": [
"@ray.remote\n",
"class MLPLayerWorker(MegatronWorker):\n",
" @register(Dispatch.ONE_TO_ALL)\n",
" def init_model(self, config):\n",
" from omegaconf import OmegaConf\n",
" from verl.models.llama.megatron.layers import ParallelLlamaMLP\n",
" megatron_config = OmegaConf.create({'sequence_parallel_enabled': False})\n",
" self.parallel_layer = ParallelLlamaMLP(config=config, megatron_config=megatron_config)\n",
" \n",
" @register(Dispatch.ONE_TO_ALL)\n",
" def get_weights(self):\n",
" output = {}\n",
" for key, val in self.parallel_layer.named_parameters():\n",
" output[key] = val\n",
" return output\n",
" \n",
" @register(Dispatch.MEGATRON_COMPUTE)\n",
" def run_layer(self, x):\n",
" x = x.to('cuda')\n",
" y = self.parallel_layer(x)\n",
" return y"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2b749128-4190-454f-96a2-72477a7a77bd",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"layer_cls = RayClassWithInitArgs(cls=MLPLayerWorker)\n",
"layer_worker_group = MegatronRayWorkerGroup(resource_pool=resource_pool,\n",
" ray_cls_with_init=layer_cls,\n",
" default_megatron_kwargs={\n",
" 'tensor_model_parallel_size': 4,\n",
" 'pipeline_model_parallel_size': 1,\n",
" 'num_layers_per_virtual_pipeline_stage': None\n",
" })\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "f134f99e-678a-4b6f-8d70-fce2a5c9ac3e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4 4 1 1\n"
]
}
],
"source": [
"print(layer_worker_group.world_size, layer_worker_group.tp_size, layer_worker_group.pp_size, layer_worker_group.dp_size)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "204ce866-868a-4b0b-900b-0dda6164a995",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ffn_hidden_size = 11008\n",
"batch_size = 16\n",
"seq_len = 2048\n",
"hidden_size = 4096\n",
"\n",
"config = OmegaConf.create({\n",
" 'hidden_size': hidden_size,\n",
" 'intermediate_size': ffn_hidden_size,\n",
" 'hidden_act': 'silu',\n",
" 'pretraining_tp': 1,\n",
" 'tp': layer_worker_group.tp_size,\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "d35a0216-7787-4ea7-86ab-c87f8e98f13d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"x = torch.rand(size=(seq_len, batch_size, hidden_size), dtype=torch.float32)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "0f84eaf8-adcd-4f89-a1ce-800db07df242",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[None, None, None, None]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"layer_worker_group.init_model(config)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "5203bb40-f831-4898-832c-5f65596ba0f6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([2048, 16, 4096])\n"
]
}
],
"source": [
"output = layer_worker_group.run_layer([x]) # This must be a list of size 1, ensuring that the input equals the data parallel (dp).\n",
"print(output[0].shape)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "6a159884-7409-4dfe-a367-3c31090b09a1",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
}
],
"source": [
"print(gpu_accumulator_decorator.world_size)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "806c36c3-a6a6-4c99-88fb-f2e2468b12db",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Shutdown ray cluster\n",
"ray.shutdown()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea54e858-5221-493e-b53a-4765b918a0fa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"fileId": "e862b5a9-13b2-48bf-9eda-96bd0c76e37c",
"kernelspec": {
"display_name": "Python 3.9.2 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,16 @@
set -x
hdfs_path=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ # replace to your own hdfs/local path
TORCHRUN -m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=question \
data.response_key=answer \
data.micro_batch_size=8 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_hdfs_dir=$hdfs_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
trainer.total_epochs=4 \
trainer.logger=['console','tracking']

View File

@ -0,0 +1,18 @@
# Tested in 4 GPUs
set -x
hdfs_path=hdfs://user/verl/experiments/gsm8k/gemma-2b-it/ # replace to your own hdfs/local path
TORCHRUN -m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=question \
data.response_key=answer \
data.micro_batch_size=32 \
model.partial_pretrain=google/gemma-2b-it \
trainer.default_hdfs_dir=$hdfs_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-gemma-2b-it \
trainer.total_epochs=3 \
trainer.logger=['console','tracking']

View File

@ -0,0 +1,16 @@
set -x
hdfs_path=hdfs://user/verl/experiments/gsm8k/gemma-1.1-7b-it/ # replace to your own hdfs/local path
TORCHRUN -m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=question \
data.response_key=answer \
data.micro_batch_size=8 \
model.partial_pretrain=google/gemma-1.1-7b-it \
trainer.default_hdfs_dir=$hdfs_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-gemma-1.1-7b-it \
trainer.total_epochs=4 \
trainer.logger=['console','tracking']

568
patches/megatron_v4.patch Normal file
View File

@ -0,0 +1,568 @@
diff --git a/.gitignore b/.gitignore
index 5955b349..ade0cd51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ build
slurm*
logs
.vscode
+tests/*
+examples/*
diff --git a/build.sh b/build.sh
new file mode 100644
index 00000000..49d5361f
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,4 @@
+#! /bin/bash
+
+export PYTHONPATH=$PYTHONPATH:$(pwd)
+pip3 install regex ninja
diff --git a/megatron/__init__.py b/megatron/__init__.py
index c35de282..60896b47 100644
--- a/megatron/__init__.py
+++ b/megatron/__init__.py
@@ -2,7 +2,7 @@
import torch
-from .global_vars import get_args, get_retro_args
+from .global_vars import get_args, update_args, fork_args_namespace, get_retro_args
from .global_vars import get_current_global_batch_size
from .global_vars import get_num_microbatches
from .global_vars import get_signal_handler
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 0ca8776e..9ef67624 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -59,6 +59,16 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
return args
def validate_args(args, defaults={}):
+ # Set input defaults.
+ for key in defaults:
+ if getattr(args, key, None) is not None:
+ if args.rank == 0 and defaults[key] != getattr(args, key):
+ print('WARNING: overriding default argument {key}:{v2} \
+ with {key}:{v}'.format(key=key, v=defaults[key],
+ v2=getattr(args, key)),
+ flush=True)
+
+ setattr(args, key, defaults[key])
# Tensor model parallel size.
args.tensor_model_parallel_size = min(
args.tensor_model_parallel_size, args.world_size)
@@ -125,19 +135,19 @@ def validate_args(args, defaults={}):
args.recompute_granularity = 'selective'
del args.recompute_activations
- # Set input defaults.
- for key in defaults:
- # For default to be valid, it should not be provided in the
- # arguments that are passed to the program. We check this by
- # ensuring the arg is set to None.
- if getattr(args, key, None) is not None:
- if args.rank == 0:
- print('WARNING: overriding default arguments for {key}:{v} \
- with {key}:{v2}'.format(key=key, v=defaults[key],
- v2=getattr(args, key)),
- flush=True)
- else:
- setattr(args, key, defaults[key])
+ # # Set input defaults.
+ # for key in defaults:
+ # # For default to be valid, it should not be provided in the
+ # # arguments that are passed to the program. We check this by
+ # # ensuring the arg is set to None.
+ # if getattr(args, key, None) is not None:
+ # if args.rank == 0:
+ # print('WARNING: overriding default arguments for {key}:{v} \
+ # with {key}:{v2}'.format(key=key, v=defaults[key],
+ # v2=getattr(args, key)),
+ # flush=True)
+ # else:
+ # setattr(args, key, defaults[key])
# Batch size.
assert args.micro_batch_size is not None
diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py
index 29ee34df..fa590b16 100644
--- a/megatron/core/pipeline_parallel/p2p_communication.py
+++ b/megatron/core/pipeline_parallel/p2p_communication.py
@@ -130,32 +130,28 @@ def _batched_p2p_ops(
send_prev_op = torch.distributed.P2POp(
torch.distributed.isend,
tensor_send_prev,
- get_pipeline_model_parallel_prev_rank(),
- group,
+ get_pipeline_model_parallel_prev_rank()
)
ops.append(send_prev_op)
if tensor_recv_prev is not None:
recv_prev_op = torch.distributed.P2POp(
torch.distributed.irecv,
tensor_recv_prev,
- get_pipeline_model_parallel_prev_rank(),
- group,
+ get_pipeline_model_parallel_prev_rank()
)
ops.append(recv_prev_op)
if tensor_send_next is not None:
send_next_op = torch.distributed.P2POp(
torch.distributed.isend,
tensor_send_next,
- get_pipeline_model_parallel_next_rank(),
- group,
+ get_pipeline_model_parallel_next_rank()
)
ops.append(send_next_op)
if tensor_recv_next is not None:
recv_next_op = torch.distributed.P2POp(
torch.distributed.irecv,
tensor_recv_next,
- get_pipeline_model_parallel_next_rank(),
- group,
+ get_pipeline_model_parallel_next_rank()
)
ops.append(recv_next_op)
if len(ops) > 0:
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
index 992da781..2eb78d52 100644
--- a/megatron/core/pipeline_parallel/schedules.py
+++ b/megatron/core/pipeline_parallel/schedules.py
@@ -78,6 +78,8 @@ def get_forward_backward_func():
transformer, this is the encoder's sequence length. This is ignored if variable_seq_lengths
in the config is True. Otherwise, each microbatch in the current global batch size must use
this sequence length.
+
+ hidden_size (int, required): hidden size of the model
micro_batch_size (int, required): The number of sequences in a microbatch.
@@ -287,6 +289,7 @@ def forward_backward_no_pipelining(
model: Union[torch.nn.Module, List[torch.nn.Module]],
num_microbatches: int,
seq_length: int, # unused
+ hidden_size: int, # unused
micro_batch_size: int, # unused
decoder_seq_length: int = None, # unused
forward_only: bool = False,
@@ -370,8 +373,10 @@ def forward_backward_pipelining_with_interleaving(
data_iterator: Union[Iterator, List[Iterator]],
model: Union[torch.nn.Module, List[torch.nn.Module]],
num_microbatches: int,
- seq_length: int,
- micro_batch_size: int,
+ seq_length: int = None,
+ hidden_size: int = None,
+ micro_batch_size: int = None,
+ input_shapes: list = None,
decoder_seq_length: int = None,
forward_only: bool = False,
collect_non_loss_data: bool = False,
@@ -457,7 +462,7 @@ def forward_backward_pipelining_with_interleaving(
"Interleaving is not supported with a different decoder sequence length."
)
- tensor_shape = [seq_length, micro_batch_size, config.hidden_size]
+ tensor_shape = [seq_length, micro_batch_size, hidden_size]
if config.sequence_parallel:
tensor_shape[0] = tensor_shape[0] // parallel_state.get_tensor_model_parallel_world_size()
@@ -944,6 +949,7 @@ def get_tensor_shapes(
rank: int,
model_type: ModelType,
seq_length: int,
+ hidden_size: int,
micro_batch_size: int,
decoder_seq_length: int,
config,
@@ -967,12 +973,12 @@ def get_tensor_shapes(
if model_type == ModelType.encoder_and_decoder:
if parallel_state.is_pipeline_stage_before_split(rank):
- tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+ tensor_shapes.append((seq_length, micro_batch_size, hidden_size))
else:
- tensor_shapes.append((decoder_seq_length, micro_batch_size, config.hidden_size))
- tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+ tensor_shapes.append((decoder_seq_length, micro_batch_size, hidden_size))
+ tensor_shapes.append((seq_length, micro_batch_size, hidden_size))
else:
- tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+ tensor_shapes.append((seq_length, micro_batch_size, hidden_size))
return tensor_shapes
@@ -1050,8 +1056,10 @@ def forward_backward_pipelining_without_interleaving(
data_iterator: Union[Iterator, List[Iterator]],
model: Union[torch.nn.Module, List[torch.nn.Module]],
num_microbatches: int,
- seq_length: int,
- micro_batch_size: int,
+ seq_length: int = None,
+ hidden_size: int = None,
+ micro_batch_size: int = None,
+ input_shapes: list = None,
decoder_seq_length: int = None,
forward_only: bool = False,
collect_non_loss_data: bool = False,
@@ -1127,22 +1135,34 @@ def forward_backward_pipelining_without_interleaving(
model_type = get_model_type(model)
rank = parallel_state.get_pipeline_model_parallel_rank()
- recv_tensor_shapes = get_tensor_shapes(
- rank=rank - 1,
- model_type=model_type,
- seq_length=seq_length,
- micro_batch_size=micro_batch_size,
- decoder_seq_length=decoder_seq_length,
- config=config,
- )
- send_tensor_shapes = get_tensor_shapes(
- rank=rank,
- model_type=model_type,
- seq_length=seq_length,
- micro_batch_size=micro_batch_size,
- decoder_seq_length=decoder_seq_length,
- config=config,
- )
+
+ def get_recv_tensor_shapes(microbatch_id):
+ if input_shapes:
+ return [input_shapes[microbatch_id]]
+ recv_tensor_shapes = get_tensor_shapes(
+ rank=rank - 1,
+ model_type=model_type,
+ seq_length=seq_length,
+ hidden_size=hidden_size,
+ micro_batch_size=micro_batch_size,
+ decoder_seq_length=decoder_seq_length,
+ config=config,
+ )
+ return recv_tensor_shapes
+
+ def get_send_tensor_shapes(microbatch_id):
+ if input_shapes:
+ return [input_shapes[microbatch_id]]
+ send_tensor_shapes = get_tensor_shapes(
+ rank=rank,
+ model_type=model_type,
+ seq_length=seq_length,
+ hidden_size=hidden_size,
+ micro_batch_size=micro_batch_size,
+ decoder_seq_length=decoder_seq_length,
+ config=config,
+ )
+ return send_tensor_shapes
# Input, output tensors only need to be saved when doing backward passes
input_tensors = None
@@ -1163,7 +1183,12 @@ def forward_backward_pipelining_without_interleaving(
else:
checkpoint_activations_microbatch = None
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {i}: warmup recv_forward begin...')
+ recv_tensor_shapes = get_recv_tensor_shapes(i) # fwd recv shape
input_tensor = recv_forward(recv_tensor_shapes, config)
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {i}: warmup recv_forward end & forward begin...')
output_tensor = forward_step(
forward_step_func,
data_iterator,
@@ -1175,7 +1200,13 @@ def forward_backward_pipelining_without_interleaving(
collect_non_loss_data,
checkpoint_activations_microbatch,
)
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: output tensor shape = {output_tensor[0].shape}, send_tensor_shapes={send_tensor_shapes}')
+ # print(f'rank {torch.cuda.current_device()}: micro batch {i}: warmup forward end & send_forward begin...')
+ send_tensor_shapes = get_send_tensor_shapes(i) # fwd send shape
send_forward(output_tensor, send_tensor_shapes, config)
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {i}: warmup send_forward end...')
if not forward_only:
input_tensors.append(input_tensor)
@@ -1186,11 +1217,16 @@ def forward_backward_pipelining_without_interleaving(
# If all microbatches are run in warmup / cooldown phase, then no need to
# receive this tensor here.
if num_microbatches_remaining > 0:
- input_tensor = recv_forward(recv_tensor_shapes, config)
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {num_warmup_microbatches}: 1f1b recv_forward begin...')
+ recv_tensor_shapes = get_recv_tensor_shapes(num_warmup_microbatches) # fwd recv shape
+ input_tensor = recv_forward(recv_tensor_shapes, config)
# Run 1F1B in steady state.
for i in range(num_microbatches_remaining):
last_iteration = i == (num_microbatches_remaining - 1)
+ next_forward_k = num_warmup_microbatches + i + 1
+ backward_k = i
# Decide to checkpoint all layers' activations of the current micro-batch
if max_outstanding_backprops is not None:
@@ -1199,7 +1235,8 @@ def forward_backward_pipelining_without_interleaving(
) >= config.num_microbatches_with_partial_activation_checkpoints
else:
checkpoint_activations_microbatch = None
-
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {num_warmup_microbatches + i}: 1f1b recv_forward end & forward begin...')
output_tensor = forward_step(
forward_step_func,
data_iterator,
@@ -1213,12 +1250,23 @@ def forward_backward_pipelining_without_interleaving(
)
if forward_only:
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {num_warmup_microbatches + i}: 1f1b forward end & send forward begin...')
+ send_tensor_shapes = get_send_tensor_shapes(next_forward_k - 1) # fwd send shape
send_forward(output_tensor, send_tensor_shapes, config)
if not last_iteration:
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {num_warmup_microbatches + i}: 1f1b send forward end & recv forward begin...')
+ recv_tensor_shapes = get_recv_tensor_shapes(next_forward_k) # fwd recv shape
input_tensor = recv_forward(recv_tensor_shapes, config)
+ else:
+ pass
+ # if torch.cuda.current_device() == 0 or torch.cuda.current_device() == 4:
+ # print(f'rank {torch.cuda.current_device()}: micro batch {num_warmup_microbatches + i}: 1f1b send forward end...')
else:
+ send_tensor_shapes = get_send_tensor_shapes(backward_k) # bwd recv shape
output_tensor_grad = send_forward_recv_backward(
output_tensor, send_tensor_shapes, config
)
@@ -1245,8 +1293,10 @@ def forward_backward_pipelining_without_interleaving(
if last_iteration:
input_tensor = None
+ recv_tensor_shapes = get_recv_tensor_shapes(backward_k) # bwd send shape
send_backward(input_tensor_grad, recv_tensor_shapes, config)
else:
+ recv_tensor_shapes = get_recv_tensor_shapes(next_forward_k) # fwd recv shape
input_tensor = send_backward_recv_forward(
input_tensor_grad, recv_tensor_shapes, config
)
@@ -1254,7 +1304,7 @@ def forward_backward_pipelining_without_interleaving(
# Run cooldown backward passes.
if not forward_only:
for i in range(num_warmup_microbatches):
-
+ backward_k = num_microbatches_remaining + i
# Enable async grad reduction in the last backward pass
# Note: If grad sync function is provided, only enable
# async grad reduction in first pipeline stage. Other
@@ -1267,12 +1317,14 @@ def forward_backward_pipelining_without_interleaving(
input_tensor = input_tensors.pop(0)
output_tensor = output_tensors.pop(0)
+ send_tensor_shapes = get_send_tensor_shapes(backward_k) # bwd recv shape
output_tensor_grad = recv_backward(send_tensor_shapes, config)
input_tensor_grad = backward_step(
input_tensor, output_tensor, output_tensor_grad, model_type, config
)
+ recv_tensor_shapes = get_recv_tensor_shapes(backward_k) # bwd send shape
send_backward(input_tensor_grad, recv_tensor_shapes, config)
# Launch any remaining grad reductions.
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index d4e042b2..c480d14e 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -55,8 +55,9 @@ def get_model_type(model):
return get_attr_wrapped_model(model, 'model_type')
+# walkaround: get_model_config to get megatron config (ModelParallelConfig)
def get_model_config(model):
- return get_attr_wrapped_model(model, 'config', allow_none=False)
+ return get_attr_wrapped_model(model, 'megatron_config', allow_none=False)
class GlobalMemoryBuffer:
diff --git a/megatron/global_vars.py b/megatron/global_vars.py
index b1b4b043..9e23dea5 100644
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -21,11 +21,48 @@ _GLOBAL_ADLR_AUTORESUME = None
_GLOBAL_TIMERS = None
_GLOBAL_SIGNAL_HANDLER = None
-def get_args():
+DEFAULT_NAMESPACE = 'default'
+import contextlib
+
+@contextlib.contextmanager
+def fork_args_namespace(namespace):
+ """
+ Usage example:
+ update_args('vit', vit_config)
+ with fork_args_namespace('vit'):
+ do vit stuff here
+ """
+ # Check if we have added the args namespace
+ if namespace not in _GLOBAL_ARGS:
+ raise Exception('args namespace {} is not added'.format(namespace))
+ # Store current args namespace.
+ tmp = _GLOBAL_ARGS[DEFAULT_NAMESPACE]
+ # Set args namespace to the desired one
+ _GLOBAL_ARGS[DEFAULT_NAMESPACE] = _GLOBAL_ARGS[namespace]
+ # Do the stuff we wanted to do.
+ try:
+ yield
+ finally:
+ _GLOBAL_ARGS[DEFAULT_NAMESPACE] = tmp
+
+def get_args(namespace=DEFAULT_NAMESPACE):
"""Return arguments."""
_ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
- return _GLOBAL_ARGS
+ return _GLOBAL_ARGS[namespace]
+def set_args(args):
+ global _GLOBAL_ARGS
+ if _GLOBAL_ARGS is None:
+ _GLOBAL_ARGS = {}
+ _GLOBAL_ARGS[DEFAULT_NAMESPACE] = args
+
+def update_args(namespace, args):
+ _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
+ if namespace not in _GLOBAL_ARGS:
+ import copy
+ _GLOBAL_ARGS[namespace] = copy.deepcopy(_GLOBAL_ARGS[DEFAULT_NAMESPACE])
+ for k, v in args.items():
+ setattr(_GLOBAL_ARGS[namespace], k, v)
def get_retro_args():
"""Return retro arguments."""
@@ -87,7 +124,7 @@ def _set_signal_handler():
-def set_global_variables(args, build_tokenizer=True):
+def set_global_variables(args):
"""Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
assert args is not None
@@ -96,7 +133,7 @@ def set_global_variables(args, build_tokenizer=True):
set_args(args)
_build_num_microbatches_calculator(args)
- if build_tokenizer:
+ if args.vocab_file:
_ = _build_tokenizer(args)
_set_tensorboard_writer(args)
_set_wandb_writer(args)
@@ -107,11 +144,6 @@ def set_global_variables(args, build_tokenizer=True):
_set_signal_handler()
-def set_args(args):
- global _GLOBAL_ARGS
- _GLOBAL_ARGS = args
-
-
def set_retro_args(retro_args):
global _GLOBAL_RETRO_ARGS
_GLOBAL_RETRO_ARGS = retro_args
diff --git a/megatron/initialize.py b/megatron/initialize.py
index fb7866ab..01999622 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -39,7 +39,7 @@ def initialize_megatron(
if not allow_no_cuda:
# Make sure cuda is available.
assert torch.cuda.is_available(), "Megatron requires CUDA."
-
+ print('use open-source megatron initialize...')
# Parse arguments
args = parse_args(extra_args_provider, ignore_unknown_args)
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index c91a674e..bcb7bd7e 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -81,7 +81,7 @@ class MixedFusedLayerNorm(torch.nn.Module):
if self.no_persist_layer_norm:
assert FusedLayerNormAffineFunction is not None, \
"FusedLayerNormAffineFunction is not available, please install apex from https://github.com/NVIDIA/apex"
- return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps)
+ return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps, False)
else:
output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py
index a04ae478..b64d22a5 100644
--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
@@ -366,7 +366,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
check_for_nan_in_grad, params_have_main_grad, fp16,
- bf16, params_dtype, grad_scaler, models):
+ bf16, params_dtype, grad_scaler, models, overlap_param_gather=False):
"""
See top of class definition for argument descriptions.
@@ -382,8 +382,11 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
check_for_nan_in_grad, params_have_main_grad,
fp16, bf16, params_dtype, grad_scaler, models)
- assert isinstance(optimizer, Adam), \
- "Only Adam currently supported, due to checkpointing requirements."
+ # assert isinstance(optimizer, Adam), \
+ # "Only Adam currently supported, due to checkpointing requirements."
+
+ if not isinstance(optimizer, Adam):
+ print("WARNING: the optimizer type is not Adam, and now Only Adam currently support checkpointing requirements!")
# Model grad buffer ranges.
self.model_gbuf_ranges = []
@@ -476,7 +479,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
self.param_buffer_copied.append(False)
self.num_all_gather_handles = len(self.all_gather_handle_index_to_bucket_index_map)
- self.overlap_param_gather = get_args().overlap_param_gather
+ self.overlap_param_gather = overlap_param_gather
if self.overlap_param_gather:
self.remove_pre_hook_handle = torch.nn.modules.module.register_module_forward_pre_hook(
self._make_forward_pre_hook())
diff --git a/megatron/training.py b/megatron/training.py
index 36f6c52e..73664509 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -430,6 +430,7 @@ def train_step(forward_step_func, data_iterator,
model=model,
num_microbatches=get_num_microbatches(),
seq_length=args.seq_length,
+ hidden_size=args.hidden_size,
micro_batch_size=args.micro_batch_size,
decoder_seq_length=args.decoder_seq_length,
forward_only=False)
diff --git a/tools/prebuild_kernels.py b/tools/prebuild_kernels.py
new file mode 100644
index 00000000..6f891b9e
--- /dev/null
+++ b/tools/prebuild_kernels.py
@@ -0,0 +1,13 @@
+import os
+from megatron.fused_kernels import load
+
+
+class FakeArgs:
+ rank = 0
+
+
+# 7.0 for V100
+# 8.0 for A100/A800
+os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0+PTX;8.0+PTX"
+
+load(FakeArgs)
\ No newline at end of file

12
requirements.txt Normal file
View File

@ -0,0 +1,12 @@
transformers
hydra-core
tensordict < 0.3.1
numpy
pytest
deepspeed
pybind11
codetiming
yapf
wandb
git+https://github.com/NVIDIA/TransformerEngine.git@stable
# vllm==0.5.4 # vllm is installed in image building to avoid ray conflicts

62
setup.py Normal file
View File

@ -0,0 +1,62 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from setuptools import setup, find_packages
import os
version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
with open(os.path.join(version_folder, 'verl/version/version')) as f:
__version__ = f.read().strip()
# TODO: add version info to requirements
install_requires = [
'tensordict',
'transformers<4.45',
'codetiming',
'pybind11',
'hydra-core',
'numpy',
'pytest',
'yapf',
"dill",
"accelerate"
]
install_optional = [
'vllm==0.5.4',
'liger-kernel'
]
extras_require = {
'demo': ['hydra-core', 'transformers', ''],
'single-controller': ['ray', 'kubernetes'],
'single-controller-ray': ['ray'],
}
setup(
name='verl',
version=__version__,
package_dir={'': '.'},
packages=find_packages(where='.'),
url='https://github.com/volcengine/verl',
license='Apache 2.0',
author='Bytedance - Seed - MLSys',
author_email='zhangchi.usc1992@bytedance.com, gmsheng@connect.hku.hk',
description='veRL: Volcano Engine Reinforcement Learning for LLM',
install_requires=install_requires,
extras_require=extras_require,
package_data={'': ['version/*']},
include_package_data=True,
)

View File

@ -0,0 +1,20 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
with open(os.path.join(version_folder, 'version/version')) as f:
__version__ = f.read().strip()

View File

@ -0,0 +1,16 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .worker import Worker
from .worker_group import WorkerGroup, ClassWithInitArgs, ResourcePool

View File

@ -0,0 +1,410 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
from functools import wraps
from typing import Dict, List, Tuple
from types import FunctionType
from verl.protocol import DataProtoFuture
# here we add a magic number of avoid user-defined function already have this attribute
MAGIC_ATTR = 'attrs_3141562937'
class Dispatch(Enum):
RANK_ZERO = 0
ONE_TO_ALL = 1
ALL_TO_ALL = 2
MEGATRON_COMPUTE = 3
MEGATRON_PP_AS_DP = 4
MEGATRON_PP_ONLY = 5
MEGATRON_COMPUTE_PROTO = 6
MEGATRON_PP_AS_DP_PROTO = 7
DP_COMPUTE = 8
DP_COMPUTE_PROTO = 9
DP_COMPUTE_PROTO_WITH_FUNC = 10
DP_COMPUTE_METRIC = 11
class Execute(Enum):
ALL = 0
RANK_ZERO = 1
def _split_args_kwargs_data_proto(chunks, *args, **kwargs):
from verl.protocol import DataProto, DataProtoFuture
splitted_args = []
for arg in args:
assert isinstance(arg, (DataProto, DataProtoFuture))
splitted_args.append(arg.chunk(chunks=chunks))
splitted_kwargs = {}
for key, val in kwargs.items():
assert isinstance(val, (DataProto, DataProtoFuture))
splitted_kwargs[key] = val.chunk(chunks=chunks)
return splitted_args, splitted_kwargs
def dispatch_one_to_all(worker_group, *args, **kwargs):
args = tuple([arg] * worker_group.world_size for arg in args)
kwargs = {k: [v] * worker_group.world_size for k, v in kwargs.items()}
return args, kwargs
def dispatch_all_to_all(worker_group, *args, **kwargs):
return args, kwargs
def collect_all_to_all(worker_group, output):
return output
def dispatch_megatron_compute(worker_group, *args, **kwargs):
"""
User passes in dp data. The data is dispatched to all tp/pp ranks with the same dp
"""
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group,
MegatronWorkerGroup), f'worker_group must be MegatronWorkerGroup, Got {type(worker_group)}'
all_args = []
for arg in args:
assert isinstance(arg, (Tuple, List)) and len(arg) == worker_group.dp_size
transformed_args = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
transformed_args.append(arg[local_dp_rank])
all_args.append(transformed_args)
all_args = tuple(all_args)
all_kwargs = {}
for k, v in kwargs.items():
assert isinstance(v, (Tuple, List)) and len(v) == worker_group.dp_size
transformed_v = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
transformed_v.append(v[local_dp_rank])
all_kwargs[k] = transformed_v
return all_args, all_kwargs
def collect_megatron_compute(worker_group, output):
"""
Only collect the data from the tp=0 and pp=last and every dp ranks
"""
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output_in_dp = []
pp_size = worker_group.get_megatron_global_info().pp_size
for global_rank in range(worker_group.world_size):
local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
if local_rank_info.tp_rank == 0 and local_rank_info.pp_rank == pp_size - 1:
output_in_dp.append(output[global_rank])
return output_in_dp
def dispatch_megatron_compute_data_proto(worker_group, *args, **kwargs):
"""
All the args and kwargs must be DataProto. The batch will be chunked by dp_size and passed to each rank
"""
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.dp_size, *args, **kwargs)
return dispatch_megatron_compute(worker_group, *splitted_args, **splitted_kwargs)
def _concat_data_proto_or_future(output: List):
from verl.protocol import DataProto, DataProtoFuture
import ray
# make sure all the elements in output has the same type
for o in output:
assert type(o) == type(output[0])
o = output[0]
if isinstance(o, DataProto):
return DataProto.concat(output)
elif isinstance(o, ray.ObjectRef):
return DataProtoFuture.concat(output)
else:
raise NotImplementedError
def collect_megatron_compute_data_proto(worker_group, output):
"""
Each output must be a DataProto. We concat the dim=0 of output
"""
from verl.protocol import DataProto
import ray
output = collect_megatron_compute(worker_group, output)
for o in output:
assert isinstance(o, (DataProto, ray.ObjectRef)), f"expecting {o} to be DataProto, but got {type(o)}"
return _concat_data_proto_or_future(output)
def dispatch_megatron_pp_as_dp(worker_group, *args, **kwargs):
"""
treat pp as dp.
"""
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
pp_size = worker_group.pp_size
dp_size = worker_group.dp_size
pp_dp_size = pp_size * dp_size
all_args = []
for arg in args:
assert isinstance(arg, (List, Tuple)) and len(arg) == pp_dp_size
transformed_args = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
local_pp_rank = worker_group.get_megatron_rank_info(rank=i).pp_rank
# compute the rank in arg. Note that the order is dp then pp
# Also note that the outputs within a pp group will be firstly allgathered, then only the output of pp0 will be collected.
# For pp=2 dp=4, a batch of data "ABCDEFGH" should be dispatched and collected in below order:
# dispatch: pp_allgther: collect:
# dp 0 1 2 3 dp 0 1 2 3
# pp +---------+ pp +-------------+
# 0 | A C E G | 0 | AB CD EF GH | ABCDEFGH
# 1 | B D F H | 1 | AB CD EF GH |
# +---------+ +-------------+
arg_rank = local_dp_rank * worker_group.pp_size + local_pp_rank
transformed_args.append(arg[arg_rank])
all_args.append(transformed_args)
all_args = tuple(all_args)
all_kwargs = {}
for k, v in kwargs.items():
assert isinstance(v, (List, Tuple)) and len(v) == pp_dp_size, f'expect len(v)=={pp_dp_size}, got {len(v)}'
transformed_v = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
local_pp_rank = worker_group.get_megatron_rank_info(rank=i).pp_rank
# compute the rank in arg. Note that the order is dp then pp
arg_rank = local_dp_rank * worker_group.pp_size + local_pp_rank
transformed_v.append(v[arg_rank])
all_kwargs[k] = transformed_v
return all_args, all_kwargs
def collect_megatron_pp_as_dp(worker_group, output):
"""
treat pp as dp. Only collect data on tp=0
"""
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output_in_dp = []
for global_rank in range(worker_group.world_size):
local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
if local_rank_info.tp_rank == 0 and local_rank_info.pp_rank == 0:
output_in_dp.append(output[global_rank])
return output_in_dp
def collect_megatron_pp_only(worker_group, output):
"""
Only collect output of megatron pp. This is useful when examine weight names as they are identical in tp/dp
"""
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output_in_pp = []
for global_rank in range(worker_group.world_size):
local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
if local_rank_info.tp_rank == 0 and local_rank_info.dp_rank == 0:
output_in_pp.append(output[global_rank])
return output_in_pp
def dispatch_megatron_pp_as_dp_data_proto(worker_group, *args, **kwargs):
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
pp_dp_size = worker_group.dp_size * worker_group.pp_size
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(pp_dp_size, *args, **kwargs)
return dispatch_megatron_pp_as_dp(worker_group, *splitted_args, **splitted_kwargs)
def collect_megatron_pp_as_dp_data_proto(worker_group, output):
from verl.protocol import DataProto
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output = collect_megatron_pp_as_dp(worker_group, output)
return _concat_data_proto_or_future(output)
def dispatch_dp_compute(worker_group, *args, **kwargs):
from single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
for arg in args:
assert isinstance(arg, (Tuple, List)) and len(arg) == worker_group.world_size
for k, v in kwargs.items():
assert isinstance(v, (Tuple, List)) and len(v) == worker_group.world_size
return args, kwargs
def collect_dp_compute(worker_group, output):
from single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
assert len(output) == worker_group.world_size
return output
def dispatch_dp_compute_data_proto(worker_group, *args, **kwargs):
from single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args, **kwargs)
return splitted_args, splitted_kwargs
def dispatch_dp_compute_data_proto_with_func(worker_group, *args, **kwargs):
from single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
assert type(args[0]) == FunctionType # NOTE: The first one args is a function!
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args[1:], **kwargs)
splitted_args_with_func = [[args[0]] * worker_group.world_size] + splitted_args
return splitted_args_with_func, splitted_kwargs
def collect_dp_compute_data_proto(worker_group, output):
from verl.protocol import DataProto
import ray
for o in output:
assert isinstance(o, (DataProto, ray.ObjectRef)), f"expecting {o} to be DataProto, but got {type(o)}"
output = collect_dp_compute(worker_group, output)
return _concat_data_proto_or_future(output)
def get_predefined_dispatch_fn(dispatch_mode):
predefined_dispatch_mode_fn = {
Dispatch.ONE_TO_ALL: {
'dispatch_fn': dispatch_one_to_all,
'collect_fn': collect_all_to_all,
},
Dispatch.ALL_TO_ALL: {
'dispatch_fn': dispatch_all_to_all,
'collect_fn': collect_all_to_all,
},
Dispatch.MEGATRON_COMPUTE: {
'dispatch_fn': dispatch_megatron_compute,
'collect_fn': collect_megatron_compute,
},
Dispatch.MEGATRON_PP_AS_DP: {
'dispatch_fn': dispatch_megatron_pp_as_dp,
'collect_fn': collect_megatron_pp_as_dp,
},
Dispatch.MEGATRON_PP_ONLY: {
'dispatch_fn': dispatch_one_to_all,
'collect_fn': collect_megatron_pp_only
},
Dispatch.MEGATRON_COMPUTE_PROTO: {
'dispatch_fn': dispatch_megatron_compute_data_proto,
'collect_fn': collect_megatron_compute_data_proto
},
Dispatch.MEGATRON_PP_AS_DP_PROTO: {
'dispatch_fn': dispatch_megatron_pp_as_dp_data_proto,
'collect_fn': collect_megatron_pp_as_dp_data_proto
},
Dispatch.DP_COMPUTE: {
'dispatch_fn': dispatch_dp_compute,
'collect_fn': collect_dp_compute
},
Dispatch.DP_COMPUTE_PROTO: {
'dispatch_fn': dispatch_dp_compute_data_proto,
'collect_fn': collect_dp_compute_data_proto
},
Dispatch.DP_COMPUTE_PROTO_WITH_FUNC: {
'dispatch_fn': dispatch_dp_compute_data_proto_with_func,
'collect_fn': collect_dp_compute_data_proto
},
Dispatch.DP_COMPUTE_METRIC: {
'dispatch_fn': dispatch_dp_compute_data_proto,
'collect_fn': collect_dp_compute
}
}
return predefined_dispatch_mode_fn[dispatch_mode]
def get_predefined_execute_fn(execute_mode):
"""
Note that here we only asks execute_all and execute_rank_zero to be implemented
Leave the choice of how these two functions handle argument 'blocking' to users
"""
predefined_execute_mode_fn = {
Execute.ALL: {
'execute_fn_name': 'execute_all'
},
Execute.RANK_ZERO: {
'execute_fn_name': 'execute_rank_zero'
}
}
return predefined_execute_mode_fn[execute_mode]
def _check_dispatch_mode(dispatch_mode):
assert isinstance(dispatch_mode,
(Dispatch, Dict)), f'dispatch_mode must be a Dispatch or a Dict. Got {dispatch_mode}'
if isinstance(dispatch_mode, Dict):
necessary_keys = ['dispatch_fn', 'collect_fn']
for key in necessary_keys:
assert key in dispatch_mode, f'key {key} should be in dispatch_mode if it is a dictionary'
def _check_execute_mode(execute_mode):
assert isinstance(execute_mode, Execute), f'execute_mode must be a Execute. Got {execute_mode}'
def _materialize_futures(*args, **kwargs):
new_args = []
for arg in args:
if isinstance(arg, DataProtoFuture):
arg = arg.get()
# add more type to materialize
new_args.append(arg)
for k, v in kwargs.items():
if isinstance(v, DataProtoFuture):
kwargs[k] = v.get()
new_args = tuple(new_args)
return new_args, kwargs
def register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.ALL, blocking=True, materialize_futures=True):
_check_dispatch_mode(dispatch_mode=dispatch_mode)
_check_execute_mode(execute_mode=execute_mode)
def decorator(func):
@wraps(func)
def inner(*args, **kwargs):
if materialize_futures:
args, kwargs = _materialize_futures(*args, **kwargs)
return func(*args, **kwargs)
attrs = {'dispatch_mode': dispatch_mode, 'execute_mode': execute_mode, 'blocking': blocking}
setattr(inner, MAGIC_ATTR, attrs)
return inner
return decorator

View File

@ -0,0 +1,47 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from single_controller.base.worker import Worker
class DPEngineWorker(Worker):
def __init__(self, *args, **kwargs):
# todo: extract _world_size etc. from kwargs and inject in super().__init__()
Worker.__init__(self, *args, **kwargs)
def init(self):
raise NotImplementedError
def add_engine(self, model, dp_config):
raise NotImplementedError
def execute_engine(self, method_name, *args, **kwargs):
print(f"execute_engine called with method={method_name}")
func = getattr(self._engine, method_name)
return func(*args, **kwargs)
def execute_module(self, method_name, *args, **kwargs):
print(f"execute_module called with method={method_name}")
func = getattr(self._engine.module, method_name)
return func(*args, **kwargs)
def get_model_size_on_rank_zero(self):
import torch
from verl.utils.model import get_model_size
if torch.distributed.get_rank() == 0:
# print("model print on rank 0: ", self._model)
module_size, module_size_scale = get_model_size(self._model)
return module_size, module_size_scale
return None

View File

@ -0,0 +1,13 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,39 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from dataclasses import dataclass
from single_controller.base.worker import Worker, DistRankInfo, DistGlobalInfo
class MegatronWorker(Worker):
def __init__(self, cuda_visible_devices=None) -> None:
super().__init__(cuda_visible_devices)
def get_megatron_global_info(self):
from megatron.core import parallel_state as mpu
tp_size = mpu.get_tensor_model_parallel_world_size()
dp_size = mpu.get_data_parallel_world_size()
pp_size = mpu.get_pipeline_model_parallel_world_size()
info = DistGlobalInfo(tp_size=tp_size, dp_size=dp_size, pp_size=pp_size)
return info
def get_megatron_rank_info(self):
from megatron.core import parallel_state as mpu
tp_rank = mpu.get_tensor_model_parallel_rank()
dp_rank = mpu.get_data_parallel_rank()
pp_rank = mpu.get_pipeline_model_parallel_rank()
info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank)
return info

View File

@ -0,0 +1,51 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict
from .worker import DistRankInfo, DistGlobalInfo
from single_controller.base import ResourcePool, WorkerGroup
class MegatronWorkerGroup(WorkerGroup):
def __init__(self, resource_pool: ResourcePool, **kwargs):
super().__init__(resource_pool=resource_pool, **kwargs)
self._megatron_rank_info = None
self._megatron_global_info: DistGlobalInfo = None
def init_megatron(self, default_megatron_kwargs: Dict = None):
raise NotImplementedError(f"MegatronWorkerGroup.init_megatron should be overwritten")
def get_megatron_rank_info(self, rank: int) -> DistRankInfo:
assert 0 <= rank < self.world_size, f'rank must be from [0, world_size), Got {rank}'
return self._megatron_rank_info[rank]
@property
def tp_size(self):
assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
return self._megatron_global_info.tp_size
@property
def dp_size(self):
assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
return self._megatron_global_info.dp_size
@property
def pp_size(self):
assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
return self._megatron_global_info.pp_size
def get_megatron_global_info(self):
return self._megatron_global_info

View File

@ -0,0 +1,13 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,29 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ray
@ray.remote
class WorkerGroupRegisterCenter:
def __init__(self, rank_zero_info):
self.rank_zero_info = rank_zero_info
def get_rank_zero_info(self):
return self.rank_zero_info
def create_worker_group_register_center(name, info):
return WorkerGroupRegisterCenter.options(name=name).remote(info)

View File

@ -0,0 +1,181 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
the class for Worker
"""
import os
import socket
from dataclasses import dataclass
from single_controller.base.decorator import register, Dispatch
@dataclass
class DistRankInfo:
tp_rank: int
dp_rank: int
pp_rank: int
@dataclass
class DistGlobalInfo:
tp_size: int
dp_size: int
pp_size: int
class WorkerHelper:
def _get_node_ip(self):
def get_node_ip_by_sdk():
if os.getenv("WG_BACKEND", None) == "ray":
import ray
return ray._private.services.get_node_ip_address()
elif os.getenv("WG_BACKEND", None) == "torch_rpc":
from single_controller.torchrpc.k8s_client import get_ip_addr
return get_ip_addr()
return None
host_ipv4 = os.getenv("MY_HOST_IP", None)
host_ipv6 = os.getenv("MY_HOST_IPV6", None)
host_ip_by_env = host_ipv4 or host_ipv6
host_ip_by_sdk = get_node_ip_by_sdk()
host_ip = host_ip_by_env or host_ip_by_sdk
return host_ip
def _get_free_port(self):
with socket.socket() as sock:
sock.bind(('', 0))
return sock.getsockname()[1]
def get_availale_master_addr_port(self):
return self._get_node_ip(), str(self._get_free_port())
def _get_pid(self):
return
class WorkerMeta:
keys = [
"WORLD_SIZE", "RANK", "LOCAL_WORLD_SIZE", "LOCAL_RANK", "MASTER_ADDR", "MASTER_PORT", "CUDA_VISIBLE_DEVICES"
]
def __init__(self, store) -> None:
self._store = store
def to_dict(self):
return {f"_{key.lower()}": self._store.get(f"_{key.lower()}", None) for key in WorkerMeta.keys}
# we assume that in each WorkerGroup, there is a Master Worker
class Worker(WorkerHelper):
def __new__(cls, *args, **kwargs):
instance = super().__new__(cls)
# note that here we use int to distinguish
disable_worker_init = int(os.environ.get('DISABLE_WORKER_INIT', 0))
if disable_worker_init:
return instance
rank = os.environ.get("RANK", None)
worker_group_prefix = os.environ.get("WG_PREFIX", None)
# when decorator @ray.remote applies, __new__ will be called while we don't want to apply _configure_before_init
if None not in [rank, worker_group_prefix] and 'ActorClass(' not in cls.__name__:
instance._configure_before_init(f"{worker_group_prefix}_register_center", int(rank))
return instance
def _configure_before_init(self, register_center_name: str, rank: int):
assert isinstance(rank, int), f"rank must be int, instead of {type(rank)}"
if rank == 0:
master_addr, master_port = self.get_availale_master_addr_port()
rank_zero_info = {
"MASTER_ADDR": master_addr,
"MASTER_PORT": master_port,
}
if os.getenv("WG_BACKEND", None) == "ray":
from single_controller.base.register_center.ray import create_worker_group_register_center
self.register_center = create_worker_group_register_center(name=register_center_name,
info=rank_zero_info)
os.environ.update(rank_zero_info)
def __init__(self, cuda_visible_devices=None) -> None:
# construct a meta from envrionment variable. Note that the import must be inside the class because it is executed remotely
import os
world_size = int(os.environ['WORLD_SIZE'])
rank = int(os.environ['RANK'])
self._rank = rank
self._world_size = world_size
master_addr = os.environ["MASTER_ADDR"]
master_port = os.environ["MASTER_PORT"]
local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", "1"))
local_rank = int(os.getenv("LOCAL_RANK", "0"))
store = {
'_world_size': world_size,
'_rank': rank,
'_local_world_size': local_world_size,
'_local_rank': local_rank,
'_master_addr': master_addr,
'_master_port': master_port
}
if cuda_visible_devices is not None:
store['_cuda_visible_devices'] = cuda_visible_devices
meta = WorkerMeta(store=store)
self._configure_with_meta(meta=meta)
def _configure_with_meta(self, meta: WorkerMeta):
"""
This function should only be called inside by WorkerGroup
"""
assert isinstance(meta, WorkerMeta)
self.__dict__.update(meta.to_dict()) # this is hacky
# print(f"__dict__: {self.__dict__}")
for key in WorkerMeta.keys:
val = self.__dict__.get(f"_{key.lower()}", None)
if val is not None:
# print(f"set {key} to {val}")
os.environ[key] = str(val)
os.environ["REDIS_STORE_SERVER_HOST"] = str(self._master_addr).replace("[", "").replace(
"]", "") if self._master_addr else ""
def get_master_addr_port(self):
return self._master_addr, self._master_port
def get_cuda_visible_devices(self):
import os
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "not set")
return cuda_visible_devices
@property
def world_size(self):
return self._world_size
@property
def rank(self):
return self._rank
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO_WITH_FUNC)
def execute_with_func_generator(self, func, *args, **kwargs):
ret_proto = func(self, *args, **kwargs)
return ret_proto

View File

@ -0,0 +1,196 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
the class of WorkerGroup
"""
import logging
import threading
import signal
import time
from typing import List, Any, Callable, Dict
from single_controller.base.decorator import MAGIC_ATTR, Dispatch, get_predefined_dispatch_fn, get_predefined_execute_fn
class ResourcePool:
def __init__(self, process_on_nodes=None, max_collocate_count: int = 10, n_gpus_per_node=8) -> None:
if process_on_nodes is None:
process_on_nodes = []
self._store = process_on_nodes
self.max_collocate_count = max_collocate_count
self.n_gpus_per_node = n_gpus_per_node # this is left for future huawei GPU that contains 16 GPUs per node
def add_node(self, process_count):
self._store.append(process_count)
@property
def world_size(self):
return sum(self._store)
def __call__(self) -> Any:
return self._store
@property
def store(self):
return self._store
def local_world_size_list(self) -> List[int]:
nested_local_world_size_list = [
[local_world_size for _ in range(local_world_size)] for local_world_size in self._store
]
return [item for row in nested_local_world_size_list for item in row]
def local_rank_list(self) -> List[int]:
nested_local_rank_list = [[i for i in range(local_world_size)] for local_world_size in self._store]
return [item for row in nested_local_rank_list for item in row]
class ClassWithInitArgs:
"""
This class stores a class constructor and the args/kwargs to construct the class.
It is used to instantiate the remote class.
"""
def __init__(self, cls, *args, **kwargs) -> None:
self.cls = cls
self.args = args
self.kwargs = kwargs
# def add_arg(self, arg):
# self.args += (arg,)
# def add_kwarg(self, key, value):
# self.kwargs[key] = value
def __call__(self) -> Any:
return self.cls(*self.args, **self.kwargs)
def check_workers_alive(workers: List, is_alive: Callable, gap_time: float = 1) -> None:
import time
while True:
for worker in workers:
if not is_alive(worker):
logging.warning(f"worker {worker} is not alive" + " sending signal to main thread")
signal.raise_signal(signal.SIGABRT)
time.sleep(gap_time)
class WorkerGroup:
def __init__(self, resource_pool: ResourcePool, **kwargs) -> None:
self._is_init_with_detached_workers = True if resource_pool is None else False
if resource_pool is not None:
# handle the case when WorkGroup is attached to an existing one
self._procecss_dispatch_config = resource_pool()
else:
self._procecss_dispatch_config = None
self._workers = []
self._worker_names = []
self._master_addr = None
self._master_port = None
self._checker_thread: threading.Thread = None
def _is_worker_alive(self, worker):
raise NotImplementedError(f"WorkerGroup._is_worker_alive called, should be implemented in derived class.")
def _block_until_all_workers_alive(self) -> None:
while True:
all_state = [self._is_worker_alive(worker) for worker in self._workers]
if False in all_state:
time.sleep(1)
else:
break
def start_worker_aliveness_check(self, every_n_seconds=1) -> None:
# before starting checking worker aliveness, make sure all workers are already alive
self._block_until_all_workers_alive()
self._checker_thread = threading.Thread(target=check_workers_alive,
args=(self._workers, self._is_worker_alive, every_n_seconds))
self._checker_thread.start()
@property
def world_size(self):
return len(self._workers)
# execute_all_async and execute_rank_zero_async should be implemented by RayWorkerGroup, TorchRPCWorkerGroup,
# MegatronWorkerGroup, XperfWorkerGroup should skip
def _bind_worker_method(self, user_defined_cls, func_generator):
"""
Bind the worker method to the WorkerGroup
"""
for method_name in dir(user_defined_cls):
try:
method = getattr(user_defined_cls, method_name)
assert callable(method), f"{method_name} in {user_defined_cls} is not callable"
except Exception as e:
# if it is a property, it will fail because Class doesn't have instance property
continue
if hasattr(method, MAGIC_ATTR):
# this method is decorated by register
attribute = getattr(method, MAGIC_ATTR)
assert isinstance(attribute, Dict), f'attribute must be a dictionary. Got {type(attribute)}'
assert 'dispatch_mode' in attribute, f'attribute must contain dispatch_mode in its key'
dispatch_mode = attribute['dispatch_mode']
execute_mode = attribute['execute_mode']
blocking = attribute['blocking']
# get dispatch fn
if isinstance(dispatch_mode, Dispatch):
# get default dispatch fn
fn = get_predefined_dispatch_fn(dispatch_mode=dispatch_mode)
dispatch_fn = fn['dispatch_fn']
collect_fn = fn['collect_fn']
else:
assert isinstance(dispatch_mode, dict)
assert 'dispatch_fn' in dispatch_mode
assert 'collect_fn' in dispatch_mode
dispatch_fn = dispatch_mode['dispatch_fn']
collect_fn = dispatch_mode['collect_fn']
# get execute_fn_name
execute_mode = get_predefined_execute_fn(execute_mode=execute_mode)
wg_execute_fn_name = execute_mode['execute_fn_name']
# get execute_fn from string
try:
execute_fn = getattr(self, wg_execute_fn_name)
assert callable(execute_fn), 'execute_fn must be callable'
except Exception as e:
print(f'execute_fn {wg_execute_fn_name} is invalid')
raise
# bind a new method to the RayWorkerGroup
func = func_generator(self,
method_name,
dispatch_fn=dispatch_fn,
collect_fn=collect_fn,
execute_fn=execute_fn,
blocking=blocking)
try:
setattr(self, method_name, func)
except Exception as e:
raise ValueError(f'Fail to set method_name {method_name}')

View File

@ -0,0 +1,16 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, create_colocated_worker_cls
from .megatron import (MegatronRayWorkerGroup, DistRankInfo, DistGlobalInfo)

View File

@ -0,0 +1,455 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from typing import Dict, List, Any, Tuple
import ray
from ray.util import list_named_actors
from ray.util.placement_group import placement_group, PlacementGroup
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy, NodeAffinitySchedulingStrategy
from ray.experimental.state.api import get_actor
from single_controller.base import WorkerGroup, ResourcePool, ClassWithInitArgs, Worker
__all__ = ['Worker']
def get_random_string(length: int) -> str:
import random
import string
letters_digits = string.ascii_letters + string.digits
return ''.join(random.choice(letters_digits) for _ in range(length))
def func_generator(self, method_name, dispatch_fn, collect_fn, execute_fn, blocking):
def func(*args, **kwargs):
args, kwargs = dispatch_fn(self, *args, **kwargs)
output = execute_fn(method_name, *args, **kwargs)
if blocking:
output = ray.get(output)
output = collect_fn(self, output)
return output
return func
class RayResourcePool(ResourcePool):
def __init__(self,
process_on_nodes: List[int] = None,
use_gpu: bool = True,
name_prefix: str = "",
max_colocate_count: int = 5,
detached=False) -> None:
super().__init__(process_on_nodes, max_colocate_count)
self.use_gpu = use_gpu
# print(f"in RayProcessDispatchConfiguration: name_prefix = {name_prefix}")
self.name_prefix = name_prefix
self.pgs = None
self.detached = detached
def get_placement_groups(self, strategy="STRICT_PACK", name=None):
if self.pgs is not None:
return self.pgs
pg_name_prefix = name if name else \
f"{self.name_prefix}verl_group_{'_'.join([str(count) for count in self._store])}:"
# print(f"pg_name_prefix = {pg_name_prefix}")
pg_scheme = [[{
"CPU": self.max_collocate_count,
"GPU": 1
} if self.use_gpu else {
"CPU": self.max_collocate_count
} for _ in range(process_count)] for process_count in self._store]
lifetime = 'detached' if self.detached else None
pgs = [
placement_group(bundles=bundles, strategy=strategy, name=pg_name_prefix + str(idx), lifetime=lifetime)
for idx, bundles in enumerate(pg_scheme)
]
ray.get([pg.ready() for pg in pgs])
self.pgs = pgs
return pgs
def extract_pg_from_exist(resource_pools: Dict[str, RayResourcePool], src_role_names: List[str],
resource_pool: RayResourcePool) -> List:
src_pgs = [
pg for role_name, resource_pool in resource_pools.items() for pg in resource_pool.get_placement_groups()
if role_name in src_role_names
]
sorted_src_pgs = sorted(src_pgs, key=lambda pg: pg.bundle_count, reverse=True)
sorted_process_on_nodes = sorted([(val, idx) for idx, val in enumerate(resource_pool.store)], reverse=True)
unsorted_pgs: List[Tuple[int, PlacementGroup]] = []
searching_idx = 0
for request_process, original_idx in sorted_process_on_nodes:
assert searching_idx < len(sorted_src_pgs), f"no enough nodes for request: searching {searching_idx} th node"
assert request_process <= sorted_src_pgs[searching_idx].bundle_count, \
f"requesting {request_process} processes, bundle count cannot satisfy"
unsorted_pgs.append((original_idx, sorted_src_pgs[searching_idx]))
searching_idx += 1
return [pg for _, pg in sorted(unsorted_pgs)]
def merge_resource_pool(rp1: RayResourcePool, rp2: RayResourcePool) -> RayResourcePool:
assert rp1.use_gpu == rp2.use_gpu, 'Both RayResourcePool must either use_gpu or not'
assert rp1.max_collocate_count == rp2.max_collocate_count, 'Both RayResourcePool must has the same max_collocate_count'
assert rp1.n_gpus_per_node == rp2.n_gpus_per_node, 'Both RayResourcePool must has the same n_gpus_per_node'
assert rp1.detached == rp2.detached, 'Detached ResourcePool cannot be merged with non-detached ResourcePool'
new_store = rp1.store + rp2.store
merged = RayResourcePool(new_store, rp1.use_gpu, f"{rp1.name_prefix}_{rp2.name_prefix}")
merged.pgs = rp1.get_placement_groups() + rp2.get_placement_groups()
return merged
class RayClassWithInitArgs(ClassWithInitArgs):
def __init__(self, cls, *args, **kwargs) -> None:
# self._options = kwargs.pop('options', dict())
super().__init__(cls, *args, **kwargs)
self._options = {}
self._additional_resource = {}
def set_additional_resource(self, additional_resource):
self._additional_resource = additional_resource
def update_options(self, options: Dict):
self._options.update(options)
def __call__(self,
placement_group,
placement_group_bundle_idx,
use_gpu: bool = True,
num_gpus=1,
sharing_with=None) -> Any:
if sharing_with is not None:
target_node_id = ray.get(sharing_with.get_node_id.remote())
cuda_visible_devices = ray.get(sharing_with.get_cuda_visible_devices.remote())
options = {"scheduling_strategy": NodeAffinitySchedulingStrategy(node_id=target_node_id, soft=False)}
return self.cls.options(**options).remote(*self.args,
cuda_visible_devices=cuda_visible_devices,
**self.kwargs)
options = {
"scheduling_strategy":
PlacementGroupSchedulingStrategy(placement_group=placement_group,
placement_group_bundle_index=placement_group_bundle_idx)
}
options.update(self._options)
if use_gpu:
options["num_gpus"] = num_gpus
if len(self._additional_resource) > 1:
for k, v in self._additional_resource.items():
options[k] = v
# print("cls:", self.cls)
# print("args: ", self.args)
# print("kwargs: ", self.kwargs)
return self.cls.options(**options).remote(*self.args, **self.kwargs)
class RayWorkerGroup(WorkerGroup):
def __init__(self,
resource_pool: RayResourcePool = None,
ray_cls_with_init: RayClassWithInitArgs = None,
bin_pack: bool = True,
name_prefix: str = None,
detached=False,
worker_names=None,
**kwargs) -> None:
super().__init__(resource_pool=resource_pool, **kwargs)
self.ray_cls_with_init = ray_cls_with_init
self.name_prefix = get_random_string(length=6) if name_prefix is None else name_prefix
if self._is_init_with_detached_workers:
self._init_with_detached_workers(worker_names=worker_names)
else:
self._init_with_resource_pool(resource_pool=resource_pool,
ray_cls_with_init=ray_cls_with_init,
bin_pack=bin_pack,
detached=detached)
if ray_cls_with_init is not None:
self._bind_worker_method(self.ray_cls_with_init.cls, func_generator)
def _is_worker_alive(self, worker: ray.actor.ActorHandle):
worker_state_dict = get_actor(worker._actor_id.hex())
return worker_state_dict.get("state", "undefined") == "ALIVE" if worker_state_dict is not None else False
def _init_with_detached_workers(self, worker_names):
workers = [ray.get_actor(name=name) for name in worker_names]
self._workers = workers
self._worker_names = worker_names
self._world_size = len(worker_names)
def _init_with_resource_pool(self, resource_pool, ray_cls_with_init, bin_pack, detached):
use_gpu = resource_pool.use_gpu
strategy = "PACK"
if bin_pack:
strategy = "STRICT_PACK"
pgs = resource_pool.get_placement_groups(strategy=strategy)
world_size = resource_pool.world_size
self._world_size = world_size
# cia.add_kwarg("_world_size", world_size)
num_gpus = 1 / resource_pool.max_collocate_count
rank = -1
for pg_idx, local_world_size in enumerate(resource_pool.store):
pg = pgs[pg_idx]
assert local_world_size <= pg.bundle_count, \
f"when generating for {self.name_prefix}, for the "
for local_rank in range(local_world_size):
rank += 1
# we pass in environment variable at option so that Worker can use environment variable to set
env_vars = {
'WORLD_SIZE': str(world_size),
'RANK': str(rank),
'WG_PREFIX': self.name_prefix,
'WG_BACKEND': 'ray',
'RAY_LOCAL_WORLD_SIZE': str(local_world_size),
'RAY_LOCAL_RANK': str(local_rank),
}
if rank != 0:
env_vars['MASTER_ADDR'] = self._master_addr
env_vars['MASTER_PORT'] = self._master_port
import re
cia_name = type(ray_cls_with_init.cls).__name__
match = re.search(r"ActorClass\(([^)]+)\)", cia_name) # ray.remote(Obj) -> "ActorClass(Obj)"
cia_name = match.group(1) if match else cia_name # "ActorClass(Obj)" -> "Obj"
name = f"{self.name_prefix}{cia_name}_{pg_idx}:{local_rank}" # e.g. Worker_2:5
ray_cls_with_init.update_options({'runtime_env': {'env_vars': env_vars}, 'name': name})
if detached:
ray_cls_with_init.update_options({'lifetime': 'detached'})
# create a worker
worker = ray_cls_with_init(placement_group=pg,
placement_group_bundle_idx=local_rank,
use_gpu=use_gpu,
num_gpus=num_gpus)
self._workers.append(worker)
self._worker_names.append(name)
if rank == 0:
register_center_actor = None
for _ in range(120):
if f"{self.name_prefix}_register_center" not in list_named_actors():
time.sleep(1)
else:
register_center_actor = ray.get_actor(f"{self.name_prefix}_register_center")
assert register_center_actor is not None, f"failed to get register_center_actor: {self.name_prefix}_register_center in {list_named_actors(all_namespaces=True)}"
rank_zero_info = ray.get(register_center_actor.get_rank_zero_info.remote())
self._master_addr, self._master_port = rank_zero_info['MASTER_ADDR'], rank_zero_info['MASTER_PORT']
# print(f"rank_zero_info: {rank_zero_info}")
# print(f"master_addr: {self._master_addr}, master_port: {self._master_port}")
@property
def worker_names(self):
return self._worker_names
@classmethod
def from_detached(cls, worker_names=None, ray_cls_with_init=None):
worker_group = cls(resource_pool=None,
ray_cls_with_init=ray_cls_with_init,
name_prefix=None,
worker_names=worker_names)
return worker_group
def spawn(self, prefix_set):
"""
spawn to a dictionary of worker groups, each with a subset of method with prefix.
"""
def _rebind_actor_methods(worker_group, actor_name):
"""
bind the method with actor_prefix to its original name
"""
prefix: str = actor_name + '_'
for method_name in dir(worker_group):
if method_name.startswith(prefix):
# only valid when Python >= 3.9
original_method_name = method_name.removeprefix(prefix)
method = getattr(worker_group, method_name)
setattr(worker_group, original_method_name, method)
new_worker_group_dict = {}
for prefix in prefix_set:
new_worker_group = self.from_detached(worker_names=self._worker_names,
ray_cls_with_init=self.ray_cls_with_init)
_rebind_actor_methods(new_worker_group, prefix)
new_worker_group_dict[prefix] = new_worker_group
return new_worker_group_dict
def execute_rank_zero_sync(self, method_name: str, *args, **kwargs):
return ray.get(self.execute_all_async(method_name, **args, **kwargs))
def execute_rank_zero_async(self, method_name: str, *args, **kwargs):
remote_call = getattr(self._workers[0], method_name)
return remote_call.remote(*args, **kwargs)
def execute_rank_zero(self, method_name: str, *args, **kwargs):
return self.execute_rank_zero_async(method_name, *args, **kwargs)
def execute_all(self, method_name: str, *args, **kwargs):
return self.execute_all_async(method_name, *args, **kwargs)
def execute_all_sync(self, method_name: str, *args, **kwargs):
return ray.get(self.execute_all_async(method_name, *args, **kwargs))
def execute_all_async(self, method_name: str, *args, **kwargs):
# 这里我们假设,如果 args 和 kwargs 里面所有的参数都是 list且所有的 list 长度都与 len(self._workers) 一致的话,我们会把
# list 中的每一个分别发到对应的 worker 上去
# print(f"execute_all_async: method {method_name}({args}, {kwargs})")
length = len(self._workers)
if all(isinstance(arg, list) for arg in args) and all(isinstance(kwarg, list) for kwarg in kwargs.values()):
if all(len(arg) == length for arg in args) and all(len(kwarg) == length for kwarg in kwargs.values()):
# print(f"splitting args and kwargs into {length} shards")
result = []
for i in range(length):
sliced_args = tuple(arg[i] for arg in args)
sliced_kwargs = {k: v[i] for k, v in kwargs.items()}
remote_call = getattr(self._workers[i], method_name)
result.append(remote_call.remote(*sliced_args, **sliced_kwargs))
return result
return [getattr(worker, method_name).remote(*args, **kwargs) for worker in self._workers]
@property
def master_address(self):
return self._master_addr
@property
def master_port(self):
return self._master_port
@property
def workers(self):
return self._workers
@property
def world_size(self):
return self._world_size
"""
Utilities that enables creating workers inside the same ray.Actor,
with code written in separate ray.Actors.
"""
from unittest.mock import patch
from single_controller.base.decorator import MAGIC_ATTR
import os
def _bind_workers_method_to_parent(cls, key, user_defined_cls):
"""
Binds the methods of each worker to the WorkerDict.
Note that we only bind public methods that are decorated by register
"""
for method_name in dir(user_defined_cls):
try:
method = getattr(user_defined_cls, method_name)
assert callable(method), f"{method_name} in {user_defined_cls} is not callable"
except Exception as e:
# if it is a property, it will fail because Class doesn't have instance property
continue
if hasattr(method, MAGIC_ATTR):
def generate_function(name):
def func(self, *args, **kwargs):
# dispatch to the actual worker
return getattr(self.worker_dict[key], name)(*args, **kwargs)
return func
func = generate_function(method_name)
# pass MAGIC_ATTR for outer worker group
setattr(func, MAGIC_ATTR, getattr(method, MAGIC_ATTR))
try:
method_name_with_prefix = key + '_' + method_name
setattr(cls, method_name_with_prefix, func)
# print(f'Binding {method_name_with_prefix}')
except Exception as e:
raise ValueError(f'Fail to set method_name {method_name}')
def _unwrap_ray_remote(cls):
if hasattr(cls, '__ray_actor_class__'):
cls = cls.__ray_actor_class__
return cls
def create_colocated_worker_cls(class_dict: dict[str, RayClassWithInitArgs]):
"""
This function should return a class instance that delegates the calls to every
cls in cls_dict
"""
cls_dict = {}
init_args_dict = {}
worker_cls = None
for key, cls in class_dict.items():
if worker_cls == None:
worker_cls = cls.cls.__ray_actor_class__.__base__
else:
assert worker_cls == cls.cls.__ray_actor_class__.__base__, \
'the worker class should be the same when share the same process'
cls_dict[key] = cls.cls
init_args_dict[key] = {'args': cls.args, 'kwargs': cls.kwargs}
assert cls_dict.keys() == init_args_dict.keys()
# TODO: create a class with customizable name
class WorkerDict(worker_cls):
def __init__(self):
super().__init__()
self.worker_dict = {}
for key, user_defined_cls in cls_dict.items():
user_defined_cls = _unwrap_ray_remote(user_defined_cls)
# directly instantiate the class without remote
with patch.dict(os.environ, {'DISABLE_WORKER_INIT': '1'}):
self.worker_dict[key] = user_defined_cls(*init_args_dict[key].get('args', ()),
**init_args_dict[key].get('kwargs', {}))
# now monkey-patch the methods from inner class to WorkerDict
for key, user_defined_cls in cls_dict.items():
user_defined_cls = _unwrap_ray_remote(user_defined_cls)
_bind_workers_method_to_parent(WorkerDict, key, user_defined_cls)
remote_cls = ray.remote(WorkerDict)
remote_cls = RayClassWithInitArgs(cls=remote_cls)
return remote_cls

View File

@ -0,0 +1,65 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import json
import os
import ray
# compatiblity cern
from single_controller.base.decorator import *
def maybe_remote(main):
"""Schedule main function as ray remote task if VERL_DRIVER_NUM_GPUS or VERL_DRIVER_RESOURCES specified in config.
- VERL_DRIVER_NUM_GPUS: number of GPUs for driver task.
- VERL_DRIVER_RESOURCES: custom resources for driver task, e.g {"verl_driver": 1.0}.
For job submission to ray cluster, you can specify these two envs in runtime.yaml.
```yaml
working_dir: "."
env_vars:
VERL_DRIVER_NUM_GPUS: "1"
VERL_DRIVER_RESOURCES: '{"verl_driver": 1.0}'
```
ray job submit --runtime-env=runtime.yaml -- python3 test.py
Args:
main (Callable): main function to be schedule.
"""
num_gpus = 0
resources = {}
env_num_gpus = os.getenv("VERL_DRIVER_NUM_GPUS")
if env_num_gpus:
num_gpus = int(env_num_gpus)
env_resources = os.getenv("VERL_DRIVER_RESOURCES")
if env_resources:
resources = json.loads(env_resources)
print(f"verl driver num_gpus: {num_gpus}, resources={resources}")
assert isinstance(resources, dict), f"resources must be dict, got {type(resources)}"
@functools.wraps(main)
def _main(*args, **kwargs):
# Run main function locally.
if num_gpus == 0 and len(resources) == 0:
return main(*args, **kwargs)
# Run main function remotely as ray task.
f = ray.remote(num_gpus=num_gpus, resources=resources)(main)
return ray.get(f.remote(*args, **kwargs))
return _main

View File

@ -0,0 +1,23 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensordict import TensorDict
import ray
from verl import DataProto
class DistDataProto(DataProto, ray.ObjectRef):
...
# skip for prototype, assuming dp size kept among all Roles

View File

@ -0,0 +1,93 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ray
from single_controller.ray.base import RayWorkerGroup, RayResourcePool, RayClassWithInitArgs
@ray.remote
class RefBasicRayActor:
...
class DPEngineRayWorkerGroup(RayWorkerGroup):
class DummyModule:
def __init__(self, core, methods_names) -> None:
self.core = core
def func_generator(method_name):
def func(*args, **kwargs):
return self.core.execute_all_async("execute_engine", method_name, *args, **kwargs)
return func
for method_name in methods_names:
setattr(self, method_name, func_generator(method_name))
def __init__(self, name_prefix, process_dispatch_scheme, use_gpu, engine_type, *args, **kwargs) -> None:
from torch import nn
# print(f"in DataParallelEngineWrapper, name_prefix = {name_prefix}")
if isinstance(process_dispatch_scheme, RayResourcePool):
rpdc = process_dispatch_scheme
else:
rpdc = RayResourcePool(process_on_nodes=process_dispatch_scheme,
use_gpu=use_gpu,
name_prefix=name_prefix,
max_colocate_count=1)
rcia = RayClassWithInitArgs(cls=engine_type, *args, **kwargs)
self._engine_type = engine_type
super().__init__(rpdc, rcia)
nn_module_methods = [
method_name for method_name in dir(nn.Module)
if callable(getattr(nn.Module, method_name)) and not method_name.startswith("__")
]
nn_module_methods += ["__call__"]
def func_generator(method_name):
def func(*args, **kwargs):
return self.execute_all_async(method_name, *args, **kwargs)
return func
print(f"{engine_type} has methods: {dir(engine_type)}")
for method_name in dir(engine_type):
try:
is_callable = callable(getattr(engine_type, method_name))
except Exception as _:
pass
else:
if is_callable and method_name not in dir(RefBasicRayActor):
print(f"register method: {method_name}")
setattr(self, method_name, func_generator(method_name))
self.module = DPEngineRayWorkerGroup.DummyModule(self, nn_module_methods)
@property
def engine(self):
return self.module
def get_model_size_on_rank_zero(self):
results = ray.get([worker.get_model_size_on_rank_zero.remote() for worker in self._workers])
for result in results:
if result is not None:
return result

View File

@ -0,0 +1,62 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Optional
import ray
from .base import RayWorkerGroup, RayResourcePool, RayClassWithInitArgs
from single_controller.base.megatron.worker import DistRankInfo, DistGlobalInfo
from single_controller.base.megatron.worker_group import MegatronWorkerGroup
# NOTE(sgm): for opensource megatron-core
class NVMegatronRayWorkerGroup(RayWorkerGroup, MegatronWorkerGroup):
"""
MegatronWorkerGroup will query each worker of its megatron rank info and store it inside the WorkerGroup
so that the dispatcher can use it to dispatch data.
"""
def __init__(self, resource_pool: RayResourcePool, ray_cls_with_init: RayClassWithInitArgs, **kwargs):
super().__init__(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, **kwargs)
self._megatron_rank_info: DistRankInfo = self.execute_all_sync(method_name='get_megatron_rank_info')
self._megatron_global_info: DistGlobalInfo = ray.get(
self.execute_rank_zero_async(method_name='get_megatron_global_info'))
class MegatronRayWorkerGroup(RayWorkerGroup, MegatronWorkerGroup):
"""
MegatronWorkerGroup will query each worker of its megatron rank info and store it inside the WorkerGroup
so that the dispatcher can use it to dispatch data.
"""
def __init__(self,
resource_pool: RayResourcePool,
ray_cls_with_init: RayClassWithInitArgs,
default_megatron_kwargs: Dict = None,
**kwargs):
super().__init__(resource_pool=resource_pool,
ray_cls_with_init=ray_cls_with_init,
default_megatron_kwargs=default_megatron_kwargs,
**kwargs)
self.init_megatron(default_megatron_kwargs=default_megatron_kwargs)
self._megatron_rank_info: DistRankInfo = self.execute_all_sync(method_name='get_megatron_rank_info')
self._megatron_global_info: DistGlobalInfo = ray.get(
self.execute_rank_zero_async(method_name='get_megatron_global_info'))
def init_megatron(self, default_megatron_kwargs: Optional[Dict] = None):
# after super, we will call init of each worker
if not self._is_init_with_detached_workers:
# only init_megatron if the WorkerGroup is created from scratch
self.execute_all_sync(method_name='init_megatron', default_megatron_kwargs=default_megatron_kwargs)

View File

@ -0,0 +1 @@
0.0.2

27
verl/__init__.py Normal file
View File

@ -0,0 +1,27 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
with open(os.path.join(version_folder, 'version/version')) as f:
__version__ = f.read().strip()
from .protocol import DataProto
from .utils.logging_utils import set_basic_config
import logging
set_basic_config(level=logging.WARNING)

35
verl/models/README.md Normal file
View File

@ -0,0 +1,35 @@
# Models
Common modelzoo such as huggingface/transformers stuggles when using Pytorch native model parallelism. Following the design principle of vLLM, we keep a simple, parallelizable, highly-optimized with packed inputs in verl.
## Adding a New Huggingface Model
### Step 1: Copy the model file from HF to verl
- Add a new file under verl/models/hf
- Copy ONLY the model file from huggingface/transformers/models to verl/models/hf
### Step 2: Modify the model file to use packed inputs
- Remove all the code related to inference (kv cache)
- Modify the inputs to include only
- input_ids (total_nnz,)
- cu_seqlens (total_nnz + 1,)
- max_seqlen_in_batch: int
- Note that this requires using flash attention with causal mask.
### Step 2.5: Add tests
- Add a test to compare this version and the huggingface version
- Following the infrastructure and add tests to tests/models/hf
### Step 3: Add a function to apply tensor parallelism
- Please follow
- https://pytorch.org/docs/stable/distributed.tensor.parallel.html
- https://pytorch.org/tutorials/intermediate/TP_tutorial.html
- General comments
- Tensor Parallelism in native Pytorch is NOT auto-parallelism. The way it works is to specify how model parameters and input/output reshards using configs. These configs are then registered as hooks to perform input/output resharding before/after model forward.
### Step 4: Add a function to apply data parallelism
- Please use FSDP2 APIs
- See demo here https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py#L413
### Step 5: Add a function to apply pipeline parallelism
- Comes in Pytorch 2.4
- Currently only in alpha in nightly version
- Check torchtitan for more details

13
verl/models/__init__.py Normal file
View File

@ -0,0 +1,13 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,24 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .modeling_llama_megatron import (
# original model with megatron
ParallelLlamaModel,
ParallelLlamaForCausalLM,
# rmpad with megatron
ParallelLlamaForCausalLMRmPad,
ParallelLlamaForValueRmPad,
# rmpad with megatron and pipeline parallelism
ParallelLlamaForCausalLMRmPadPP,
ParallelLlamaForValueRmPadPP)

View File

@ -0,0 +1,13 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,446 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import time
from typing import Dict, Any, Callable, Optional
import torch.distributed as dist
def _megatron_calc_layer_map(config):
"""Calculate the mapping of global layer_idx to local layer_idx
Returns:
layer_map (Dict: int -> tuple(int, int, int)):
mapping from the global layer index to
a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
"""
import megatron
from megatron.core import mpu
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
layer_map = dict()
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
for pp_rank_idx in range(pp_size):
for virtual_pp_rank_idx in range(virtual_pp_size):
layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
pp_rank_idx * num_layers_per_model)
for layer_idx in range(num_layers_per_model):
layer_map[layer_offset + layer_idx] = (
pp_rank_idx,
virtual_pp_rank_idx,
layer_idx,
)
return layer_map
def load_state_dict_to_megatron_llama(state_dict, wrapped_models, config, params_dtype, is_value_model=False):
"""Load merged state_dict to sharded Megatron module in training.
"""
import megatron
from megatron.core import mpu
from megatron.utils import print_rank_0, unwrap_model
from megatron.core.transformer.module import Float16Module
from megatron.core import DistributedDataParallel as LocalDDP
from torch.nn.parallel import DistributedDataParallel as torchDDP
start_time = time.time()
def _get_gpt_model(model):
return model
def broadcast_params(module):
for param in module.parameters():
torch.distributed.broadcast(param.data,
src=mpu.get_data_parallel_src_rank(),
group=mpu.get_data_parallel_group())
dp_rank = mpu.get_data_parallel_rank()
pp_rank = mpu.get_pipeline_model_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
mp_group = mpu.get_model_parallel_group()
if torch.distributed.get_rank() == 0:
assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
if not isinstance(wrapped_models, (list, tuple)):
wrapped_models = list(wrapped_models)
assert len(wrapped_models) == virtual_pp_size
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
models = [None] * len(wrapped_models)
for i, wrapped_model in enumerate(wrapped_models):
models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
gpt_model_module = _get_gpt_model(models[i])
assert len(gpt_model_module.model.layers) == num_layers_per_model
def _broadcast_tensor(tensor, name) -> torch.Tensor:
"""broadcast tensor from rank0 across mp_group"""
nonlocal state_dict
nonlocal mp_group
if torch.distributed.get_rank() == 0:
if name in state_dict:
weight = state_dict[name]
tensor_shape = weight.shape
else:
tensor_shape = None
else:
weight = None
tensor_shape = None
obj_list = [tensor_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
tensor_shape = obj_list[0]
if tensor_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tensor:[{name}] not in state_dict, skip load")
return
if tensor is None:
tensor = torch.empty(
tensor_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
if torch.distributed.get_rank() == 0:
tensor.data.copy_(weight)
dist.broadcast(tensor, src=0, group=mp_group)
def _broadcast_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
if name in state_dict:
full_weight = state_dict[name]
if mutate_func is not None:
full_weight = mutate_func(full_weight)
tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
def _broadcast_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
if name in state_dict:
full_weight = state_dict[name]
if mutate_func is not None:
full_weight = mutate_func(full_weight)
tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
gate_weight = state_dict[gate_name]
up_weight = state_dict[up_name]
new_gate_up_weight = torch.empty(config.intermediate_size * 2,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
intermediate_size_tp = config.intermediate_size // tp_size
gate_weight_tp = gate_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
up_weight_tp = up_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
new_gate_up_weight[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)].copy_(
torch.cat([gate_weight_tp, up_weight_tp], dim=0))
tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (
tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank() == 0:} tensor {gate_name, up_name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
assert (q_name in state_dict and k_name in state_dict and v_name in state_dict)
full_weight_q = state_dict[q_name]
full_weight_k = state_dict[k_name]
full_weight_v = state_dict[v_name]
hidden_size_per_head = config.hidden_size // config.num_attention_heads
if config.num_key_value_heads >= tp_size:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
total_size = q_size_tp + 2 * kv_size_tp
new_weight_qkv = torch.empty(total_size * tp_size,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
k_part = full_weight_k[i * kv_size_tp:(i + 1) * kv_size_tp]
v_part = full_weight_v[i * kv_size_tp:(i + 1) * kv_size_tp]
new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
dim=0))
else:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head
total_size = q_size_tp + 2 * kv_size_tp
new_weight_qkv = torch.empty(total_size * tp_size,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
k_part = full_weight_k[start_idx:end_idx]
v_part = full_weight_v[start_idx:end_idx]
new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
dim=0))
tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank()} tensor {q_name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
if dp_rank == 0:
# Embeddings
# -------------------
print_rank_0("loading embeddings...")
gpt_model_module = _get_gpt_model(models[0])
embed_tokens_weight = None
if pp_rank == 0:
embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
_broadcast_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
# Transformer layers
# -------------------
layer_map = _megatron_calc_layer_map(config)
for layer in range(config.num_hidden_layers):
print_rank_0(f"loading layer #{layer}...")
layer_name = f"model.layers.{layer}"
dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
sync_layer = gpt_model_module.model.layers[dst_layer_idx]
_broadcast_tensor(
sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.input_layernorm.weight",
)
_broadcast_tp_shard_tensor_qkv(
sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.self_attn.q_proj.weight",
f"{layer_name}.self_attn.k_proj.weight",
f"{layer_name}.self_attn.v_proj.weight",
)
_broadcast_tp_shard_tensor(
sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.self_attn.o_proj.weight",
chunk_dim=1,
)
_broadcast_tensor(
sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.post_attention_layernorm.weight",
)
_broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.mlp.gate_proj.weight", f"{layer_name}.mlp.up_proj.weight")
_broadcast_tp_shard_tensor(
sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.mlp.down_proj.weight",
chunk_dim=1,
)
# Final Layernorm
# -------------------
print_rank_0("loading final layernorm...")
gpt_model_module = _get_gpt_model(models[-1])
_broadcast_tensor(
getattr(gpt_model_module.model.norm, "weight", None),
"model.norm.weight",
)
print_rank_0("loading lm_head...")
lm_head_weight = None
if pp_rank + 1 == pp_size:
lm_head_weight = gpt_model_module.lm_head.weight
if is_value_model:
# if torch.distributed.get_rank() == 0:
if 'lm_head.weight' in state_dict and state_dict['lm_head.weight'].shape[0] == 1:
_broadcast_tensor(lm_head_weight, "lm_head.weight")
elif 'reward_head.weight' in state_dict and state_dict['reward_head.weight'].shape[0] == 1:
_broadcast_tensor(lm_head_weight, "reward_head.weight")
print_rank_0('load lm_head from value_head weight')
else:
_broadcast_tensor(None, "lm_head.weight")
print_rank_0('fail to match lm_head in value_model')
# else:
# _broadcast_tensor(lm_head_weight, "lm_head.weight")
else:
_broadcast_tp_shard_tensor(lm_head_weight, "lm_head.weight")
dist.barrier()
# Broadcast weights inside data parallel groups
for wrapped_model in wrapped_models:
broadcast_params(wrapped_model)
torch.cuda.empty_cache()
print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")

View File

@ -0,0 +1,449 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import megatron
from megatron.core import mpu
from megatron.utils import print_rank_0, unwrap_model
from megatron.model import Float16Module
from megatron.model import DistributedDataParallel as LocalDDP
from torch.nn.parallel import DistributedDataParallel as torchDDP
import torch
import time
from typing import Optional
import torch.distributed as dist
from megatron import get_args
def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0):
"""given TP,DP,PP rank to get the global rank."""
args = get_args()
tp_size = mpu.get_tensor_model_parallel_world_size()
dp_size = mpu.get_data_parallel_world_size()
pp_size = mpu.get_pipeline_model_parallel_world_size()
assert (tp_size * dp_size * pp_size == torch.distributed.get_world_size()
), f"{tp_size} x {dp_size} x {pp_size} != {torch.distributed.get_world_size()}"
if args.switch_dp_and_pp_grouping:
# TP-PP-DP grouping
return (dp_rank * pp_size + pp_rank) * tp_size + tp_rank
else:
# TP-DP-PP grouping
return (pp_rank * dp_size + dp_rank) * tp_size + tp_rank
def _megatron_calc_layer_map(config):
"""Calculate the mapping of global layer_idx to local layer_idx
Returns:
layer_map (Dict: int -> tuple(int, int, int)):
mapping from the global layer index to
a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
"""
import megatron
from megatron.core import mpu
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
args = megatron.get_args()
layer_map = dict()
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
for pp_rank_idx in range(pp_size):
for virtual_pp_rank_idx in range(virtual_pp_size):
layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
pp_rank_idx * num_layers_per_model)
for layer_idx in range(num_layers_per_model):
layer_map[layer_offset + layer_idx] = (
pp_rank_idx,
virtual_pp_rank_idx,
layer_idx,
)
return layer_map
def merge_megatron_ckpt_llama(wrapped_models, config, is_value_model=False, dtype='bf16'):
"""Merge sharded parameters of a Megatron module into a merged checkpoint.
Args:
wrapped_modelss (list of megatron.model.DistributedDataParallel):
The local DDP wrapped megatron modules.
dtype (str or None):
The data type of state_dict. if None, the data type of the original parameters
is used.
gpt_model_key: key to access model
Returns:
state_dict (dict):
The merged state_dict in rank 0, and an empty dictionary in other ranks.
"""
start_time = time.time()
args = megatron.get_args()
def _get_gpt_model(model):
return model
dp_rank = mpu.get_data_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
pp_rank = mpu.get_pipeline_model_parallel_rank()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
mp_group = mpu.get_model_parallel_group()
if dist.get_rank() == 0:
assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
if not isinstance(wrapped_models, (list, tuple)):
wrapped_models = list(wrapped_models)
assert len(wrapped_models) == virtual_pp_size
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
models = [None] * len(wrapped_models)
for i, wrapped_model in enumerate(wrapped_models):
models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
assert len(models[i].model.layers
) == num_layers_per_model, 'len model layers {} not equal to num_layers_per_model {}'.format(
len(models[i].model.layers), num_layers_per_model)
state_dict = dict()
def _get_cpu_tensor(tensor: torch.Tensor):
if tensor is None:
return None
if tensor.device == torch.device("cpu"):
return tensor.detach().clone()
return tensor.detach().cpu()
def _broadcast_tensor(tensor, name, src_pp_rank) -> torch.Tensor:
"""broadcast tensor across mp_group"""
nonlocal state_dict
nonlocal mp_group
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
if tensor is None:
weight = None
tensor_shape = None
else:
weight = tensor
tensor_shape = weight.shape
else:
weight = None
tensor_shape = None
obj_list = [tensor_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
tensor_shape = obj_list[0]
if tensor_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tensor:[{name}] not exist, skip collect")
return
if weight is None:
weight = torch.empty(
tensor_shape,
dtype=args.params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
dist.broadcast(weight, src=src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
state_dict[name] = _get_cpu_tensor(weight)
def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_func=None) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
chunk_shape = tensor.shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{name}] not exist, skip collecting")
return
buffer_tensor = torch.empty(
chunk_shape,
dtype=args.params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
chunk_tensors = [None] * tp_size
for i in range(tp_size):
cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
if torch.distributed.get_rank() == 0:
full_tensor = torch.concat(chunk_tensors, dim=concat_dim)
if mutate_func is not None:
full_tensor = mutate_func(full_tensor)
state_dict[name] = full_tensor
def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
chunk_shape = tensor.shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not exist, skip collecting")
return
buffer_tensor = torch.empty(
chunk_shape,
dtype=args.params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
chunk_tensors = [None] * tp_size
for i in range(tp_size):
cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
if torch.distributed.get_rank() == 0:
full_tensor = torch.concat(chunk_tensors, dim=0)
intermediate_size_tp = config.intermediate_size // tp_size
gate_weight_list = []
up_weight_list = []
for i in range(tp_size):
gate_up_weight_tp = full_tensor[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)]
gate_weight_tp = gate_up_weight_tp[:intermediate_size_tp]
up_weight_tp = gate_up_weight_tp[intermediate_size_tp:]
gate_weight_list.append(gate_weight_tp)
up_weight_list.append(up_weight_tp)
state_dict[gate_name] = torch.cat(gate_weight_list, dim=0)
state_dict[up_name] = torch.cat(up_weight_list, dim=0)
def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
chunk_shape = tensor.shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{q_name}] not exist, skip collecting")
return
buffer_tensor = torch.empty(
chunk_shape,
dtype=args.params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
chunk_tensors = [None] * tp_size
for i in range(tp_size):
cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
if torch.distributed.get_rank() == 0:
full_tensor = torch.concat(chunk_tensors, dim=0)
q_weight_list = []
k_weight_list = []
v_weight_list = []
hidden_size_per_head = config.hidden_size // config.num_attention_heads
if config.num_key_value_heads >= tp_size:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
total_size = q_size_tp + 2 * kv_size_tp
for i in range(tp_size):
qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
q_part = qkv_part[:q_size_tp]
k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
q_weight_list.append(q_part)
k_weight_list.append(k_part)
v_weight_list.append(v_part)
else:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head
total_size = q_size_tp + 2 * kv_size_tp
for i in range(tp_size):
qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
q_part = qkv_part[:q_size_tp]
k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
q_weight_list.append(q_part)
if i * config.num_key_value_heads % tp_size == 0:
k_weight_list.append(k_part)
v_weight_list.append(v_part)
state_dict[q_name] = torch.cat(q_weight_list, dim=0)
state_dict[k_name] = torch.cat(k_weight_list, dim=0)
state_dict[v_name] = torch.cat(v_weight_list, dim=0)
# empty cache before collecting weights
torch.cuda.empty_cache()
# Embeddings
# -------------------
if dp_rank == 0:
# Embeddings
# -------------------
print_rank_0("collecting embeddings...")
gpt_model_module = _get_gpt_model(models[0])
_broadcast_tp_shard_tensor(
gpt_model_module.model.embed_tokens.weight if pp_rank == 0 else None,
"model.embed_tokens.weight",
src_pp_rank=0,
)
# Transformer layers
# -------------------
layer_map = _megatron_calc_layer_map(config)
for layer in range(config.num_hidden_layers):
print_rank_0(f"collecting layer #{layer}...")
layer_name = f"model.layers.{layer}"
src_pp_rank, src_virtual_pp_rank, src_layer_idx = layer_map[layer]
gpt_model_module = _get_gpt_model(models[src_virtual_pp_rank])
sync_layer = gpt_model_module.model.layers[src_layer_idx]
_broadcast_tensor(
sync_layer.input_layernorm.weight,
f"{layer_name}.input_layernorm.weight",
src_pp_rank=src_pp_rank,
)
_broadcast_tp_shard_tensor_qkv(
sync_layer.self_attn.qkv_proj.weight,
f"{layer_name}.self_attn.q_proj.weight",
f"{layer_name}.self_attn.k_proj.weight",
f"{layer_name}.self_attn.v_proj.weight",
src_pp_rank=src_pp_rank,
)
_broadcast_tp_shard_tensor(
sync_layer.self_attn.o_proj.weight,
f"{layer_name}.self_attn.o_proj.weight",
concat_dim=1,
src_pp_rank=src_pp_rank,
)
_broadcast_tensor(
sync_layer.post_attention_layernorm.weight,
f"{layer_name}.post_attention_layernorm.weight",
src_pp_rank=src_pp_rank,
)
_broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight,
f"{layer_name}.mlp.gate_proj.weight",
f"{layer_name}.mlp.up_proj.weight",
src_pp_rank=src_pp_rank)
_broadcast_tp_shard_tensor(
sync_layer.mlp.down_proj.weight,
f"{layer_name}.mlp.down_proj.weight",
concat_dim=1,
src_pp_rank=src_pp_rank,
)
# Final Layernorm
# -------------------
print_rank_0("collecting final layernorm...")
gpt_model_module = _get_gpt_model(models[-1])
_broadcast_tensor(
getattr(gpt_model_module.model.norm, "weight", None),
"model.norm.weight",
src_pp_rank=pp_size - 1,
)
print_rank_0("collecting lm_head...")
if is_value_model:
_broadcast_tensor(getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
"reward_head.weight",
src_pp_rank=pp_size - 1)
else:
_broadcast_tp_shard_tensor(
getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
"lm_head.weight",
src_pp_rank=pp_size - 1,
)
dist.barrier()
torch.cuda.empty_cache()
if torch.distributed.get_rank() == 0:
if dtype == "fp16":
dtype = torch.float16
elif dtype == "bf16":
dtype = torch.bfloat16
elif dtype is None or dtype == "fp32":
dtype = torch.float32
else:
print(f'Unknown/unsupported dtype to save: {dtype}"')
exit(1)
for k, v in state_dict.items():
if dtype != v.dtype:
state_dict[k] = v.to(dtype)
print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
return state_dict

View File

@ -0,0 +1,18 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .parallel_attention import ParallelLlamaAttention
from .parallel_decoder import ParallelLlamaDecoderLayer, ParallelLlamaDecoderLayerRmPad
from .parallel_mlp import ParallelLlamaMLP
from .parallel_rmsnorm import ParallelLlamaRMSNorm

View File

@ -0,0 +1,418 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Optional, Tuple
import torch
from megatron.core import parallel_state as mpu
from megatron.core import tensor_parallel
from megatron.core import ModelParallelConfig
from torch import nn
from transformers import LlamaConfig
from verl.models.llama.megatron.layers.parallel_linear import QKVParallelLinear
from verl.utils.megatron import tensor_parallel as tp_utils
class LlamaRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(seq_len=max_position_embeddings,
device=self.inv_freq.device,
dtype=torch.get_default_dtype())
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def forward(self, x, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
"""LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
t = t / self.scaling_factor
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
"""LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
if seq_len > self.max_position_embeddings:
base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
(self.scaling_factor - 1))**(self.dim / (self.dim - 2))
inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., :x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2:]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
"""
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
class ParallelLlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.config = config
self.megatron_config = megatron_config
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
# assign values after tp
tp_size = mpu.get_tensor_model_parallel_world_size()
assert self.num_heads % tp_size == 0, f'num_head must be divisible by tp_size. Got num_head={self.num_heads}, tp_size={tp_size}'
assert self.num_key_value_heads % tp_size == 0, \
f'num_key_value_heads must be divisible by tp_size. Got num_key_value_heads={self.num_key_value_heads}, tp_size={tp_size}'
self.num_heads_per_tp = self.num_heads // tp_size
self.num_key_value_heads_per_tp = self.num_key_value_heads // tp_size
self.hidden_size_per_tp = self.hidden_size // tp_size
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads}).")
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
if megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
assert row_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
# [self.q_size, self.k_size, self.v_size]
self.qkv_proj = QKVParallelLinear(input_size=self.hidden_size,
num_heads=self.num_heads,
num_key_value_heads=self.num_key_value_heads,
head_dim=self.head_dim,
bias=config.attention_bias,
gather_output=False,
skip_bias_add=False,
**column_kwargs)
self.q_size = self.num_heads_per_tp * self.head_dim
self.k_size = self.num_key_value_heads_per_tp * self.head_dim
self.v_size = self.num_key_value_heads_per_tp * self.head_dim
self.o_proj = tensor_parallel.RowParallelLinear(input_size=self.num_heads * self.head_dim,
output_size=self.hidden_size,
bias=config.attention_bias,
input_is_parallel=True,
skip_bias_add=False,
**row_kwargs)
self._init_rope()
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = LlamaRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
qkv = self.qkv_proj(hidden_states)[0]
query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
query_states = query_states.view(bsz, q_len, self.num_heads_per_tp, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
kv_seq_len = key_states.shape[-2]
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
if attn_weights.size() != (bsz, self.num_heads_per_tp, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz, self.num_heads_per_tp, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}")
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}")
attn_weights = attn_weights + attention_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads_per_tp, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads_per_tp, q_len, self.head_dim)}, but is"
f" {attn_output.size()}")
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size_per_tp)
attn_output = self.o_proj(attn_output)[0]
return attn_output
"""
Remove padding Attention
- Using Flash-attn 2
- Compatible with sequence parallel
"""
from transformers.utils import is_flash_attn_2_available
import torch.nn.functional as F
from einops import rearrange
if is_flash_attn_2_available():
from flash_attn import flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
def apply_rotary_pos_emb_rmpad(q, k, cos, sin, position_ids, indices, sequence_length):
batch_size = position_ids.shape[0]
q = pad_input(q, indices, batch_size, sequence_length) # (batch_size, seqlen, num_head, head_dim)
k = pad_input(k, indices, batch_size, sequence_length)
cos = cos[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
sin = sin[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
q_embed = index_first_axis(rearrange(q_embed, "b s ... -> (b s) ..."), indices)
k_embed = index_first_axis(rearrange(k_embed, "b s ... -> (b s) ..."), indices)
return q_embed, k_embed
from flash_attn.layers.rotary import apply_rotary_emb
# use flash-attn rotary embeddings with rmpad
# cos/sin shoudl be: (seq_length, rotary_dim / 2)
def apply_rotary_pos_emb_rmpad_flash(q, k, cos, sin, cu_seqlens, max_seqlen):
q_embed = apply_rotary_emb(q,
cos,
sin,
interleaved=False,
inplace=False,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen)
k_embed = apply_rotary_emb(k,
cos,
sin,
interleaved=False,
inplace=False,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen)
return q_embed, k_embed
class ParallelLlamaAttentionRmPad(ParallelLlamaAttention):
def forward(self,
hidden_states: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: torch.Tensor = None,
max_seqlen_in_batch: int = None):
total_nnz, _, _ = hidden_states.size() # This is the total_nnz padded after sequence parallel
if self.megatron_config.sequence_parallel:
total_nnz = total_nnz * mpu.get_tensor_model_parallel_world_size()
qkv = self.qkv_proj(hidden_states)[0]
query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size],
dim=-1) # (total_nnz, 1, hidden_size)
if self.megatron_config.sequence_parallel:
sequence_parallel_pad = total_nnz - cu_seqlens[-1]
total_nnz = cu_seqlens[-1] # total_nnz before sp padding
query_states = query_states[:total_nnz]
key_states = key_states[:total_nnz]
value_states = value_states[:total_nnz]
# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dime x hidden_dim
# therefore we just need to keep the original shape
query_states = query_states.view(total_nnz, self.num_heads_per_tp, self.head_dim)
key_states = key_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
value_states = value_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
cos, sin = self.rotary_emb(value_states, seq_len=sequence_length)
cos, sin = cos[:, :cos.shape[1] // 2], sin[:, :sin.shape[1] // 2] # flash attn only needs half
query_states, key_states = apply_rotary_pos_emb_rmpad_flash(query_states,
key_states,
cos,
sin,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen_in_batch)
# query_states, key_states = apply_rotary_pos_emb_rmpad(query_states, key_states, cos, sin, position_ids, indices,
# TODO: llama does not have dropout in the config??
# It is recommended to use dropout with FA according to the docs
# when training.
dropout_rate = 0.0 # if not self.training else self.attn_dropout
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in float16 just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (LlamaRMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
query_states = query_states.to(torch.float16)
key_states = key_states.to(torch.float16)
value_states = value_states.to(torch.float16)
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens,
cu_seqlens_k=cu_seqlens,
max_seqlen_q=max_seqlen_in_batch,
max_seqlen_k=max_seqlen_in_batch,
dropout_p=dropout_rate,
softmax_scale=None,
causal=True,
)
attn_output_unpad = attn_output_unpad.to(input_dtype)
attn_output_unpad = attn_output_unpad.reshape(total_nnz, 1, self.hidden_size_per_tp).contiguous()
# sequence parallel reduce_scatter is performed inside RowColumnParallel if enabled
# Here we need to repad
if self.megatron_config.sequence_parallel:
attn_output_unpad = F.pad(attn_output_unpad, pad=(0, 0, 0, 0, 0, sequence_parallel_pad))
attn_output_unpad = self.o_proj(attn_output_unpad)[0]
return attn_output_unpad

View File

@ -0,0 +1,146 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Tuple
import torch
from torch import nn
from transformers import LlamaConfig
from megatron.core import ModelParallelConfig
from .parallel_attention import ParallelLlamaAttention, ParallelLlamaAttentionRmPad
from .parallel_mlp import ParallelLlamaMLP
from .parallel_rmsnorm import ParallelLlamaRMSNorm
class ParallelLlamaDecoderLayer(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = ParallelLlamaAttention(config=config, megatron_config=megatron_config)
self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Note: sequence parallel is hidden inside ColumnParallelLinear
# reduce scatter is hidden inside RowParallelLinear
# Self Attention
hidden_states = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
)
# TODO: add sequence parallel operator reduce_scatter here
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
# TODO: add sequence parallel operator all_gather here
hidden_states = self.mlp(hidden_states)
# TODO: add sequence parallel operator reduce_scatter here
hidden_states = residual + hidden_states
outputs = hidden_states
return outputs
class ParallelLlamaDecoderLayerRmPad(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.config = config
self.megatron_config = megatron_config
self.hidden_size = config.hidden_size
self.self_attn = ParallelLlamaAttentionRmPad(config=config, megatron_config=megatron_config)
self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
def forward(
self,
hidden_states: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: int = None,
max_seqlen_in_batch: int = None
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states # (total_nnz // sp, 1, hidden_size)
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
# (total_nnz // sp, 1, hidden_size) -> all-gather (total_nnz, 1, hidden_size)
# -> col + row -> reduce-scatter -> (total_nnz // sp, 1, hidden_size)
hidden_states = self.self_attn(hidden_states=hidden_states,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = residual + hidden_states
# Fully Connected
# shape changes same as attn
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = hidden_states
return outputs

View File

@ -0,0 +1,74 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/linear.py
from typing import Optional, Tuple
from megatron.core import tensor_parallel
class QKVParallelLinear(tensor_parallel.ColumnParallelLinear):
def __init__(self,
input_size,
num_heads,
num_key_value_heads,
head_dim,
*,
bias=True,
gather_output=True,
skip_bias_add=False,
**kwargs):
# Keep input parameters, and already restrict the head numbers
self.input_size = input_size
self.q_output_size = num_heads * head_dim
self.kv_output_size = num_key_value_heads * head_dim
self.head_dim = head_dim
self.gather_output = gather_output
self.skip_bias_add = skip_bias_add
input_size = self.input_size
output_size = (num_heads + 2 * num_key_value_heads) * self.head_dim
super().__init__(input_size=input_size,
output_size=output_size,
bias=bias,
gather_output=gather_output,
skip_bias_add=skip_bias_add,
**kwargs)
class MergedColumnParallelLinear(tensor_parallel.ColumnParallelLinear):
def __init__(self,
input_size,
gate_ouput_size,
up_output_size,
*,
bias=True,
gather_output=True,
skip_bias_add=False,
**kwargs):
# Keep input parameters, and already restrict the head numbers
self.input_size = input_size
self.output_size = gate_ouput_size + up_output_size
self.gather_output = gather_output
self.skip_bias_add = skip_bias_add
super().__init__(input_size=self.input_size,
output_size=self.output_size,
bias=bias,
gather_output=gather_output,
skip_bias_add=skip_bias_add,
**kwargs)

View File

@ -0,0 +1,74 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from megatron.core import parallel_state as mpu
from megatron.core import tensor_parallel
from megatron.core import ModelParallelConfig
from torch import nn
from transformers.activations import ACT2FN
from verl.models.llama.megatron.layers.parallel_linear import MergedColumnParallelLinear
from verl.utils.megatron import tensor_parallel as tp_utils
class ParallelLlamaMLP(nn.Module):
def __init__(self, config, megatron_config: ModelParallelConfig = None) -> None:
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
# The weight is only [hidden_size, intermediate_size // model_parallel_world_size]
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
if megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
assert row_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
tp_size = mpu.get_tensor_model_parallel_world_size()
self.gate_up_proj = MergedColumnParallelLinear(
input_size=self.hidden_size,
gate_ouput_size=self.intermediate_size,
up_output_size=self.intermediate_size,
bias=False,
gather_output=False,
skip_bias_add=False,
**column_kwargs,
)
self.gate_size = self.intermediate_size // tp_size
self.down_proj = tensor_parallel.RowParallelLinear(input_size=self.intermediate_size,
output_size=self.hidden_size,
bias=False,
input_is_parallel=True,
skip_bias_add=False,
**row_kwargs)
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
gate_up = self.gate_up_proj(x)[0]
gate, up = gate_up.split(self.gate_size, dim=-1)
return self.down_proj(self.act_fn(gate) * up)[0]

View File

@ -0,0 +1,46 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numbers
import torch
from megatron.core import ModelParallelConfig
from torch import nn
from transformers import LlamaConfig
from apex.normalization.fused_layer_norm import fused_rms_norm_affine
from verl.utils.megatron import sequence_parallel as sp_utils
class ParallelLlamaRMSNorm(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
"""
LlamaRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
if isinstance(config.hidden_size, numbers.Integral):
normalized_shape = (config.hidden_size,)
self.normalized_shape = torch.Size(normalized_shape)
self.weight = nn.Parameter(torch.ones(self.normalized_shape))
self.variance_epsilon = config.rms_norm_eps
if megatron_config.sequence_parallel:
sp_utils.mark_parameter_as_sequence_parallel(self.weight)
def forward(self, hidden_states):
return fused_rms_norm_affine(input=hidden_states,
weight=self.weight,
normalized_shape=self.normalized_shape,
eps=self.variance_epsilon,
memory_efficient=True)

View File

@ -0,0 +1,663 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch LLaMA model."""
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from megatron.core import tensor_parallel
from megatron.core import ModelParallelConfig
from torch import nn
from torch.nn import init
from transformers.modeling_outputs import BaseModelOutputWithPast
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.models.llama.modeling_llama import CausalLMOutputWithPast
from verl.utils.megatron import sequence_parallel as sp_utils
from verl.utils.megatron import tensor_parallel as tp_utils
from .layers import ParallelLlamaDecoderLayer, ParallelLlamaRMSNorm, ParallelLlamaDecoderLayerRmPad
"""
TODO:
1. Add weight initialization. Here we need to be careful on TP weight init.
2. Add sequence parallel
3. Load checkpoint from meta LLama pretrained checkpoint
"""
# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device):
"""
Make causal mask used for bi-directional self-attention.
"""
bsz, tgt_len = input_ids_shape
mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
mask_cond = torch.arange(mask.size(-1), device=device)
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
mask = mask.to(dtype)
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)
# Copied from transformers.models.bart.modeling_bart._expand_mask
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
class ParallelLlamaModel(nn.Module):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
Args:
config: LlamaConfig
"""
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
if megatron_config is not None:
assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
**embedding_kwargs)
self.layers = nn.ModuleList(
[ParallelLlamaDecoderLayer(config, megatron_config) for _ in range(config.num_hidden_layers)])
self.norm = ParallelLlamaRMSNorm(config, megatron_config)
# Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds):
# create causal mask
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
combined_attention_mask = None
if input_shape[-1] > 1:
combined_attention_mask = _make_causal_mask(
input_shape,
inputs_embeds.dtype,
device=inputs_embeds.device,
)
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype,
tgt_len=input_shape[-1]).to(inputs_embeds.device)
combined_attention_mask = (expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask +
combined_attention_mask)
return combined_attention_mask
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
"""
Args:
input_ids: input ids. shape (batch_size, seq_length)
attention_mask: attention_mask. shape (batch_size, seq_length)
position_ids: position ids. shape (batch_size, seq_length)
Returns:
"""
batch_size, seq_length = input_ids.shape
inputs_embeds = self.embed_tokens(input_ids)
# embed positions
attention_mask = self._prepare_decoder_attention_mask(attention_mask, (batch_size, seq_length), inputs_embeds)
hidden_states = inputs_embeds
for idx, decoder_layer in enumerate(self.layers):
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
)
hidden_states = layer_outputs
hidden_states = self.norm(hidden_states)
return hidden_states
class ParallelLlamaForCausalLM(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.model = ParallelLlamaModel(config, megatron_config=megatron_config)
self.vocab_size = config.vocab_size
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=config.hidden_size,
output_size=config.vocab_size,
bias=False,
gather_output=False,
skip_bias_add=False,
**column_kwargs)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
```"""
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
)
hidden_states = outputs
logits = self.lm_head(hidden_states)[0]
logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
logits = logits.float()
return CausalLMOutputWithPast(
loss=None,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
class ParallelLlamaModelRmPad(nn.Module):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
Args:
config: LlamaConfig
"""
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
self.megatron_config = megatron_config
if megatron_config is not None:
assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
**embedding_kwargs)
self.layers = nn.ModuleList(
[ParallelLlamaDecoderLayerRmPad(config, megatron_config) for _ in range(config.num_hidden_layers)])
self.norm = ParallelLlamaRMSNorm(config, megatron_config)
def forward(self,
input_ids: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: int = None,
max_seqlen_in_batch: int = None) -> Union[Tuple, BaseModelOutputWithPast]:
"""
Args:
input_ids: input ids. shape (1, totol_nnz)
position_ids: position ids. shape (batch_size, seq_length)
Returns:
"""
inputs_embeds = self.embed_tokens(input_ids) # (1, total_nnz) -> (1, total_nnz, hidden_size)
# (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
inputs_embeds = inputs_embeds.transpose(0, 1)
if self.megatron_config.sequence_parallel:
inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
hidden_states = inputs_embeds
for idx, decoder_layer in enumerate(self.layers):
layer_outputs = decoder_layer(hidden_states,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = layer_outputs
hidden_states = self.norm(hidden_states)
return hidden_states
class ParallelLlamaForCausalLMRmPad(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.config = config
self.megatron_config = megatron_config
self.model = ParallelLlamaModelRmPad(config, megatron_config=megatron_config)
self.vocab_size = config.vocab_size
self._init_head()
def _init_head(self):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=self.config.hidden_size,
output_size=self.config.vocab_size,
bias=False,
gather_output=False,
skip_bias_add=False,
**column_kwargs)
def _forward_head(self, hidden_states):
# all_gather from sequence parallel region is performed inside lm_head
logits = self.lm_head(hidden_states)[0]
logits = logits.float() # (total_nnz_padded, 1, vocab_size // tp)
logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits) # (total_nnz_padded, 1, vocab_size)
return logits
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
```"""
batch_size, sequence_length = input_ids.shape
# remove padding here
input_ids, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(input_ids.unsqueeze(dim=-1),
attention_mask) # (total_nnz, 1)
# pad input_ids to multiple of tp for all tp ranks
# TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
if self.megatron_config.sequence_parallel:
input_ids = sp_utils.pad_to_sequence_parallel(input_ids)
input_ids = input_ids.transpose(0, 1) # (1, total_nnz+pad)
outputs = self.model(input_ids=input_ids,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = outputs
logits = self._forward_head(hidden_states)
# remove padding from sequence parallel
if self.megatron_config.sequence_parallel:
totol_nnz = cu_seqlens[-1]
logits = logits[:totol_nnz] # (total_nnz_padded)
logits = torch.squeeze(logits, dim=1) # remove the artificial batch dimension
# add removed padding back
logits = pad_input(logits, indices, batch_size,
seqlen=sequence_length) # (batch_size, sequence_length, vocab_size)
return CausalLMOutputWithPast(
loss=None,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
class ParallelLlamaForValueRmPad(ParallelLlamaForCausalLMRmPad):
def _init_head(self):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = nn.Linear(in_features=self.config.hidden_size, out_features=1, bias=False)
# lm_head is effectively the same as sequence parallel
sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
def _forward_head(self, hidden_states):
logits = self.lm_head(hidden_states) # (total_nnz_padded // tp, 1, 1)
logits = logits.float()
if self.megatron_config.sequence_parallel:
logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
return logits
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
output = super().forward(input_ids, attention_mask, position_ids)
output.logits = torch.squeeze(output.logits, dim=-1)
return output
"""
Support pipeline parallelism
"""
class ParallelLlamaModelRmPadPP(nn.Module):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
This model definition supports pipeline parallelism. To support pp and vpp,
- This model only contains layer in this pp stage and vpp chunk
- When calling get_model in Megatron, this rank will instantiate all the vpp chunks in this pp.
Args:
config: LlamaConfig
"""
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, pre_process, post_process):
super().__init__()
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.pre_process = pre_process
self.post_process = post_process
self.megatron_config = megatron_config
embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
if megatron_config is not None:
assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
if pre_process:
self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
**embedding_kwargs)
else:
self.embed_tokens = None
# pp_rank = megatron_config.pipeline_model_parallel_rank
pp_size = megatron_config.pipeline_model_parallel_size
self.num_layer_per_pp = config.num_hidden_layers // pp_size
vpp_size = megatron_config.virtual_pipeline_model_parallel_size
if vpp_size is not None:
self.num_layer_vpp_chunk = self.num_layer_per_pp // vpp_size
self.num_layer_this_model = self.num_layer_vpp_chunk
# vpp_rank = megatron_config.virtual_pipeline_model_parallel_rank
# self.offset = vpp_rank * (
# config.num_hidden_layers // megatron_config.virtual_pipeline_model_parallel_size) + \
# (megatron_config.pipeline_model_parallel_rank * self.num_layer_vpp_chunk)
else:
self.num_layer_this_model = self.num_layer_per_pp
# self.offset = pp_rank * self.num_layer_per_pp
layers = []
for i in range(self.num_layer_this_model):
layer = ParallelLlamaDecoderLayerRmPad(config, megatron_config)
# setattr(layer, 'hidden_layer_index', self.offset + i)
layers.append(layer)
self.layers = nn.ModuleList(layers)
if post_process:
self.norm = ParallelLlamaRMSNorm(config, megatron_config)
else:
self.norm = None
def set_input_tensor(self, input_tensor):
"""Set input tensor to be used instead of forward()'s input.
When doing pipeline parallelism the input from the previous
stage comes from communication, not from the input, so the
model's forward_step_func won't have it. This function is thus
used by internal code to bypass the input provided by the
forward_step_func"""
self.input_tensor = input_tensor
def forward(self,
input_ids: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: int = None,
max_seqlen_in_batch: int = None) -> Union[Tuple, BaseModelOutputWithPast]:
"""
Args:
input_ids: input ids. shape (1, totol_nnz)
position_ids: position ids. shape (batch_size, seq_length)
Returns:
"""
if self.pre_process:
# if torch.cuda.current_device() == 0:
# print(f'rank {torch.cuda.current_device()}: input_ids shape before embedding: {input_ids.shape}')
inputs_embeds = self.embed_tokens(input_ids) # (1, total_nnz) -> (1, total_nnz, hidden_size)
# vocab parallel embedding will not do sequence parallel reduce-scatter in open source megatron
# so need to deal with it by handle here:
# (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
inputs_embeds = inputs_embeds.transpose(0, 1)
if self.megatron_config.sequence_parallel:
inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
# if torch.cuda.current_device() == 0:
# print(f'rank {torch.cuda.current_device()}: input_embeds shape after embedding: {inputs_embeds.shape}')
hidden_states = inputs_embeds
else:
# self.hidden_states should be passed by Megatron
hidden_states = self.input_tensor
for idx, decoder_layer in enumerate(self.layers):
layer_outputs = decoder_layer(hidden_states,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = layer_outputs
if self.post_process:
hidden_states = self.norm(hidden_states)
return hidden_states
class ParallelLlamaForCausalLMRmPadPP(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, pre_process, post_process):
super().__init__()
self.config = config
self.megatron_config = megatron_config
self.model = ParallelLlamaModelRmPadPP(config,
megatron_config=megatron_config,
pre_process=pre_process,
post_process=post_process)
self.share_embeddings_and_output_weights = None # workaround, megatron requires this attr
self.vocab_size = config.vocab_size
self.pre_process = pre_process
self.post_process = post_process
if post_process:
self._init_head()
def set_input_tensor(self, input_tensor):
"""Set input tensor to be used instead of forward()'s input.
When doing pipeline parallelism the input from the previous
stage comes from communication, not from the input, so the
model's forward_step_func won't have it. This function is thus
used by internal code to bypass the input provided by the
forward_step_func"""
assert len(input_tensor) == 1
self.model.set_input_tensor(input_tensor[0])
def _init_head(self):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=self.config.hidden_size,
output_size=self.config.vocab_size,
bias=False,
gather_output=False,
skip_bias_add=False,
**column_kwargs)
def _forward_head(self, hidden_states):
# all_gather from sequence parallel region is performed inside lm_head
# print(f'logits shape before forward_head: {hidden_states.shape}, vocab_size = {self.config.vocab_size}') # [4, 32, 4096]
logits = self.lm_head(hidden_states)[0]
# print(f'logits shape after forward_head: {logits.shape}') # [8, 32, 8]
logits = logits.float() # (total_nnz_padded, 1, vocab_size // tp)
return logits
def forward(
self,
# original input
*,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
```"""
# Note that input_ids, attention_mask and position_ids should be passed to every pp layer.
# In the first pp, input_ids will be used, in other pp layers hidden_states will be used inside self.model
batch_size, sequence_length = input_ids.shape
# remove padding here
input_ids_rmpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(input_ids.unsqueeze(dim=-1),
attention_mask) # (total_nnz, 1)
# print(f'input_ids.shape = {input_ids.shape}, input_ids_rmpad.shape = {input_ids_rmpad.shape}, indices.shape = {indices.shape}, cu_seqlens[-1] = {cu_seqlens[-1]}')
# pad input_ids to multiple of tp for all tp ranks
# TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
if self.megatron_config.sequence_parallel:
input_ids_rmpad = sp_utils.pad_to_sequence_parallel(input_ids_rmpad)
input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz+pad)
outputs = self.model(input_ids=input_ids_rmpad,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
if self.post_process:
hidden_states = outputs
# print(f'hidden_states.shape = {hidden_states.shape}') # torch.Size([4, 32, 4096])
logits = self._forward_head(hidden_states)
# print(f'logits.shape = {logits.shape}')
logits = torch.squeeze(logits, dim=1) # remove the artificial batch dimension # torch.Size([8, 32, 16])
# remove padding from sequence parallel
if self.megatron_config.sequence_parallel:
totol_nnz = cu_seqlens[-1]
logits = logits[:totol_nnz] # (total_nnz_padded)
# add removed padding back. If input is already rmpad, we let the caller pad_input
# print(f'logits.shape = {logits.shape}, indices.shape = {indices.shape}, batch_size = {batch_size}, seq_len = {sequence_length}')
logits = pad_input(logits, indices, batch_size,
seqlen=sequence_length) # (batch_size, sequence_length, vocab_size)
return CausalLMOutputWithPast(
loss=None,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
else:
return outputs
class ParallelLlamaForValueRmPadPP(ParallelLlamaForCausalLMRmPadPP):
def _init_head(self):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = nn.Linear(in_features=self.config.hidden_size, out_features=1, bias=False)
# lm_head is effectively the same as sequence parallel
sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
def _forward_head(self, hidden_states):
logits = self.lm_head(hidden_states) # (total_nnz_padded // tp, 1, 1)
logits = logits.float()
if self.megatron_config.sequence_parallel:
logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
return logits
def forward(
self,
*,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
output = super().forward(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
if self.post_process:
output.logits = torch.squeeze(output.logits, dim=-1)
return output
else:
return output

50
verl/models/registry.py Normal file
View File

@ -0,0 +1,50 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
from typing import List, Optional, Type
import torch.nn as nn
# Architecture -> (module, class).
_MODELS = {
"LlamaForCausalLM":
("llama", ("ParallelLlamaForCausalLMRmPadPP", "ParallelLlamaForValueRmPadPP", "ParallelLlamaForCausalLMRmPad")),
"MistralForCausalLM": ("mistral", ("ParallelMistralForCausalLMRmPadPP", "ParallelMistralForValueRmPadPP",
"ParallelMistralForCausalLMRmPad"))
}
# return model class
class ModelRegistry:
@staticmethod
def load_model_cls(model_arch: str, value=False) -> Optional[Type[nn.Module]]:
if model_arch not in _MODELS:
return None
megatron = "megatron"
module_name, model_cls_name = _MODELS[model_arch]
if not value: # actor/ref
model_cls_name = model_cls_name[0]
elif value: # critic/rm
model_cls_name = model_cls_name[1]
module = importlib.import_module(f"verl.models.{module_name}.{megatron}.modeling_{module_name}_megatron")
return getattr(module, model_cls_name, None)
@staticmethod
def get_supported_archs() -> List[str]:
return list(_MODELS.keys())

View File

@ -0,0 +1,23 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def get_weight_loader(arch: str):
from verl.models.llama.megatron.checkpoint_utils.llama_loader import load_state_dict_to_megatron_llama
_MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY = {'LlamaForCausalLM': load_state_dict_to_megatron_llama}
if arch in _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY:
return _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY[arch]
raise ValueError(f"Model architectures {arch} are not supported for now. "
f"Supported architectures: {_MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY.keys()}")

487
verl/protocol.py Normal file
View File

@ -0,0 +1,487 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Implement base data transfer protocol between any two functions, modules.
We can subclass Protocol to define more detailed batch info with specific keys
"""
import numpy as np
import copy
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Union
import torch
import tensordict
from tensordict import TensorDict
from torch.utils.data import DataLoader, Dataset
from verl.utils.py_functional import union_two_dict
__all__ = ['DataProto', 'union_tensor_dict']
try:
tensordict.set_lazy_legacy(False).set()
except:
pass
def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> TensorDict:
"""Union two tensordicts."""
assert tensor_dict1.batch_size == tensor_dict2.batch_size, \
f'Two tensor dict must have identical batch size. Got {tensor_dict1.batch_size} and {tensor_dict2.batch_size}'
for key in tensor_dict2.keys():
if key not in tensor_dict1.keys():
tensor_dict1[key] = tensor_dict2[key]
else:
assert tensor_dict1[key].equal(tensor_dict2[key]), \
f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
return tensor_dict1
def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[np.ndarray]) -> dict[np.ndarray]:
for key, val in tensor_dict2.items():
if key in tensor_dict1:
assert isinstance(tensor_dict2[key], np.ndarray)
assert isinstance(tensor_dict1[key], np.ndarray)
assert np.all(tensor_dict2[key] == tensor_dict1[key]), \
f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
tensor_dict1[key] = val
return tensor_dict1
def list_of_dict_to_dict_of_list(list_of_dict: list[dict]):
if len(list_of_dict) == 0:
return {}
keys = list_of_dict[0].keys()
output = {key: [] for key in keys}
for data in list_of_dict:
for key, item in data.items():
assert key in output
output[key].append(item)
return output
def collate_fn(x: list['DataProtoItem']):
batch = []
non_tensor_batch = []
for data in x:
batch.append(data.batch)
non_tensor_batch.append(data.non_tensor_batch)
batch = torch.stack(batch).contiguous()
non_tensor_batch = list_of_dict_to_dict_of_list(non_tensor_batch)
for key, val in non_tensor_batch.items():
non_tensor_batch[key] = np.array(val, dtype=object)
return DataProto(batch=batch, non_tensor_batch=non_tensor_batch)
@dataclass
class DataProtoItem:
# TODO(zhangchi.usc1992) add consistency check
batch: TensorDict = None
non_tensor_batch: Dict = field(default_factory=dict)
meta_info: Dict = field(default_factory=dict)
@dataclass
class DataProto:
"""
A DataProto is a data structure that aims to provide a standard protocol for data exchange between functions.
It contains a batch (TensorDict) and a meta_info (Dict). The batch is a TensorDict https://pytorch.org/tensordict/.
TensorDict allows you to manipulate a dictionary of Tensors like a single Tensor. Ideally, the tensors with the
same batch size should be put inside batch.
"""
batch: TensorDict = None
non_tensor_batch: Dict = field(default_factory=dict)
meta_info: Dict = field(default_factory=dict)
def __post_init__(self):
# perform necessary checking
self.check_consistency()
def __len__(self):
return self.batch.batch_size[0]
def __getitem__(self, item):
tensor_data = self.batch[item]
non_tensor_data = {key: val[item] for key, val in self.non_tensor_batch.items()}
return DataProtoItem(batch=tensor_data, non_tensor_batch=non_tensor_data, meta_info=self.meta_info)
def __getstate__(self):
import io
buffer = io.BytesIO()
if tensordict.__version__ >= '0.5.0' and self.batch is not None:
self.batch = self.batch.contiguous()
self.batch = self.batch.consolidate()
torch.save(self.batch, buffer)
return buffer, self.non_tensor_batch, self.meta_info
def __setstate__(self, data):
batch_deserialized, non_tensor_batch, meta_info = data
batch_deserialized.seek(0)
batch = torch.load(batch_deserialized,
weights_only=False,
map_location='cpu' if not torch.cuda.is_available() else None)
self.batch = batch
self.non_tensor_batch = non_tensor_batch
self.meta_info = meta_info
def check_consistency(self):
"""Check the consistency of the DataProto. Mainly for batch and non_tensor_batch
We expose this function as a public one so that user can call themselves directly
"""
if self.batch is not None:
assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1'
if len(self.non_tensor_batch) != 0:
# TODO: we can actually lift this restriction if needed
assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1 when non_tensor_batch is not empty.'
batch_size = self.batch.batch_size[0]
for key, val in self.non_tensor_batch.items():
assert isinstance(
val, np.ndarray
) and val.dtype == object, 'data in the non_tensor_batch must be a numpy.array with dtype=object'
assert val.shape[
0] == batch_size, f'key {key} length {len(val)} is not equal to batch size {batch_size}'
@classmethod
def from_single_dict(cls, data: Dict[str, Union[torch.Tensor, np.ndarray]], meta_info=None):
tensors = {}
non_tensors = {}
for key, val in data.items():
if isinstance(val, torch.Tensor):
tensors[key] = val
elif isinstance(val, np.ndarray):
non_tensors[key] = val
else:
raise ValueError(f'Unsupported type in data {type(val)}')
return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
@classmethod
def from_dict(cls, tensors: Dict[str, torch.Tensor], non_tensors=None, meta_info=None, num_batch_dims=1):
"""Create a DataProto from a dict of tensors. This assumes that
1. All the tensor in tensors have the same dim0
2. Only dim0 is the batch dim
"""
assert len(tensors) > 0, 'tensors must not be empty'
assert num_batch_dims > 0, 'num_batch_dims must be greater than zero'
if non_tensors is not None:
assert num_batch_dims == 1, 'only support num_batch_dims=1 when non_tensors is not None.'
if meta_info is None:
meta_info = {}
if non_tensors is None:
non_tensors = {}
assert isinstance(non_tensors, dict)
# get and check batch size
batch_size = None
pivot_key = None
for key, tensor in tensors.items():
if batch_size is None:
batch_size = tensor.shape[:num_batch_dims]
pivot_key = key
else:
current_batch = tensor.shape[:num_batch_dims]
assert batch_size == current_batch, \
f'Not all the tensor in tensors have the same batch size with batch_dims={num_batch_dims}. Got {pivot_key} has {batch_size}, {key} has {current_batch}'
for key, val in non_tensors.items():
non_tensors[key] = np.array(val, dtype=object)
tensor_dict = TensorDict(source=tensors, batch_size=batch_size)
return cls(batch=tensor_dict, non_tensor_batch=non_tensors, meta_info=meta_info)
def to(self, device) -> 'DataProto':
"""move the batch to device
Args:
device (torch.device, str): torch device
Returns:
DataProto: the current DataProto
"""
if self.batch is not None:
self.batch = self.batch.to(device)
return self
def select(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None, deepcopy=False) -> 'DataProto':
"""Select a subset of the DataProto via batch_keys and meta_info_keys
Args:
batch_keys (list, optional): a list of strings indicating the keys in batch to select
meta_info_keys (list, optional): a list of keys indicating the meta info to select
Returns:
DataProto: the DataProto with the selected batch_keys and meta_info_keys
"""
# TODO (zhangchi.usc1992) whether to copy
if batch_keys is not None:
batch_keys = tuple(batch_keys)
sub_batch = self.batch.select(*batch_keys)
else:
sub_batch = self.batch
if non_tensor_batch_keys is not None:
non_tensor_batch = {key: val for key, val in self.non_tensor_batch.items() if key in non_tensor_batch_keys}
else:
non_tensor_batch = self.non_tensor_batch
if deepcopy:
non_tensor_batch = copy.deepcopy(non_tensor_batch)
if meta_info_keys is not None:
sub_meta_info = {key: val for key, val in self.meta_info.items() if key in meta_info_keys}
else:
sub_meta_info = self.meta_info
if deepcopy:
sub_meta_info = copy.deepcopy(sub_meta_info)
return DataProto(batch=sub_batch, non_tensor_batch=non_tensor_batch, meta_info=sub_meta_info)
def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None) -> 'DataProto':
"""Pop a subset of the DataProto via `batch_keys` and `meta_info_keys`
Args:
batch_keys (list, optional): a list of strings indicating the keys in batch to pop
meta_info_keys (list, optional): a list of keys indicating the meta info to pop
Returns:
DataProto: the DataProto with the poped batch_keys and meta_info_keys
"""
assert batch_keys is not None
if meta_info_keys is None:
meta_info_keys = []
if non_tensor_batch_keys is None:
non_tensor_batch_keys = []
tensors = {}
# tensor batch
for key in batch_keys:
assert key in self.batch.keys()
tensors[key] = self.batch.pop(key)
non_tensors = {}
# non tensor batch
for key in non_tensor_batch_keys:
assert key in self.non_tensor_batch.keys()
non_tensors[key] = self.non_tensor_batch.pop(key)
meta_info = {}
for key in meta_info_keys:
assert key in self.meta_info.keys()
meta_info[key] = self.meta_info.pop(key)
return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
def rename(self, old_keys=None, new_keys=None) -> 'DataProto':
"""
Note that this function only rename the key in the batch
"""
def validate_input(keys):
if keys is not None:
if isinstance(keys, str):
keys = [keys]
elif isinstance(keys, list):
pass
else:
raise TypeError(f'keys must be a list or a string, but got {type(keys)}')
return keys
old_keys = validate_input(old_keys)
new_keys = validate_input(new_keys)
if len(new_keys) != len(old_keys):
raise ValueError(
f'new_keys and old_keys must have the same length, but got {len(new_keys)} and {len(old_keys)}')
self.batch.rename_key_(tuple(old_keys), tuple(new_keys))
return self
def union(self, other: 'DataProto') -> 'DataProto':
"""Union with another DataProto. Union batch and meta_info separately.
Throw an error if
- there are conflict keys in batch and they are not equal
- the batch size of two data batch is not the same
- there are conflict keys in meta_info and they are not the same.
Args:
other (DataProto): another DataProto to union
Returns:
DataProto: the DataProto after union
"""
self.batch = union_tensor_dict(self.batch, other.batch)
self.non_tensor_batch = union_numpy_dict(self.non_tensor_batch, other.non_tensor_batch)
self.meta_info = union_two_dict(self.meta_info, other.meta_info)
return self
def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader_kwargs=None):
"""Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details.
Args:
mini_batch_size (int): mini-batch size when iterating the dataset. We require that
``batch.batch_size[0] % mini_batch_size == 0``
epochs (int): number of epochs when iterating the dataset.
dataloader_kwargs: internally, it returns a DataLoader over the batch.
The dataloader_kwargs is the kwargs passed to the DataLoader
Returns:
Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is
``self.batch.batch_size * epochs // mini_batch_size``
"""
assert self.batch.batch_size[0] % mini_batch_size == 0, f"{self.batch.batch_size[0]} % {mini_batch_size} != 0"
# we can directly create a dataloader from TensorDict
if dataloader_kwargs is None:
dataloader_kwargs = {}
if seed is not None:
generator = torch.Generator()
generator.manual_seed(seed)
else:
generator = None
assert isinstance(dataloader_kwargs, Dict)
train_dataloader = DataLoader(dataset=self,
batch_size=mini_batch_size,
collate_fn=collate_fn,
generator=generator,
**dataloader_kwargs)
def get_data():
for _ in range(epochs):
for d in train_dataloader:
d.meta_info = self.meta_info
yield d
return iter(get_data())
def chunk(self, chunks: int) -> List['DataProto']:
"""Split the batch among dim=0 into chunks. The meta_info is passed to each DataProto after split.
Args:
chunks (int): the number of chunks to split on dim=0
Returns:
List[DataProto]: a list of DataProto after splitting
"""
if self.batch is not None:
batch_lst = self.batch.chunk(chunks=chunks, dim=0)
else:
batch_lst = [None for _ in range(chunks)]
non_tensor_batch_lst = [{} for _ in range(chunks)]
for key, val in self.non_tensor_batch.items():
assert isinstance(val, np.ndarray)
non_tensor_lst = np.array_split(val, chunks)
assert len(non_tensor_lst) == chunks
for i in range(chunks):
non_tensor_batch_lst[i][key] = non_tensor_lst[i]
output = []
for i in range(chunks):
output.append(
DataProto(batch=batch_lst[i], non_tensor_batch=non_tensor_batch_lst[i], meta_info=self.meta_info))
return output
@staticmethod
def concat(data: List['DataProto']) -> 'DataProto':
"""Concat a list of DataProto. The batch is concatenated among dim=0.
The meta_info is assumed to be identical and will use the first one.
Args:
data (List[DataProto]): list of DataProto
Returns:
DataProto: concatenated DataProto
"""
batch_lst = []
for batch in data:
batch_lst.append(batch.batch)
if batch_lst[0] is not None:
new_batch = torch.cat(batch_lst, dim=0)
else:
new_batch = None
non_tensor_batch = list_of_dict_to_dict_of_list(list_of_dict=[d.non_tensor_batch for d in data])
for key, val in non_tensor_batch.items():
non_tensor_batch[key] = np.concatenate(val, axis=0)
return DataProto(batch=new_batch, non_tensor_batch=non_tensor_batch, meta_info=data[0].meta_info)
def reorder(self, indices):
"""
Note that this operation is in-place
"""
indices_np = indices.detach().numpy()
self.batch = self.batch[indices]
self.non_tensor_batch = {key: val[indices_np] for key, val in self.non_tensor_batch.items()}
import ray
@dataclass
class DataProtoFuture:
"""
DataProtoFuture aims to eliminate actual data fetching on driver. By doing so, the driver doesn't have to wait
for data so that asynchronous execution becomes possible.
DataProtoFuture contains a list of futures from another WorkerGroup of size world_size.
- collect_fn is a Callable that reduces the list of futures to a DataProto
- dispatch_fn is a Callable that partitions the DataProto into a list of DataProto of size world_size and then select
Potential issue: we can optimize dispatch_fn(collect_fn) such that only needed data is fetched on destination
- DataProtoFuture only supports directly passing from the output of a method to another input. You can't perform any
operation on the DataProtoFuture in driver.
"""
collect_fn: Callable
futures: List[ray.ObjectRef]
dispatch_fn: Callable = None
@staticmethod
def concat(data: List[ray.ObjectRef]) -> 'DataProtoFuture':
output = DataProtoFuture(collect_fn=DataProto.concat, futures=data)
return output
def chunk(self, chunks: int) -> List['DataProtoFuture']:
from functools import partial
arg_future_lst = []
for i in range(chunks):
# note that we can't directly pass i and chunks
def dispatch_fn(x, i, chunks):
return x.chunk(chunks=chunks)[i]
arg_future = DataProtoFuture(collect_fn=self.collect_fn,
dispatch_fn=partial(dispatch_fn, i=i, chunks=chunks),
futures=self.futures)
arg_future_lst.append(arg_future)
return arg_future_lst
def get(self):
output = ray.get(self.futures) # dp_size.
for o in output:
assert isinstance(o, DataProto)
output = self.collect_fn(output) # select dp, concat
if self.dispatch_fn is not None:
output = self.dispatch_fn(output) # split in batch dim, select using dp
return output

13
verl/third_party/__init__.py vendored Normal file
View File

@ -0,0 +1,13 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

45
verl/third_party/vllm/__init__.py vendored Normal file
View File

@ -0,0 +1,45 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from importlib.metadata import version, PackageNotFoundError
def get_version(pkg):
try:
return version(pkg)
except PackageNotFoundError:
return None
package_name = 'vllm'
package_version = get_version(package_name)
if package_version == '0.3.1':
vllm_version = '0.3.1'
from .vllm_v_0_3_1.llm import LLM
from .vllm_v_0_3_1.llm import LLMEngine
from .vllm_v_0_3_1 import parallel_state
elif package_version == '0.4.2':
vllm_version = '0.4.2'
from .vllm_v_0_4_2.llm import LLM
from .vllm_v_0_4_2.llm import LLMEngine
from .vllm_v_0_4_2 import parallel_state
elif package_version == '0.5.4':
vllm_version = '0.5.4'
from .vllm_v_0_5_4.llm import LLM
from .vllm_v_0_5_4.llm import LLMEngine
from .vllm_v_0_5_4 import parallel_state
else:
raise ValueError(
f'vllm version {package_version} not supported. Currently supported versions are 0.3.1, 0.4.2, and 0.5.4.')

View File

@ -0,0 +1,13 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,228 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
import argparse
import dataclasses
from dataclasses import dataclass
from typing import Dict, Optional, Tuple
import torch.nn as nn
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
from transformers import PretrainedConfig
from .config import ModelConfig
@dataclass
class EngineArgs:
"""Arguments for vLLM engine."""
model_hf_config: PretrainedConfig = None
dtype: str = 'auto'
kv_cache_dtype: str = 'auto'
seed: int = 0
max_model_len: Optional[int] = None
worker_use_ray: bool = False
pipeline_parallel_size: int = 1
tensor_parallel_size: int = 1
max_parallel_loading_workers: Optional[int] = None
block_size: int = 16
swap_space: int = 4 # GiB
gpu_memory_utilization: float = 0.90
max_num_batched_tokens: Optional[int] = None
max_num_seqs: int = 256
max_paddings: int = 256
disable_log_stats: bool = False
revision: Optional[str] = None
tokenizer_revision: Optional[str] = None
quantization: Optional[str] = None
load_format: str = 'model'
enforce_eager: bool = False
max_context_len_to_capture: int = 8192
disable_custom_all_reduce: bool = False
enable_lora: bool = False
max_loras: int = 1
max_lora_rank: int = 16
lora_extra_vocab_size: int = 256
lora_dtype = 'auto'
max_cpu_loras: Optional[int] = None
device: str = 'cuda'
@staticmethod
def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
"""Shared CLI arguments for vLLM engine."""
# Model arguments
# TODO(shengguangming): delete the unused args
parser.add_argument('--model',
type=str,
default='facebook/opt-125m',
help='name or path of the huggingface model to use')
parser.add_argument('--tokenizer',
type=str,
default=EngineArgs.tokenizer,
help='name or path of the huggingface tokenizer to use')
parser.add_argument('--revision',
type=str,
default=None,
help='the specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.')
parser.add_argument('--tokenizer-revision',
type=str,
default=None,
help='the specific tokenizer version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.')
parser.add_argument('--tokenizer-mode',
type=str,
default=EngineArgs.tokenizer_mode,
choices=['auto', 'slow'],
help='tokenizer mode. "auto" will use the fast '
'tokenizer if available, and "slow" will '
'always use the slow tokenizer.')
parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface')
parser.add_argument('--download-dir',
type=str,
default=EngineArgs.download_dir,
help='directory to download and load the weights, '
'default to the default cache dir of '
'huggingface')
parser.add_argument('--load-format',
type=str,
default=EngineArgs.load_format,
choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
help='The format of the model weights to load. '
'"auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available. '
'"pt" will load the weights in the pytorch bin format. '
'"safetensors" will load the weights in the safetensors format. '
'"npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading. '
'"dummy" will initialize the weights with random values, '
'which is mainly for profiling.')
parser.add_argument('--dtype',
type=str,
default=EngineArgs.dtype,
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--max-model-len',
type=int,
default=None,
help='model context length. If unspecified, '
'will be automatically derived from the model.')
# Parallel arguments
parser.add_argument('--worker-use-ray',
action='store_true',
help='use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU')
parser.add_argument('--pipeline-parallel-size',
'-pp',
type=int,
default=EngineArgs.pipeline_parallel_size,
help='number of pipeline stages')
parser.add_argument('--tensor-parallel-size',
'-tp',
type=int,
default=EngineArgs.tensor_parallel_size,
help='number of tensor parallel replicas')
# KV cache arguments
parser.add_argument('--block-size',
type=int,
default=EngineArgs.block_size,
choices=[8, 16, 32],
help='token block size')
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed')
parser.add_argument('--swap-space',
type=int,
default=EngineArgs.swap_space,
help='CPU swap space size (GiB) per GPU')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=EngineArgs.gpu_memory_utilization,
help='the percentage of GPU memory to be used for'
'the model executor')
parser.add_argument('--max-num-batched-tokens',
type=int,
default=EngineArgs.max_num_batched_tokens,
help='maximum number of batched tokens per '
'iteration')
parser.add_argument('--max-num-seqs',
type=int,
default=EngineArgs.max_num_seqs,
help='maximum number of sequences per iteration')
parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics')
# Quantization settings.
parser.add_argument('--quantization',
'-q',
type=str,
choices=['awq', None],
default=None,
help='Method used to quantize the weights')
return parser
@classmethod
def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
# Get the list of attributes of this dataclass.
attrs = [attr.name for attr in dataclasses.fields(cls)]
# Set the attributes from the parsed arguments.
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
return engine_args
def create_engine_configs(
self,
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
device_config = DeviceConfig(self.device)
model_config = ModelConfig(self.model_hf_config, self.dtype, self.seed, self.load_format, self.revision,
self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager,
self.max_context_len_to_capture)
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype,
model_config.get_sliding_window())
parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray,
self.max_parallel_loading_workers, self.disable_custom_all_reduce)
scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len,
self.max_paddings)
lora_config = LoRAConfig(max_lora_rank=self.max_lora_rank,
max_loras=self.max_loras,
lora_extra_vocab_size=self.lora_extra_vocab_size,
lora_dtype=self.lora_dtype,
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else
None) if self.enable_lora else None
return (model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config)
@dataclass
class AsyncEngineArgs(EngineArgs):
"""Arguments for asynchronous vLLM engine."""
engine_use_ray: bool = False
disable_log_requests: bool = False
max_log_len: Optional[int] = None
@staticmethod
def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser = EngineArgs.add_cli_args(parser)
parser.add_argument('--engine-use-ray',
action='store_true',
help='use Ray to start the LLM engine in a '
'separate process as the server process.')
parser.add_argument('--disable-log-requests', action='store_true', help='disable logging requests')
parser.add_argument('--max-log-len',
type=int,
default=None,
help='max number of prompt characters or prompt '
'ID numbers being printed in log. '
'Default: unlimited.')
return parser

View File

@ -0,0 +1,578 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
from typing import Optional, Union, ClassVar
from dataclasses import dataclass
import torch
from transformers import PretrainedConfig
from packaging.version import Version
from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config
from vllm.utils import get_cpu_memory, is_hip, get_nvcc_cuda_version
logger = init_logger(__name__)
_GB = 1 << 30
class ModelConfig:
"""Configuration for the model.
Args:
model: Name or path of the huggingface model to use.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
dtype: Data type for model weights and activations. The "auto" option
will use FP16 precision for FP32 and FP16 models, and BF16 precision
for BF16 models.
seed: Random seed for reproducibility.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id. If unspecified, will use the default
version.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id. If unspecified, will use
the default version.
max_model_len: Maximum length of a sequence (including prompt and
output). If None, will be derived from the model.
quantization: Quantization method that was used to quantize the model
weights. If None, we assume the model weights are not quantized.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
"""
def __init__(
self,
hf_config: PretrainedConfig,
dtype: str,
seed: int,
load_format: str = 'model',
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
max_model_len: Optional[int] = None,
quantization: Optional[str] = None,
trust_remote_code: Optional[bool] = True,
enforce_eager: bool = False,
max_context_len_to_capture: Optional[int] = None,
) -> None:
self.model = hf_config._name_or_path
self.tokenizer = hf_config._name_or_path
self.load_format = load_format
self.seed = seed
self.revision = revision
self.tokenizer_revision = tokenizer_revision
self.quantization = quantization
self.trust_remote_code = trust_remote_code
self.enforce_eager = enforce_eager
self.max_context_len_to_capture = max_context_len_to_capture
# self.hf_config = get_config(model, trust_remote_code, revision)
self.hf_config = hf_config
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
# self._verify_load_format()
# self._verify_tokenizer_mode()
self._verify_quantization()
self._verify_cuda_graph()
def _verify_load_format(self) -> None:
load_format = self.load_format.lower()
if load_format not in ["auto", "pt", "safetensors", "npcache", "dummy", "model"]:
raise ValueError(f"Unknown load format: {self.load_format}. Must be one of "
"'auto', 'pt', 'safetensors', 'npcache', 'dummy' or 'model'.")
self.load_format = load_format
# def _verify_tokenizer_mode(self) -> None:
# tokenizer_mode = self.tokenizer_mode.lower()
# if tokenizer_mode not in ["auto", "slow"]:
# raise ValueError(
# f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
# "either 'auto' or 'slow'.")
# self.tokenizer_mode = tokenizer_mode
def _verify_quantization(self) -> None:
supported_quantization = ["awq", "gptq", "squeezellm"]
rocm_not_supported_quantization = ["awq", "gptq"]
if self.quantization is not None:
self.quantization = self.quantization.lower()
# Parse quantization method from the HF model config, if available.
hf_quant_config = getattr(self.hf_config, "quantization_config", None)
if hf_quant_config is not None:
hf_quant_method = str(hf_quant_config["quant_method"]).lower()
if self.quantization is None:
self.quantization = hf_quant_method
elif self.quantization != hf_quant_method:
raise ValueError("Quantization method specified in the model config "
f"({hf_quant_method}) does not match the quantization "
f"method specified in the `quantization` argument "
f"({self.quantization}).")
if self.quantization is not None:
if self.quantization not in supported_quantization:
raise ValueError(f"Unknown quantization method: {self.quantization}. Must "
f"be one of {supported_quantization}.")
if is_hip() and self.quantization in rocm_not_supported_quantization:
raise ValueError(f"{self.quantization} quantization is currently not supported "
f"in ROCm.")
logger.warning(f"{self.quantization} quantization is not fully "
"optimized yet. The speed can be slower than "
"non-quantized models.")
def _verify_cuda_graph(self) -> None:
if self.max_context_len_to_capture is None:
self.max_context_len_to_capture = self.max_model_len
self.max_context_len_to_capture = min(self.max_context_len_to_capture, self.max_model_len)
if (self.quantization in ["gptq", "squeezellm"] and not self.enforce_eager):
# Related issue: https://github.com/vllm-project/vllm/issues/2147
logger.warning(f"{self.quantization} does not support CUDA graph "
"yet. Disabling CUDA graph.")
self.enforce_eager = True
def verify_with_parallel_config(
self,
parallel_config: "ParallelConfig",
) -> None:
total_num_attention_heads = self.hf_config.num_attention_heads
tensor_parallel_size = parallel_config.tensor_parallel_size
if total_num_attention_heads % tensor_parallel_size != 0:
raise ValueError(f"Total number of attention heads ({total_num_attention_heads})"
" must be divisible by tensor parallel size "
f"({tensor_parallel_size}).")
total_num_hidden_layers = self.hf_config.num_hidden_layers
pipeline_parallel_size = parallel_config.pipeline_parallel_size
if total_num_hidden_layers % pipeline_parallel_size != 0:
raise ValueError(f"Total number of hidden layers ({total_num_hidden_layers}) "
"must be divisible by pipeline parallel size "
f"({pipeline_parallel_size}).")
def get_sliding_window(self) -> Optional[int]:
return getattr(self.hf_config, "sliding_window", None)
def get_vocab_size(self) -> int:
return self.hf_config.vocab_size
def get_hidden_size(self) -> int:
return self.hf_config.hidden_size
def get_head_size(self) -> int:
# FIXME(woosuk): This may not be true for all models.
return self.hf_config.hidden_size // self.hf_config.num_attention_heads
def get_total_num_kv_heads(self) -> int:
"""Returns the total number of KV heads."""
# For GPTBigCode & Falcon:
# NOTE: for falcon, when new_decoder_architecture is True, the
# multi_query flag is ignored and we use n_head_kv for the number of
# KV heads.
falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
new_decoder_arch_falcon = (self.hf_config.model_type in falcon_model_types and
getattr(self.hf_config, "new_decoder_architecture", False))
if not new_decoder_arch_falcon and getattr(self.hf_config, "multi_query", False):
# Multi-query attention, only one KV head.
# Currently, tensor parallelism is not supported in this case.
return 1
attributes = [
# For Falcon:
"n_head_kv",
"num_kv_heads",
# For LLaMA-2:
"num_key_value_heads",
# For ChatGLM:
"multi_query_group_num",
]
for attr in attributes:
num_kv_heads = getattr(self.hf_config, attr, None)
if num_kv_heads is not None:
return num_kv_heads
# For non-grouped-query attention models, the number of KV heads is
# equal to the number of attention heads.
return self.hf_config.num_attention_heads
def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
"""Returns the number of KV heads per GPU."""
total_num_kv_heads = self.get_total_num_kv_heads()
# If tensor parallelism is used, we divide the number of KV heads by
# the tensor parallel size. We will replicate the KV heads in the
# case where the number of KV heads is smaller than the tensor
# parallel size so each GPU has at least one KV head.
return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
total_num_hidden_layers = self.hf_config.num_hidden_layers
return total_num_hidden_layers // parallel_config.pipeline_parallel_size
class CacheConfig:
"""Configuration for the KV cache.
Args:
block_size: Size of a cache block in number of tokens.
gpu_memory_utilization: Fraction of GPU memory to use for the
vLLM execution.
swap_space: Size of the CPU swap space per GPU (in GiB).
cache_dtype: Data type for kv cache storage.
"""
def __init__(
self,
block_size: int,
gpu_memory_utilization: float,
swap_space: int,
cache_dtype: str,
sliding_window: Optional[int] = None,
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
self.swap_space_bytes = swap_space * _GB
self.cache_dtype = cache_dtype
self.sliding_window = sliding_window
self._verify_args()
self._verify_cache_dtype()
# Will be set after profiling.
self.num_gpu_blocks = None
self.num_cpu_blocks = None
def _verify_args(self) -> None:
if self.gpu_memory_utilization > 1.0:
raise ValueError("GPU memory utilization must be less than 1.0. Got "
f"{self.gpu_memory_utilization}.")
def _verify_cache_dtype(self) -> None:
if self.cache_dtype == "auto":
pass
elif self.cache_dtype == "fp8_e5m2":
nvcc_cuda_version = get_nvcc_cuda_version()
if nvcc_cuda_version < Version("11.8"):
raise ValueError("FP8 is not supported when cuda version is lower than 11.8.")
device_name = torch.cuda.get_device_name()
if "AMD" in device_name:
raise NotImplementedError("FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
logger.info("Using fp8_e5m2 data type to store kv cache. It reduces "
"the GPU memory footprint and boosts the performance. "
"But it may cause slight accuracy drop. "
"Currently we only support fp8 without scaling factors and "
"make e5m2 as a default format.")
else:
raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
def verify_with_parallel_config(
self,
parallel_config: "ParallelConfig",
) -> None:
total_cpu_memory = get_cpu_memory()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
"allocated for the swap space.")
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warning("Possibly too large swap space. " + msg)
class ParallelConfig:
"""Configuration for the distributed execution.
Args:
pipeline_parallel_size: Number of pipeline parallel groups.
tensor_parallel_size: Number of tensor parallel groups.
worker_use_ray: Whether to use Ray for model workers. Will be set to
True if either pipeline_parallel_size or tensor_parallel_size is
greater than 1.
max_parallel_loading_workers: Maximum number of multiple batches
when load model sequentially. To avoid RAM OOM when using tensor
parallel and large models.
disable_custom_all_reduce: Disable the custom all-reduce kernel and
fall back to NCCL.
"""
def __init__(
self,
pipeline_parallel_size: int,
tensor_parallel_size: int,
worker_use_ray: bool,
max_parallel_loading_workers: Optional[int] = None,
disable_custom_all_reduce: bool = False,
) -> None:
self.pipeline_parallel_size = pipeline_parallel_size
self.tensor_parallel_size = tensor_parallel_size
self.worker_use_ray = worker_use_ray
self.max_parallel_loading_workers = max_parallel_loading_workers
self.disable_custom_all_reduce = disable_custom_all_reduce
self.world_size = pipeline_parallel_size * tensor_parallel_size
if self.world_size > 1:
self.worker_use_ray = True
self._verify_args()
def _verify_args(self) -> None:
if self.pipeline_parallel_size > 1:
raise NotImplementedError("Pipeline parallelism is not supported yet.")
if not self.disable_custom_all_reduce and self.world_size > 1:
if is_hip():
self.disable_custom_all_reduce = True
logger.info("Disabled the custom all-reduce kernel because it is not "
"supported on AMD GPUs.")
elif self.pipeline_parallel_size > 1:
self.disable_custom_all_reduce = True
logger.info("Disabled the custom all-reduce kernel because it is not "
"supported with pipeline parallelism.")
# FIXME(woosuk): Fix the stability issues and re-enable the custom
# all-reduce kernel.
if not self.disable_custom_all_reduce and self.world_size > 1:
self.disable_custom_all_reduce = True
logger.info("Custom all-reduce kernels are temporarily disabled due to "
"stability issues. We will re-enable them once the issues are "
"resolved.")
class SchedulerConfig:
"""Scheduler configuration.
Args:
max_num_batched_tokens: Maximum number of tokens to be processed in
a single iteration.
max_num_seqs: Maximum number of sequences to be processed in a single
iteration.
max_model_len: Maximum length of a sequence (including prompt
and generated text).
max_paddings: Maximum number of paddings to be added to a batch.
"""
def __init__(
self,
max_num_batched_tokens: Optional[int],
max_num_seqs: int,
max_model_len: int,
max_paddings: int,
) -> None:
if max_num_batched_tokens is not None:
self.max_num_batched_tokens = max_num_batched_tokens
else:
# If max_model_len is too short, use 2048 as the default value for
# higher throughput.
self.max_num_batched_tokens = max(max_model_len, 2048)
self.max_num_seqs = max_num_seqs
self.max_model_len = max_model_len
self.max_paddings = max_paddings
self._verify_args()
def _verify_args(self) -> None:
if self.max_num_batched_tokens < self.max_model_len:
raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
f"smaller than max_model_len ({self.max_model_len}). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len.")
if self.max_num_batched_tokens < self.max_num_seqs:
raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
"be greater than or equal to max_num_seqs "
f"({self.max_num_seqs}).")
class DeviceConfig:
def __init__(self, device: str = "cuda") -> None:
self.device = torch.device(device)
@dataclass
class LoRAConfig:
max_lora_rank: int
max_loras: int
max_cpu_loras: Optional[int] = None
lora_dtype: Optional[torch.dtype] = None
lora_extra_vocab_size: int = 256
# This is a constant.
lora_vocab_padding_size: ClassVar[int] = 256
def __post_init__(self):
# Keep this in sync with csrc/punica/bgmv/bgmv_config.h
possible_max_ranks = (8, 16, 32, 64)
possible_lora_extra_vocab_size = (0, 256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(f"max_lora_rank ({self.max_lora_rank}) must be one of "
f"{possible_max_ranks}.")
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
raise ValueError(f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
f"must be one of {possible_lora_extra_vocab_size}.")
if self.max_loras < 1:
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
if self.max_cpu_loras is None:
self.max_cpu_loras = self.max_loras
elif self.max_cpu_loras < self.max_loras:
raise ValueError(f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})")
def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"):
self.lora_dtype = model_config.dtype
elif isinstance(self.lora_dtype, str):
self.lora_dtype = getattr(torch, self.lora_dtype)
if model_config.quantization is not None:
raise ValueError("LoRA is not supported with quantized models yet.")
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
if scheduler_config.max_num_batched_tokens > 65528:
raise ValueError("Due to limitations of the custom LoRA CUDA kernel, "
"max_num_batched_tokens must be <= 65528 when "
"LoRA is enabled.")
_STR_DTYPE_TO_TORCH_DTYPE = {
"half": torch.float16,
"float16": torch.float16,
"float": torch.float32,
"float32": torch.float32,
"bfloat16": torch.bfloat16,
}
_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
def _get_and_verify_dtype(
config: PretrainedConfig,
dtype: Union[str, torch.dtype],
) -> torch.dtype:
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
# because config.torch_dtype can be None.
config_dtype = getattr(config, "torch_dtype", None)
if config_dtype is None:
config_dtype = torch.float32
if isinstance(dtype, str):
dtype = dtype.lower()
if dtype == "auto":
if config_dtype == torch.float32:
# Following the common practice, we use float16 for float32
# models.
torch_dtype = torch.float16
else:
torch_dtype = config_dtype
else:
if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
raise ValueError(f"Unknown dtype: {dtype}")
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
elif isinstance(dtype, torch.dtype):
torch_dtype = dtype
else:
raise ValueError(f"Unknown dtype: {dtype}")
if is_hip() and torch_dtype == torch.float32:
rocm_supported_dtypes = [
k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
]
raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. "
f"Supported dtypes are {rocm_supported_dtypes}")
# Verify the dtype.
if torch_dtype != config_dtype:
if torch_dtype == torch.float32:
# Upcasting to float32 is allowed.
pass
elif config_dtype == torch.float32:
# Downcasting from float32 to float16 or bfloat16 is allowed.
pass
else:
# Casting between float16 and bfloat16 is allowed with a warning.
logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
return torch_dtype
def _get_and_verify_max_len(
hf_config: PretrainedConfig,
max_model_len: Optional[int],
) -> int:
"""Get and verify the model's maximum length."""
derived_max_model_len = float("inf")
possible_keys = [
# OPT
"max_position_embeddings",
# GPT-2
"n_positions",
# MPT
"max_seq_len",
# ChatGLM2
"seq_length",
# Others
"max_sequence_length",
"max_seq_length",
"seq_len",
]
for key in possible_keys:
max_len_key = getattr(hf_config, key, None)
if max_len_key is not None:
derived_max_model_len = min(derived_max_model_len, max_len_key)
if derived_max_model_len == float("inf"):
if max_model_len is not None:
# If max_model_len is specified, we use it.
return max_model_len
default_max_len = 2048
logger.warning("The model's config.json does not contain any of the following "
"keys to determine the original maximum length of the model: "
f"{possible_keys}. Assuming the model's maximum length is "
f"{default_max_len}.")
derived_max_model_len = default_max_len
rope_scaling = getattr(hf_config, "rope_scaling", None)
if rope_scaling is not None:
assert "factor" in rope_scaling
scaling_factor = rope_scaling["factor"]
if rope_scaling["type"] == "yarn":
derived_max_model_len = rope_scaling["original_max_position_embeddings"]
derived_max_model_len *= scaling_factor
if max_model_len is None:
max_model_len = derived_max_model_len
elif max_model_len > derived_max_model_len:
raise ValueError(f"User-specified max_model_len ({max_model_len}) is greater than "
f"the derived max_model_len ({max_len_key}={derived_max_model_len}"
" in model's config.json). This may lead to incorrect model "
"outputs or CUDA errors. Make sure the value is correct and "
"within the model context size.")
return int(max_model_len)

View File

@ -0,0 +1,275 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
from typing import Dict, List, Optional, Tuple, Union
from tqdm import tqdm
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers import PretrainedConfig
import torch.nn as nn
from .arg_utils import EngineArgs
from .llm_engine_sp import LLMEngine
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.utils import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
from verl.trainer.ppo.rollout.tokenizer import HybridEngineBaseTokenizer
class LLM:
"""An LLM for generating texts from given prompts and sampling parameters.
This class includes a tokenizer, a language model (possibly distributed
across multiple GPUs), and GPU memory space allocated for intermediate
states (aka KV cache). Given a batch of prompts and sampling parameters,
this class generates texts from the model, using an intelligent batching
mechanism and efficient memory management.
NOTE: This class is intended to be used for offline inference. For online
serving, use the `AsyncLLMEngine` class instead.
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
Args:
model: A HuggingFace Transformers model instance.
tokenizer: A HuggingFace Transformers tokenizer instance.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
we support "awq". If None, we assume the model weights are not
quantized and use `dtype` to determine the data type of the weights.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id.
seed: The seed to initialize the random number generator for sampling.
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
reserve for the model weights, activations, and KV cache. Higher
values will increase the KV cache size and thus improve the model's
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
disable_custom_all_reduce: See ParallelConfig
"""
def __init__(
self,
model: Union[nn.Module, Dict], # model itself or its parameter dict
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
model_hf_config: PretrainedConfig,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
tensor_parallel_size: int = 1,
dtype: str = "auto",
quantization: Optional[str] = None,
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: int = 4,
enforce_eager: bool = False,
max_context_len_to_capture: int = 8192,
disable_custom_all_reduce: bool = False,
**kwargs,
) -> None:
if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True
engine_args = EngineArgs(
model_hf_config=model_hf_config,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype,
quantization=quantization,
revision=revision,
tokenizer_revision=tokenizer_revision,
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=swap_space,
enforce_eager=enforce_eager,
max_context_len_to_capture=max_context_len_to_capture,
disable_custom_all_reduce=disable_custom_all_reduce,
**kwargs,
)
tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
if not isinstance(tokenizer, tokenizer_cls):
raise ValueError(
f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
"one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.trainer.ppo.rollout.HybridEngineBaseTokenizer"
)
self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args)
self.request_counter = Counter()
def init_cache_engine(self):
self.llm_engine.init_cache_engine()
def free_cache_engine(self):
self.llm_engine.free_cache_engine()
def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
return self.llm_engine.tokenizer
def set_tokenizer(
self,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
) -> None:
self.llm_engine.tokenizer = tokenizer
def generate(
self,
prompts: Optional[Union[str, List[str]]] = None,
sampling_params: Optional[SamplingParams] = None,
prompt_token_ids: Optional[List[List[int]]] = None,
prefix_pos: Optional[Union[int, List[int]]] = None,
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
) -> List[RequestOutput]:
"""Generates the completions for the input prompts.
NOTE: This class automatically batches the given prompts, considering
the memory constraint. For the best performance, put all of your prompts
into a single list and pass it to this method.
Args:
prompts: A list of prompts to generate completions for.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
prompt_token_ids: A list of token IDs for the prompts. If None, we
use the tokenizer to convert the prompts to token IDs.
use_tqdm: Whether to use tqdm to display the progress bar.
Returns:
A list of `RequestOutput` objects containing the generated
completions in the same order as the input prompts.
"""
if prompts is None and prompt_token_ids is None:
raise ValueError("Either prompts or prompt_token_ids must be "
"provided.")
if isinstance(prompts, str):
# Convert a single prompt to a list.
prompts = [prompts]
if prompts is not None and prompt_token_ids is not None:
if len(prompts) != len(prompt_token_ids):
raise ValueError("The lengths of prompts and prompt_token_ids "
"must be the same.")
if sampling_params is None:
# Use default sampling params.
sampling_params = SamplingParams()
# Add requests to the engine.
num_requests = len(prompts) if prompts is not None else len(prompt_token_ids)
for i in range(num_requests):
prompt = prompts[i] if prompts is not None else None
prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None
token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
if not isinstance(token_ids, list):
# NOTE(shengguangming): convert the rollout input into List[str]
token_ids = self._pre_process_inputs(token_ids)
self._add_request(prompt, sampling_params, token_ids, lora_request=lora_request, prefix_pos=prefix_pos_i)
return self._run_engine(use_tqdm)
def _add_request(
self,
prompt: Optional[str],
sampling_params: SamplingParams,
prompt_token_ids: Optional[List[int]],
lora_request: Optional[LoRARequest] = None,
prefix_pos: Optional[int] = None,
) -> None:
request_id = str(next(self.request_counter))
self.llm_engine.add_request(request_id,
prompt,
sampling_params,
prompt_token_ids,
lora_request=lora_request,
prefix_pos=prefix_pos)
def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
# Initialize tqdm.
if use_tqdm:
num_requests = self.llm_engine.get_num_unfinished_requests()
pbar = tqdm(total=num_requests, desc="Processed prompts")
# Run the engine.
outputs: List[RequestOutput] = []
while self.llm_engine.has_unfinished_requests():
step_outputs = self.llm_engine.step()
for output in step_outputs:
if output.finished:
outputs.append(output)
if use_tqdm:
pbar.update(1)
if use_tqdm:
pbar.close()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs = sorted(outputs, key=lambda x: int(x.request_id))
# TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
return self._post_process_outputs(outputs)
# NOTE(shengguangming): add for verl
# TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
# remove the left padding in the prompt token_id
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
token_ids = prompt_token_ids[non_pad_index:].tolist()
return token_ids
# NOTE(shengguangming): add for verl
def _post_process_outputs(self, outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
output_token_ids = []
logprobs = []
for output in outputs: # List[RequestOutput]
output = output.outputs
for output in output: # List[CompletionOutput], usually len == 1
output_token_ids.append(torch.tensor(output.token_ids))
# TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
logprobs_dicts = output.logprobs
if logprobs_dicts is not None:
logprob = []
for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
logprob.append(logprobs_dict[id])
logprobs.append(torch.tensor(logprob))
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
if len(logprobs) > 0:
logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
return output_token_ids, logprobs
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor]) -> None:
self.llm_engine.sync_model_weights(actor_weights=actor_weights)
def offload_model_weights(self) -> None:
self.llm_engine.offload_model_weights()

View File

@ -0,0 +1,765 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
import os
import socket
import time
import torch
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
from vllm.lora.request import LoRARequest
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup, SequenceGroupMetadata, SequenceGroupOutput,
SequenceOutput, SequenceStatus)
from vllm.transformers_utils.tokenizer import detokenize_incrementally
from vllm.engine.metrics import StatLogger, Stats
from vllm.utils import Counter
import torch.nn as nn
from .arg_utils import EngineArgs
from .tokenizer import TokenizerGroup
logger = init_logger(__name__)
_LOCAL_LOGGING_INTERVAL_SEC = 5
class LLMEngine:
"""An LLM engine that receives requests and generates texts.
This is the main class for the vLLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The `LLM` class wraps this class for offline batched inference and the
`AsyncLLMEngine` class wraps this class for online serving.
NOTE: The config arguments are derived from the `EngineArgs` class. For the
comprehensive list of arguments, see `EngineArgs`.
Args:
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
distributed_init_method: The initialization method for distributed
execution. See `torch.distributed.init_process_group` for details.
placement_group: Ray placement group for distributed execution.
Required for distributed execution.
log_stats: Whether to log statistics.
"""
def __init__(
self,
model: Union[nn.Module, Dict], # model itself or its parameter dict
tokenizer: nn.Module,
model_config: ModelConfig,
cache_config: CacheConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
distributed_init_method: str,
placement_group: Optional[None],
log_stats: bool,
) -> None:
logger.info("Initializing an LLM engine with config: "
f"model={model_config.model!r}, "
f"tokenizer={model_config.tokenizer!r}, "
# f"tokenizer_mode={model_config.tokenizer_mode}, "
f"revision={model_config.revision}, "
f"tokenizer_revision={model_config.tokenizer_revision}, "
# f"trust_remote_code={model_config.trust_remote_code}, "
f"dtype={model_config.dtype}, "
f"max_seq_len={model_config.max_model_len}, "
# f"download_dir={model_config.download_dir!r}, "
# f"load_format={model_config.load_format}, "
f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
f"quantization={model_config.quantization}, "
f"seed={model_config.seed})")
# TODO(woosuk): Print more configs in debug mode.
self.model_config = model_config # TODO: currently is hfconfig
self.cache_config = cache_config
self.lora_config = lora_config
assert self.cache_config.sliding_window == getattr(self.model_config.hf_config, "sliding_window", None)
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.log_stats = log_stats
self._verify_args()
# self.model = model # should not store the model, it should be deleted
# TODO(shengguangming): maybe we can choose init here or from arguments
self._init_tokenizer(tokenizer)
self.seq_counter = Counter()
# Create the parallel GPU workers.
self._init_workers_sp(model, distributed_init_method)
# Profile the memory usage and initialize the cache.
self._init_cache_sp()
# Create the scheduler.
# NOTE(shengguangming): each process will have independent scheduler
self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
# Metric Logging.
if self.log_stats:
self.stat_logger = StatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC)
# Logging.
self.last_logging_time = 0.0
# List of (timestamp, num_tokens)
self.num_prompt_tokens: List[Tuple[float, int]] = []
# List of (timestamp, num_tokens)
self.num_generation_tokens: List[Tuple[float, int]] = []
def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
init_kwargs = dict(enable_lora=bool(self.lora_config),
max_num_seqs=self.scheduler_config.max_num_seqs,
max_input_length=None)
init_kwargs.update(tokenizer_init_kwargs)
self.tokenizer: TokenizerGroup = TokenizerGroup(tokenizer, **init_kwargs)
# TODO: check get_lora_tokenizer func
def get_tokenizer_for_seq(self, sequence: Sequence):
return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
def _init_workers_sp(self, model, distributed_init_method: str):
# Lazy import the Worker to avoid importing torch.cuda/xformers
# before CUDA_VISIBLE_DEVICES is set in the Worker
from .worker import Worker # pylint: disable=import-outside-toplevel
rank = int(os.getenv("RANK"))
self.worker = Worker(
model,
self.model_config,
self.parallel_config,
self.scheduler_config,
self.device_config,
rank,
distributed_init_method,
lora_config=self.lora_config,
kv_cache_dtype=self.cache_config.cache_dtype,
)
# NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
self.worker.init_model()
self.worker.load_model()
def _verify_args(self) -> None:
self.model_config.verify_with_parallel_config(self.parallel_config)
self.cache_config.verify_with_parallel_config(self.parallel_config)
def _init_cache_sp(self) -> None:
"""Profiles the memory usage and initializes the KV cache."""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks = self.worker.profile_num_available_blocks(
block_size=self.cache_config.block_size,
gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
cpu_swap_space=self.cache_config.swap_space_bytes,
cache_dtype=self.cache_config.cache_dtype,
)
# NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
# have its own scheduler
num_gpu_blocks = num_blocks[0]
num_cpu_blocks = num_blocks[1]
# FIXME(woosuk): Change to debug log.
logger.info(f"# GPU blocks: {num_gpu_blocks}, "
f"# CPU blocks: {num_cpu_blocks}")
if num_gpu_blocks <= 0:
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine.")
max_seq_len = self.cache_config.block_size * num_gpu_blocks
if self.model_config.max_model_len > max_seq_len:
raise ValueError(f"The model's max seq len ({self.model_config.max_model_len}) "
"is larger than the maximum number of tokens that can be "
f"stored in KV cache ({max_seq_len}). Try increasing "
"`gpu_memory_utilization` or decreasing `max_model_len` when "
"initializing the engine.")
self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
# Initialize the cache.
self.worker.init_cache_engine(cache_config=self.cache_config)
self.worker.warm_up_model()
def init_cache_engine(self):
self.worker.init_cache_engine(cache_config=self.cache_config)
def free_cache_engine(self):
self.worker.free_cache_engine()
@classmethod
def from_engine_args(cls, model, tokenizer, engine_args: EngineArgs) -> "LLMEngine":
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
engine_configs = engine_args.create_engine_configs()
parallel_config = engine_configs[2]
# Initialize the cluster.
distributed_init_method, placement_group = initialize_cluster(parallel_config)
# Create the LLM engine.
engine = cls(model,
tokenizer,
*engine_configs,
distributed_init_method,
placement_group,
log_stats=not engine_args.disable_log_stats)
return engine
def add_request(
self,
request_id: str,
prompt: Optional[str],
sampling_params: SamplingParams,
prompt_token_ids: Optional[List[int]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
prefix_pos: Optional[int] = None,
) -> None:
"""Add a request to the engine's request pool.
The request is added to the request pool and will be processed by the
scheduler as `engine.step()` is called. The exact scheduling policy is
determined by the scheduler.
Args:
request_id: The unique ID of the request.
prompt: The prompt string. Can be None if prompt_token_ids is
provided.
sampling_params: The sampling parameters for text generation.
prompt_token_ids: The token IDs of the prompt. If None, we
use the tokenizer to convert the prompts to token IDs.
arrival_time: The arrival time of the request. If None, we use
the current monotonic time.
prefix_pos: If not None, we use the given position as the prefix
position for each prompt. We will cache the prefix's KV
cache and reuse it for the next request with the same prefix.
This is an experimental feature, and may be replaced with
automatic prefix caching in the future.
Details:
- Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None.
- Create `best_of` number of :class:`~vllm.Sequence` objects.
- Create a :class:`~vllm.SequenceGroup` object
from the list of :class:`~vllm.Sequence`.
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
Example:
>>> # initialize engine
>>> engine = LLMEngine.from_engine_args(engine_args)
>>> # set request arguments
>>> example_prompt = "Who is the president of the United States?"
>>> sampling_params = SamplingParams(temperature=0.0)
>>> request_id = 0
>>>
>>> # add the request to the engine
>>> engine.add_request(
>>> str(request_id),
>>> example_prompt,
>>> SamplingParams(temperature=0.0))
>>> # continue the request processing
>>> ...
"""
if lora_request is not None and not self.lora_config:
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
"not enabled!")
if arrival_time is None:
arrival_time = time.monotonic()
if prompt_token_ids is None:
assert prompt is not None
prompt_token_ids = self.tokenizer.encode(prompt)
# Create the sequences.
block_size = self.cache_config.block_size
seq_id = next(self.seq_counter)
seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, lora_request)
# Check whether the input specifies prefix
prefix = self.scheduler.prefix_pool.add_or_get_prefix(prompt_token_ids[:prefix_pos], lora_request.lora_int_id if
lora_request else 0) if prefix_pos is not None else None
# Create the sequence group.
seq_group = SequenceGroup(request_id, [seq], sampling_params, arrival_time, lora_request, prefix)
# Add the sequence group to the scheduler.
self.scheduler.add_seq_group(seq_group)
def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
"""Aborts a request(s) with the given ID.
Args:
request_id: The ID(s) of the request to abort.
Details:
- Refer to the
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
from class :class:`~vllm.core.scheduler.Scheduler`.
Example:
>>> # initialize engine and add a request with request_id
>>> request_id = str(0)
>>> # abort the request
>>> engine.abort_request(request_id)
"""
self.scheduler.abort_seq_group(request_id)
def get_model_config(self) -> ModelConfig:
"""Gets the model configuration."""
return self.model_config
def get_num_unfinished_requests(self) -> int:
"""Gets the number of unfinished requests."""
return self.scheduler.get_num_unfinished_seq_groups()
def has_unfinished_requests(self) -> bool:
"""Returns True if there are unfinished requests."""
return self.scheduler.has_unfinished_seqs()
def _check_beam_search_early_stopping(
self,
early_stopping: Union[bool, str],
sampling_params: SamplingParams,
best_running_seq: Sequence,
current_worst_seq: Sequence,
) -> bool:
assert sampling_params.use_beam_search
length_penalty = sampling_params.length_penalty
if early_stopping is True:
return True
current_worst_score = (current_worst_seq.get_beam_search_score(
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(current_worst_seq).eos_token_id))
if early_stopping is False:
highest_attainable_score = (best_running_seq.get_beam_search_score(
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id))
else:
assert early_stopping == "never"
if length_penalty > 0.0:
# If length_penalty > 0.0, beam search will prefer longer
# sequences. The highest attainable score calculation is
# based on the longest possible sequence length in this case.
max_possible_length = max(best_running_seq.get_prompt_len() + sampling_params.max_tokens,
self.scheduler_config.max_model_len)
highest_attainable_score = (best_running_seq.get_beam_search_score(
length_penalty=length_penalty,
eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id,
seq_len=max_possible_length))
else:
# Otherwise, beam search will prefer shorter sequences. The
# highest attainable score calculation is based on the current
# sequence length.
highest_attainable_score = (best_running_seq.get_beam_search_score(
length_penalty=length_penalty,
eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id))
def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput) -> None:
# Process prompt logprobs
prompt_logprobs = outputs.prompt_logprobs
if prompt_logprobs is not None:
seq_group.prompt_logprobs = prompt_logprobs
# Process samples
samples = outputs.samples
parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
existing_finished_seqs = seq_group.get_finished_seqs()
parent_child_dict = {parent_seq.seq_id: [] for parent_seq in parent_seqs}
for sample in samples:
parent_child_dict[sample.parent_seq_id].append(sample)
# List of (child, parent)
child_seqs: List[Tuple[Sequence, Sequence]] = []
# Process the child samples for each parent sequence
for parent in parent_seqs:
child_samples: List[SequenceOutput] = parent_child_dict[parent.seq_id]
if len(child_samples) == 0:
# This parent sequence has no children samples. Remove
# the parent sequence from the sequence group since it will
# not be used in the future iterations.
parent.status = SequenceStatus.FINISHED_ABORTED
seq_group.remove(parent.seq_id)
self.scheduler.free_seq(parent)
continue
# Fork the parent sequence if there are multiple child samples.
for child_sample in child_samples[:-1]:
new_child_seq_id = next(self.seq_counter)
child = parent.fork(new_child_seq_id)
child.append_token_id(child_sample.output_token, child_sample.logprobs)
child_seqs.append((child, parent))
# Continue the parent sequence for the last child sample.
# We reuse the parent sequence here to reduce redundant memory
# copies, especially when using non-beam search sampling methods.
last_child_sample = child_samples[-1]
parent.append_token_id(last_child_sample.output_token, last_child_sample.logprobs)
child_seqs.append((parent, parent))
for seq, _ in child_seqs:
# self._decode_sequence(seq, seq_group.sampling_params)
self._check_stop(seq, seq_group.sampling_params)
# Non-beam search case
if not seq_group.sampling_params.use_beam_search:
# For newly created child sequences, add them to the sequence group
# and fork them in block manager if they are not finished.
for seq, parent in child_seqs:
if seq is not parent:
seq_group.add(seq)
if not seq.is_finished():
self.scheduler.fork_seq(parent, seq)
# Free the finished and selected parent sequences' memory in block
# manager. Keep them in the sequence group as candidate output.
# NOTE: we need to fork the new sequences before freeing the
# old sequences.
for seq, parent in child_seqs:
if seq is parent and seq.is_finished():
self.scheduler.free_seq(seq)
return
# Beam search case
# Select the child sequences to keep in the sequence group.
selected_child_seqs = []
unselected_child_seqs = []
beam_width = seq_group.sampling_params.best_of
length_penalty = seq_group.sampling_params.length_penalty
# Select the newly finished sequences with the highest scores
# to replace existing finished sequences.
# Tuple of (seq, parent, is_new)
existing_finished_seqs = [(seq, None, False) for seq in existing_finished_seqs]
new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs if seq.is_finished()]
all_finished_seqs = existing_finished_seqs + new_finished_seqs
# Sort the finished sequences by their scores.
all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
reverse=True)
for seq, parent, is_new in all_finished_seqs[:beam_width]:
if is_new:
# A newly generated child sequence finishes and has a high
# score, so we will add it into the sequence group.
selected_child_seqs.append((seq, parent))
for seq, parent, is_new in all_finished_seqs[beam_width:]:
if is_new:
# A newly generated child sequence finishes but has a low
# score, so we will not add it into the sequence group.
# Additionally, if this sequence is a continuation of a
# parent sequence, we will need remove the parent sequence
# from the sequence group.
unselected_child_seqs.append((seq, parent))
else:
# An existing finished sequence has a low score, so we will
# remove it from the sequence group.
seq_group.remove(seq.seq_id)
# select the top beam_width sequences from the running
# sequences for the next iteration to continue the beam
# search.
running_child_seqs = [(seq, parent) for seq, parent in child_seqs if not seq.is_finished()]
# Sort the running sequences by their scores.
running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
reverse=True)
# Check if we can stop the beam search.
if len(running_child_seqs) == 0:
# No running sequences, stop the beam search.
stop_beam_search = True
elif len(all_finished_seqs) < beam_width:
# Not enough finished sequences, continue the beam search.
stop_beam_search = False
else:
# Check the early stopping criteria
best_running_seq = running_child_seqs[0][0]
current_worst_seq = all_finished_seqs[beam_width - 1][0]
stop_beam_search = self._check_beam_search_early_stopping(seq_group.sampling_params.early_stopping,
seq_group.sampling_params, best_running_seq,
current_worst_seq)
if stop_beam_search:
# Stop the beam search and remove all the running sequences from
# the sequence group.
unselected_child_seqs.extend(running_child_seqs)
else:
# Continue the beam search and select the top beam_width sequences
# to continue the beam search.
selected_child_seqs.extend(running_child_seqs[:beam_width])
# The remaining running sequences will not be used in the next
# iteration. Again, if these sequences are continuations of
# parent sequences, we will need to remove the parent sequences
# from the sequence group.
unselected_child_seqs.extend(running_child_seqs[beam_width:])
# For newly created child sequences, add them to the sequence group
# and fork them in block manager if they are not finished.
for seq, parent in selected_child_seqs:
if seq is not parent:
seq_group.add(seq)
if not seq.is_finished():
self.scheduler.fork_seq(parent, seq)
# Free the finished and selected parent sequences' memory in block
# manager. Keep them in the sequence group as candidate output.
for seq, parent in selected_child_seqs:
if seq is parent and seq.is_finished():
self.scheduler.free_seq(seq)
# Remove the unselected parent sequences from the sequence group and
# free their memory in block manager.
for seq, parent in unselected_child_seqs:
if seq is parent:
# Remove the parent sequence if it is not selected for next
# iteration
seq_group.remove(seq.seq_id)
self.scheduler.free_seq(seq)
def _process_model_outputs(self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
# Update the scheduled sequence groups with the model outputs.
scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
for seq_group, outputs in zip(scheduled_seq_groups, output):
self._process_sequence_group_outputs(seq_group, outputs)
# Free the finished sequence groups.
self.scheduler.free_finished_seq_groups()
# Create the outputs.
request_outputs: List[RequestOutput] = []
for seq_group in scheduled_seq_groups:
request_output = RequestOutput.from_seq_group(seq_group)
request_outputs.append(request_output)
for seq_group in scheduler_outputs.ignored_seq_groups:
request_output = RequestOutput.from_seq_group(seq_group)
request_outputs.append(request_output)
# Update prefix state, now all the uncomputed prefixes are computed.
for seq_group in scheduled_seq_groups:
if (seq_group.prefix is not None and seq_group.prefix.allocated and not seq_group.prefix.computed):
seq_group.prefix.computed = True
# Log stats.
if self.log_stats:
self.stat_logger.log(self._get_stats(scheduler_outputs))
return request_outputs
def step(self) -> List[RequestOutput]:
"""Performs one decoding iteration and returns newly generated results.
This function performs one decoding iteration of the engine. It first
schedules the sequences to be executed in the next iteration and the
token blocks to be swapped in/out/copy. Then, it executes the model
and updates the scheduler with the model outputs. Finally, it decodes
the sequences and returns the newly generated results.
"""
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
if not scheduler_outputs.is_empty():
output = self.worker.execute_model(
seq_group_metadata_list=seq_group_metadata_list, # TODO: check this input
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
blocks_to_copy=scheduler_outputs.blocks_to_copy,)
else:
return [RequestOutput.from_seq_group(seq_group) for seq_group in scheduler_outputs.ignored_seq_groups]
return self._process_model_outputs(output, scheduler_outputs)
def do_log_stats(self) -> None:
"""Forced log when no requests active."""
if self.log_stats:
self.stat_logger.log(self._get_stats(scheduler_outputs=None))
def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
"""Get Stats to be Logged to Prometheus."""
now = time.monotonic()
# KV Cache Usage in %.
num_total_gpu = self.cache_config.num_gpu_blocks
num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
num_total_cpu = self.cache_config.num_cpu_blocks
cpu_cache_usage = 0.
if num_total_cpu > 0:
num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks()
cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
# Scheduler State
num_running = len(self.scheduler.running)
num_swapped = len(self.scheduler.swapped)
num_waiting = len(self.scheduler.waiting)
# Iteration stats if we have scheduler output.
num_prompt_tokens = 0
num_generation_tokens = 0
time_to_first_tokens = []
time_per_output_tokens = []
time_e2e_requests = []
if scheduler_outputs is not None:
prompt_run = scheduler_outputs.prompt_run
# Number of Tokens.
if prompt_run:
num_prompt_tokens = scheduler_outputs.num_batched_tokens
else:
num_generation_tokens = scheduler_outputs.num_batched_tokens
# Latency Timings.
time_last_iters = []
for seq_group in scheduler_outputs.scheduled_seq_groups:
# Time since last token. (n.b. updates seq_group.last_token_time)
time_last_iters.append(seq_group.get_last_latency(now))
# Time since arrival for all finished requests.
if seq_group.is_finished():
time_e2e_requests.append(now - seq_group.arrival_time)
time_to_first_tokens = time_last_iters if prompt_run else []
time_per_output_tokens = [] if prompt_run else time_last_iters
return Stats(
now=now,
num_running=num_running,
num_swapped=num_swapped,
num_waiting=num_waiting,
gpu_cache_usage=gpu_cache_usage,
cpu_cache_usage=cpu_cache_usage,
num_prompt_tokens=num_prompt_tokens,
num_generation_tokens=num_generation_tokens,
time_to_first_tokens=time_to_first_tokens,
time_per_output_tokens=time_per_output_tokens,
time_e2e_requests=time_e2e_requests,
)
# TODO: we may not need to decode
def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
"""Decodes the new token for a sequence."""
(new_tokens, new_output_text, prefix_offset, read_offset) = detokenize_incrementally(
self.get_tokenizer_for_seq(seq),
all_input_ids=seq.get_token_ids(),
prev_tokens=seq.tokens,
prefix_offset=seq.prefix_offset,
read_offset=seq.read_offset,
skip_special_tokens=prms.skip_special_tokens,
spaces_between_special_tokens=prms.spaces_between_special_tokens,
)
if seq.tokens is None:
seq.tokens = new_tokens
else:
seq.tokens.extend(new_tokens)
seq.prefix_offset = prefix_offset
seq.read_offset = read_offset
seq.output_text += new_output_text
def _check_stop(self, seq: Sequence, sampling_params: SamplingParams) -> None:
"""Stop the finished sequences."""
# for stop_str in sampling_params.stop:
# if seq.output_text.endswith(stop_str):
# self._finalize_sequence(seq, sampling_params, stop_str)
# seq.status = SequenceStatus.FINISHED_STOPPED
# return
# if seq.get_last_token_id() in sampling_params.stop_token_ids:
# stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(seq.get_last_token_id())
# self._finalize_sequence(seq, sampling_params, stop_str)
# seq.status = SequenceStatus.FINISHED_STOPPED
# return
# Check if the sequence has reached max_model_len.
if seq.get_len() > self.scheduler_config.max_model_len:
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
return
# Check if the sequence has reached max_tokens.
if seq.get_output_len() == sampling_params.max_tokens:
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
return
# Check if the sequence has generated the EOS token.
if ((not sampling_params.ignore_eos) and
seq.get_last_token_id() == self.get_tokenizer_for_seq(seq).eos_token_id):
seq.status = SequenceStatus.FINISHED_STOPPED
return
def _finalize_sequence(self, seq: Sequence, sampling_params: SamplingParams, stop_string: str) -> None:
if not sampling_params.include_stop_str_in_output and stop_string:
# Truncate the output text so that the stop string is
# not included in the output.
seq.output_text = seq.output_text[:-len(stop_string)]
def add_lora(self, lora_request: LoRARequest) -> bool:
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
return self.worker.add_lora(lora_request)
def remove_lora(self, lora_id: int) -> bool:
assert lora_id > 0, "lora_id must be greater than 0."
return self.worker.remove_lora(lora_id)
def list_loras(self) -> List[int]:
return self.worker.list_loras()
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor]) -> None:
self.worker.sync_model_weights(actor_weights=actor_weights)
def offload_model_weights(self) -> None:
self.worker.offload_model_weights()
def initialize_cluster(
parallel_config: ParallelConfig,
engine_use_ray: bool = False,
ray_address: Optional[str] = None,
) -> Tuple[str, Optional[None]]:
"""Initialize the distributed cluster probably with Ray.
Args:
parallel_config: The configurations for parallel execution.
engine_use_ray: Whether to use Ray for async engine.
ray_address: The address of the Ray cluster. If None, uses
the default Ray cluster address.
Returns:
A tuple of (`distributed_init_method`, `placement_group`). The
`distributed_init_method` is the address for initializing the
distributed backend. `placement_group` includes the specification
of the resources for each distributed worker.
"""
# Initialize cluster locally.
port = get_open_port()
# We need to setup the distributed init method to make sure
# the distributed megatron code (e.g., get world size) works correctly.
distributed_init_method = f"tcp://localhost:{port}"
return distributed_init_method, None
def get_open_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", 0))
return s.getsockname()[1]

View File

@ -0,0 +1,275 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
"""Utilities for selecting and loading models."""
import contextlib
from typing import Dict, Type, Union
import torch
import torch.nn as nn
from transformers import PretrainedConfig, PreTrainedModel
from megatron.core.tensor_parallel.utils import VocabUtility
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.weight_utils import (get_quant_config, initialize_dummy_weights)
from .config import ModelConfig
from vllm.config import DeviceConfig, LoRAConfig
from .weight_loaders import *
from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors
from vllm.sequence import SamplerOutput
from typing import Optional
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.sampler import _prune_hidden_states, _apply_logits_processors, _apply_penalties, _apply_top_k_top_p, _apply_min_p, _apply_penalties, _sample, _get_logprobs, _build_sampler_output
@contextlib.contextmanager
def _set_default_torch_dtype(dtype: torch.dtype):
"""Sets the default torch dtype to the given dtype."""
old_dtype = torch.get_default_dtype()
torch.set_default_dtype(dtype)
yield
torch.set_default_dtype(old_dtype)
def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
architectures = getattr(config, "architectures", [])
for arch in architectures:
model_cls = ModelRegistry.load_model_cls(arch)
if model_cls is not None:
return model_cls
raise ValueError(f"Model architectures {architectures} are not supported for now. "
f"Supported architectures: {ModelRegistry.get_supported_archs()}")
from vllm.model_executor.layers.linear import *
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
from vllm.model_executor.layers.activation import ScaledActivation
__LAYER_WEIGHT_LOADER_REGISTRY__ = {
ColumnParallelLinear: parallel_weight_loader,
MergedColumnParallelLinear: parallel_weight_loader,
QKVParallelLinear: parallel_weight_loader,
RowParallelLinear: parallel_weight_loader,
VocabParallelEmbedding: parallel_weight_loader,
ParallelLMHead: parallel_weight_loader
# "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
# "default_weight_loader": default_weight_loader
}
# NOTE(gmsheng): change the weight_loader function in runtime
for layer_class, weight_loader in __LAYER_WEIGHT_LOADER_REGISTRY__.items():
layer_class.weight_loader = weight_loader
__MODEL_WEIGHT_LOADER_REGISTRY__ = {
'GPT2LMHeadModel': gpt2_weight_loader,
'LlamaForCausalLM': llama_weight_loader,
'LLaMAForCausalLM': llama_weight_loader,
'MistralForCausalLM': mistral_weight_loader,
}
# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
# so we need to rewrite the init function of vocab
DEFAULT_VOCAB_PADDING_SIZE = 64
def vocab_init(self,
num_embeddings: int,
embedding_dim: int,
params_dtype: Optional[torch.dtype] = None,
org_num_embeddings: Optional[int] = None,
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
super(VocabParallelEmbedding, self).__init__()
# Keep the input dimensions.
# TODO (pad to be divided by 4)
self.num_embeddings = num_embeddings
self.org_vocab_size = org_num_embeddings or num_embeddings
# self.num_embeddings_padded = pad_vocab_size(num_embeddings,
# padding_size)
self.embedding_dim = embedding_dim
if params_dtype is None:
params_dtype = torch.get_default_dtype()
self.tp_size = get_tensor_model_parallel_world_size()
# Divide the weight matrix along the vocaburaly dimension.
self.vocab_start_index, self.vocab_end_index = (VocabUtility.vocab_range_from_global_vocab_size(
self.num_embeddings, get_tensor_model_parallel_rank(), self.tp_size))
self.num_embeddings_per_partition = (self.vocab_end_index - self.vocab_start_index)
self.weight = Parameter(
torch.empty(
self.num_embeddings_per_partition,
self.embedding_dim,
# device=torch.cuda.current_device(),
dtype=params_dtype))
set_weight_attrs(self.weight, {"parallel_dim": 0, "weight_loader": self.weight_loader})
VocabParallelEmbedding.__init__ = vocab_init
def _get_model_weight_loader(arch: str):
if arch in __MODEL_WEIGHT_LOADER_REGISTRY__:
return __MODEL_WEIGHT_LOADER_REGISTRY__[arch]
raise ValueError(f"Model architectures {arch} are not supported for now. "
f"Supported architectures: {ModelRegistry.get_supported_archs()}")
def get_model(actor_model: Union[PreTrainedModel, Dict],
model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig] = None) -> nn.Module:
model_class = _get_model_architecture(model_config.hf_config)
# Get the quantization config.
linear_method = None
quant_config = None
if model_config.quantization is not None:
quant_config = get_quant_config(model_config.quantization, model_config.model, model_config.hf_config,
model_config.download_dir)
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
if capability < quant_config.get_min_capability():
raise ValueError(f"The quantization method {model_config.quantization} is not "
"supported for the current GPU. "
f"Minimum capability: {quant_config.get_min_capability()}. "
f"Current capability: {capability}.")
supported_dtypes = quant_config.get_supported_act_dtypes()
if model_config.dtype not in supported_dtypes:
raise ValueError(f"{model_config.dtype} is not supported for quantization "
f"method {model_config.quantization}. Supported dtypes: "
f"{supported_dtypes}")
linear_method = quant_config.get_linear_method()
with _set_default_torch_dtype(model_config.dtype):
# Create a model instance.
# The weights will be initialized as empty tensors.
# with torch.device(device_config.device):
# NOTE(sgm): init the model in cpu
model = model_class(model_config.hf_config, linear_method)
if model_config.load_format == "dummy":
model = model.cuda()
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
initialize_dummy_weights(model)
elif model_config.load_format == 'model' or model_config.load_format == 'auto':
# NOTE(shengguangming) Load the weights from the actor model
if isinstance(actor_model, nn.Module):
load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
else:
load_weights(actor_weights=actor_model, vllm_model=model)
# NOTE(sgm) Some weights are point to gpu, but still need this.
model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return model.eval()
# the actor model is .state_dict()
def load_weights(actor_weights: Dict, vllm_model: nn.Module):
weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
weight_loader(actor_weights, vllm_model)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model = vllm_model.cuda()
# FIXME(sgm): hack the Sampler function in vllm v0.3.1
# as they use ray, the sampler result will only need to return to the driver node,
# therefore gather is enough. However, we use SPMD instead of a central scheduler,
# all_gather is required (aligned with v0.2.6)
def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
# Get the logits for the next tokens.
logits = torch.matmul(hidden_states, embedding.t())
if embedding_bias is not None:
logits += embedding_bias
logits = tensor_model_parallel_all_gather(logits)
# Remove paddings in vocab (if any).
if logits is not None:
logits = logits[:, :self.org_vocab_size]
return logits
def forward(
self,
embedding: torch.Tensor,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
embedding_bias: Optional[torch.Tensor] = None,
) -> Optional[SamplerOutput]:
# Get the hidden states that we use for sampling.
hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
# Get the logits for the next tokens.
logits = self._get_logits(hidden_states, embedding, embedding_bias)
# save origin logprobs for sampler_output
origin_logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
# Only perform sampling in the driver worker.
# Note: `_get_logits` is still distributed across TP workers because
# the `embedding` weight is distributed across TP workers.
# TODO(zhuohan): Change the get_logits part to a separate stage.
if not sampling_metadata.perform_sampling:
return None
assert logits is not None
_, vocab_size = logits.shape
# Apply logits processors (if any).
logits = _apply_logits_processors(logits, sampling_metadata)
# Prepare sampling tensors with pinned memory to avoid blocking.
(sampling_tensors, do_penalties, do_top_p_top_k,
do_min_p) = SamplingTensors.from_sampling_metadata(sampling_metadata, vocab_size, logits.device, logits.dtype)
# Apply presence and frequency penalties.
if do_penalties:
logits = _apply_penalties(logits, sampling_tensors.prompt_tokens, sampling_tensors.output_tokens,
sampling_tensors.presence_penalties, sampling_tensors.frequency_penalties,
sampling_tensors.repetition_penalties)
# Apply temperature scaling.
# Use in-place division to avoid creating a new tensor.
logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1))
if do_top_p_top_k:
logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, sampling_tensors.top_ks)
if do_min_p:
logits = _apply_min_p(logits, sampling_tensors.min_ps)
# We use float32 for probabilities and log probabilities.
# Compute the probabilities.
probs = torch.softmax(logits, dim=-1, dtype=torch.float)
# Compute the log probabilities.
# Use log_softmax to ensure numerical stability.
logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
# Sample the next tokens.
sample_results = _sample(probs, logprobs, sampling_metadata)
# Get the logprobs query results.
# prompt_logprobs, sample_logprobs = _get_logprobs(
# logprobs, sampling_metadata, sample_results)
prompt_logprobs, sample_logprobs = _get_logprobs(origin_logprobs, sampling_metadata, sample_results)
return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs)
from vllm.model_executor.layers.sampler import Sampler
Sampler._get_logits = _get_logits
Sampler.forward = forward

View File

@ -0,0 +1,285 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
from typing import Dict, List, Optional, Tuple, Set, Union
import contextlib
import time
import numpy as np
import torch
import torch.nn as nn
from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig)
from vllm.logger import init_logger
from vllm.model_executor import InputMetadata, SamplingMetadata
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
from vllm.lora.layers import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.utils import in_wsl
from vllm.worker.model_runner import ModelRunner, CUDAGraphRunner, _async_h2d
from .model_loader import get_model
logger = init_logger(__name__)
KVCache = Tuple[torch.Tensor, torch.Tensor]
_PAD_SLOT_ID = -1
LORA_WARMUP_RANK = 8
# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
class ModelRunner(ModelRunner):
def __init__(
self,
model: Union[nn.Module, Dict], # model itself or its parameter dict
model_config: ModelConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
kv_cache_dtype: Optional[str] = "auto",
):
self.model_config = model_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.lora_config = lora_config
# model_config can be None in tests/samplers/test_sampler.py.
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None)
self.device_config = (device_config if device_config is not None else DeviceConfig())
self.device = self.device_config.device
self.model = model # this will be replaced by get_model()
self.block_size = None # Set after initial profiling.
self.lora_manager = None
self.graph_runners: Dict[int, CUDAGraphRunner] = {}
self.graph_memory_pool = None # Set during graph capture.
self.max_context_len_to_capture = (self.model_config.max_context_len_to_capture
if self.model_config is not None else 0)
# When using CUDA graph, the input block tables must be padded to
# max_context_len_to_capture. However, creating the block table in
# Python can be expensive. To optimize this, we cache the block table
# in numpy and only copy the actual input content at every iteration.
# The shape of the cached block table will be
# (max batch size to capture, max context len to capture / block size).
self.graph_block_tables = None # Set after initial profiling.
# cache in_wsl result
self.in_wsl = in_wsl()
self.kv_cache_dtype = kv_cache_dtype
def load_model(self) -> None:
self.model = get_model(actor_model=self.model,
model_config=self.model_config,
device_config=self.device_config,
lora_config=self.lora_config)
vocab_size = self.model.config.vocab_size
if self.lora_config:
assert hasattr(
self.model,
"supported_lora_modules") and self.model.supported_lora_modules, "Model does not support LoRA"
assert hasattr(self.model, "embedding_modules"), "Model does not have embedding_modules"
assert hasattr(self.model, "embedding_padding_modules"), "Model does not have embedding_padding_modules"
self.lora_manager = LRUCacheWorkerLoRAManager(
self.scheduler_config.max_num_seqs,
self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_paddings, vocab_size,
self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules)
self.model = self.lora_manager.create_lora_manager(self.model)
def _prepare_sample(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
prompt_lens: List[int],
subquery_lens: Optional[List[int]],
) -> SamplingMetadata:
seq_groups: List[Tuple[List[int], SamplingParams]] = []
selected_token_indices: List[int] = []
selected_token_start_idx = 0
categorized_sample_indices = {t: [] for t in SamplingType}
categorized_sample_indices_start_idx = 0
max_subquery_len = max(subquery_lens) if subquery_lens else 1
for i, seq_group_metadata in enumerate(seq_group_metadata_list):
seq_ids = list(seq_group_metadata.seq_data.keys())
sampling_params = seq_group_metadata.sampling_params
seq_groups.append((seq_ids, sampling_params))
if seq_group_metadata.is_prompt:
assert len(seq_ids) == 1
assert subquery_lens is not None
subquery_len = subquery_lens[i]
if sampling_params.prompt_logprobs is not None:
# NOTE: prompt token positions do not need sample, skip
categorized_sample_indices_start_idx += subquery_len - 1
categorized_sample_indices[sampling_params.sampling_type].append(categorized_sample_indices_start_idx)
categorized_sample_indices_start_idx += 1
if sampling_params.prompt_logprobs is not None:
selected_token_indices.extend(
range(selected_token_start_idx, selected_token_start_idx + subquery_len - 1))
selected_token_indices.append(selected_token_start_idx + subquery_len - 1)
selected_token_start_idx += max_subquery_len
else:
num_seqs = len(seq_ids)
selected_token_indices.extend(range(selected_token_start_idx, selected_token_start_idx + num_seqs))
selected_token_start_idx += num_seqs
categorized_sample_indices[sampling_params.sampling_type].extend(
range(categorized_sample_indices_start_idx, categorized_sample_indices_start_idx + num_seqs))
categorized_sample_indices_start_idx += num_seqs
selected_token_indices = _async_h2d(selected_token_indices,
dtype=torch.long,
target_device=self.device,
pin_memory=not self.in_wsl)
categorized_sample_indices = {
t: _async_h2d(seq_ids, dtype=torch.int, target_device=self.device, pin_memory=not self.in_wsl)
for t, seq_ids in categorized_sample_indices.items()
}
seq_data: Dict[int, SequenceData] = {}
for seq_group_metadata in seq_group_metadata_list:
seq_data.update(seq_group_metadata.seq_data)
sampling_metadata = SamplingMetadata(
seq_groups=seq_groups,
seq_data=seq_data,
prompt_lens=prompt_lens,
selected_token_indices=selected_token_indices,
categorized_sample_indices=categorized_sample_indices,
)
return sampling_metadata
def prepare_input_tensors(
self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata, Set[int], LoRAMapping]:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt = seq_group_metadata_list[0].is_prompt
# Prepare input tensors.
if is_prompt:
(input_tokens, input_positions, input_metadata, prompt_lens, subquery_lens, lora_index_mapping,
lora_prompt_mapping, lora_requests) = self._prepare_prompt(seq_group_metadata_list)
else:
(input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping,
lora_requests) = self._prepare_decode(seq_group_metadata_list)
prompt_lens = []
subquery_lens = None
sampling_metadata = self._prepare_sample(seq_group_metadata_list, prompt_lens, subquery_lens)
if self.lora_config:
flat_lora_index_mapping = [item for sublist in lora_index_mapping for item in sublist]
lora_mapping = LoRAMapping(
flat_lora_index_mapping,
lora_prompt_mapping,
)
else:
lora_mapping = None
return (input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping)
@torch.inference_mode()
def execute_model(
self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
) -> Optional[SamplerOutput]:
(input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests,
lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
if self.lora_config:
self.set_active_loras(lora_requests, lora_mapping)
# Execute the model.
if input_metadata.use_cuda_graph:
graph_batch_size = input_tokens.shape[0]
model_executable = self.graph_runners[graph_batch_size]
else:
model_executable = self.model
hidden_states = model_executable(
input_ids=input_tokens,
positions=input_positions,
kv_caches=kv_caches,
input_metadata=input_metadata,
)
# Sample the next token.
output = self.model.sample(
hidden_states=hidden_states,
sampling_metadata=sampling_metadata,
)
return output
@torch.inference_mode()
def profile_run(self) -> None:
# Enable top-k sampling to reflect the accurate memory usage.
vocab_size = self.model_config.get_vocab_size()
# FIXME(sgm): this sampling params will call cumsum(), causing the
# deterministic cumsum throw error
sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
max_num_seqs = self.scheduler_config.max_num_seqs
# This represents the maximum number of different requests
# that will have unique loras, an therefore the max amount of memory
# consumption create dummy lora request copies from the lora request
# passed in, which contains a lora from the lora warmup path.
dummy_lora_requests = []
dummy_lora_requests_per_seq = []
if self.lora_config:
for idx in range(self.lora_config.max_loras):
lora_id = idx + 1
dummy_lora_request = LoRARequest(
lora_name=f"warmup_{lora_id}",
lora_int_id=lora_id,
lora_local_path="/not/a/real/path",
)
self.lora_manager.add_dummy_lora(dummy_lora_request, rank=LORA_WARMUP_RANK)
dummy_lora_requests.append(dummy_lora_request)
dummy_lora_requests_per_seq = [
dummy_lora_requests[idx % len(dummy_lora_requests)] for idx in range(max_num_seqs)
]
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
seqs: List[SequenceGroupMetadata] = []
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs))
seq_data = SequenceData([0] * seq_len)
seq = SequenceGroupMetadata(
request_id=str(group_id),
is_prompt=True,
seq_data={group_id: seq_data},
sampling_params=sampling_params,
block_tables=None,
lora_request=dummy_lora_requests_per_seq[group_id] if dummy_lora_requests_per_seq else None,
)
seqs.append(seq)
# Run the model with the dummy inputs.
num_layers = self.model_config.get_num_layers(self.parallel_config)
kv_caches = [(None, None)] * num_layers
self.execute_model(seqs, kv_caches)
torch.cuda.synchronize()
return

View File

@ -0,0 +1,147 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Model and data parallel groups."""
import torch
import torch.distributed
import vllm.model_executor.parallel_utils.parallel_state as ps
"""
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
- We assume the Megatron tp+dp+pp world is already established before calling this function.
"""
# Tensor model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP = None
# Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp
# into infer_tp and micro_tp. By default, we use order micro_dp - tp
_MICRO_DATA_PARALLEL_GROUP = None
def initialize_model_parallel_from_megatron(
tensor_model_parallel_size=None # we set None for backward compatibility to set infer_tp = train_tp
) -> None:
from megatron.core import parallel_state as mpu
from megatron.distributed import new_group
# Get world size and rank. Ensure some consistencies.
assert torch.distributed.is_initialized()
if tensor_model_parallel_size is None:
tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
else:
assert isinstance(tensor_model_parallel_size, int)
# Build the tensor model-parallel groups.
assert ps._TENSOR_MODEL_PARALLEL_GROUP is None, ("tensor model parallel group is already initialized")
assert tensor_model_parallel_size <= mpu.get_tensor_model_parallel_world_size(
), 'Not implemented for infer_tp > train_tp'
global _TENSOR_MODEL_PARALLEL_GROUP
global _MICRO_DATA_PARALLEL_GROUP
assert mpu.get_tensor_model_parallel_world_size() % tensor_model_parallel_size == 0
micro_dp_size = mpu.get_tensor_model_parallel_world_size() // tensor_model_parallel_size
world_size: int = torch.distributed.get_world_size()
num_micro_dp_groups = world_size // micro_dp_size
rank = torch.distributed.get_rank()
# Build the micro dp groups.
assert _MICRO_DATA_PARALLEL_GROUP is None, ("micro data parallel group is already initialized")
for i in range(num_micro_dp_groups):
ranks = range(i * micro_dp_size, (i + 1) * micro_dp_size)
group = new_group(rank=rank, ranks=ranks, group_type='micro_dp')
if rank in ranks:
_MICRO_DATA_PARALLEL_GROUP = group
if tensor_model_parallel_size == mpu.get_tensor_model_parallel_world_size():
# using the same tp group as Megatron
ps._TENSOR_MODEL_PARALLEL_GROUP = mpu.get_tensor_model_parallel_group()
_TENSOR_MODEL_PARALLEL_GROUP = mpu.get_tensor_model_parallel_group()
# no _MICRO_DATA_PARALLEL_GROUP
else:
# initialize a micro_dp group and a tp group
# assume training tp=4, infer tp=2, then, weight is partitioned as
# [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
# Build the inference tp groups
train_tp = mpu.get_tensor_model_parallel_world_size()
num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
assert _TENSOR_MODEL_PARALLEL_GROUP is None, ("tensor model parallel group is already initialized")
for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
start = train_tp * i
end = train_tp * (i + 1)
for j in range(num_tensor_model_parallel_groups_per_train_tp):
ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
for i in range(len(ranks)):
ranks[i] += j
# group = torch.distributed.new_group(ranks)
group = new_group(rank=rank, ranks=ranks, group_type='infer_tp')
if rank in ranks:
_TENSOR_MODEL_PARALLEL_GROUP = group
ps._TENSOR_MODEL_PARALLEL_GROUP = _TENSOR_MODEL_PARALLEL_GROUP
# Build the pipeline model-parallel groups.
# global _PIPELINE_MODEL_PARALLEL_GROUP
# global _PIPELINE_GLOBAL_RANKS
# assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
# ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
# ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
"""
Tensor model parallel utilities
"""
def get_tensor_model_parallel_group():
"""Get the tensor model parallel group the caller rank belongs to."""
assert _TENSOR_MODEL_PARALLEL_GROUP is not None, ("tensor model parallel group is not initialized")
return _TENSOR_MODEL_PARALLEL_GROUP
def get_tensor_model_parallel_world_size():
"""Return world size for the tensor model parallel group."""
return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
def get_tensor_model_parallel_rank():
"""Return my rank for the tensor model parallel group."""
return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
def get_tensor_model_parallel_src_rank():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank = torch.distributed.get_rank()
local_world_size = get_tensor_model_parallel_world_size()
return (global_rank // local_world_size) * local_world_size
"""
Micro Data parallel group
"""
def get_micro_data_parallel_group():
assert _MICRO_DATA_PARALLEL_GROUP is not None
return _MICRO_DATA_PARALLEL_GROUP
def get_micro_data_parallel_world_size():
return torch.distributed.get_world_size(group=get_micro_data_parallel_group())
def get_micro_data_parallel_rank():
return torch.distributed.get_rank(group=get_micro_data_parallel_group())

View File

@ -0,0 +1,72 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
from typing import List, Optional, Tuple, Union
from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast)
from vllm.lora.request import LoRARequest
from vllm.utils import make_async, LRUCache
from vllm.transformers_utils.tokenizers import *
class TokenizerGroup:
"""A group of tokenizers that can be used for LoRA adapters."""
def __init__(self, tokenizer: PreTrainedTokenizer, enable_lora: bool, max_num_seqs: int,
max_input_length: Optional[int]):
self.enable_lora = enable_lora
self.max_input_length = max_input_length
self.tokenizer = tokenizer
if enable_lora:
self.lora_tokenizers = LRUCache(capacity=max_num_seqs)
else:
self.lora_tokenizers = None
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
tokenizer = self.get_lora_tokenizer(lora_request)
return tokenizer.encode(prompt)
async def encode_async(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
tokenizer = await self.get_lora_tokenizer_async(lora_request)
return tokenizer.encode(prompt)
def get_lora_tokenizer(self, lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
if not lora_request or not self.enable_lora:
return self.tokenizer
if lora_request.lora_int_id not in self.lora_tokenizers:
# TODO(sgm): the lora tokenizer is also passed, but may be different
tokenizer = self.tokenizer
# tokenizer = (get_lora_tokenizer(
# lora_request, **self.tokenizer_config) or self.tokenizer)
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
return tokenizer
else:
return self.lora_tokenizers.get(lora_request.lora_int_id)
# FIXME(sgm): for simplicity, we assign the special token here
@property
def pad_token_id(self):
return self.tokenizer.pad_token_id
@property
def eos_token_id(self):
return self.tokenizer.eos_token_id

View File

@ -0,0 +1,95 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from typing import Dict
import torch
import torch.nn as nn
# NOTE(shengguangming): replace the origin weight loader function in the class
def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
"""Parallel Linear weight loader."""
assert param.size() == loaded_weight.size(
), 'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'.format(
param.size(), loaded_weight.size())
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
param.data = loaded_weight.data
def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
"""Default weight loader."""
assert param.size() == loaded_weight.size()
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
param.data = loaded_weight.data
def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
for name, loaded_weight in actor_weights.items():
if "lm_head.weight" in name:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
if ".attn.bias" in name or ".attn.masked_bias" in name:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
if not name.startswith("transformer."):
name = "transformer." + name
param = params_dict[name]
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
# Because of this, we need to transpose the weights.
# Note(zhuohan): the logic below might break quantized models.
for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
if conv1d_weight_name not in name:
continue
if not name.endswith(".weight"):
continue
# TODO: check megatron
loaded_weight = loaded_weight.t()
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def llama_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
# NOTE(shengguangming): the megatron llama may have this prefix
prefix = '0.module.module.'
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
if name[:len(prefix)] == prefix:
name = name[len(prefix):]
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def mistral_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
# TODO: need to implement a general way to deal with prefix
prefix = '0.module.module.'
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
if name[:len(prefix)] == prefix:
name = name[len(prefix):]
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)

View File

@ -0,0 +1,314 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
"""A GPU worker class."""
import os
import gc
from typing import Dict, List, Tuple, Optional, Union, Set
import torch
import torch.distributed
import torch.nn as nn
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.model_executor import InputMetadata, set_random_seed
from vllm.model_executor.parallel_utils.parallel_state import (initialize_model_parallel)
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
from vllm.worker.cache_engine import CacheEngine
from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_group
from .model_runner import ModelRunner
from .model_loader import load_weights
from .parallel_state import initialize_model_parallel_from_megatron
from vllm.lora.request import LoRARequest
class Worker:
"""A worker class that executes (a partition of) the model on a GPU.
Each worker is associated with a single GPU. The worker is responsible for
maintaining the KV cache and executing the model on the GPU. In case of
distributed inference, each worker is assigned a partition of the model.
"""
def __init__(
self,
model: Union[nn.Module, Dict], # model itself or its parameter dict
model_config: ModelConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
rank: Optional[int] = None,
distributed_init_method: Optional[str] = None,
lora_config: Optional[LoRAConfig] = None,
kv_cache_dtype: Optional[str] = "auto",
) -> None:
# self.model = model # will be replaced in the init_model
self.model_config = model_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.rank = rank
self.distributed_init_method = distributed_init_method
self.lora_config = lora_config
self.model_runner = ModelRunner(
model,
model_config,
parallel_config,
scheduler_config,
device_config,
lora_config=self.lora_config,
kv_cache_dtype=kv_cache_dtype,
)
# Uninitialized cache engine. Will be initialized by
# self.init_cache_engine().
self.cache_config = None
self.block_size = None
self.sliding_window = None
self.cache_engine = None
self.cache_events = None
self.gpu_cache = None
# For offloading inference engine params
self.cpu_model = None
def init_model(self, cupy_port: Optional[int] = None):
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
# Env vars will be set by TORCHRUN.
self.rank = self.rank if self.rank is not None else int(os.getenv("RANK", "-1"))
local_rank = int(os.getenv("LOCAL_RANK", "0"))
self.device = torch.device(f"cuda:{local_rank}")
if self.rank < 0:
raise ValueError("Invalid or unspecified rank.")
torch.cuda.set_device(self.device)
_check_if_gpu_supports_dtype(self.model_config.dtype)
# Initialize the distributed environment.
# TODO: do not use cupy
_init_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method)
if not self.parallel_config.disable_custom_all_reduce:
init_custom_ar()
# Initialize the model.
set_random_seed(self.model_config.seed)
# self.model = get_model(actor_model=self.model, model_config=self.model_config)
def load_model(self):
self.model_runner.load_model()
@torch.inference_mode()
def profile_num_available_blocks(
self,
block_size: int,
gpu_memory_utilization: float,
cpu_swap_space: int,
cache_dtype: str,
) -> Tuple[int, int]:
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
torch.cuda.empty_cache()
# torch.cuda.reset_peak_memory_stats()
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
self.model_runner.profile_run()
# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
torch.cuda.synchronize()
free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
peak_memory = total_gpu_memory - free_gpu_memory
cache_block_size = CacheEngine.get_cache_block_size(block_size, cache_dtype, self.model_config,
self.parallel_config)
# NOTE(sgm) use the remaining memory
num_gpu_blocks = int((free_gpu_memory * gpu_memory_utilization) // cache_block_size)
# num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size)
num_cpu_blocks = int(cpu_swap_space // cache_block_size)
num_gpu_blocks = max(num_gpu_blocks, 0)
num_cpu_blocks = max(num_cpu_blocks, 0)
if self.model_runner.lora_manager:
self.model_runner.remove_all_loras()
gc.collect()
torch.cuda.empty_cache()
# Synchronize number of blocks with all the rank
num_gpu_blocks = torch.tensor([num_gpu_blocks], device='cuda')
num_cpu_blocks = torch.tensor([num_cpu_blocks], device='cuda')
torch.distributed.all_reduce(num_gpu_blocks,
op=torch.distributed.ReduceOp.MIN,
group=get_tensor_model_parallel_group())
torch.distributed.all_reduce(num_cpu_blocks,
op=torch.distributed.ReduceOp.MIN,
group=get_tensor_model_parallel_group())
num_gpu_blocks = num_gpu_blocks.item()
num_cpu_blocks = num_cpu_blocks.item()
return num_gpu_blocks, num_cpu_blocks
def init_cache_engine(self, cache_config: CacheConfig) -> None:
if self.cache_engine is None and self.gpu_cache is None:
self.cache_config = cache_config
self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config)
self.cache_events = self.cache_engine.events
self.gpu_cache = self.cache_engine.gpu_cache
self.model_runner.set_block_size(self.cache_engine.block_size)
def free_cache_engine(self):
# ensure `enforce_eager=True`
self.cache_engine = None
self.gpu_cache = None
def warm_up_model(self) -> None:
if not self.model_config.enforce_eager:
self.model_runner.capture_model(self.gpu_cache)
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed(self.model_config.seed)
def cache_swap(
self,
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
blocks_to_copy: Dict[int, List[int]],
) -> None:
# Issue cache operations.
issued_cache_op = False
if blocks_to_swap_in:
self.cache_engine.swap_in(blocks_to_swap_in)
issued_cache_op = True
if blocks_to_swap_out:
self.cache_engine.swap_out(blocks_to_swap_out)
issued_cache_op = True
if blocks_to_copy:
self.cache_engine.copy(blocks_to_copy)
issued_cache_op = True
cache_events = self.cache_events if issued_cache_op else None
# Wait for cache operations to finish.
# TODO(woosuk): Profile swapping overhead and optimize if needed.
if cache_events is not None:
for event in cache_events:
event.wait()
@torch.inference_mode()
def execute_model(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
blocks_to_copy: Dict[int, List[int]],
) -> SamplerOutput:
num_seq_groups = len(seq_group_metadata_list)
self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
# If there is no input, we don't need to execute the model.
if num_seq_groups == 0:
return {}
output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache)
return output
# # Prepare input tensors.
# # NOTE(shengguangming): currently we pad in our dataloader and unpad it in pre_process_input, j
# # we can just input un-padded sequence for better performance
# input_tokens, input_positions, input_metadata = self._prepare_inputs(seq_group_metadata_list)
# # Execute the model.
# output = self.model(
# input_ids=input_tokens,
# positions=input_positions,
# kv_caches=self.gpu_cache,
# input_metadata=input_metadata,
# cache_events=cache_events,
# )
# return output
# assume the input is .state_dict()
def sync_model_weights(self, actor_weights: Dict):
load_weights(actor_weights, self.model_runner.model)
def offload_model_weights(self) -> None:
if self.cpu_model == None:
self.cpu_model = {}
for name, params in self.model_runner.model.named_parameters():
self.cpu_model[name] = torch.empty_like(params, device='cpu')
params.data = self.cpu_model[name]
else:
for name, params in self.model_runner.model.named_parameters():
params.data = self.cpu_model[name]
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_runner.add_lora(lora_request)
def remove_lora(self, lora_id: int) -> bool:
return self.model_runner.remove_lora(lora_id)
def list_loras(self) -> Set[int]:
return self.model_runner.list_loras()
def _init_distributed_environment(
parallel_config: ParallelConfig,
rank: int,
distributed_init_method: Optional[str] = None,
) -> None:
"""Initialize the distributed environment."""
if torch.distributed.is_initialized():
print('The distributed environment has been initialized before vLLM')
elif not distributed_init_method:
raise ValueError("distributed_init_method must be set if torch.distributed "
"is not already initialized")
else:
torch.distributed.init_process_group(
backend="nccl",
world_size=parallel_config.world_size,
rank=rank,
# init_method=distributed_init_method,
)
# A small all_reduce for warmup.
torch.distributed.all_reduce(torch.zeros(1).cuda())
# TODO (shengguangming): maybe we should also flag the megatron is initialized
if torch.distributed.get_world_size() > 1:
initialize_model_parallel_from_megatron(tensor_model_parallel_size=parallel_config.tensor_parallel_size)
else:
initialize_model_parallel()
def _pad_to_alignment(x: List[int], multiple_of: int, pad: int) -> List[int]:
return x + [pad] * ((-len(x)) % multiple_of)
def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
return x + [pad] * (max_len - len(x))
def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
# Check if the GPU supports the dtype.
if torch_dtype == torch.bfloat16:
compute_capability = torch.cuda.get_device_capability()
if compute_capability[0] < 8:
gpu_name = torch.cuda.get_device_name()
raise ValueError("Bfloat16 is only supported on GPUs with compute capability "
f"of at least 8.0. Your {gpu_name} GPU has compute capability "
f"{compute_capability[0]}.{compute_capability[1]}.")

View File

@ -0,0 +1,13 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,320 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
import os
import argparse
import dataclasses
from dataclasses import dataclass
from typing import List, Optional, Union
import torch.nn as nn
from transformers import PretrainedConfig
from .config import ModelConfig, LoadConfig
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoRAConfig, ParallelConfig,
SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import str_to_int_tuple
def nullable_str(val: str):
if not val or val == "None":
return None
return val
@dataclass
class EngineArgs:
"""Arguments for vLLM engine."""
model_hf_config: PretrainedConfig = None
skip_tokenizer_init: bool = False
served_model_name: Optional[Union[str, List[str]]] = None # TODO
download_dir: Optional[str] = None
load_format: str = 'auto'
dtype: str = 'auto'
kv_cache_dtype: str = 'auto'
quantization_param_path: Optional[str] = None
seed: int = 0
max_model_len: Optional[int] = None
worker_use_ray: bool = False
pipeline_parallel_size: int = 1
tensor_parallel_size: int = 1
max_parallel_loading_workers: Optional[int] = None
block_size: int = 16
enable_prefix_caching: bool = False
use_v2_block_manager: bool = False
swap_space: int = 4 # GiB
gpu_memory_utilization: float = 0.90
max_num_batched_tokens: Optional[int] = None
max_num_seqs: int = 256
max_logprobs: int = 5 # OpenAI default value
disable_log_stats: bool = False
revision: Optional[str] = None
code_revision: Optional[str] = None
tokenizer_revision: Optional[str] = None
quantization: Optional[str] = None
enforce_eager: bool = False
max_context_len_to_capture: Optional[int] = None
max_seq_len_to_capture: int = 8192
disable_custom_all_reduce: bool = False
tokenizer_pool_size: int = 0
tokenizer_pool_type: str = "ray"
tokenizer_pool_extra_config: Optional[dict] = None
enable_lora: bool = False
max_loras: int = 1
max_lora_rank: int = 16
fully_sharded_loras: bool = False
lora_extra_vocab_size: int = 256
lora_dtype = 'auto'
max_cpu_loras: Optional[int] = None
device: str = 'auto'
ray_workers_use_nsight: bool = False
num_gpu_blocks_override: Optional[int] = None
num_lookahead_slots: int = 0
model_loader_extra_config: Optional[dict] = None
# Related to Vision-language models such as llava
image_input_type: Optional[str] = None
image_token_id: Optional[int] = None
image_input_shape: Optional[str] = None
image_feature_size: Optional[int] = None
scheduler_delay_factor: float = 0.0
enable_chunked_prefill: bool = False
guided_decoding_backend: str = 'outlines'
# Speculative decoding configuration.
speculative_model: Optional[str] = None
num_speculative_tokens: Optional[int] = None
speculative_max_model_len: Optional[int] = None
ngram_prompt_lookup_max: Optional[int] = None
ngram_prompt_lookup_min: Optional[int] = None
@staticmethod
def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
"""Shared CLI arguments for vLLM engine."""
# Model arguments
# TODO(shengguangming): delete the unused args
parser.add_argument('--model',
type=str,
default='facebook/opt-125m',
help='name or path of the huggingface model to use')
parser.add_argument('--tokenizer',
type=str,
default=EngineArgs.tokenizer,
help='name or path of the huggingface tokenizer to use')
parser.add_argument('--revision',
type=str,
default=None,
help='the specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.')
parser.add_argument('--tokenizer-revision',
type=str,
default=None,
help='the specific tokenizer version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.')
parser.add_argument('--tokenizer-mode',
type=str,
default=EngineArgs.tokenizer_mode,
choices=['auto', 'slow'],
help='tokenizer mode. "auto" will use the fast '
'tokenizer if available, and "slow" will '
'always use the slow tokenizer.')
parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface')
parser.add_argument('--download-dir',
type=str,
default=EngineArgs.download_dir,
help='directory to download and load the weights, '
'default to the default cache dir of '
'huggingface')
parser.add_argument('--load-format',
type=str,
default=EngineArgs.load_format,
choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
help='The format of the model weights to load. '
'"auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available. '
'"pt" will load the weights in the pytorch bin format. '
'"safetensors" will load the weights in the safetensors format. '
'"npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading. '
'"dummy" will initialize the weights with random values, '
'which is mainly for profiling.')
parser.add_argument('--dtype',
type=str,
default=EngineArgs.dtype,
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--max-model-len',
type=int,
default=None,
help='model context length. If unspecified, '
'will be automatically derived from the model.')
# Parallel arguments
parser.add_argument('--worker-use-ray',
action='store_true',
help='use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU')
parser.add_argument('--pipeline-parallel-size',
'-pp',
type=int,
default=EngineArgs.pipeline_parallel_size,
help='number of pipeline stages')
parser.add_argument('--tensor-parallel-size',
'-tp',
type=int,
default=EngineArgs.tensor_parallel_size,
help='number of tensor parallel replicas')
# KV cache arguments
parser.add_argument('--block-size',
type=int,
default=EngineArgs.block_size,
choices=[8, 16, 32],
help='token block size')
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed')
parser.add_argument('--swap-space',
type=int,
default=EngineArgs.swap_space,
help='CPU swap space size (GiB) per GPU')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=EngineArgs.gpu_memory_utilization,
help='the percentage of GPU memory to be used for'
'the model executor')
parser.add_argument('--max-num-batched-tokens',
type=int,
default=EngineArgs.max_num_batched_tokens,
help='maximum number of batched tokens per '
'iteration')
parser.add_argument('--max-num-seqs',
type=int,
default=EngineArgs.max_num_seqs,
help='maximum number of sequences per iteration')
parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics')
# Quantization settings.
parser.add_argument('--quantization',
'-q',
type=str,
choices=['awq', None],
default=None,
help='Method used to quantize the weights')
return parser
@classmethod
def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
# Get the list of attributes of this dataclass.
attrs = [attr.name for attr in dataclasses.fields(cls)]
# Set the attributes from the parsed arguments.
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
return engine_args
def create_engine_config(
self,
) -> EngineConfig:
device_config = DeviceConfig(self.device)
# NOTE(sgm): we only modify ModelConfig, other configs are import from vllm
model_config = ModelConfig(self.model_hf_config, self.dtype, self.seed, self.revision, self.code_revision,
self.tokenizer_revision, self.max_model_len, self.quantization,
self.quantization_param_path, self.enforce_eager, self.max_context_len_to_capture,
self.max_seq_len_to_capture, self.max_logprobs, self.skip_tokenizer_init,
self.served_model_name)
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
self.swap_space, self.kv_cache_dtype, self.num_gpu_blocks_override,
model_config.get_sliding_window(), self.enable_prefix_caching)
parallel_config = ParallelConfig(
self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray,
self.max_parallel_loading_workers, self.disable_custom_all_reduce,
TokenizerPoolConfig.create_config(
self.tokenizer_pool_size,
self.tokenizer_pool_type,
self.tokenizer_pool_extra_config,
), self.ray_workers_use_nsight)
# Use the world_size set by TORCHRUN
world_size = int(os.getenv("WORLD_SIZE", "-1"))
assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
parallel_config.world_size = world_size
# TODO: spec config
speculative_config = SpeculativeConfig.maybe_create_spec_config(
target_model_config=model_config,
target_parallel_config=parallel_config,
target_dtype=self.dtype,
speculative_model=self.speculative_model,
num_speculative_tokens=self.num_speculative_tokens,
speculative_max_model_len=self.speculative_max_model_len,
enable_chunked_prefill=self.enable_chunked_prefill,
use_v2_block_manager=self.use_v2_block_manager,
ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
)
scheduler_config = SchedulerConfig(
self.max_num_batched_tokens,
self.max_num_seqs,
model_config.max_model_len,
self.use_v2_block_manager,
num_lookahead_slots=(self.num_lookahead_slots
if speculative_config is None else speculative_config.num_lookahead_slots),
delay_factor=self.scheduler_delay_factor,
enable_chunked_prefill=self.enable_chunked_prefill,
)
lora_config = LoRAConfig(max_lora_rank=self.max_lora_rank,
max_loras=self.max_loras,
fully_sharded_loras=self.fully_sharded_loras,
lora_extra_vocab_size=self.lora_extra_vocab_size,
lora_dtype=self.lora_dtype,
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else
None) if self.enable_lora else None
load_config = LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
)
if self.image_input_type:
if (not self.image_token_id or not self.image_input_shape or not self.image_feature_size):
raise ValueError('Specify `image_token_id`, `image_input_shape` and '
'`image_feature_size` together with `image_input_type`.')
vision_language_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig.get_image_input_enum_type(self.image_input_type),
image_token_id=self.image_token_id,
image_input_shape=str_to_int_tuple(self.image_input_shape),
image_feature_size=self.image_feature_size,
)
else:
vision_language_config = None
decoding_config = DecodingConfig(guided_decoding_backend=self.guided_decoding_backend)
return EngineConfig(model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
vision_language_config=vision_language_config,
speculative_config=speculative_config,
load_config=load_config,
decoding_config=decoding_config)

View File

@ -0,0 +1,201 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
import enum
import json
from typing import List, Optional, Union
from dataclasses import dataclass, field, fields
from transformers import PretrainedConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import get_quantization_config
from vllm.transformers_utils.config import get_hf_text_config
from vllm.utils import is_hip
# Add for verl
from vllm.config import ModelConfig, _get_and_verify_dtype, _get_and_verify_max_len
GPTQMarlinConfig = get_quantization_config("gptq_marlin")
logger = init_logger(__name__)
_GB = 1 << 30
class ModelConfig(ModelConfig):
"""Configuration for the model.
Args:
model: Name or path of the huggingface model to use.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
dtype: Data type for model weights and activations. The "auto" option
will use FP16 precision for FP32 and FP16 models, and BF16 precision
for BF16 models.
seed: Random seed for reproducibility.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id. If unspecified, will use the default
version.
code_revision: The specific revision to use for the model code on
Hugging Face Hub. It can be a branch name, a tag name, or a
commit id. If unspecified, will use the default version.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id. If unspecified, will use
the default version.
max_model_len: Maximum length of a sequence (including prompt and
output). If None, will be derived from the model.
quantization: Quantization method that was used to quantize the model
weights. If None, we assume the model weights are not quantized.
quantization_param_path: Path to JSON file containing scaling factors.
Used to load KV cache scaling factors into the model when KV cache
type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
be used to load activation and weight scaling factors when the
model dtype is FP8_E4M3 on ROCm.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode
skip_tokenizer_init: If true, skip initialization of tokenizer and
detokenizer.
served_model_name: The model name used in metrics tag `model_name`,
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
"""
def __init__(
self,
hf_config: PretrainedConfig,
dtype: str,
seed: int,
revision: Optional[str] = None,
code_revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
max_model_len: Optional[int] = None,
quantization: Optional[str] = None,
quantization_param_path: Optional[str] = None,
enforce_eager: bool = False,
max_context_len_to_capture: Optional[int] = None,
max_seq_len_to_capture: Optional[int] = None,
max_logprobs: int = 5,
skip_tokenizer_init: bool = False,
served_model_name: Optional[Union[str, List[str]]] = None,
) -> None:
self.model = hf_config._name_or_path
self.tokenizer = hf_config._name_or_path
self.seed = seed
self.revision = revision
self.code_revision = code_revision
self.tokenizer_revision = tokenizer_revision
self.quantization = quantization
self.quantization_param_path = quantization_param_path
self.enforce_eager = enforce_eager
self.max_context_len_to_capture = max_context_len_to_capture
if self.max_context_len_to_capture is not None:
raise ValueError("`max_context_len_to_capture` is deprecated. "
"Use `max_seq_len_to_capture` instead.")
self.max_seq_len_to_capture = (max_seq_len_to_capture or max_context_len_to_capture)
self.max_logprobs = max_logprobs
self.skip_tokenizer_init = skip_tokenizer_init
# self.hf_config = get_config(model, trust_remote_code, revision)
self.hf_config = hf_config
self.hf_text_config = get_hf_text_config(hf_config)
# TODO: for multimodal model
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
# self.served_model_name = get_served_model_name(model,
# served_model_name)
# self._verify_load_format()
# self._verify_tokenizer_mode()
self._verify_quantization()
self._verify_cuda_graph()
class LoadFormat(str, enum.Enum):
AUTO = 'auto'
MEGATRON = "megatron"
HF = "hf"
DTENSOR = 'dtensor'
DUMMY_HF = 'dummy_hf'
DUMMY_MEGATRON = 'dummy_megatron'
DUMMY_DTENSOR = 'dummy_dtensor'
@dataclass
class LoadConfig:
"""
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
"tensorizer" will use CoreWeave's tensorizer library for
fast weight loading.
"""
load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
download_dir: Optional[str] = None
model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
def __post_init__(self):
model_loader_extra_config = self.model_loader_extra_config or {}
if isinstance(model_loader_extra_config, str):
self.model_loader_extra_config = json.loads(model_loader_extra_config)
self._verify_load_format()
def _verify_load_format(self) -> None:
if not isinstance(self.load_format, str):
return
load_format = self.load_format.lower()
self.load_format = LoadFormat(load_format)
rocm_not_supported_load_format: List[str] = []
if is_hip() and load_format in rocm_not_supported_load_format:
rocm_supported_load_format = [
f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format)
]
raise ValueError(f"load format '{load_format}' is not supported in ROCm. "
f"Supported load formats are "
f"{rocm_supported_load_format}")

View File

@ -0,0 +1,269 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from typing import Dict, Iterable, Tuple
import torch
import torch.nn as nn
from torch.distributed._tensor import DTensor, Shard, Replicate
from vllm.model_executor.layers.linear import *
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
for (param_name, shard_name, shard_id) in stacked_params_mapping:
if shard_name not in name:
continue
stacked_name = name.replace(shard_name, param_name)
# Skip loading extra bias for GPTQ models.
if stacked_name.endswith(".bias") and stacked_name not in params_dict:
continue
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[stacked_name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
break
else:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name:
continue
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
# GemmaRMSNorm is different from Llama's in that it multiplies
# (1 + weight) to the output, instead of just weight.
if "norm.weight" in name:
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
norm_weight = local_loaded_weight + 1.0
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, norm_weight.to(dtype=param.dtype))
else:
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
def gptbigcode_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
for name, loaded_weight in actor_weights.items():
if "lm_head.weight" in name:
continue
if ".attn.bias" in name:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
def starcoder2_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
]
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
for name, loaded_weight in actor_weights.items():
if "rotary_emb.inv_freq" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
break
else:
if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
continue
param = params_dict[name]
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
def llama_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
(".qkv_proj", ".q_proj", "q"),
(".qkv_proj", ".k_proj", "k"),
(".qkv_proj", ".v_proj", "v"),
(".gate_up_proj", ".gate_proj", 0),
(".gate_up_proj", ".up_proj", 1),
]
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
if "rotary_emb.inv_freq" in name:
continue
if ("rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name):
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
# With tie_word_embeddings, we can skip lm_head.weight
# The weight might appear unnecessarily in the files if the model is
# processed with quantization, LoRA, fine-tuning, etc.
if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
break
else:
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, local_loaded_weight)
def qwen2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
for name, loaded_weight in actor_weights.items():
if "rotary_emb.inv_freq" in name:
continue
if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
break
else:
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
def gpt2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
pass
def redistribute_dtensor(param_name: str, loaded_weights: DTensor, parallelize_plan: Dict = None):
param_name = _process_parameter_names(name=param_name)
if parallelize_plan is not None:
assert param_name in parallelize_plan.keys(), \
f"param name: {param_name} not in parallelize_plan :{parallelize_plan.keys()}"
placement = parallelize_plan[param_name]
local_loaded_weights = loaded_weights.redistribute(device_mesh=loaded_weights.device_mesh,
placements=placement).to_local()
else:
local_loaded_weights = loaded_weights.full_tensor()
return local_loaded_weights
def _process_parameter_names(name):
# Remove '.weight' if it exists at the end of the string
if name.endswith(".weight"):
name = name[:-7]
# Remove 'model.layers.x.' or 'model.' prefix
if "model.layers" in name:
parts = name.split('.')
# Reconstruct the string without 'model.layers.x.'
name = '.'.join(parts[3:]) # parts[0] is 'model', parts[1] is 'layers', parts[2] is 'x'
elif name.startswith("model."):
name = name[6:] # Remove 'model.'
return name
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__ = {
'GPT2LMHeadModel': gpt2_dtensor_weight_loader,
'LlamaForCausalLM': llama_dtensor_weight_loader,
'LLaMAForCausalLM': llama_dtensor_weight_loader,
'MistralForCausalLM': llama_dtensor_weight_loader, # mistral is the same as llama in vLLM
'InternLMForCausalLM': llama_dtensor_weight_loader,
'AquilaModel': llama_dtensor_weight_loader,
'AquilaForCausalLM': llama_dtensor_weight_loader,
'Phi3ForCausalLM': llama_dtensor_weight_loader,
'GemmaForCausalLM': gemma_dtensor_weight_loader,
'GPTBigCodeForCausalLM': gptbigcode_dtensor_load_weights,
'Starcoder2ForCausalLM': starcoder2_dtensor_load_weights,
'Qwen2ForCausalLM': qwen2_dtensor_weight_loader
}
# the actor model is .state_dict()
# Load dtensor weights
def load_dtensor_weights(actor_weights: Dict, vllm_model: nn.Module):
weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
weight_loader(actor_weights, vllm_model)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model = vllm_model.cuda()
def _get_model_weight_loader(arch: str):
if arch in __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__:
return __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__[arch]
raise ValueError(f"Model architectures {arch} are not supported for now. "
f"Supported architectures: {__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__.keys()}")
# NOTE(sgm): we use per-parameter weight loader in each vllm sub
def update_dtensor_weight_loader():
pass

View File

@ -0,0 +1,91 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from typing import Dict, Union, Optional, Iterable, Tuple
import torch
import torch.nn as nn
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
def update_hf_weight_loader():
from vllm.model_executor.models.gemma import GemmaForCausalLM
GemmaForCausalLM.load_weights = gemma_load_weights
def gemma_load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
loaded_params = set()
for name, loaded_weight in weights:
for (param_name, shard_name, shard_id) in stacked_params_mapping:
if shard_name not in name:
continue
name = name.replace(shard_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name:
continue
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
# GemmaRMSNorm is different from Llama's in that it multiplies
# (1 + weight) to the output, instead of just weight.
if "norm.weight" in name:
norm_weight = loaded_weight + 1.0 # prevent inplace modify actor weights
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, norm_weight)
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
loaded_params.add(name)
unloaded_params = params_dict.keys() - loaded_params
if unloaded_params:
raise RuntimeError("Some weights are not initialized from checkpoints: "
f"{unloaded_params}")
def load_hf_weights(actor_weights: Dict, vllm_model: nn.Module):
assert isinstance(actor_weights, Dict)
with set_default_torch_dtype(next(vllm_model.parameters()).dtype): # TODO
vllm_model.load_weights(actor_weights.items())
for _, module in vllm_model.named_modules():
quant_method = getattr(module, "quant_method", None)
if quant_method is not None:
quant_method.process_weights_after_loading(module)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if hasattr(module, "process_weights_after_loading"):
module.process_weights_after_loading()
vllm_model = vllm_model.cuda()

View File

@ -0,0 +1,306 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
from typing import Dict, List, Optional, Tuple, Union
from tqdm import tqdm
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers import PretrainedConfig
import torch.nn as nn
from .arg_utils import EngineArgs
from .llm_engine_sp import LLMEngine
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.sequence import MultiModalData
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
from verl.trainer.ppo.rollout.tokenizer import HybridEngineBaseTokenizer
class LLM:
"""An LLM for generating texts from given prompts and sampling parameters.
This class includes a tokenizer, a language model (possibly distributed
across multiple GPUs), and GPU memory space allocated for intermediate
states (aka KV cache). Given a batch of prompts and sampling parameters,
this class generates texts from the model, using an intelligent batching
mechanism and efficient memory management.
NOTE: This class is intended to be used for offline inference. For online
serving, use the `AsyncLLMEngine` class instead.
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
Args:
model: A HuggingFace Transformers model instance.
tokenizer: A HuggingFace Transformers tokenizer instance.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
we support "awq". If None, we assume the model weights are not
quantized and use `dtype` to determine the data type of the weights.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id.
seed: The seed to initialize the random number generator for sampling.
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
reserve for the model weights, activations, and KV cache. Higher
values will increase the KV cache size and thus improve the model's
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
disable_custom_all_reduce: See ParallelConfig
"""
def __init__(
self,
model: Union[nn.Module, Dict], # model itself or its parameter dict
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
model_hf_config: PretrainedConfig,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
tensor_parallel_size: int = 1,
dtype: str = "auto",
quantization: Optional[str] = None,
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: int = 4,
enforce_eager: bool = False,
max_context_len_to_capture: int = None,
disable_custom_all_reduce: bool = False,
load_format = 'auto',
**kwargs,
) -> None:
if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True
engine_args = EngineArgs(
model_hf_config=model_hf_config,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype,
quantization=quantization,
revision=revision,
tokenizer_revision=tokenizer_revision,
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=swap_space,
enforce_eager=enforce_eager,
max_context_len_to_capture=max_context_len_to_capture,
disable_custom_all_reduce=disable_custom_all_reduce,
load_format=load_format,
**kwargs,
)
tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
if not isinstance(tokenizer, tokenizer_cls):
raise ValueError(
f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
"one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.trainer.ppo.rollout.HybridEngineBaseTokenizer"
)
self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args)
self.request_counter = Counter()
def init_cache_engine(self):
self.llm_engine.init_cache_engine()
def free_cache_engine(self):
self.llm_engine.free_cache_engine()
def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
return self.llm_engine.tokenizer
def set_tokenizer(
self,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
) -> None:
self.llm_engine.tokenizer = tokenizer
def generate(
self,
prompts: Optional[Union[str, List[str]]] = None,
sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None,
prompt_token_ids: Optional[List[List[int]]] = None,
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[RequestOutput]:
"""Generates the completions for the input prompts.
NOTE: This class automatically batches the given prompts, considering
the memory constraint. For the best performance, put all of your prompts
into a single list and pass it to this method.
Args:
prompts: A list of prompts to generate completions for.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
When it is a single value, it is applied to every prompt.
When it is a list, the list must have the same length as the
prompts and it is paired one by one with the prompt.
prompt_token_ids: A list of token IDs for the prompts. If None, we
use the tokenizer to convert the prompts to token IDs.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
multi_modal_data: Multi modal data.
Returns:
A list of `RequestOutput` objects containing the generated
completions in the same order as the input prompts.
"""
if prompts is None and prompt_token_ids is None:
raise ValueError("Either prompts or prompt_token_ids must be "
"provided.")
if self.llm_engine.model_config.skip_tokenizer_init \
and prompts is not None:
raise ValueError("prompts must be None if skip_tokenizer_init "
"is True")
if isinstance(prompts, str):
# Convert a single prompt to a list.
prompts = [prompts]
if (prompts is not None and prompt_token_ids is not None and len(prompts) != len(prompt_token_ids)):
raise ValueError("The lengths of prompts and prompt_token_ids "
"must be the same.")
if prompts is not None:
num_requests = len(prompts)
else:
assert prompt_token_ids is not None
num_requests = len(prompt_token_ids)
if sampling_params is None:
# Use default sampling params.
sampling_params = SamplingParams()
elif isinstance(sampling_params, list) and len(sampling_params) != num_requests:
raise ValueError("The lengths of prompts and sampling_params "
"must be the same.")
if multi_modal_data:
multi_modal_data.data = multi_modal_data.data.to(torch.float16)
# Add requests to the engine.
for i in range(num_requests):
prompt = prompts[i] if prompts is not None else None
token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
if not isinstance(token_ids, list):
# NOTE(shengguangming): convert the rollout input into List[str]
token_ids = self._pre_process_inputs(token_ids)
self._add_request(
prompt,
sampling_params[i] if isinstance(sampling_params, list) else sampling_params,
token_ids,
lora_request=lora_request,
# Get ith image while maintaining the batch dim.
multi_modal_data=MultiModalData(type=multi_modal_data.type, data=multi_modal_data.data[i].unsqueeze(0))
if multi_modal_data else None,
)
return self._run_engine(use_tqdm)
def _add_request(
self,
prompt: Optional[str],
sampling_params: SamplingParams,
prompt_token_ids: Optional[List[int]],
lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> None:
request_id = str(next(self.request_counter))
self.llm_engine.add_request(request_id,
prompt,
sampling_params,
prompt_token_ids,
lora_request=lora_request,
multi_modal_data=multi_modal_data)
def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
# Initialize tqdm.
if use_tqdm:
num_requests = self.llm_engine.get_num_unfinished_requests()
pbar = tqdm(total=num_requests, desc="Processed prompts", dynamic_ncols=True)
# Run the engine.
outputs: List[RequestOutput] = []
while self.llm_engine.has_unfinished_requests():
step_outputs = self.llm_engine.step()
for output in step_outputs:
if output.finished:
outputs.append(output)
if use_tqdm:
pbar.update(1)
if use_tqdm:
pbar.close()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs = sorted(outputs, key=lambda x: int(x.request_id))
# TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
return self._post_process_outputs(outputs)
# NOTE(shengguangming): add for verl
# TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
# remove the left padding in the prompt token_id
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
token_ids = prompt_token_ids[non_pad_index:].tolist()
return token_ids
# NOTE(shengguangming): add for verl
def _post_process_outputs(self, request_outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
output_token_ids = []
logprobs = []
for request_output in request_outputs: # List[RequestOutput]
outputs = request_output.outputs
for output in outputs: # List[CompletionOutput], usually len == 1
output_token_ids.append(torch.tensor(output.token_ids))
# TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
logprobs_dicts = output.logprobs
if logprobs_dicts is not None:
logprob = []
for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
logprob.append(logprobs_dict[id].logprob)
logprobs.append(torch.tensor(logprob))
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
if len(logprobs) > 0:
logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
return output_token_ids, logprobs
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
self.llm_engine.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
def offload_model_weights(self) -> None:
self.llm_engine.offload_model_weights()

View File

@ -0,0 +1,285 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
import torch
from typing import Dict, Optional, Union, Type
import vllm
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig,
SpeculativeConfig, VisionLanguageConfig)
from vllm.core.scheduler import Scheduler
from vllm.engine.output_processor.interfaces import (SequenceGroupOutputProcessor)
from vllm.engine.output_processor.stop_checker import StopChecker
from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger
from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.engine.metrics import StatLogger
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message)
from vllm.utils import Counter
from vllm.engine.llm_engine import _load_generation_config_dict
from vllm.engine.llm_engine import LLMEngine
import torch.nn as nn
from .arg_utils import EngineArgs
from .tokenizer import TokenizerGroup
from .config import ModelConfig, LoadConfig
logger = init_logger(__name__)
_LOCAL_LOGGING_INTERVAL_SEC = 5
class LLMEngine(LLMEngine):
"""An LLM engine that receives requests and generates texts.
This is the main class for the vLLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The `LLM` class wraps this class for offline batched inference and the
`AsyncLLMEngine` class wraps this class for online serving.
NOTE: The config arguments are derived from the `EngineArgs` class. For the
comprehensive list of arguments, see `EngineArgs`.
Args:
model: the actor model initialize outside vllm (add for verl)
tokenizer: the initialized tokenizer (add for verl)
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
distributed_init_method: The initialization method for distributed
execution. See `torch.distributed.init_process_group` for details.
placement_group: Ray placement group for distributed execution.
Required for distributed execution.
log_stats: Whether to log statistics.
"""
def __init__(
self,
# NOTE(sgm): first two arguments are added for verl
model: Union[nn.Module, Dict], # model itself or its parameter dict
tokenizer: nn.Module,
# NOTE(sgm): vllm original arguments
model_config: ModelConfig,
cache_config: CacheConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
speculative_config: Optional[SpeculativeConfig],
decoding_config: Optional[DecodingConfig],
executor_class: Type[ExecutorBase],
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
) -> None:
logger.info(
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
"tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
"max_seq_len=%d, download_dir=%r, load_format=%s, "
"tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
"quantization_param_path=%s, device_config=%s, "
"decoding_config=%r, seed=%d, served_model_name=%s)",
vllm.__version__,
model_config.model,
speculative_config,
model_config.tokenizer,
model_config.skip_tokenizer_init,
# model_config.tokenizer_mode,
model_config.revision,
model_config.tokenizer_revision,
# model_config.trust_remote_code,
model_config.dtype,
model_config.max_model_len,
load_config.download_dir,
load_config.load_format,
parallel_config.tensor_parallel_size,
parallel_config.disable_custom_all_reduce,
model_config.quantization,
model_config.enforce_eager,
cache_config.cache_dtype,
model_config.quantization_param_path,
device_config.device,
decoding_config,
model_config.seed,
# model_config.served_model_name,
)
# TODO(woosuk): Print more configs in debug mode.
self.model_config = model_config # TODO: currently is hfconfig
self.cache_config = cache_config
self.lora_config = lora_config
self.vision_language_config = vision_language_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.speculative_config = speculative_config
self.load_config = load_config
self.decoding_config = decoding_config or DecodingConfig()
self.log_stats = log_stats
# self.model = model # should not store the model, it should be deleted
# TODO(shengguangming): maybe we can choose init here or from arguments
if not self.model_config.skip_tokenizer_init:
# TODO: check tokenizer class
self._init_tokenizer(tokenizer)
self.detokenizer = Detokenizer(self.tokenizer)
else:
self.detokenizer = None
self.tokenizer = None
self.seq_counter = Counter()
# TODO: don't know what's the usage
self.generation_config_fields = _load_generation_config_dict(model_config)
self.model_executor = executor_class(
model=model, # add for spmd_gpu_executor
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
vision_language_config=vision_language_config,
speculative_config=speculative_config,
load_config=load_config,
)
# Profile the memory usage and initialize the cache.
self._initialize_kv_caches()
# If usage stat is enabled, collect relevant info.
if is_usage_stats_enabled():
from vllm.model_executor.model_loader import (get_architecture_class_name)
usage_message.report_usage(
get_architecture_class_name(model_config),
usage_context,
extra_kvs={
# Common configuration
"dtype": str(model_config.dtype),
"tensor_parallel_size": parallel_config.tensor_parallel_size,
"block_size": cache_config.block_size,
"gpu_memory_utilization": cache_config.gpu_memory_utilization,
# Quantization
"quantization": model_config.quantization,
"kv_cache_dtype": cache_config.cache_dtype,
# Feature flags
"enable_lora": bool(lora_config),
"enable_prefix_caching": cache_config.enable_prefix_caching,
"enforce_eager": model_config.enforce_eager,
"disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
})
if self.tokenizer:
# Ping the tokenizer to ensure liveness if it runs in a
# different process.
self.tokenizer.ping()
# Create the scheduler.
# NOTE: the cache_config here have been updated with the numbers of
# GPU and CPU blocks, which are profiled in the distributed executor.
# NOTE(shengguangming): each process will have independent scheduler
self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
# Metric Logging.
if self.log_stats:
self.stat_logger = StatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
labels=dict(model_name=model_config.served_model_name),
max_model_len=self.model_config.max_model_len)
self.stat_logger.info("cache_config", self.cache_config)
# Create sequence output processor, e.g. for beam search or
# speculative decoding.
self.output_processor = (SequenceGroupOutputProcessor.create_output_processor(
self.scheduler_config,
self.detokenizer,
self.scheduler,
self.seq_counter,
self.get_tokenizer_for_seq,
stop_checker=StopChecker(
self.scheduler_config.max_model_len,
self.get_tokenizer_for_seq,
),
))
# TODO(sgm): add for verl but we may not tokenizer in Rollout
def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
init_kwargs = dict(enable_lora=bool(self.lora_config),
max_num_seqs=self.scheduler_config.max_num_seqs,
max_input_length=None)
init_kwargs.update(tokenizer_init_kwargs)
self.tokenizer: TokenizerGroup = TokenizerGroup(tokenizer, **init_kwargs)
def init_cache_engine(self):
# TODO: check whether we should rebuild the CUDAGraph every iter when offload/load KVCache
# Re-capture CUDAGraph would be time-consuming
self.model_executor.init_cache_engine()
def free_cache_engine(self):
self.model_executor.free_cache_engine()
# NOTE(sgm): currently, we only support GPU executor
# The GPUExecutor remove the Ray dependency
@classmethod
def from_engine_args(
cls,
model,
tokenizer,
engine_args: EngineArgs,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
) -> "LLMEngine":
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
engine_config = engine_args.create_engine_config()
# Initialize the cluster and specify the executor class.
assert engine_config.device_config.device_type == "cuda", \
"Currently, the vllm in verl only support running on GPU"
if engine_config.parallel_config.world_size == 1:
# TODO: may also need to init process group
from vllm.executor.gpu_executor import GPUExecutor
executor_class = GPUExecutor
else:
from .spmd_gpu_executor import SPMDGPUExecutor
executor_class = SPMDGPUExecutor
# Create the LLM engine.
engine = cls(
model,
tokenizer,
**engine_config.to_dict(),
executor_class=executor_class,
log_stats=not engine_args.disable_log_stats,
usage_context=usage_context,
)
return engine
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
self.model_executor.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
def offload_model_weights(self) -> None:
self.model_executor.offload_model_weights()

View File

@ -0,0 +1,348 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from typing import Dict
import torch
import torch.nn as nn
from vllm.model_executor.layers.linear import *
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
from vllm.model_executor.layers.activation import ScaledActivation
from vllm.model_executor.models import ModelRegistry
# NOTE(shengguangming): replace the origin weight loader function in the class
def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
"""Parallel Linear weight loader."""
assert param.size() == loaded_weight.size(
), 'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'.format(
param.size(), loaded_weight.size())
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
param.data = loaded_weight.data
def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
"""Default weight loader."""
assert param.size() == loaded_weight.size()
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
param.data = loaded_weight.data
def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
for name, loaded_weight in actor_weights.items():
if "lm_head.weight" in name:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
if ".attn.bias" in name or ".attn.masked_bias" in name:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
if not name.startswith("transformer."):
name = "transformer." + name
param = params_dict[name]
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
# Because of this, we need to transpose the weights.
# Note(zhuohan): the logic below might break quantized models.
for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
if conv1d_weight_name not in name:
continue
if not name.endswith(".weight"):
continue
# TODO: check megatron
loaded_weight = loaded_weight.t()
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def llama_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
params_mapping = [
# (megatron core gpt model name, vllm model name)
("embedding.word_embeddings", "model.embed_tokens"),
("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
("self_attention.linear_qkv", "self_attn.qkv_proj"),
("self_attention.linear_qkv", "self_attn.qkv_proj"),
("self_attention.linear_proj", 'self_attn.o_proj'),
('pre_mlp_layernorm', 'post_attention_layernorm'),
('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
('mlp.linear_fc1', 'mlp.gate_up_proj'),
('mlp.linear_fc2', 'mlp.down_proj'),
('decoder.final_layernorm', 'model.norm'),
('output_layer', 'lm_head'),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
name = _replace_name(name, params_mapping)
if name.endswith('.bias') and name not in params_dict:
continue
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
params_mapping = [
# (megatron core gpt model name, vllm model name)
("embedding.word_embeddings", "model.embed_tokens"),
("self_attention.linear_qkv", "self_attn.qkv_proj"),
("self_attention.linear_proj", 'self_attn.o_proj'),
(
'input_layernorm',
'input_layernorm',
),
('pre_mlp_layernorm', 'post_attention_layernorm'),
('mlp.linear_fc1', 'mlp.gate_up_proj'),
('mlp.linear_fc2', 'mlp.down_proj'),
('decoder.final_layernorm', 'model.norm'),
('output_layer', 'lm_head'),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
name = _replace_name(name, params_mapping)
if name.endswith('.bias') and name not in params_dict:
continue
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def _replace_name(megatron_name, name_mapping):
for m_name, v_name in name_mapping:
if m_name not in megatron_name:
continue
if 'layers' in megatron_name: # deal with decoder layers
megatron_name = megatron_name.replace('decoder', 'model')
megatron_name_list = megatron_name.split('.')
if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
param_name_list = megatron_name_list[:3]
param_name_list.append(v_name)
param_name = '.'.join(param_name_list)
else:
param_name_list = megatron_name_list[:3]
weight_or_bias = megatron_name_list[-1]
param_name_list.append(v_name)
param_name_list.append(weight_or_bias)
param_name = '.'.join(param_name_list)
return param_name
else:
param_name = megatron_name.replace(m_name, v_name)
return param_name
def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
params_mapping = [
# (megatron core gpt model name, vllm model name)
("embedding.word_embeddings", "model.embed_tokens"),
("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
("self_attention.linear_qkv", "self_attn.qkv_proj"),
("self_attention.linear_qkv", "self_attn.qkv_proj"),
("self_attention.linear_proj", 'self_attn.o_proj'),
('pre_mlp_layernorm', 'post_attention_layernorm'),
('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
('mlp.linear_fc1', 'mlp.gate_up_proj'),
('mlp.linear_fc2', 'mlp.down_proj'),
('decoder.final_layernorm', 'model.norm'),
('output_layer', 'lm_head'),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
name = _replace_name(name, params_mapping)
if name.endswith('.bias') and name not in params_dict:
continue
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
params_mapping = [
# (megatron core gpt model name, vllm model name)
("embedding.word_embeddings", "model.embed_tokens"),
("self_attention.linear_qkv", "self_attn.qkv_proj"),
("self_attention.linear_proj", 'self_attn.o_proj'),
(
'input_layernorm',
'input_layernorm',
),
('pre_mlp_layernorm', 'post_attention_layernorm'),
('mlp.linear_fc1', 'mlp.gate_up_proj'),
('mlp.linear_fc2', 'mlp.down_proj'),
('decoder.final_layernorm', 'model.norm'),
('output_layer', 'lm_head'),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
name = _replace_name(name, params_mapping)
if name.endswith('.bias') and name not in params_dict:
continue
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
def _replace_name(megatron_name, name_mapping):
for m_name, v_name in name_mapping:
if m_name not in megatron_name:
continue
if 'layers' in megatron_name: # deal with decoder layers
megatron_name = megatron_name.replace('decoder', 'model')
megatron_name_list = megatron_name.split('.')
if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
param_name_list = megatron_name_list[:3]
param_name_list.append(v_name)
param_name = '.'.join(param_name_list)
else:
param_name_list = megatron_name_list[:3]
weight_or_bias = megatron_name_list[-1]
param_name_list.append(v_name)
param_name_list.append(weight_or_bias)
param_name = '.'.join(param_name_list)
return param_name
else:
param_name = megatron_name.replace(m_name, v_name)
return param_name
def mistral_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
# TODO: need to implement a general way to deal with prefix
params_dict = dict(vllm_model.named_parameters())
for name, loaded_weight in actor_weights.items():
if "rotary_emb.inv_freq" in name:
continue
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__ = {
ColumnParallelLinear: parallel_weight_loader,
MergedColumnParallelLinear: parallel_weight_loader,
QKVParallelLinear: parallel_weight_loader,
RowParallelLinear: parallel_weight_loader,
VocabParallelEmbedding: parallel_weight_loader,
ParallelLMHead: parallel_weight_loader
# "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
# "default_weight_loader": default_weight_loader
}
# for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
# # setattr(layer_class, 'megatron_weight_loader', weight_loader)
# layer_class.weight_loader = weight_loader
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__ = {
'GPT2LMHeadModel': gpt2_weight_loader,
'LlamaForCausalLM': llama_megatron_core_te_weight_loader, # use te backend for open-source megatron
'LLaMAForCausalLM': llama_megatron_core_te_weight_loader,
'MistralForCausalLM': mistral_megatron_weight_loader,
}
# the actor model is .state_dict()
# Load megatron weights
def load_megatron_weights(actor_weights: Dict, vllm_model: nn.Module):
weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
weight_loader(actor_weights, vllm_model)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model = vllm_model.cuda()
def _get_model_weight_loader(arch: str):
if arch in __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__:
return __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__[arch]
raise ValueError(f"Model architectures {arch} are not supported for now. "
f"Supported architectures: {ModelRegistry.get_supported_archs()}")
def update_megatron_weight_loader():
for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
layer_class.weight_loader = weight_loader
VocabParallelEmbedding.__init__ = vocab_init
# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
# so we need to rewrite the init function of vocab
DEFAULT_VOCAB_PADDING_SIZE = 64
def vocab_init(self,
num_embeddings: int,
embedding_dim: int,
params_dtype: Optional[torch.dtype] = None,
org_num_embeddings: Optional[int] = None,
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
super(VocabParallelEmbedding, self).__init__()
# Keep the input dimensions.
# TODO (pad to be divided by 4)
self.num_embeddings = num_embeddings
self.org_vocab_size = org_num_embeddings or num_embeddings
# self.num_embeddings_padded = pad_vocab_size(num_embeddings,
# padding_size)
self.embedding_dim = embedding_dim
if params_dtype is None:
params_dtype = torch.get_default_dtype()
self.tp_size = get_tensor_model_parallel_world_size()
# Divide the weight matrix along the vocaburaly dimension.
# TODO: remove dependencies from megatron
from megatron.core.tensor_parallel.utils import VocabUtility
self.vocab_start_index, self.vocab_end_index = (VocabUtility.vocab_range_from_global_vocab_size(
self.num_embeddings, get_tensor_model_parallel_rank(), self.tp_size))
self.num_embeddings_per_partition = (self.vocab_end_index - self.vocab_start_index)
self.weight = Parameter(
torch.empty(
self.num_embeddings_per_partition,
self.embedding_dim,
# device=torch.cuda.current_device(),
dtype=params_dtype))
set_weight_attrs(self.weight, {"parallel_dim": 0, "weight_loader": self.weight_loader})

Some files were not shown because too many files have changed in this diff Show More