diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 2df37965f..001686854 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,11 +6,14 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.11" + rust: "1.70" sphinx: configuration: docs/conf.py python: install: - - requirements: docs/requirements-docs.txt \ No newline at end of file + - requirements: docs/requirements-docs.txt + - method: pip + path: . diff --git a/README.md b/README.md index bcf747248..b1d9ba080 100644 --- a/README.md +++ b/README.md @@ -118,18 +118,20 @@ If you find the project helpful, please cite: verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and supported by Anyscale, Bytedance, LMSys.org, Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, and University of Hong Kong. ## Awesome work using verl -- [Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization](https://arxiv.org/abs/2410.09302) -- [Flaming-hot Initiation with Regular Execution Sampling for Large Language Models](https://arxiv.org/abs/2410.21236) -- [Process Reinforcement Through Implicit Rewards](https://github.com/PRIME-RL/PRIME/) -- [TinyZero](https://github.com/Jiayi-Pan/TinyZero): a reproduction of DeepSeek R1 Zero recipe for reasoning tasks -- [RAGEN](https://github.com/ZihanWang314/ragen): a general-purpose reasoning agent training framework -- [Logic R1](https://github.com/Unakar/Logic-RL): a reproduced DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset. +- [TinyZero](https://github.com/Jiayi-Pan/TinyZero): a reproduction of **DeepSeek R1 Zero** recipe for reasoning tasks +- [PRIME](https://github.com/PRIME-RL/PRIME): Process reinforcement through implicit rewards +- [RAGEN](https://github.com/ZihanWang314/ragen): a general-purpose reasoning **agent** training framework +- [Logic-RL](https://github.com/Unakar/Logic-RL): a reproduction of DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset. - [deepscaler](https://github.com/agentica-project/deepscaler): iterative context scaling with GRPO -- [critic-rl](https://github.com/HKUNLP/critic-rl): Teaching Language Models to Critique via Reinforcement Learning -- [Easy-R1](https://github.com/hiyouga/EasyR1): Multi-Modality RL +- [critic-rl](https://github.com/HKUNLP/critic-rl): LLM critics for code generation +- [Easy-R1](https://github.com/hiyouga/EasyR1): **Multi-modal** RL training framework +- [self-rewarding-reasoning-LLM](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with **generative reward models** +- [Search-R1](https://github.com/PeterGriffinJin/Search-R1): RL with reasoning and **searching (tool-call)** interleaved LLMs +- [DQO](https://arxiv.org/abs/2410.09302): Enhancing multi-Step reasoning abilities of language models through direct Q-function optimization +- [FIRE](https://arxiv.org/abs/2410.21236): Flaming-hot initiation with regular execution sampling for large language models ## Contribution Guide -Contributions from the community are welcome! +Contributions from the community are welcome! Please checkout our [roadmap](https://github.com/volcengine/verl/issues/22) and [release plan](https://github.com/volcengine/verl/issues/354). ### Code formatting We use yapf (Google style) to enforce strict code formatting when reviewing PRs. To reformat you code locally, make sure you installed **latest** `yapf` diff --git a/docs/faq/faq.rst b/docs/faq/faq.rst index d27dc9f6f..dc0bbd2f1 100644 --- a/docs/faq/faq.rst +++ b/docs/faq/faq.rst @@ -55,3 +55,8 @@ Please set the following environment variable. The env var must be set before th export VLLM_ATTENTION_BACKEND=XFORMERS If in doubt, print this env var in each rank to make sure it is properly set. + +Checkpoints +------------------------ + +If you want to convert the model checkpoint into huggingface safetensor format, please refer to ``scripts/model_merger.py``. diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 439f85ef8..49ecc0af4 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -6,4 +6,7 @@ sphinx-markdown-tables # theme default rtd # crate-docs-theme -sphinx-rtd-theme \ No newline at end of file +sphinx-rtd-theme + +# pin tokenizers version to avoid env_logger version req +tokenizers==0.19.1 diff --git a/verl/protocol.py b/verl/protocol.py index 737ec1403..a272f53a6 100644 --- a/verl/protocol.py +++ b/verl/protocol.py @@ -84,7 +84,7 @@ def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> Ten return tensor_dict1 -def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[np.ndarray]) -> dict[np.ndarray]: +def union_numpy_dict(tensor_dict1: dict[str, np.ndarray], tensor_dict2: dict[str, np.ndarray]) -> dict[str, np.ndarray]: for key, val in tensor_dict2.items(): if key in tensor_dict1: assert isinstance(tensor_dict2[key], np.ndarray) @@ -448,19 +448,17 @@ class DataProto: return self def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader_kwargs=None): - """Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch + r"""Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details. + Args: - mini_batch_size (int): mini-batch size when iterating the dataset. We require that - ``batch.batch_size[0] % mini_batch_size == 0`` + mini_batch_size (int): mini-batch size when iterating the dataset. We require that ``batch.batch_size[0] % mini_batch_size == 0``. epochs (int): number of epochs when iterating the dataset. - dataloader_kwargs: internally, it returns a DataLoader over the batch. - The dataloader_kwargs is the kwargs passed to the DataLoader + dataloader_kwargs (Any): internally, it returns a DataLoader over the batch. The dataloader_kwargs is the kwargs passed to the DataLoader. Returns: - Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is - ``self.batch.batch_size * epochs // mini_batch_size`` + Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is ``self.batch.batch_size * epochs // mini_batch_size`` """ assert self.batch.batch_size[0] % mini_batch_size == 0, f"{self.batch.batch_size[0]} % {mini_batch_size} != 0" # we can directly create a dataloader from TensorDict