mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[docker] feat: upgrade to torch 2.7, sglang 0.4.8 (#2617)
### What does this PR do? [docker] feat: upgrade to torch 2.7, sglang 0.4.8 Stage 2: vllm 0.9.1 Stage 3: mcore 0.13.0 ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). --------- Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
This commit is contained in:
@ -17,6 +17,7 @@ import argparse
|
||||
import os
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from importlib.metadata import version
|
||||
from typing import Any, Callable, ContextManager, Optional
|
||||
|
||||
import numpy as np
|
||||
@ -29,6 +30,7 @@ from megatron.core.dist_checkpointing.mapping import ShardedTensor
|
||||
from megatron.core.dist_checkpointing.serialization import StrictHandling
|
||||
from megatron.core.models.gpt.gpt_model import ModelType
|
||||
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
|
||||
from packaging.version import Version
|
||||
from transformers import AutoConfig
|
||||
|
||||
from verl.model_merger.megatron_model_merger import get_dynamic_pipeline_shards
|
||||
@ -204,7 +206,11 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
|
||||
head_dim = hidden_size // num_attention_heads
|
||||
|
||||
# 1. vision model
|
||||
hfvision = hfmodel.visual
|
||||
if Version(version("transformers")) < Version("4.52.0"):
|
||||
print("Using transformers < 4.52 API to load vision model")
|
||||
hfvision = hfmodel.visual
|
||||
else:
|
||||
hfvision = hfmodel.model.visual
|
||||
mgvision = mgmodel.vision_model
|
||||
vision_hidden_size = mgvision.config.hidden_size
|
||||
vision_num_query_groups = mgvision.config.num_query_groups
|
||||
@ -255,13 +261,18 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
|
||||
copied_numel += safe_copy(hfprojector.mlp[2].weight, mgprojector.encoder.linear_fc2.weight)
|
||||
copied_numel += safe_copy(hfprojector.mlp[2].bias, mgprojector.encoder.linear_fc2.bias)
|
||||
n_params = sum([t.numel() for t in hfvision.state_dict().values()])
|
||||
assert n_params == copied_numel
|
||||
assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"
|
||||
# 3. llm [just Qwen2]
|
||||
hfllm = hfmodel.model
|
||||
if Version(version("transformers")) < Version("4.52.0"):
|
||||
print("Using transformers < 4.52 API to load llm")
|
||||
hfllm = hfmodel.model
|
||||
else:
|
||||
hfllm = hfmodel.model.language_model
|
||||
mgllm = mgmodel.language_model
|
||||
copied_numel = 0
|
||||
copied_numel += safe_copy(hfllm.embed_tokens.weight, mgllm.embedding.word_embeddings.weight)
|
||||
for mglayer, hflayer in zip(mgllm.decoder.layers, hfllm.layers, strict=True):
|
||||
layermaps = zip(mgllm.decoder.layers, hfllm.layers, strict=True)
|
||||
for mglayer, hflayer in layermaps:
|
||||
copied_numel += safe_copy(hflayer.input_layernorm.weight, mglayer.self_attention.linear_qkv.layer_norm_weight)
|
||||
|
||||
q_proj_weight = hflayer.self_attn.q_proj.weight.view(num_query_groups, -1, head_dim, hidden_size)
|
||||
@ -289,7 +300,7 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
|
||||
|
||||
n_params = sum([t.numel() for t in hfllm.state_dict().values()])
|
||||
|
||||
assert n_params == copied_numel
|
||||
assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
Reference in New Issue
Block a user