[docker] feat: upgrade to torch 2.7, sglang 0.4.8 (#2617)

### What does this PR do?

[docker] feat: upgrade to torch 2.7, sglang 0.4.8

Stage 2: vllm 0.9.1
Stage 3: mcore 0.13.0

### Checklist Before Starting

- [ ] Search for similar PRs. Paste at least one query link here: ...
- [ ] Format the PR title as `[{modules}] {type}: {description}` (This
will be checked by the CI)
- `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`,
`trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`,
`ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`,
`env`, `tool`, `ckpt`, `doc`, `data`
- If this PR involves multiple modules, separate them with `,` like
`[megatron, fsdp, doc]`
  - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test`
- If this PR breaks any API (CLI arguments, config, function signature,
etc.), add `[BREAKING]` to the beginning of the title.
  - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching`

### Test

> For changes that can not be tested by CI (e.g., algorithm
implementation, new model support), validate by experiment(s) and show
results like training curve plots, evaluation results, etc.

### API and Usage Example

> Demonstrate how the API changes if any, and provide usage example(s)
if possible.

```python
# Add code snippet or script demonstrating how to use this
```

### Design & Code Changes

> Demonstrate the high-level design if this PR is complex, and list the
specific changes.

### Checklist Before Submitting

> [!IMPORTANT]
> Please check all the following items before requesting a review,
otherwise the reviewer might deprioritize this PR for review.

- [ ] Read the [Contribute
Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md).
- [ ] Apply [pre-commit
checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting):
`pre-commit install && pre-commit run --all-files --show-diff-on-failure
--color=always`
- [ ] Add / Update [the
documentation](https://github.com/volcengine/verl/tree/main/docs).
- [ ] Add unit or end-to-end test(s) to [the CI
workflow](https://github.com/volcengine/verl/tree/main/.github/workflows)
to cover all the code. If not feasible, explain why: ...
- [ ] Once your PR is ready for CI, send a message in [the `ci-request`
channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the
`verl` Slack
workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ).

---------

Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
This commit is contained in:
Blue Space
2025-07-25 05:53:24 +08:00
committed by GitHub
parent bcd336fd46
commit 4879d619fc
20 changed files with 165 additions and 41 deletions

View File

@ -17,6 +17,7 @@ import argparse
import os
import warnings
from contextlib import contextmanager
from importlib.metadata import version
from typing import Any, Callable, ContextManager, Optional
import numpy as np
@ -29,6 +30,7 @@ from megatron.core.dist_checkpointing.mapping import ShardedTensor
from megatron.core.dist_checkpointing.serialization import StrictHandling
from megatron.core.models.gpt.gpt_model import ModelType
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from packaging.version import Version
from transformers import AutoConfig
from verl.model_merger.megatron_model_merger import get_dynamic_pipeline_shards
@ -204,7 +206,11 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
head_dim = hidden_size // num_attention_heads
# 1. vision model
hfvision = hfmodel.visual
if Version(version("transformers")) < Version("4.52.0"):
print("Using transformers < 4.52 API to load vision model")
hfvision = hfmodel.visual
else:
hfvision = hfmodel.model.visual
mgvision = mgmodel.vision_model
vision_hidden_size = mgvision.config.hidden_size
vision_num_query_groups = mgvision.config.num_query_groups
@ -255,13 +261,18 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
copied_numel += safe_copy(hfprojector.mlp[2].weight, mgprojector.encoder.linear_fc2.weight)
copied_numel += safe_copy(hfprojector.mlp[2].bias, mgprojector.encoder.linear_fc2.bias)
n_params = sum([t.numel() for t in hfvision.state_dict().values()])
assert n_params == copied_numel
assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"
# 3. llm [just Qwen2]
hfllm = hfmodel.model
if Version(version("transformers")) < Version("4.52.0"):
print("Using transformers < 4.52 API to load llm")
hfllm = hfmodel.model
else:
hfllm = hfmodel.model.language_model
mgllm = mgmodel.language_model
copied_numel = 0
copied_numel += safe_copy(hfllm.embed_tokens.weight, mgllm.embedding.word_embeddings.weight)
for mglayer, hflayer in zip(mgllm.decoder.layers, hfllm.layers, strict=True):
layermaps = zip(mgllm.decoder.layers, hfllm.layers, strict=True)
for mglayer, hflayer in layermaps:
copied_numel += safe_copy(hflayer.input_layernorm.weight, mglayer.self_attention.linear_qkv.layer_norm_weight)
q_proj_weight = hflayer.self_attn.q_proj.weight.view(num_query_groups, -1, head_dim, hidden_size)
@ -289,7 +300,7 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
n_params = sum([t.numel() for t in hfllm.state_dict().values()])
assert n_params == copied_numel
assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"
@torch.inference_mode()