[docker] feat: upgrade to torch 2.7, sglang 0.4.8 (#2617)

### What does this PR do? [docker] feat: upgrade to torch 2.7, sglang 0.4.8 Stage 2: vllm 0.9.1 Stage 3: mcore 0.13.0 ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). --------- Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
2025-10-20 13:43:50 +08:00 · 2025-07-25 05:53:24 +08:00
parent bcd336fd46
commit 4879d619fc
20 changed files with 165 additions and 41 deletions
--- a/scripts/converter_hf_to_mcore.py
+++ b/scripts/converter_hf_to_mcore.py
@ -17,6 +17,7 @@ import argparse
 import os
 import warnings
 from contextlib import contextmanager
+from importlib.metadata import version
 from typing import Any, Callable, ContextManager, Optional

 import numpy as np
@ -29,6 +30,7 @@ from megatron.core.dist_checkpointing.mapping import ShardedTensor
 from megatron.core.dist_checkpointing.serialization import StrictHandling
 from megatron.core.models.gpt.gpt_model import ModelType
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from packaging.version import Version
 from transformers import AutoConfig

 from verl.model_merger.megatron_model_merger import get_dynamic_pipeline_shards
@ -204,7 +206,11 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
    head_dim = hidden_size // num_attention_heads

    # 1. vision model
-    hfvision = hfmodel.visual
+    if Version(version("transformers")) < Version("4.52.0"):
+        print("Using transformers < 4.52 API to load vision model")
+        hfvision = hfmodel.visual
+    else:
+        hfvision = hfmodel.model.visual
    mgvision = mgmodel.vision_model
    vision_hidden_size = mgvision.config.hidden_size
    vision_num_query_groups = mgvision.config.num_query_groups
@ -255,13 +261,18 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel
    copied_numel += safe_copy(hfprojector.mlp[2].weight, mgprojector.encoder.linear_fc2.weight)
    copied_numel += safe_copy(hfprojector.mlp[2].bias, mgprojector.encoder.linear_fc2.bias)
    n_params = sum([t.numel() for t in hfvision.state_dict().values()])
-    assert n_params == copied_numel
+    assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"
    # 3. llm [just Qwen2]
-    hfllm = hfmodel.model
+    if Version(version("transformers")) < Version("4.52.0"):
+        print("Using transformers < 4.52 API to load llm")
+        hfllm = hfmodel.model
+    else:
+        hfllm = hfmodel.model.language_model
    mgllm = mgmodel.language_model
    copied_numel = 0
    copied_numel += safe_copy(hfllm.embed_tokens.weight, mgllm.embedding.word_embeddings.weight)
-    for mglayer, hflayer in zip(mgllm.decoder.layers, hfllm.layers, strict=True):
+    layermaps = zip(mgllm.decoder.layers, hfllm.layers, strict=True)
+    for mglayer, hflayer in layermaps:
        copied_numel += safe_copy(hflayer.input_layernorm.weight, mglayer.self_attention.linear_qkv.layer_norm_weight)

        q_proj_weight = hflayer.self_attn.q_proj.weight.view(num_query_groups, -1, head_dim, hidden_size)
@ -289,7 +300,7 @@ def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel

    n_params = sum([t.numel() for t in hfllm.state_dict().values()])

-    assert n_params == copied_numel
+    assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"


@torch.inference_mode()