[Misc] Modify the organization of GLM series (#22171)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-04 14:51:20 +08:00
parent 8ecb3e9e93
commit a7b8788d2c
16 changed files with 31 additions and 31 deletions
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -328,7 +328,7 @@ th {
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
-| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -348,8 +348,8 @@ th {
 | `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
-| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
@ -589,8 +589,8 @@ See [this page](generative_models.md) for more information on how to use generat
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
-| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V-Air`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -221,7 +221,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
 # GLM-4v
 def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
+    model_name = "zai-org/glm-4v-9b"

    engine_args = EngineArgs(
        model=model_name,
@ -250,7 +250,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:

 # GLM-4.1V
 def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "THUDM/GLM-4.1V-9B-Thinking"
+    model_name = "zai-org/GLM-4.1V-9B-Thinking"

    engine_args = EngineArgs(
        model=model_name,
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -154,7 +154,7 @@ TEXT_GENERATION_MODELS = {
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
-    "THUDM/chatglm3-6b": PPTestSettings.fast(),
+    "zai-org/chatglm3-6b": PPTestSettings.fast(),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
@ -224,7 +224,7 @@ MULTIMODAL_MODELS = {
    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
-    "THUDM/glm-4v-9b": PPTestSettings.fast(),
+    "zai-org/glm-4v-9b": PPTestSettings.fast(),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@ -14,7 +14,7 @@ from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators

-MODEL_PATH = "THUDM/chatglm3-6b"
+MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64
 DEFAULT_MAX_LORAS = 4 * 3

--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@ -6,7 +6,7 @@ from vllm.lora.request import LoRARequest

 from ..utils import create_new_process_for_each_test, multi_gpu_test

-MODEL_PATH = "THUDM/chatglm3-6b"
+MODEL_PATH = "zai-org/chatglm3-6b"

 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501

--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@ -53,7 +53,7 @@ AITER_MODEL_LIST = [
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
-            "THUDM/chatglm3-6b",  # chatglm (text-only)
+            "zai-org/chatglm3-6b",  # chatglm (text-only)
        ),
        pytest.param(
            "meta-llama/Llama-3.2-1B-Instruct",  # llama
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -355,7 +355,7 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10,
    ),
    "glm4v": VLMTestInfo(
-        models=["THUDM/glm-4v-9b"],
+        models=["zai-org/glm-4v-9b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts({
@ -374,7 +374,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "glm4_1v": VLMTestInfo(
-        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        models=["zai-org/GLM-4.1V-9B-Thinking"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
@ -388,7 +388,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "glm4_1v-video": VLMTestInfo(
-        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        models=["zai-org/GLM-4.1V-9B-Thinking"],
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -271,8 +271,8 @@ def _test_processing_correctness_one(
    "microsoft/Florence-2-base",
    "adept/fuyu-8b",
    "google/gemma-3-4b-it",
-    "THUDM/glm-4v-9b",
-    "THUDM/GLM-4.1V-9B-Thinking",
+    "zai-org/glm-4v-9b",
+    "zai-org/GLM-4.1V-9B-Thinking",
    "ibm-granite/granite-speech-3.3-2b",
    "h2oai/h2ovl-mississippi-800m",
    "internlm/Intern-S1",
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@ -9,7 +9,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from ...utils import build_model_context


-@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("expected_toks_per_frame", [299])
@pytest.mark.parametrize("num_frames", [32, 128])
@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -153,7 +153,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                        {"1b": "bigscience/bloomz-1b1"}),
-    "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
+    "ChatGLMModel": _HfExamplesInfo("zai-org/chatglm3-6b",
                                    trust_remote_code=True,
                                    max_transformers_version="4.48"),
    "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
@ -187,8 +187,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
                                          min_transformers_version="4.53"),
-    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
-    "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"),
+    "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"),
+    "Glm4ForCausalLM": _HfExamplesInfo("zai-org/GLM-4-9B-0414"),
    "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5",
                                          min_transformers_version="4.54"),   # noqa: E501
    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
@ -380,10 +380,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
-    "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
+    "GLM4VForCausalLM": _HfExamplesInfo("zai-org/glm-4v-9b",
                                        trust_remote_code=True,
                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
-    "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking"),  # noqa: E501
+    "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
    "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V-Air",
                                          is_available_online=False),   # noqa: E501
    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer,
                                               get_cached_tokenizer)


-@pytest.mark.parametrize("model_id", ["gpt2", "THUDM/chatglm3-6b"])
+@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
 def test_cached_tokenizer(model_id: str):
    reference_tokenizer = AutoTokenizer.from_pretrained(model_id,
                                                        trust_remote_code=True)
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
-# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/zai-org/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
 import json
 from collections.abc import Iterable
@ -86,10 +86,10 @@ class GLMAttention(nn.Module):
            prefix=f"{prefix}.dense",
        )

-        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+        # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
        rope_ratio = getattr(config, "rope_ratio", 1.0)
        max_positions = getattr(config, "seq_length", 8192)
-        # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False,
+        # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
        # which is equivalent to is_neox_style=True
        is_neox_style = not config.original_rope
        self.rotary_emb = get_rope(
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Adapted from
-# https://github.com/THUDM/CogAgent
+# https://github.com/zai-org/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
 from collections.abc import Mapping, Sequence
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@ -118,7 +118,7 @@ MODELS_ON_S3 = [
    "stabilityai/stablelm-zephyr-3b",
    "state-spaces/mamba-130m-hf",
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
-    "THUDM/glm-4v-9b",
+    "zai-org/glm-4v-9b",
    "TIGER-Lab/Mantis-8B-siglip-llama3",
    "TIGER-Lab/VLM2Vec-Full",
    "tiiuae/falcon-40b",
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Adapted from
-# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/zai-org/ChatGLM2-6B
 from transformers import PretrainedConfig


--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -271,7 +271,7 @@ def get_tokenizer(
            }
            tokenizer.add_special_tokens(special_tokens_map)

-        # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
+        # NOTE: We can remove this after https://github.com/zai-org/ChatGLM3/issues/1324
        if type(tokenizer).__name__ in ("ChatGLMTokenizer",
                                        "ChatGLM4Tokenizer"):
            assert isinstance(tokenizer, PreTrainedTokenizer)