mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Update PyTorch to 2.8.0 (#20358)
Signed-off-by: Huy Do <huydhn@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
@ -462,8 +462,8 @@ steps:
|
|||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
commands:
|
||||||
# temporary install here since we need nightly, will move to requirements/test.in
|
# temporary install here since we need nightly, will move to requirements/test.in
|
||||||
# after torchao 0.12 release
|
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
||||||
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
|
@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
|
@ -6,7 +6,7 @@ requires = [
|
|||||||
"packaging>=24.2",
|
"packaging>=24.2",
|
||||||
"setuptools>=77.0.3,<80.0.0",
|
"setuptools>=77.0.3,<80.0.0",
|
||||||
"setuptools-scm>=8.0",
|
"setuptools-scm>=8.0",
|
||||||
"torch == 2.7.1",
|
"torch == 2.8.0",
|
||||||
"wheel",
|
"wheel",
|
||||||
"jinja2",
|
"jinja2",
|
||||||
]
|
]
|
||||||
|
@ -4,7 +4,8 @@ ninja
|
|||||||
packaging>=24.2
|
packaging>=24.2
|
||||||
setuptools>=77.0.3,<80.0.0
|
setuptools>=77.0.3,<80.0.0
|
||||||
setuptools-scm>=8
|
setuptools-scm>=8
|
||||||
torch==2.7.1
|
torch==2.8.0
|
||||||
wheel
|
wheel
|
||||||
jinja2>=3.1.6
|
jinja2>=3.1.6
|
||||||
regex
|
regex
|
||||||
|
build
|
||||||
|
@ -9,17 +9,16 @@ packaging>=24.2
|
|||||||
setuptools>=77.0.3,<80.0.0
|
setuptools>=77.0.3,<80.0.0
|
||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||||
torch==2.7.0; platform_system == "Darwin"
|
torch==2.8.0; platform_system == "Darwin"
|
||||||
torch==2.7.0; platform_machine == "ppc64le"
|
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||||
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
|
|
||||||
|
|
||||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||||
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||||
torchaudio==2.7.0; platform_machine == "ppc64le"
|
torchaudio==2.8.0; platform_machine == "ppc64le"
|
||||||
|
|
||||||
# required for the image processor of phi3v, this must be updated alongside torch
|
# required for the image processor of phi3v, this must be updated alongside torch
|
||||||
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||||
torchvision==0.22.0; platform_machine == "ppc64le"
|
torchvision==0.23.0; platform_machine == "ppc64le"
|
||||||
datasets # for benchmark scripts
|
datasets # for benchmark scripts
|
||||||
|
|
||||||
# Intel Extension for PyTorch, only for x86_64 CPUs
|
# Intel Extension for PyTorch, only for x86_64 CPUs
|
||||||
|
@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'
|
|||||||
|
|
||||||
# Dependencies for NVIDIA GPUs
|
# Dependencies for NVIDIA GPUs
|
||||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||||
torch==2.7.1
|
torch==2.8.0
|
||||||
torchaudio==2.7.1
|
torchaudio==2.8.0
|
||||||
# These must be updated alongside torch
|
# These must be updated alongside torch
|
||||||
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||||
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
|
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
|
||||||
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
|
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
# Common dependencies
|
# Common dependencies
|
||||||
-r common.txt
|
-r common.txt
|
||||||
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
|
--extra-index-url https://download.pytorch.org/whl/rocm6.3
|
||||||
torch==2.7.0
|
torch==2.8.0
|
||||||
torchvision==0.22.0
|
torchvision==0.23.0
|
||||||
torchaudio==2.7.0
|
torchaudio==2.8.0
|
||||||
|
|
||||||
triton==3.3.0
|
triton==3.3.0
|
||||||
cmake>=3.26.1,<4
|
cmake>=3.26.1,<4
|
||||||
|
@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
|
|||||||
soundfile # required for audio tests
|
soundfile # required for audio tests
|
||||||
jiwer # required for audio tests
|
jiwer # required for audio tests
|
||||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
||||||
torch==2.7.1
|
torch==2.8.0
|
||||||
torchaudio==2.7.1
|
torchaudio==2.8.0
|
||||||
torchvision==0.22.1
|
torchvision==0.23.0
|
||||||
transformers_stream_generator # required for qwen-vl test
|
transformers_stream_generator # required for qwen-vl test
|
||||||
matplotlib # required for qwen-vl test
|
matplotlib # required for qwen-vl test
|
||||||
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
|
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
|
||||||
|
@ -541,42 +541,42 @@ numpy==1.26.4
|
|||||||
# tritonclient
|
# tritonclient
|
||||||
# vocos
|
# vocos
|
||||||
# xarray
|
# xarray
|
||||||
nvidia-cublas-cu12==12.8.3.14
|
nvidia-cublas-cu12==12.8.4.1
|
||||||
# via
|
# via
|
||||||
# nvidia-cudnn-cu12
|
# nvidia-cudnn-cu12
|
||||||
# nvidia-cusolver-cu12
|
# nvidia-cusolver-cu12
|
||||||
# torch
|
# torch
|
||||||
nvidia-cuda-cupti-cu12==12.8.57
|
nvidia-cuda-cupti-cu12==12.8.90
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cuda-nvrtc-cu12==12.8.61
|
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cuda-runtime-cu12==12.8.57
|
nvidia-cuda-runtime-cu12==12.8.90
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cudnn-cu12==9.7.1.26
|
nvidia-cudnn-cu12==9.10.2.21
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cufft-cu12==11.3.3.41
|
nvidia-cufft-cu12==11.3.3.83
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cufile-cu12==1.13.0.11
|
nvidia-cufile-cu12==1.13.1.3
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-curand-cu12==10.3.9.55
|
nvidia-curand-cu12==10.3.9.90
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cusolver-cu12==11.7.2.55
|
nvidia-cusolver-cu12==11.7.3.90
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cusparse-cu12==12.5.7.53
|
nvidia-cusparse-cu12==12.5.8.93
|
||||||
# via
|
# via
|
||||||
# nvidia-cusolver-cu12
|
# nvidia-cusolver-cu12
|
||||||
# torch
|
# torch
|
||||||
nvidia-cusparselt-cu12==0.6.3
|
nvidia-cusparselt-cu12==0.7.1
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-nccl-cu12==2.26.2
|
nvidia-nccl-cu12==2.27.3
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-nvjitlink-cu12==12.8.61
|
nvidia-nvjitlink-cu12==12.8.93
|
||||||
# via
|
# via
|
||||||
# nvidia-cufft-cu12
|
# nvidia-cufft-cu12
|
||||||
# nvidia-cusolver-cu12
|
# nvidia-cusolver-cu12
|
||||||
# nvidia-cusparse-cu12
|
# nvidia-cusparse-cu12
|
||||||
# torch
|
# torch
|
||||||
nvidia-nvtx-cu12==12.8.55
|
nvidia-nvtx-cu12==12.8.90
|
||||||
# via torch
|
# via torch
|
||||||
omegaconf==2.3.0
|
omegaconf==2.3.0
|
||||||
# via
|
# via
|
||||||
@ -1069,7 +1069,7 @@ tomli==2.2.1
|
|||||||
# via schemathesis
|
# via schemathesis
|
||||||
tomli-w==1.2.0
|
tomli-w==1.2.0
|
||||||
# via schemathesis
|
# via schemathesis
|
||||||
torch==2.7.1+cu128
|
torch==2.8.0+cu128
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# accelerate
|
# accelerate
|
||||||
@ -1098,7 +1098,7 @@ torch==2.7.1+cu128
|
|||||||
# torchvision
|
# torchvision
|
||||||
# vector-quantize-pytorch
|
# vector-quantize-pytorch
|
||||||
# vocos
|
# vocos
|
||||||
torchaudio==2.7.1+cu128
|
torchaudio==2.8.0+cu128
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# encodec
|
# encodec
|
||||||
@ -1111,7 +1111,7 @@ torchmetrics==1.7.4
|
|||||||
# pytorch-lightning
|
# pytorch-lightning
|
||||||
# terratorch
|
# terratorch
|
||||||
# torchgeo
|
# torchgeo
|
||||||
torchvision==0.22.1+cu128
|
torchvision==0.23.0+cu128
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# lightly
|
# lightly
|
||||||
@ -1152,7 +1152,7 @@ transformers==4.55.2
|
|||||||
# transformers-stream-generator
|
# transformers-stream-generator
|
||||||
transformers-stream-generator==0.0.5
|
transformers-stream-generator==0.0.5
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
triton==3.3.1
|
triton==3.4.0
|
||||||
# via torch
|
# via torch
|
||||||
tritonclient==2.51.0
|
tritonclient==2.51.0
|
||||||
# via
|
# via
|
||||||
|
@ -292,7 +292,7 @@ SP_TEST_MODELS = [
|
|||||||
# TODO support other models
|
# TODO support other models
|
||||||
# [LANGUAGE GENERATION]
|
# [LANGUAGE GENERATION]
|
||||||
"meta-llama/Llama-3.2-1B-Instruct",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
|
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
|||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
|
||||||
|
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
|
||||||
|
# more GPU memory causing vLLM to OOM
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
|||||||
tensor_parallel_size=4,
|
tensor_parallel_size=4,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
fully_sharded_loras=True,
|
fully_sharded_loras=True,
|
||||||
enable_chunked_prefill=True)
|
enable_chunked_prefill=True,
|
||||||
|
gpu_memory_utilization=0.85)
|
||||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||||
|
@ -789,6 +789,7 @@ def get_kernel_options(query, block_m, block_n,
|
|||||||
device_props = torch.cuda.get_device_properties()
|
device_props = torch.cuda.get_device_properties()
|
||||||
max_shared_memory = device_props.shared_memory_per_block_optin
|
max_shared_memory = device_props.shared_memory_per_block_optin
|
||||||
if max_shared_memory < 144 * 1024:
|
if max_shared_memory < 144 * 1024:
|
||||||
kernel_options["BLOCK_M"] = 32
|
kernel_options["BLOCK_M"] = kernel_options["BLOCK_M"] // 2
|
||||||
kernel_options["BLOCK_N"] = 32
|
kernel_options["BLOCK_N"] = kernel_options["BLOCK_N"] // 2
|
||||||
|
|
||||||
return kernel_options
|
return kernel_options
|
||||||
|
Reference in New Issue
Block a user