[CPU] Fix torch version in x86 CPU backend (#19258)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
@ -131,16 +131,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
|
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
||||||
// Compute int8 quantized tensor for given scaling factor.
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
ops.def(
|
ops.def(
|
||||||
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
|
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
|
||||||
"Tensor? azp) -> ()");
|
"Tensor? azp) -> ()",
|
||||||
|
{stride_tag});
|
||||||
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
|
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
|
||||||
|
|
||||||
// Compute int8 quantized tensor and scaling factor
|
// Compute int8 quantized tensor and scaling factor
|
||||||
ops.def(
|
ops.def(
|
||||||
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
|
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
|
||||||
"Tensor!? azp) -> ()");
|
"Tensor!? azp) -> ()",
|
||||||
|
{stride_tag});
|
||||||
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
|
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
|
||||||
&dynamic_scaled_int8_quant);
|
&dynamic_scaled_int8_quant);
|
||||||
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
|
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
@ -148,7 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.def(
|
ops.def(
|
||||||
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
||||||
" Tensor b, Tensor a_scales,"
|
" Tensor b, Tensor a_scales,"
|
||||||
" Tensor b_scales, Tensor? bias) -> ()");
|
" Tensor b_scales, Tensor? bias) -> ()",
|
||||||
|
{stride_tag});
|
||||||
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
|
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
|
||||||
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
|
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
|
||||||
// quantization.
|
// quantization.
|
||||||
@ -156,7 +160,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
|
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
|
||||||
" Tensor b, Tensor a_scales,"
|
" Tensor b, Tensor a_scales,"
|
||||||
" Tensor b_scales, Tensor azp_adj,"
|
" Tensor b_scales, Tensor azp_adj,"
|
||||||
" Tensor? azp, Tensor? bias) -> ()");
|
" Tensor? azp, Tensor? bias) -> ()",
|
||||||
|
{stride_tag});
|
||||||
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
|
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
|
||||||
#elif defined(__powerpc64__)
|
#elif defined(__powerpc64__)
|
||||||
// Compute int8 quantized tensor for given scaling factor.
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
|
@ -66,7 +66,7 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
|||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
--mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
|
||||||
uv pip install -r requirements/build.txt
|
uv pip install -r requirements/build.txt
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
@ -79,6 +79,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
######################### TEST DEPS #########################
|
||||||
|
FROM base AS vllm-test-deps
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||||
|
cp requirements/test.in requirements/cpu-test.in && \
|
||||||
|
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||||
|
sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
|
||||||
|
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||||
|
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||||
|
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install -r requirements/cpu-test.txt
|
||||||
|
|
||||||
######################### DEV IMAGE #########################
|
######################### DEV IMAGE #########################
|
||||||
FROM vllm-build AS vllm-dev
|
FROM vllm-build AS vllm-dev
|
||||||
|
|
||||||
@ -97,28 +113,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||||
|
|
||||||
|
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
|
||||||
cp requirements/test.in requirements/test-cpu.in && \
|
|
||||||
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
|
|
||||||
uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
|
|
||||||
uv pip install -r requirements/dev.txt && \
|
uv pip install -r requirements/dev.txt && \
|
||||||
pre-commit install --hook-type pre-commit --hook-type commit-msg
|
pre-commit install --hook-type pre-commit --hook-type commit-msg
|
||||||
|
|
||||||
ENTRYPOINT ["bash"]
|
ENTRYPOINT ["bash"]
|
||||||
|
|
||||||
######################### TEST IMAGE #########################
|
######################### TEST IMAGE #########################
|
||||||
FROM base AS vllm-test
|
FROM vllm-test-deps AS vllm-test
|
||||||
|
|
||||||
WORKDIR /workspace/
|
WORKDIR /workspace/
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
|
||||||
cp requirements/test.in requirements/test-cpu.in && \
|
|
||||||
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
|
|
||||||
uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
|
|
||||||
uv pip install -r requirements/cpu-test.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
||||||
uv pip install dist/*.whl
|
uv pip install dist/*.whl
|
||||||
|
12
requirements/cpu-build.txt
Normal file
12
requirements/cpu-build.txt
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
|
||||||
|
# see https://github.com/pytorch/pytorch/pull/151218
|
||||||
|
cmake>=3.26.1
|
||||||
|
ninja
|
||||||
|
packaging>=24.2
|
||||||
|
setuptools>=77.0.3,<80.0.0
|
||||||
|
setuptools-scm>=8
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
torch==2.6.0+cpu
|
||||||
|
wheel
|
||||||
|
jinja2>=3.1.6
|
||||||
|
regex
|
@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
|
|||||||
packaging>=24.2
|
packaging>=24.2
|
||||||
setuptools>=77.0.3,<80.0.0
|
setuptools>=77.0.3,<80.0.0
|
||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch==2.7.0+cpu; platform_machine == "x86_64"
|
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||||
torch==2.7.0; platform_system == "Darwin"
|
torch==2.7.0; platform_system == "Darwin"
|
||||||
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||||
|
|
||||||
@ -23,6 +23,7 @@ datasets # for benchmark scripts
|
|||||||
|
|
||||||
# Intel Extension for PyTorch, only for x86_64 CPUs
|
# Intel Extension for PyTorch, only for x86_64 CPUs
|
||||||
intel-openmp==2024.2.1; platform_machine == "x86_64"
|
intel-openmp==2024.2.1; platform_machine == "x86_64"
|
||||||
intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
|
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||||
py-libnuma; platform_system != "Darwin"
|
py-libnuma; platform_system != "Darwin"
|
||||||
psutil; platform_system != "Darwin"
|
psutil; platform_system != "Darwin"
|
||||||
|
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
|
||||||
|
@ -107,6 +107,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
),
|
),
|
||||||
limit_mm_per_prompt={"image": 4},
|
limit_mm_per_prompt={"image": 4},
|
||||||
)],
|
)],
|
||||||
|
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
|
||||||
|
dtype="bfloat16" if current_platform.is_cpu() else "auto",
|
||||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||||
),
|
),
|
||||||
"paligemma": VLMTestInfo(
|
"paligemma": VLMTestInfo(
|
||||||
|
@ -203,6 +203,9 @@ def build_embedding_inputs_from_test_info(
|
|||||||
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
images = [asset.pil_image for asset in image_assets]
|
||||||
embeds = test_info.convert_assets_to_embeddings(image_assets)
|
embeds = test_info.convert_assets_to_embeddings(image_assets)
|
||||||
|
if test_info.dtype != "auto":
|
||||||
|
dtype = getattr(torch, test_info.dtype) # type: ignore
|
||||||
|
embeds = [e.to(dtype=dtype) for e in embeds]
|
||||||
assert len(images) == len(model_prompts)
|
assert len(images) == len(model_prompts)
|
||||||
|
|
||||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||||
|
@ -54,6 +54,8 @@ else:
|
|||||||
if is_rocm_aiter_moe_enabled():
|
if is_rocm_aiter_moe_enabled():
|
||||||
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
|
||||||
rocm_aiter_grouped_topk as grouped_topk)
|
rocm_aiter_grouped_topk as grouped_topk)
|
||||||
|
elif current_platform.is_cpu():
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
|
||||||
if current_platform.is_tpu():
|
if current_platform.is_tpu():
|
||||||
|
@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization.base_config import (
|
|||||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
MIN_IPEX_VERSION = "2.7.0"
|
MIN_IPEX_VERSION = "2.6.0"
|
||||||
|
|
||||||
|
|
||||||
class IPEXConfig(QuantizationConfig):
|
class IPEXConfig(QuantizationConfig):
|
||||||
|
Reference in New Issue
Block a user