Files
vllm/examples/online_serving/openai_embedding_long_text/service.sh

138 lines
4.8 KiB
Bash
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# vLLM Embedding Server with Enhanced Chunked Processing
# This script starts a vLLM server with chunked processing enabled for long text embedding.
# Now supports proper pooling type validation and model-specific configurations.
set -euo pipefail
# Configuration
MODEL_NAME=${MODEL_NAME:-"intfloat/multilingual-e5-large"}
MODEL_CODE=${MODEL_CODE:-"multilingual-e5-large"}
PORT=${PORT:-31090}
GPU_COUNT=${GPU_COUNT:-1}
MAX_EMBED_LEN=${MAX_EMBED_LEN:-3072000}
API_KEY=${API_KEY:-"your-api-key"}
# Enhanced pooling configuration with model-specific defaults
POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST
export VLLM_ENABLE_CHUNKED_PROCESSING=true
export CUDA_VISIBLE_DEVICES=2,3,4,5
# export VLLM_ATTENTION_BACKEND=XFORMERS
echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
echo "=================================================================="
# Environment variables for optimization
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Function to determine optimal pooling type for known models
get_optimal_pooling_type() {
local model="$1"
case "$model" in
*"e5-"* | *"multilingual-e5"*)
echo "MEAN" # E5 series native pooling
;;
*"bge-"*)
echo "CLS" # BGE series native pooling
;;
*"gte-"*)
echo "LAST" # GTE series native pooling
;;
*"sentence-t5"* | *"st5"*)
echo "MEAN" # Sentence-T5 native pooling
;;
*"jina-embeddings"*)
echo "MEAN" # Jina embeddings native pooling
;;
*"Qwen"*"Embedding"*)
echo "LAST" # Qwen embeddings native pooling
;;
*)
echo "MEAN" # Default native pooling for unknown models
;;
esac
}
# Auto-detect pooling type if not explicitly set
if [ "$POOLING_TYPE" = "auto" ]; then
POOLING_TYPE=$(get_optimal_pooling_type "$MODEL_NAME")
echo "🔍 Auto-detected pooling type: $POOLING_TYPE for model $MODEL_NAME"
fi
# Display configuration
echo "📋 Configuration:"
echo " - Model: $MODEL_NAME"
echo " - Port: $PORT"
echo " - GPU Count: $GPU_COUNT"
echo " - Enhanced Chunked Processing: ${VLLM_ENABLE_CHUNKED_PROCESSING}"
echo " - Max Embed Length: ${MAX_EMBED_LEN} tokens"
echo " - Native Pooling Type: $POOLING_TYPE + Normalization"
echo " - Cross-chunk Aggregation: MEAN (automatic)"
echo ""
# Validate GPU availability
if command -v nvidia-smi &> /dev/null; then
gpu_count=$(nvidia-smi --list-gpus | wc -l)
echo "🖥️ Available GPUs: $gpu_count"
if [ "$GPU_COUNT" -gt "$gpu_count" ]; then
echo "⚠️ Warning: Requested $GPU_COUNT GPUs but only $gpu_count available"
echo " Adjusting to use $gpu_count GPUs"
GPU_COUNT=$gpu_count
fi
else
echo "⚠️ Warning: nvidia-smi not found. GPU detection skipped."
fi
# Chunked processing uses unified MEAN aggregation
echo " Chunked Processing: Using $POOLING_TYPE pooling within chunks, MEAN aggregation across chunks"
echo " - All chunks processed for complete semantic coverage"
echo " - Weighted averaging based on chunk token counts"
echo ""
echo "🔧 Starting server with enhanced chunked processing configuration..."
# Build pooler config JSON
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
# Start vLLM server with enhanced chunked processing
vllm serve "$MODEL_NAME" \
--tensor-parallel-size "$GPU_COUNT" \
--enforce-eager \
--pooler-config "$POOLER_CONFIG" \
--served-model-name ${MODEL_CODE} \
--api-key "$API_KEY" \
--trust-remote-code \
--port "$PORT" \
--host 0.0.0.0
echo ""
echo "✅ vLLM Embedding Server started successfully!"
echo ""
echo "📡 Server Information:"
echo " - Base URL: http://localhost:$PORT"
echo " - Model Code: ${MODEL_CODE}"
echo " - API Key: $API_KEY"
echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
echo ""
echo "🧪 Test the server with:"
echo " python examples/online_serving/openai_embedding_long_text/client.py"
echo ""
echo "📚 Enhanced features enabled:"
echo " ✅ Intelligent native pooling type detection"
echo " ✅ Unified MEAN aggregation for chunked processing"
echo " ✅ Model-specific native pooling optimization"
echo " ✅ Enhanced max embedding length (${MAX_EMBED_LEN} tokens)"
echo " ✅ Complete semantic coverage for all pooling types"
echo " ✅ OpenAI-compatible API"
echo " ✅ GPU acceleration"
echo ""
echo "🔧 Advanced usage:"
echo " - Set POOLING_TYPE=MEAN|CLS|LAST to override auto-detection"
echo " - Set MAX_EMBED_LEN to adjust maximum input length"
echo " - All pooling types use MEAN aggregation across chunks"