Files
vllm/examples/online_serving/elastic_ep/scale.py
2025-07-18 17:46:09 -07:00

54 lines
1.5 KiB
Python

#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import sys
import requests
def scale(host, port, new_dp_size):
url = f"http://{host}:{port}/scale_elastic_ep"
payload = {"new_data_parallel_size": new_dp_size}
headers = {"Content-Type": "application/json"}
print(f"Sending scale request to {url}")
print(f"Payload: {json.dumps(payload, indent=2)}")
try:
response = requests.post(url, json=payload, headers=headers, timeout=300)
print(f"Status Code: {response.status_code}")
print(f"Response: {response.text}")
if response.status_code == 200:
print("Scale up/down request successful!")
return True
else:
print("Scale up/down request failed!")
return False
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Test scale up/down functionality")
parser.add_argument("--host", default="localhost", help="API server host")
parser.add_argument("--port", type=int, default=8006, help="API server port")
parser.add_argument(
"--new-dp-size", type=int, default=2, help="New data parallel size"
)
args = parser.parse_args()
success = scale(args.host, args.port, args.new_dp_size)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()