[BugFix] Pop instead of del CUDA_VISIBLE_DEVICES (#12962)

Signed-off-by: Hollow Man <hollowman@opensuse.org>
2025-10-20 14:53:52 +08:00 · 2025-02-11 18:21:50 +02:00
parent 21f5d50fa5
commit 6c4dbe23eb
4 changed files with 9 additions and 9 deletions
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@ -92,7 +92,7 @@ class MyLLM(LLM):
        # a hack to make the script work.
        # stop ray from manipulating CUDA_VISIBLE_DEVICES
        # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        super().__init__(*args, **kwargs)


--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@ -59,7 +59,7 @@ class MyLLM(LLM):
        # a hack to make the script work.
        # stop ray from manipulating CUDA_VISIBLE_DEVICES
        # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        # every worker will use 0.4 GPU, so that we can schedule
        # 2 instances on the same GPUs.
        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
 def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
                                      distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
 def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
                          distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):

@ray.remote(num_gpus=1, max_calls=1)
 def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):

@ray.remote(num_gpus=1, max_calls=1)
 def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,