[BE][6/6] fix typos in test/ (test/distributed/) (#157640)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157640
Approved by: https://github.com/yewentao256, https://github.com/malfet
This commit is contained in:
Xuehai Pan
2025-07-11 15:04:47 +08:00
committed by PyTorch MergeBot
parent 4283d96bcd
commit 0d17029fea
28 changed files with 63 additions and 54 deletions

View File

@ -1169,7 +1169,6 @@ exclude_patterns = [
'aten/src/ATen/[a-mA-M]*/**',
'test/**',
'test/[a-hA-h]*/**',
'test/distributed/**',
'torch/_*/**',
'torch/distributed/tensor/**',
]

View File

@ -69,7 +69,7 @@ class ReplicateStateDictTest(MultiProcessTestCase):
def test_replicate_non_root_multiple_save_load(self):
"""
Tests tha replicate() on multiple submodules matches
Tests the replicate() on multiple submodules matches
local module state_dict.
"""
self._init_pg()

View File

@ -1733,7 +1733,7 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase):
self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
self.assertEqual(str(remote_device_after.device()), "cpu")
# ensure metdata also get changed to CPU
# ensure metadata also get changed to CPU
metas = new_st.metadata().shards_metadata
for meta in metas:
self.assertEqual(str(meta.placement.device()), "cpu")
@ -1764,7 +1764,7 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase):
self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
self.assertEqual(str(remote_device_after.device()), "cpu")
# ensure metdata also get changed to CPU
# ensure metadata also get changed to CPU
metas = new_st.metadata().shards_metadata
for meta in metas:
self.assertEqual(str(meta.placement.device()), "cpu")
@ -1820,7 +1820,7 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase):
self.assertEqual(str(remote_device_before.device().type), "cpu")
self.assertEqual(str(remote_device_after.device().type), "cuda")
# ensure metdata also get changed to GPU
# ensure metadata also get changed to GPU
metas = new_st_gpu.metadata().shards_metadata
for meta in metas:
self.assertEqual(str(meta.placement.device().type), "cuda")

View File

@ -129,7 +129,7 @@ class TestTrackerFullyShard1DTrainingCore(FSDPTest):
@skip_if_lt_x_gpu(2)
def test_tracker_non_root_forward_backward(self):
"""
Tests tracker accracy when running forward/backward through a non-root.
Tests tracker accuracy when running forward/backward through a non-root.
"""
debug = False
dev = torch.device(torch.cuda.current_device())

View File

@ -211,7 +211,7 @@ class TestSACILP(TestCase):
class TestOptimalCheckpointingPolicy(TestCase):
# tests are adpated from tests in xformers
# tests are adapted from tests in xformers
# https://github.com/facebookresearch/xformers/blob/c6c0ac31f1b08542a0bc27278c6ed10f825f6963/tests/test_checkpoint.py#L222
def setUp(self):
super().setUp()

View File

@ -72,7 +72,7 @@ class TestFSDPWithEP(DTensorTestBase, VerifyStateDictMixin):
mesh_fsdp_tp = init_device_mesh(
self.device_type, (2, 4), mesh_dim_names=("dp", "tp")
)
# TODO: we are using an internal API atm. Change to a publich API once it is ready.
# TODO: we are using an internal API atm. Change to a public API once it is ready.
mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",))
del _mesh_resources.child_to_parent_mapping[mesh_fsdp_ep]

View File

@ -109,7 +109,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
for d_optim in _dist_optim:
d_optim.step()
# We need to ensure gradients don't exist, this the invarient of using DSD.
# We need to ensure gradients don't exist, this the invariant of using DSD.
optim.zero_grad()
# Get the state_dict, and compare the result
@ -135,7 +135,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
# We won't be able to load the partial state_dict back.
return
# Since we already have the state_dict saved before, no need to call DCP.
# We can directly load them back. This asser is to ensure that optimizer
# We can directly load them back. This assert is to ensure that optimizer
# state storage are initialized.
# self.assertEqual(len(curr_dist_osd[STATE]), len(dist_osd[STATE]))
set_model_state_dict(

View File

@ -140,7 +140,17 @@ class RendezvousParametersTest(TestCase):
self.assertFalse(params.get_as_bool("dummy_param"))
def test_get_as_bool_raises_error_if_value_is_invalid(self) -> None:
for value in ["01", "Flse", "Ture", "g", "4", "_", "truefalse", 2, -1]:
for value in [
"01",
"Flse", # codespell:ignore
"Ture", # codespell:ignore
"g",
"4",
"_",
"truefalse",
2,
-1,
]:
with self.subTest(value=value):
self._kwargs["dummy_param"] = value

View File

@ -71,9 +71,9 @@ class WorkerServerTest(TestCase):
self.assertEqual(resp.status, 200)
self.assertIn("ping", json.loads(resp.data))
resp = pool.request("POST", "/handler/nonexistant")
resp = pool.request("POST", "/handler/nonexistent")
self.assertEqual(resp.status, 404)
self.assertIn(b"Handler nonexistant not found:", resp.data)
self.assertIn(b"Handler nonexistent not found:", resp.data)
@requires_cuda
def test_dump_nccl_trace_pickle(self) -> None:
@ -207,8 +207,8 @@ class WorkerServerTest(TestCase):
def test_get_handler_nonexistant(self) -> None:
from torch._C._distributed_c10d import _get_handler
with self.assertRaisesRegex(ValueError, "Failed to find handler nonexistant"):
_get_handler("nonexistant")
with self.assertRaisesRegex(ValueError, "Failed to find handler nonexistent"):
_get_handler("nonexistent")
def test_get_handler_names(self) -> None:
from torch._C._distributed_c10d import _get_handler_names

View File

@ -158,7 +158,7 @@ class TestFSDPMemory(FSDPTest):
output = cmp(results, expected)
self.assertEqual(output, "")
@unittest.skipIf(TEST_HPU, "Memory will be differnt for CUDA and HPU, skipping")
@unittest.skipIf(TEST_HPU, "Memory will be different for CUDA and HPU, skipping")
@skip_if_lt_x_gpu(2)
@parametrize("ckpt", ["no_ckpt", "ckpt"])
def test_fsdp_memory(self, ckpt):

View File

@ -45,7 +45,7 @@ class ScheduleVShaped(PipelineScheduleMulti):
)
# Go through one microbatch
# Note(whc) - it might be easier to work with thes schedules by writing them as a list of
# Note(whc) - it might be easier to work with this schedules by writing them as a list of
# ["0F0", ...] and then parsing them in the test infra to turn them into actions.
self.pipeline_order = {
0: [

View File

@ -22,7 +22,7 @@ from torch.testing._internal.distributed.rpc_utils import (
# On CircleCI these tests are already run on CPU jobs, thus to save resources do
# not run them on GPU jobs, since thet wouldn't provide additional test signal.
# not run them on GPU jobs, since they wouldn't provide additional test signal.
if not (IS_CI and torch.cuda.is_available()):
globals().update(
generate_tests(

View File

@ -23,7 +23,7 @@ from torch.testing._internal.distributed.rpc_utils import (
# On CircleCI these tests are already run on CPU jobs, thus to save resources do
# not run them on GPU jobs, since thet wouldn't provide additional test signal.
# not run them on GPU jobs, since they wouldn't provide additional test signal.
if not (IS_CI and torch.cuda.is_available()):
globals().update(
generate_tests(

View File

@ -90,7 +90,7 @@ class TestLocalMap(DTensorTestBase):
) # row-wisely sharded W tensor
# Test 1: use the function returned from calling local_map
# get the function wrapped with DTensor/Tensor convertion
# get the function wrapped with DTensor/Tensor conversion
# mm_allreduce_forward is a function that applies to Tensors with manual collective
# local_mm_allreduce_forward is the function that does the same but applies to
# DTensors' `_local_tensor`.

View File

@ -85,7 +85,7 @@ class TensorParallelTest(DTensorTestBase):
with torch.no_grad():
tp_res = tp_model(*inputs)
self.assertEqual(res, tp_res)
# Expect all_gather to be inserted to distributed sharded fc resutls
# Expect all_gather to be inserted to distributed sharded fc results
self.assert_has_c10d_ops(
tp_exported_program.graph_module,
{

View File

@ -438,7 +438,7 @@ class DTensorTest(DTensorTestBase):
self.assertEqual(type(out_view), AsyncCollectiveTensor)
self.assertFalse(out.completed)
# Use the daa, requiring a sync
# Use the data, requiring a sync
ref = torch.ones((4, 2), device=self.device_type) + 1
ref = ref.view(-1)
out_data = out_view + 1

View File

@ -220,7 +220,7 @@ def forward(self, b_parametrizations_buffer_original0, x):
group1 = x.get_group(mesh_dim=1)
return size, coord, group0, group1
# Cant be fullgraph=True because ProcessGroup is not reconstructible in dynamo
# Can't be fullgraph=True because ProcessGroup is not reconstructible in dynamo
compiled_fn = torch.compile(backend="aot_eager")(fn)
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).unsqueeze(1))

View File

@ -193,7 +193,7 @@ class TestEmbeddingOp(DTensorTestBase):
from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
# case 1: two embeddings with the same shape, thus sharing the underying _MaskPartial
# case 1: two embeddings with the same shape, thus sharing the underlying _MaskPartial
# and MaskBuffer, because of cache hit from sharding propagation
emb1 = torch.nn.Embedding(10, 23, device=self.device_type)

View File

@ -65,7 +65,7 @@ class TestEinsumDims(TestCase):
self.assertEqual(edims.lhs_out_only_dims, ["c"])
self.assertEqual(edims.rhs_out_only_dims, [])
equation = "abd,bf->abfd"
equation = "abd,bf->abfd" # codespell:ignore
input_dims, output_dim = EinsumDims.parse_equation(equation)
edims = EinsumDims.parse_dims(input_dims, output_dim)

View File

@ -635,7 +635,7 @@ class MultiDimRedistributeTest(DTensorTestBase):
dt = distribute_tensor(full_tensor, device_mesh, repl_inputs)
if repl_inputs != inputs:
# create a new DTensor reinterpreting some of the replicated entires as "Partial"
# create a new DTensor reinterpreting some of the replicated entries as "Partial"
dt = DTensor.from_local(
dt.to_local(), device_mesh, inputs, run_check=False
)

View File

@ -150,7 +150,7 @@ class DTensorXLAIntegrationTest(TestCase):
def shard_params(mod_name, mod, mesh):
shard_spec = [Shard(0)]
# annoate fc1 and fc2
# annotate fc1 and fc2
if isinstance(mod, nn.Linear):
for _, param in mod.named_parameters():
# annotate the parameter tensors directly

View File

@ -601,7 +601,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
def _helper_test_extra_cuda_context_by_nvml(self):
"""
A helper for `test_extra_cuda_context`, if pynvml is avaiable.
A helper for `test_extra_cuda_context`, if pynvml is available.
pynvml provides python bindings for NVIDIA NVML functionalities.
Here we are interested in: nvmlDeviceGetComputeRunningProcesses
"""
@ -634,7 +634,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
def _helper_test_extra_cuda_context_by_memory(self):
"""
A helper for `test_extra_cuda_context`, if pynvml is NOT avaiable.
A helper for `test_extra_cuda_context`, if pynvml is NOT available.
If extra context is created, it would manifest into device 0's memory usage.
"""
device = torch.device(f"cuda:{self.rank:d}")
@ -1112,7 +1112,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
os.environ["TORCH_NCCL_NONBLOCKING_TIMEOUT"] = "100"
store = c10d.FileStore(self.file_name, self.world_size)
device = torch.device(f"cuda:{self.rank}")
# bound device to triger eager init mode
# bound device to trigger eager init mode
pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
backend = pg._get_backend(torch.device(device))
self.assertEqual(backend.comm_split_count(), 0)
@ -2995,7 +2995,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
time.sleep(4)
self.assertEqual(process_group.get_error(), ErrorType.REMOTE_ERROR)
# Mimicing all ranks sensing the timeout, abort
# Mimicking all ranks sensing the timeout, abort
process_group.abort()
if prev_nccl_async_error_handling is not None:
@ -4291,7 +4291,7 @@ class NCCLTraceTestBase(MultiProcessTestCase):
def _join_processes(self, fn):
# We need to patch sys.exit() as skip_if will use sys.exit() and
# the exit code from the this process will not be catched.
# the exit code from the this process will not be caught.
with mock.patch("sys.exit"):
fn()
super()._join_processes(fn)

View File

@ -1814,7 +1814,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
Note: comptime prints the guards before the time they get installed or not installed, so in both cases
(skip or no skip) the same guards get printed. The difference is that in the skip case, they show up
with a special 'guard source' which will cuase them to not be installed. So all we check for is the expected
with a special 'guard source' which will cause them to not be installed. So all we check for is the expected
guard source 'local_fsdp_module'.
"""
global GUARDS_FILE
@ -1871,7 +1871,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
def test_fsdp_skip_register_attr_or_module(self):
"""
ensure FSDP module is not registered as attrbutes
ensure FSDP module is not registered as attributes
in the fx graph
see `not source.guard_source().is_fsdp_module()`
before calling `register_attr_or_module`

View File

@ -824,7 +824,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
out = compiled(inputs, **self.get_world_trs())
code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
# NOTE: Make sure we are not unneccessarily copying the outputs of
# NOTE: Make sure we are not unnecessarily copying the outputs of
# wait_tensors before they are returned from the graph.
(
FileCheck()
@ -891,7 +891,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
# NOTE: Make sure we are not unneccessarily copying the outputs of
# NOTE: Make sure we are not unnecessarily copying the outputs of
# wait_tensors before they are returned from the graph.
(
FileCheck()
@ -1356,7 +1356,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
# NOTE: Make sure we are not unneccessarily copying the outputs of
# NOTE: Make sure we are not unnecessarily copying the outputs of
# wait_tensors before they are returned from the graph.
(
FileCheck()
@ -1403,7 +1403,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
# NOTE: The first return value should be the output of the first wait_tensor.
# We want to make sure no unneccessary copy is made.
# We want to make sure no unnecessary copy is made.
(
FileCheck()
.check("buf0 = empty_strided")
@ -1474,7 +1474,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
# NOTE: The first return value should be the output of the first wait_tensor.
# We want to make sure no unneccessary copy is made.
# We want to make sure no unnecessary copy is made.
(
FileCheck()
.check("buf0 = empty_strided")
@ -1548,7 +1548,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
# NOTE: The first return value should be the output of the first wait_tensor.
# We want to make sure no unneccessary copy is made.
# We want to make sure no unnecessary copy is made.
(FileCheck().check("all_gather_into_tensor_out").run(code))
out = compiled(*inputs, **self.get_world_trs())
correct = func(*inputs, **self.get_world_trs())
@ -1598,7 +1598,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
# NOTE: The first return value should be the output of the first wait_tensor.
# We want to make sure no unneccessary copy is made.
# We want to make sure no unnecessary copy is made.
(
FileCheck()
.check_count(
@ -1689,7 +1689,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
# NOTE: The first return value should be the output of the first wait_tensor.
# We want to make sure no unneccessary copy is made.
# We want to make sure no unnecessary copy is made.
(
FileCheck()
.check_count(
@ -1785,7 +1785,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
# NOTE: The first return value should be the output of the first wait_tensor.
# We want to make sure no unneccessary copy is made.
# We want to make sure no unnecessary copy is made.
(
FileCheck()
.check("all_gather")

View File

@ -232,7 +232,7 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
)
out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
# 3 rows: input splits, output splits, output offsets
# Initiallizing all values to -1 to check if they are updated
# Initializing all values to -1 to check if they are updated
in_out_splits = symm_mem.empty(
(3, nsplits), dtype=torch.int64, device=self.device
).fill_(-1)

View File

@ -376,7 +376,7 @@ if not TEST_WITH_DEV_DBG_ASAN:
):
self._create_wrapper_pg(with_new_group=True)
# nothing to assert, isinstance(pg, _ProcessGroupWrapper)
# should never be invoked since it is preceeded by
# should never be invoked since it is proceeded by
# _GLOO_AVAILABLE check, this test will fail on
# an unexpected NameError if not.

View File

@ -837,9 +837,9 @@ class RendezvousTCPTest(TestCase):
# not respected, it will take much longer to timeout.
start = time.time()
with self.assertRaisesRegex(
DistStoreError, "wait timeout after 100ms, keys: /nonexistant key"
DistStoreError, "wait timeout after 100ms, keys: /nonexistent key"
):
store0.get("nonexistant key")
store0.get("nonexistent key")
end = time.time()
time_diff = end - start
@ -1066,7 +1066,7 @@ class TimeoutTest(TestCase):
wait_for_workers=False,
)
ths = []
threads = []
for i in range(2):
t = threading.Thread(
target=run,
@ -1076,16 +1076,16 @@ class TimeoutTest(TestCase):
),
)
t.start()
ths.append(t)
threads.append(t)
def handler(a, b):
pass
signal.signal(signal.SIGUSR1, handler)
time.sleep(1)
signal.pthread_kill(ths[1].ident, signal.SIGUSR1)
signal.pthread_kill(threads[1].ident, signal.SIGUSR1)
for t in ths:
for t in threads:
t.join()
self.assertTrue(rank_res[0], "rank0")
self.assertTrue(rank_res[1], "rank1")

View File

@ -664,7 +664,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
# These timeout tests are skipped on ROCm because timeout calls trap(), which
# is handled differently inside hip runtime. It collects gpu coredump and causes
# the linux kernel to create a core dump of the host application. The funcitonality
# the linux kernel to create a core dump of the host application. The functionality
# is there, meaning timeout is happening correctly. However, there isn't a nice way
# to test it as the current executing thread will coredump and exit.
@skipIfRocm
@ -690,7 +690,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
# These timeout tests are skipped on ROCm because timeout calls trap(), which
# is handled differently inside hip runtime. It collects gpu coredump and causes
# the linux kernel to create a core dump of the host application. The funcitonality
# the linux kernel to create a core dump of the host application. The functionality
# is there, meaning timeout is happening correctly. However, there isn't a nice way
# to test it as the current executing thread will coredump and exit.
@skipIfRocm
@ -719,7 +719,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
# These timeout tests are skipped on ROCm because timeout calls trap(), which
# is handled differently inside hip runtime. It collects gpu coredump and causes
# the linux kernel to create a core dump of the host application. The funcitonality
# the linux kernel to create a core dump of the host application. The functionality
# is there, meaning timeout is happening correctly. However, there isn't a nice way
# to test it as the current executing thread will coredump and exit.
@skipIfRocm