From 0d17029fea3d96bb88b19912946648b47f8e003d Mon Sep 17 00:00:00 2001
From: Xuehai Pan <XuehaiPan@pku.edu.cn>
Date: Fri, 11 Jul 2025 15:04:47 +0800
Subject: [PATCH] [BE][6/6] fix typos in test/ (test/distributed/) (#157640)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157640
Approved by: https://github.com/yewentao256, https://github.com/malfet
---
 .lintrunner.toml                               |  1 -
 test/distributed/_composable/test_replicate.py |  2 +-
 .../sharded_tensor/test_sharded_tensor.py      |  6 +++---
 .../_tools/test_fsdp2_mem_tracker.py           |  2 +-
 test/distributed/_tools/test_sac_ilp.py        |  2 +-
 .../distributed/checkpoint/e2e/test_fsdp_ep.py |  2 +-
 test/distributed/checkpoint/test_state_dict.py |  4 ++--
 .../distributed/elastic/rendezvous/api_test.py | 12 +++++++++++-
 test/distributed/elastic/test_control_plane.py |  8 ++++----
 test/distributed/fsdp/test_fsdp_memory.py      |  2 +-
 .../pipelining/schedule_registry.py            |  2 +-
 test/distributed/rpc/test_faulty_agent.py      |  2 +-
 test/distributed/rpc/test_tensorpipe_agent.py  |  2 +-
 .../tensor/experimental/test_local_map.py      |  2 +-
 .../tensor/experimental/test_tp_transform.py   |  2 +-
 test/distributed/tensor/test_dtensor.py        |  2 +-
 .../distributed/tensor/test_dtensor_compile.py |  2 +-
 test/distributed/tensor/test_embedding_ops.py  |  2 +-
 test/distributed/tensor/test_op_strategy.py    |  2 +-
 test/distributed/tensor/test_redistribute.py   |  2 +-
 .../distributed/tensor/test_xla_integration.py |  2 +-
 test/distributed/test_c10d_nccl.py             | 10 +++++-----
 test/distributed/test_dynamo_distributed.py    |  4 ++--
 test/distributed/test_inductor_collectives.py  | 18 +++++++++---------
 test/distributed/test_nvshmem.py               |  2 +-
 test/distributed/test_pg_wrapper.py            |  2 +-
 test/distributed/test_store.py                 | 12 ++++++------
 test/distributed/test_symmetric_memory.py      |  6 +++---
 28 files changed, 63 insertions(+), 54 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 426bbe42a35b..6b24f3a9d65f 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -1169,7 +1169,6 @@ exclude_patterns = [
     'aten/src/ATen/[a-mA-M]*/**',
     'test/**',
     'test/[a-hA-h]*/**',
-    'test/distributed/**',
     'torch/_*/**',
     'torch/distributed/tensor/**',
 ]
diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index 49c60da4fb38..a793fe2fed4c 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -69,7 +69,7 @@ class ReplicateStateDictTest(MultiProcessTestCase):
 
     def test_replicate_non_root_multiple_save_load(self):
         """
-        Tests tha replicate() on multiple submodules matches
+        Tests the replicate() on multiple submodules matches
         local module state_dict.
         """
         self._init_pg()
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index 3902d294c7d5..f62e4d29617d 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -1733,7 +1733,7 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase):
             self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
             self.assertEqual(str(remote_device_after.device()), "cpu")
 
-        # ensure metdata also get changed to CPU
+        # ensure metadata also get changed to CPU
         metas = new_st.metadata().shards_metadata
         for meta in metas:
             self.assertEqual(str(meta.placement.device()), "cpu")
@@ -1764,7 +1764,7 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase):
             self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
             self.assertEqual(str(remote_device_after.device()), "cpu")
 
-        # ensure metdata also get changed to CPU
+        # ensure metadata also get changed to CPU
         metas = new_st.metadata().shards_metadata
         for meta in metas:
             self.assertEqual(str(meta.placement.device()), "cpu")
@@ -1820,7 +1820,7 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase):
             self.assertEqual(str(remote_device_before.device().type), "cpu")
             self.assertEqual(str(remote_device_after.device().type), "cuda")
 
-        # ensure metdata also get changed to GPU
+        # ensure metadata also get changed to GPU
         metas = new_st_gpu.metadata().shards_metadata
         for meta in metas:
             self.assertEqual(str(meta.placement.device().type), "cuda")
diff --git a/test/distributed/_tools/test_fsdp2_mem_tracker.py b/test/distributed/_tools/test_fsdp2_mem_tracker.py
index 33c941816f39..0af73cd3867a 100644
--- a/test/distributed/_tools/test_fsdp2_mem_tracker.py
+++ b/test/distributed/_tools/test_fsdp2_mem_tracker.py
@@ -129,7 +129,7 @@ class TestTrackerFullyShard1DTrainingCore(FSDPTest):
     @skip_if_lt_x_gpu(2)
     def test_tracker_non_root_forward_backward(self):
         """
-        Tests tracker accracy when running forward/backward through a non-root.
+        Tests tracker accuracy when running forward/backward through a non-root.
         """
         debug = False
         dev = torch.device(torch.cuda.current_device())
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
index 10d4d7a030f1..bd9c8d3a8136 100644
--- a/test/distributed/_tools/test_sac_ilp.py
+++ b/test/distributed/_tools/test_sac_ilp.py
@@ -211,7 +211,7 @@ class TestSACILP(TestCase):
 
 
 class TestOptimalCheckpointingPolicy(TestCase):
-    # tests are adpated from tests in xformers
+    # tests are adapted from tests in xformers
     # https://github.com/facebookresearch/xformers/blob/c6c0ac31f1b08542a0bc27278c6ed10f825f6963/tests/test_checkpoint.py#L222
     def setUp(self):
         super().setUp()
diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
index 9d1e78d9d9fe..7489317035b9 100644
--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
@@ -72,7 +72,7 @@ class TestFSDPWithEP(DTensorTestBase, VerifyStateDictMixin):
         mesh_fsdp_tp = init_device_mesh(
             self.device_type, (2, 4), mesh_dim_names=("dp", "tp")
         )
-        # TODO: we are using an internal API atm. Change to a publich API once it is ready.
+        # TODO: we are using an internal API atm. Change to a public API once it is ready.
         mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",))
         del _mesh_resources.child_to_parent_mapping[mesh_fsdp_ep]
 
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
index 9c4f6fb005a3..37bb6def9a94 100644
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@@ -109,7 +109,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
             for d_optim in _dist_optim:
                 d_optim.step()
 
-        # We need to ensure gradients don't exist, this the invarient of using DSD.
+        # We need to ensure gradients don't exist, this the invariant of using DSD.
         optim.zero_grad()
 
         # Get the state_dict, and compare the result
@@ -135,7 +135,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
             # We won't be able to load the partial state_dict back.
             return
         # Since we already have the state_dict saved before, no need to call DCP.
-        # We can directly load them back. This asser is to ensure that optimizer
+        # We can directly load them back. This assert is to ensure that optimizer
         # state storage are initialized.
         # self.assertEqual(len(curr_dist_osd[STATE]), len(dist_osd[STATE]))
         set_model_state_dict(
diff --git a/test/distributed/elastic/rendezvous/api_test.py b/test/distributed/elastic/rendezvous/api_test.py
index c72656e4cc14..938353a9ffa0 100644
--- a/test/distributed/elastic/rendezvous/api_test.py
+++ b/test/distributed/elastic/rendezvous/api_test.py
@@ -140,7 +140,17 @@ class RendezvousParametersTest(TestCase):
                 self.assertFalse(params.get_as_bool("dummy_param"))
 
     def test_get_as_bool_raises_error_if_value_is_invalid(self) -> None:
-        for value in ["01", "Flse", "Ture", "g", "4", "_", "truefalse", 2, -1]:
+        for value in [
+            "01",
+            "Flse",  # codespell:ignore
+            "Ture",  # codespell:ignore
+            "g",
+            "4",
+            "_",
+            "truefalse",
+            2,
+            -1,
+        ]:
             with self.subTest(value=value):
                 self._kwargs["dummy_param"] = value
 
diff --git a/test/distributed/elastic/test_control_plane.py b/test/distributed/elastic/test_control_plane.py
index 8fc51b5bf7e0..9b31cf3b1755 100644
--- a/test/distributed/elastic/test_control_plane.py
+++ b/test/distributed/elastic/test_control_plane.py
@@ -71,9 +71,9 @@ class WorkerServerTest(TestCase):
             self.assertEqual(resp.status, 200)
             self.assertIn("ping", json.loads(resp.data))
 
-            resp = pool.request("POST", "/handler/nonexistant")
+            resp = pool.request("POST", "/handler/nonexistent")
             self.assertEqual(resp.status, 404)
-            self.assertIn(b"Handler nonexistant not found:", resp.data)
+            self.assertIn(b"Handler nonexistent not found:", resp.data)
 
     @requires_cuda
     def test_dump_nccl_trace_pickle(self) -> None:
@@ -207,8 +207,8 @@ class WorkerServerTest(TestCase):
     def test_get_handler_nonexistant(self) -> None:
         from torch._C._distributed_c10d import _get_handler
 
-        with self.assertRaisesRegex(ValueError, "Failed to find handler nonexistant"):
-            _get_handler("nonexistant")
+        with self.assertRaisesRegex(ValueError, "Failed to find handler nonexistent"):
+            _get_handler("nonexistent")
 
     def test_get_handler_names(self) -> None:
         from torch._C._distributed_c10d import _get_handler_names
diff --git a/test/distributed/fsdp/test_fsdp_memory.py b/test/distributed/fsdp/test_fsdp_memory.py
index 2adaf6c27701..d10f78e3b3c7 100644
--- a/test/distributed/fsdp/test_fsdp_memory.py
+++ b/test/distributed/fsdp/test_fsdp_memory.py
@@ -158,7 +158,7 @@ class TestFSDPMemory(FSDPTest):
         output = cmp(results, expected)
         self.assertEqual(output, "")
 
-    @unittest.skipIf(TEST_HPU, "Memory will be differnt for CUDA and HPU, skipping")
+    @unittest.skipIf(TEST_HPU, "Memory will be different for CUDA and HPU, skipping")
     @skip_if_lt_x_gpu(2)
     @parametrize("ckpt", ["no_ckpt", "ckpt"])
     def test_fsdp_memory(self, ckpt):
diff --git a/test/distributed/pipelining/schedule_registry.py b/test/distributed/pipelining/schedule_registry.py
index 1c5fcc9bf4a6..9b401193a172 100644
--- a/test/distributed/pipelining/schedule_registry.py
+++ b/test/distributed/pipelining/schedule_registry.py
@@ -45,7 +45,7 @@ class ScheduleVShaped(PipelineScheduleMulti):
         )
 
         # Go through one microbatch
-        # Note(whc) - it might be easier to work with thes schedules by writing them as a list of
+        # Note(whc) - it might be easier to work with this schedules by writing them as a list of
         # ["0F0", ...] and then parsing them in the test infra to turn them into actions.
         self.pipeline_order = {
             0: [
diff --git a/test/distributed/rpc/test_faulty_agent.py b/test/distributed/rpc/test_faulty_agent.py
index a060a2e25349..f9e9db18cce5 100644
--- a/test/distributed/rpc/test_faulty_agent.py
+++ b/test/distributed/rpc/test_faulty_agent.py
@@ -22,7 +22,7 @@ from torch.testing._internal.distributed.rpc_utils import (
 
 
 # On CircleCI these tests are already run on CPU jobs, thus to save resources do
-# not run them on GPU jobs, since thet wouldn't provide additional test signal.
+# not run them on GPU jobs, since they wouldn't provide additional test signal.
 if not (IS_CI and torch.cuda.is_available()):
     globals().update(
         generate_tests(
diff --git a/test/distributed/rpc/test_tensorpipe_agent.py b/test/distributed/rpc/test_tensorpipe_agent.py
index fe24d1e9f816..e21460ba04c8 100644
--- a/test/distributed/rpc/test_tensorpipe_agent.py
+++ b/test/distributed/rpc/test_tensorpipe_agent.py
@@ -23,7 +23,7 @@ from torch.testing._internal.distributed.rpc_utils import (
 
 
 # On CircleCI these tests are already run on CPU jobs, thus to save resources do
-# not run them on GPU jobs, since thet wouldn't provide additional test signal.
+# not run them on GPU jobs, since they wouldn't provide additional test signal.
 if not (IS_CI and torch.cuda.is_available()):
     globals().update(
         generate_tests(
diff --git a/test/distributed/tensor/experimental/test_local_map.py b/test/distributed/tensor/experimental/test_local_map.py
index fbbec59293ba..1e1b4fa8f27d 100644
--- a/test/distributed/tensor/experimental/test_local_map.py
+++ b/test/distributed/tensor/experimental/test_local_map.py
@@ -90,7 +90,7 @@ class TestLocalMap(DTensorTestBase):
         )  # row-wisely sharded W tensor
 
         # Test 1: use the function returned from calling local_map
-        # get the function wrapped with DTensor/Tensor convertion
+        # get the function wrapped with DTensor/Tensor conversion
         # mm_allreduce_forward is a function that applies to Tensors with manual collective
         # local_mm_allreduce_forward is the function that does the same but applies to
         # DTensors' `_local_tensor`.
diff --git a/test/distributed/tensor/experimental/test_tp_transform.py b/test/distributed/tensor/experimental/test_tp_transform.py
index b225f987fd4a..2f52d9c18b2b 100644
--- a/test/distributed/tensor/experimental/test_tp_transform.py
+++ b/test/distributed/tensor/experimental/test_tp_transform.py
@@ -85,7 +85,7 @@ class TensorParallelTest(DTensorTestBase):
         with torch.no_grad():
             tp_res = tp_model(*inputs)
         self.assertEqual(res, tp_res)
-        # Expect all_gather to be inserted to distributed sharded fc resutls
+        # Expect all_gather to be inserted to distributed sharded fc results
         self.assert_has_c10d_ops(
             tp_exported_program.graph_module,
             {
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index b41d87c7e44b..b82661454bfc 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -438,7 +438,7 @@ class DTensorTest(DTensorTestBase):
         self.assertEqual(type(out_view), AsyncCollectiveTensor)
         self.assertFalse(out.completed)
 
-        # Use the daa, requiring a sync
+        # Use the data, requiring a sync
         ref = torch.ones((4, 2), device=self.device_type) + 1
         ref = ref.view(-1)
         out_data = out_view + 1
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index 23114f87f46a..a26cf5da144f 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -220,7 +220,7 @@ def forward(self, b_parametrizations_buffer_original0, x):
             group1 = x.get_group(mesh_dim=1)
             return size, coord, group0, group1
 
-        # Cant be fullgraph=True because ProcessGroup is not reconstructible in dynamo
+        # Can't be fullgraph=True because ProcessGroup is not reconstructible in dynamo
         compiled_fn = torch.compile(backend="aot_eager")(fn)
 
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).unsqueeze(1))
diff --git a/test/distributed/tensor/test_embedding_ops.py b/test/distributed/tensor/test_embedding_ops.py
index 9b90f5f1dffb..eabd4a55470e 100644
--- a/test/distributed/tensor/test_embedding_ops.py
+++ b/test/distributed/tensor/test_embedding_ops.py
@@ -193,7 +193,7 @@ class TestEmbeddingOp(DTensorTestBase):
 
         from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
 
-        # case 1: two embeddings with the same shape, thus sharing the underying _MaskPartial
+        # case 1: two embeddings with the same shape, thus sharing the underlying _MaskPartial
         # and MaskBuffer, because of cache hit from sharding propagation
 
         emb1 = torch.nn.Embedding(10, 23, device=self.device_type)
diff --git a/test/distributed/tensor/test_op_strategy.py b/test/distributed/tensor/test_op_strategy.py
index 939fe0e47ee5..8bb96bf441b9 100644
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@@ -65,7 +65,7 @@ class TestEinsumDims(TestCase):
         self.assertEqual(edims.lhs_out_only_dims, ["c"])
         self.assertEqual(edims.rhs_out_only_dims, [])
 
-        equation = "abd,bf->abfd"
+        equation = "abd,bf->abfd"  # codespell:ignore
         input_dims, output_dim = EinsumDims.parse_equation(equation)
         edims = EinsumDims.parse_dims(input_dims, output_dim)
 
diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
index 8087b0144f36..b56f32dbcaea 100644
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@@ -635,7 +635,7 @@ class MultiDimRedistributeTest(DTensorTestBase):
                 dt = distribute_tensor(full_tensor, device_mesh, repl_inputs)
 
                 if repl_inputs != inputs:
-                    # create a new DTensor reinterpreting some of the replicated entires as "Partial"
+                    # create a new DTensor reinterpreting some of the replicated entries as "Partial"
                     dt = DTensor.from_local(
                         dt.to_local(), device_mesh, inputs, run_check=False
                     )
diff --git a/test/distributed/tensor/test_xla_integration.py b/test/distributed/tensor/test_xla_integration.py
index 3fbfcffbd76c..e39931e1f183 100644
--- a/test/distributed/tensor/test_xla_integration.py
+++ b/test/distributed/tensor/test_xla_integration.py
@@ -150,7 +150,7 @@ class DTensorXLAIntegrationTest(TestCase):
 
         def shard_params(mod_name, mod, mesh):
             shard_spec = [Shard(0)]
-            # annoate fc1 and fc2
+            # annotate fc1 and fc2
             if isinstance(mod, nn.Linear):
                 for _, param in mod.named_parameters():
                     # annotate the parameter tensors directly
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index abdb2ff2aee0..fd9e7594828d 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -601,7 +601,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
 
     def _helper_test_extra_cuda_context_by_nvml(self):
         """
-        A helper for `test_extra_cuda_context`, if pynvml is avaiable.
+        A helper for `test_extra_cuda_context`, if pynvml is available.
         pynvml provides python bindings for NVIDIA NVML functionalities.
         Here we are interested in: nvmlDeviceGetComputeRunningProcesses
         """
@@ -634,7 +634,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
 
     def _helper_test_extra_cuda_context_by_memory(self):
         """
-        A helper for `test_extra_cuda_context`, if pynvml is NOT avaiable.
+        A helper for `test_extra_cuda_context`, if pynvml is NOT available.
         If extra context is created, it would manifest into device 0's memory usage.
         """
         device = torch.device(f"cuda:{self.rank:d}")
@@ -1112,7 +1112,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
         os.environ["TORCH_NCCL_NONBLOCKING_TIMEOUT"] = "100"
         store = c10d.FileStore(self.file_name, self.world_size)
         device = torch.device(f"cuda:{self.rank}")
-        # bound device to triger eager init mode
+        # bound device to trigger eager init mode
         pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
         backend = pg._get_backend(torch.device(device))
         self.assertEqual(backend.comm_split_count(), 0)
@@ -2995,7 +2995,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
             time.sleep(4)
             self.assertEqual(process_group.get_error(), ErrorType.REMOTE_ERROR)
 
-        # Mimicing all ranks sensing the timeout, abort
+        # Mimicking all ranks sensing the timeout, abort
         process_group.abort()
 
         if prev_nccl_async_error_handling is not None:
@@ -4291,7 +4291,7 @@ class NCCLTraceTestBase(MultiProcessTestCase):
 
     def _join_processes(self, fn):
         # We need to patch sys.exit() as skip_if will use sys.exit() and
-        # the exit code from the this process will not be catched.
+        # the exit code from the this process will not be caught.
         with mock.patch("sys.exit"):
             fn()
         super()._join_processes(fn)
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 73ac6eb0da7b..d3436bbe4754 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -1814,7 +1814,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
 
         Note: comptime prints the guards before the time they get installed or not installed, so in both cases
         (skip or no skip) the same guards get printed.  The difference is that in the skip case, they show up
-        with a special 'guard source' which will cuase them to not be installed.  So all we check for is the expected
+        with a special 'guard source' which will cause them to not be installed.  So all we check for is the expected
         guard source 'local_fsdp_module'.
         """
         global GUARDS_FILE
@@ -1871,7 +1871,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
 
     def test_fsdp_skip_register_attr_or_module(self):
         """
-        ensure FSDP module is not registered as attrbutes
+        ensure FSDP module is not registered as attributes
         in the fx graph
         see `not source.guard_source().is_fsdp_module()`
         before calling `register_attr_or_module`
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index fcebcb772c9f..fad2f8195600 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -824,7 +824,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
         compiled = torch.compile(func)
         out = compiled(inputs, **self.get_world_trs())
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
-        # NOTE: Make sure we are not unneccessarily copying the outputs of
+        # NOTE: Make sure we are not unnecessarily copying the outputs of
         # wait_tensors before they are returned from the graph.
         (
             FileCheck()
@@ -891,7 +891,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
-        # NOTE: Make sure we are not unneccessarily copying the outputs of
+        # NOTE: Make sure we are not unnecessarily copying the outputs of
         # wait_tensors before they are returned from the graph.
         (
             FileCheck()
@@ -1356,7 +1356,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
-        # NOTE: Make sure we are not unneccessarily copying the outputs of
+        # NOTE: Make sure we are not unnecessarily copying the outputs of
         # wait_tensors before they are returned from the graph.
         (
             FileCheck()
@@ -1403,7 +1403,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unneccessary copy is made.
+        # We want to make sure no unnecessary copy is made.
         (
             FileCheck()
             .check("buf0 = empty_strided")
@@ -1474,7 +1474,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unneccessary copy is made.
+        # We want to make sure no unnecessary copy is made.
         (
             FileCheck()
             .check("buf0 = empty_strided")
@@ -1548,7 +1548,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unneccessary copy is made.
+        # We want to make sure no unnecessary copy is made.
         (FileCheck().check("all_gather_into_tensor_out").run(code))
         out = compiled(*inputs, **self.get_world_trs())
         correct = func(*inputs, **self.get_world_trs())
@@ -1598,7 +1598,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unneccessary copy is made.
+        # We want to make sure no unnecessary copy is made.
         (
             FileCheck()
             .check_count(
@@ -1689,7 +1689,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unneccessary copy is made.
+        # We want to make sure no unnecessary copy is made.
         (
             FileCheck()
             .check_count(
@@ -1785,7 +1785,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unneccessary copy is made.
+        # We want to make sure no unnecessary copy is made.
         (
             FileCheck()
             .check("all_gather")
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index 26460e7469a9..66ecdcd7e9c9 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -232,7 +232,7 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
         )
         out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
         # 3 rows: input splits, output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
+        # Initializing all values to -1 to check if they are updated
         in_out_splits = symm_mem.empty(
             (3, nsplits), dtype=torch.int64, device=self.device
         ).fill_(-1)
diff --git a/test/distributed/test_pg_wrapper.py b/test/distributed/test_pg_wrapper.py
index 4c96d4b564d6..c1fbf05e60a1 100644
--- a/test/distributed/test_pg_wrapper.py
+++ b/test/distributed/test_pg_wrapper.py
@@ -376,7 +376,7 @@ if not TEST_WITH_DEV_DBG_ASAN:
             ):
                 self._create_wrapper_pg(with_new_group=True)
                 # nothing to assert, isinstance(pg, _ProcessGroupWrapper)
-                # should never be invoked since it is preceeded by
+                # should never be invoked since it is proceeded by
                 # _GLOO_AVAILABLE check, this test will fail on
                 # an unexpected NameError if not.
 
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index e9abb1d90717..870805eec75e 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -837,9 +837,9 @@ class RendezvousTCPTest(TestCase):
         # not respected, it will take much longer to timeout.
         start = time.time()
         with self.assertRaisesRegex(
-            DistStoreError, "wait timeout after 100ms, keys: /nonexistant key"
+            DistStoreError, "wait timeout after 100ms, keys: /nonexistent key"
         ):
-            store0.get("nonexistant key")
+            store0.get("nonexistent key")
 
         end = time.time()
         time_diff = end - start
@@ -1066,7 +1066,7 @@ class TimeoutTest(TestCase):
             wait_for_workers=False,
         )
 
-        ths = []
+        threads = []
         for i in range(2):
             t = threading.Thread(
                 target=run,
@@ -1076,16 +1076,16 @@ class TimeoutTest(TestCase):
                 ),
             )
             t.start()
-            ths.append(t)
+            threads.append(t)
 
         def handler(a, b):
             pass
 
         signal.signal(signal.SIGUSR1, handler)
         time.sleep(1)
-        signal.pthread_kill(ths[1].ident, signal.SIGUSR1)
+        signal.pthread_kill(threads[1].ident, signal.SIGUSR1)
 
-        for t in ths:
+        for t in threads:
             t.join()
         self.assertTrue(rank_res[0], "rank0")
         self.assertTrue(rank_res[1], "rank1")
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 7ea41a86f22f..ed39107a0676 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -664,7 +664,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
 
     # These timeout tests are skipped on ROCm because timeout calls trap(), which
     # is handled differently inside hip runtime. It collects gpu coredump and causes
-    # the linux kernel to create a core dump of the host application. The funcitonality
+    # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
     @skipIfRocm
@@ -690,7 +690,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
 
     # These timeout tests are skipped on ROCm because timeout calls trap(), which
     # is handled differently inside hip runtime. It collects gpu coredump and causes
-    # the linux kernel to create a core dump of the host application. The funcitonality
+    # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
     @skipIfRocm
@@ -719,7 +719,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
 
     # These timeout tests are skipped on ROCm because timeout calls trap(), which
     # is handled differently inside hip runtime. It collects gpu coredump and causes
-    # the linux kernel to create a core dump of the host application. The funcitonality
+    # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
     @skipIfRocm