diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 6c4506f1a8a9..1d4a050b8047 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -92,7 +92,9 @@ def default_tolerances(
                 f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
             )
     dtype_precisions = dtype_precisions or _DTYPE_PRECISIONS
-    rtols, atols = zip(*[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes])
+    rtols, atols = zip(
+        *[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes], strict=True
+    )
     return max(rtols), max(atols)
 
 
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 11cfb179a97e..b3616fede6ce 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -437,7 +437,7 @@ class TestAutocast(TestCase):
                 if isinstance(first, torch.Tensor):
                     return torch.equal(first, second)
                 elif isinstance(first, collections.abc.Iterable):
-                    return all(compare(f, s) for f, s in zip(first, second))
+                    return all(compare(f, s) for f, s in zip(first, second, strict=False))
                 else:
                     return first == second
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 916221d33651..8202a32ae8ad 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -252,7 +252,7 @@ def tf32_on_and_off(tf32_precision=1e-5, *, only_if=True):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
             cond = torch.cuda.is_tf32_supported() and only_if
             if 'device' in kwargs:
                 cond = cond and (torch.device(kwargs['device']).type == 'cuda')
@@ -325,7 +325,7 @@ def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.
     mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     with torch.no_grad():
-        for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+        for c, s in zip(mod_control.parameters(), mod_scaling.parameters(), strict=True):
             s.copy_(c)
 
     kwargs = {"lr": 1.0}
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 64ea87852a86..719713e7c9f6 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1153,7 +1153,7 @@ def run_subtests(
     subtest_config_values: list[list[Any]] = [item[1] for item in subtest_config_items]
     for values in itertools.product(*subtest_config_values):
         # Map keyword to chosen value
-        subtest_kwargs = dict(zip(subtest_config_keys, values))
+        subtest_kwargs = dict(zip(subtest_config_keys, values, strict=True))
         with cls_inst.subTest(**subtest_kwargs):
             torch._dynamo.reset()
             test_fn(*test_args, **test_kwargs, **subtest_kwargs)
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index c18fbccb795d..dd211599cf14 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -157,7 +157,7 @@ def _assert_module_states(
     assert rank0_states is not None  # mypy
     for state in olist[1:]:
         assert state is not None  # mypy
-        for (_, p1), (_, p2) in zip(rank0_states, state):
+        for (_, p1), (_, p2) in zip(rank0_states, state, strict=True):
             assert_fn(p1, p2)
 
 
@@ -1135,7 +1135,9 @@ def check_sharded_parity(
     prefixes_to_ignore: tuple[str, ...] = (),
 ):
     for (replicated_name, replicated_param), (sharded_name, sharded_param) in zip(
-        replicated_module.named_parameters(), sharded_module.named_parameters()
+        replicated_module.named_parameters(),
+        sharded_module.named_parameters(),
+        strict=True,
     ):
         clean_sharded_name = sharded_name
         for prefix in prefixes_to_ignore:
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 6ca05c51189b..ac6e851d7e28 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -135,7 +135,7 @@ def check_against_reference(self, func, reference_func, output_func, args, kwarg
 
         self.assertEqual(outputs, outputs_test)
         self.assertEqual(grads, grads_test)
-        for g2, g2_test in zip(grads2, grads2_test):
+        for g2, g2_test in zip(grads2, grads2_test, strict=True):
             if g2 is None and g2_test is None:
                 continue
             self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index bafe4b241d3c..82e630519eb8 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -449,7 +449,7 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     biases = [None, channels, None]
     is_training = [True, False, False]
 
-    for weight, bias, training in zip(weights, biases, is_training):
+    for weight, bias, training in zip(weights, biases, is_training, strict=True):
         yield SampleInput(
             make_arg(input_shape),
             args=(
@@ -3631,7 +3631,7 @@ class _TestParamsMaxPoolBase:
     def _gen_kwargs(self):
         keys = self.kwargs.keys()
         for values in product(*self.kwargs.values()):
-            yield dict(zip(keys, values))
+            yield dict(zip(keys, values, strict=True))
 
     def gen_input_params(self):
         yield from product(self._gen_shape(), self._gen_kwargs())
@@ -4400,7 +4400,7 @@ def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
     weights = [channels, None]
     biases = [None, None]
 
-    for weight_channels, bias_channels in zip(weights, biases):
+    for weight_channels, bias_channels in zip(weights, biases, strict=True):
         running_mean = make_arg_without_requires_grad(channels, low=0)
         running_var = make_arg_without_requires_grad(channels, low=0)
         yield SampleInput(
@@ -11625,7 +11625,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         split_sorter = [sorter[i] if (sorter is not None) else None for i in splits]
 
         split_ret = [np.searchsorted(s_seq, b, side=side, sorter=s_sort)
-                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter)]
+                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter, strict=True)]
         split_ret = [i.astype(np.int32) for i in split_ret] if out_int32 else split_ret
         return np.stack(split_ret).reshape(orig_shape)
 
diff --git a/torch/testing/_internal/common_mkldnn.py b/torch/testing/_internal/common_mkldnn.py
index 44da60a5ad1f..70ab98137bd7 100644
--- a/torch/testing/_internal/common_mkldnn.py
+++ b/torch/testing/_internal/common_mkldnn.py
@@ -91,7 +91,7 @@ def reduced_f32_on_and_off(bf32_precision=1e-2, tf32_precision=1e-5):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
             cond = True
             if "device" in kwargs:
                 cond = cond and (torch.device(kwargs["device"]).type == "cpu")
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 2cd6a89a0452..120a76eb5ef3 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -1413,7 +1413,7 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, tra
                     forward_input=FunctionInput(make_input((2, 3, 4)),
                                                 make_input((2, 3, 4))),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
-                                                                         for a, b in zip(i, t))),
+                                                                         for a, b in zip(i, t, strict=True))),
         ModuleInput(constructor_input=FunctionInput(),
                     forward_input=FunctionInput(make_input(()), make_input(())),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index aaca0efe1eb4..68a35e8c40a1 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2633,7 +2633,7 @@ def get_new_module_tests():
     # add conv padding mode tests:
     for padding_mode, cpp_padding_mode in zip(
             ['reflect', 'circular', 'replicate', 'zeros'],
-            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros']):
+            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros'], strict=True):
         # conv signature:
         #     in_channels, out_channels, kernel_size, stride=1,
         #     padding=0, dilation=1, groups=1,
@@ -2848,8 +2848,8 @@ def nllloss_reference(input, target, weight=None, ignore_index=-100,
         return (result, norm)
 
     losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
-                          for i, t in zip(input, target)]
-    losses, weights = zip(*losses_and_weights)
+                          for i, t in zip(input, target, strict=True)]
+    losses, weights = zip(*losses_and_weights, strict=True)
     losses_tensor = input.new_tensor(losses)
     if reduction == 'mean':
         return sum(losses_tensor) / sum(weights)
@@ -3268,7 +3268,7 @@ class NNTestCase(TestCase):
         for i in range(output_size):
             param, d_param = self._get_parameters(module)
             # make non grad zeros
-            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
+            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param, strict=True)]
 
             d_out = torch.zeros_like(output)
             flat_d_out = d_out.view(-1)
@@ -3282,7 +3282,7 @@ class NNTestCase(TestCase):
             d_input = self._backward(module, input, output, d_out)
 
             if jacobian_input:
-                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input)):
+                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input), strict=True):
                     jacobian_x[:, i] = d_x.contiguous().view(-1)
             if jacobian_parameters:
                 jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
@@ -3320,7 +3320,7 @@ class NNTestCase(TestCase):
         numerical_t = list(_iter_tensors(numerical))
 
         differences = []
-        for a, n in zip(analytical_t, numerical_t):
+        for a, n in zip(analytical_t, numerical_t, strict=True):
             if a.numel() != 0:
                 differences.append(a.add(n, alpha=-1).abs().max())
             # TODO: compare structure (ensure analytic jacobian has correct shape)
@@ -3528,7 +3528,7 @@ class ModuleTest(TestBase):
             gpu_module = self.constructor(*self.constructor_args).float().cuda()
             cpu_param = test_case._get_parameters(cpu_module)
             gpu_param = test_case._get_parameters(gpu_module)
-            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0], strict=True):
                 gpu_p.data.copy_(cpu_p)
 
             test_case._zero_grad_input(cpu_input_tuple)
@@ -3549,7 +3549,7 @@ class ModuleTest(TestBase):
                 cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
                 gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1], strict=True):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
 
             # Run double-backwards on CPU and GPU and compare results
@@ -3575,7 +3575,7 @@ class ModuleTest(TestBase):
                     gpu_gradOutput,
                     create_graph=True)
 
-                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs, strict=True):
                     test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)
 
                 # We mix output into the second backwards computation so that
@@ -3598,7 +3598,7 @@ class ModuleTest(TestBase):
                     gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                     retain_graph=True)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg, strict=True):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)
 
             self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 0146f37e4baf..284a3bdcfbd7 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -692,7 +692,7 @@ class parametrize(_TestParametrizer):
             return f"{name}{idx}"
 
     def _default_subtest_name(self, idx, values):
-        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values)])
+        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values, strict=True)])
 
     def _get_subtest_name(self, idx, values, explicit_name=None):
         if explicit_name:
@@ -736,7 +736,7 @@ class parametrize(_TestParametrizer):
                     raise RuntimeError(f'Expected # values == # arg names, but got: {len(values)} '
                                        f'values and {len(self.arg_names)} names for test "{test.__name__}"')
 
-                param_kwargs = dict(zip(self.arg_names, values))
+                param_kwargs = dict(zip(self.arg_names, values, strict=True))
 
                 test_name = self._get_subtest_name(idx, values, explicit_name=maybe_name)
 
@@ -3696,7 +3696,7 @@ class TestCase(expecttest.TestCase):
             n_compressed_dims, n_plain_dims = size[-1 - dense_dims] // blocksize1, size[-2 - dense_dims] // blocksize0
         blocknnz = nnz // (blocksize0 * blocksize1)
         sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, blocknnz) for _ in range(n_batch)]
-        sparse_tensors_it = map(list, zip(*sparse_tensors))
+        sparse_tensors_it = map(list, zip(*sparse_tensors, strict=True))
 
         values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, blocknnz, *blocksize, *dense_size)
         compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index c44c0f50ff5d..527fc8a5826e 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -234,7 +234,7 @@ def generate_cct_and_mode(autograd_view_consistency=True):
                     #    tensor results to be that of the tensors that alias the input
                     result = func(*args, **kwargs)
                     if isinstance(result, (tuple, list)):
-                        for a, b in zip(rs, result):
+                        for a, b in zip(rs, result, strict=True):
                             a.set_(b)
                     else:
                         rs.set_(result)
@@ -303,7 +303,7 @@ def generate_subclass_choices(flat_args, CCT, cct_mode):
     for which_args_are_wrapped in itertools.product(*subclass_options):
 
         result = [maybe_map(partial(wrap, CCT=CCT, cct_mode=cct_mode), should_wrap_arg, arg)
-                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args)]
+                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args, strict=True)]
         yield result, which_args_are_wrapped
 
 
@@ -539,11 +539,11 @@ def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None,
                 return fwAD.make_dual(primal.detach(), tangent)
             elif is_tensorlist(primal):
                 return tuple(fwAD.make_dual(pri.detach(), tang) if tang is not None else pri
-                             for pri, tang in zip(primal, tangent))
+                             for pri, tang in zip(primal, tangent, strict=True))
             return primal
 
         def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
-            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args)))
+            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args, strict=True)))
             op_kwargs = {k: maybe_make_dual((v, tangent_kwargs[k])) for k, v in kwargs.items()}
 
             if gradcheck_wrapper is None:
@@ -572,7 +572,7 @@ def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None,
                 new_tang_args, new_tang_kwargs, \
                     which_tang_args_are_wrapped, which_tang_kwargs_are_wrapped = tang_choice
 
-                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args)))
+                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args, strict=True)))
                 op_kwargs = {k: maybe_make_dual((v, new_tang_kwargs[k])) for k, v in new_kwargs.items()}
 
                 try:
diff --git a/torch/testing/_internal/custom_tensor.py b/torch/testing/_internal/custom_tensor.py
index 9fa6f79ec68a..de1b44ba8dac 100644
--- a/torch/testing/_internal/custom_tensor.py
+++ b/torch/testing/_internal/custom_tensor.py
@@ -144,7 +144,9 @@ class CustomTensorPlainOut(torch.Tensor):
             new_out = pytree.tree_unflatten(
                 (
                     CustomTensorPlainOut(tensor1, tensor2)
-                    for tensor1, tensor2 in zip(out_inner_flat_1, out_inner_flat_2)
+                    for tensor1, tensor2 in zip(
+                        out_inner_flat_1, out_inner_flat_2, strict=True
+                    )
                 ),
                 spec,
             )
diff --git a/torch/testing/_internal/distributed/common_state_dict.py b/torch/testing/_internal/distributed/common_state_dict.py
index 76b7800a8d2a..a78e312306ba 100644
--- a/torch/testing/_internal/distributed/common_state_dict.py
+++ b/torch/testing/_internal/distributed/common_state_dict.py
@@ -60,7 +60,7 @@ class VerifyStateDictMixin:
         dist_osd: dict[str, Any],
     ) -> None:
         params = list(chain.from_iterable(g["params"] for g in optim.param_groups))
-        param_pid_mapping = dict(zip(params, range(len(params))))
+        param_pid_mapping = dict(zip(params, range(len(params)), strict=True))
         fqn_pid_mapping = {}
         for fqn, param in model.named_parameters():
             pid = param_pid_mapping[param]
@@ -90,7 +90,7 @@ class VerifyStateDictMixin:
             dist_osd[_PG] = [new_pg]
 
         self.assertEqual(len(osd[_PG]), len(dist_osd[_PG]))
-        for group, dist_group in zip(osd[_PG], dist_osd[_PG]):
+        for group, dist_group in zip(osd[_PG], dist_osd[_PG], strict=True):
             self.assertEqual(len(group), len(dist_group))
             for key, value in group.items():
                 # Below doesn't work because param_groups can have None
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 428224022a45..ca9bc297010a 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -238,7 +238,9 @@ class Trainer:
             sparse_microbatch = torch.split(sparse_features, 2)
             values_microbatch = torch.split(values, 2)
             batches = []
-            for d, s, v in zip(dense_microbatch, sparse_microbatch, values_microbatch):
+            for d, s, v in zip(
+                dense_microbatch, sparse_microbatch, values_microbatch, strict=True
+            ):
                 feature_set = FeatureSet(dense_features=d, sparse_features=s, values=v)
                 batches.append(feature_set)
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 62ef8d4a5eca..c41602d43994 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -678,7 +678,7 @@ class DistributedTest:
             # Verify buffers across ranks.
             m1_buffers = list(m1.buffers())
             m2_buffers = list(m2.buffers())
-            for buf1, buf2 in zip(m1_buffers, m2_buffers):
+            for buf1, buf2 in zip(m1_buffers, m2_buffers, strict=True):
                 gathered_bufs = [
                     torch.empty_like(buf1) for _ in range(dist.get_world_size())
                 ]
@@ -3045,7 +3045,7 @@ class DistributedTest:
                 curr_values = master_values if rank == src else worker_values
                 tensors = [
                     _build_tensor(src + 1, val, dtype=dtype)
-                    for dtype, val in zip(dtypes, curr_values)
+                    for dtype, val in zip(dtypes, curr_values, strict=True)
                 ]
                 if cuda:
                     tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
@@ -3066,7 +3066,9 @@ class DistributedTest:
                 )
                 expected_tensors = [
                     _build_tensor(src + 1, expected_value, dtype=dtype)
-                    for dtype, expected_value in zip(dtypes, expected_values)
+                    for dtype, expected_value in zip(
+                        dtypes, expected_values, strict=True
+                    )
                 ]
                 self.assertEqual(tensors, expected_tensors)
 
@@ -3338,7 +3340,7 @@ class DistributedTest:
                 )
                 if rank == dest:
                     expected_tensors = [_build_tensor(dest + 1, i) for i in group]
-                    for t1, t2 in zip(tensors, expected_tensors):
+                    for t1, t2 in zip(tensors, expected_tensors, strict=True):
                         self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3440,7 +3442,7 @@ class DistributedTest:
                 expected_tensors = [
                     _build_tensor(dest + 1, i, dtype=dtype) for i in group
                 ]
-                for t1, t2 in zip(tensors, expected_tensors):
+                for t1, t2 in zip(tensors, expected_tensors, strict=True):
                     self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3624,8 +3626,8 @@ class DistributedTest:
                 tensor_shapes=tensor_shapes,
             )
 
-            for l1, l2 in zip(output_tensor_lists, expected_tensors):
-                for t1, t2 in zip(l1, l2):
+            for l1, l2 in zip(output_tensor_lists, expected_tensors, strict=True):
+                for t1, t2 in zip(l1, l2, strict=True):
                     if not torch.equal(t1, t2):
                         return False
             return True
@@ -3824,7 +3826,7 @@ class DistributedTest:
                     ]
                     out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
                 dist.all_to_all(out_tensors, in_tensors, group=group_id)
-                for t1, t2 in zip(out_tensors, expected_tensors):
+                for t1, t2 in zip(out_tensors, expected_tensors, strict=True):
                     self.assertEqual(t1, t2)
             self._barrier()
 
@@ -4203,7 +4205,7 @@ class DistributedTest:
 
         def _assert_equal_param(self, param_gpu, param_DDP):
             self.assertEqual(len(param_gpu), len(param_DDP))
-            for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+            for p_gpu, p_DDP in zip(param_gpu, param_DDP, strict=True):
                 self.assertEqual(p_gpu, p_DDP)
 
         def _test_DDP_niter(
@@ -4618,6 +4620,7 @@ class DistributedTest:
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4649,6 +4652,7 @@ class DistributedTest:
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4825,7 +4829,9 @@ class DistributedTest:
                         optimizer_kwargs=optim_kwargs,
                     )
 
-                for p1, p2 in zip(model.parameters(), model_optim_in_bwd.parameters()):
+                for p1, p2 in zip(
+                    model.parameters(), model_optim_in_bwd.parameters(), strict=True
+                ):
                     self.assertEqual(p1, p2, "Parameters not initially equal!")
                 # Enable determinism in cudnn operators
                 with torch.backends.cudnn.flags(
@@ -4843,7 +4849,9 @@ class DistributedTest:
                             inp
                         ).sum().backward()  # runs optimizer as well
                         for p1, p2 in zip(
-                            model.parameters(), model_optim_in_bwd.parameters()
+                            model.parameters(),
+                            model_optim_in_bwd.parameters(),
+                            strict=True,
                         ):
                             self.assertEqual(
                                 p1, p2, f"Params not equal at iteration {i}"
@@ -5323,7 +5331,9 @@ class DistributedTest:
                     # sync grads
                     step_model(ddp_model, ddp_input, ddp_target)
 
-                for i, j in zip(model.parameters(), ddp_model.parameters()):
+                for i, j in zip(
+                    model.parameters(), ddp_model.parameters(), strict=True
+                ):
                     if not i.requires_grad:
                         continue
                     if iteration % 2 == 0:
@@ -5562,6 +5572,7 @@ class DistributedTest:
             for i, j in zip(
                 ddp_model_grad_not_view.parameters(),
                 ddp_model_grad_is_view.parameters(),
+                strict=True,
             ):
                 self.assertEqual(i, j)
 
@@ -5667,7 +5678,9 @@ class DistributedTest:
                     target,
                 )
                 for p1, p2 in zip(
-                    net.parameters(), net_using_post_localSGD_opt.parameters()
+                    net.parameters(),
+                    net_using_post_localSGD_opt.parameters(),
+                    strict=True,
                 ):
                     self.assertEqual(p1.data, p2.data)
 
@@ -6817,7 +6830,7 @@ class DistributedTest:
             # they are the same as new_model on rank_to_broadcast.
             if rank == rank_to_broadcast:
                 expected_states = new_model.state_dict().values()
-                for t, expected in zip(net_module_states, expected_states):
+                for t, expected in zip(net_module_states, expected_states, strict=True):
                     self.assertEqual(t, expected)
 
         @skip_if_lt_x_gpu(2)
@@ -7134,7 +7147,9 @@ class DistributedTest:
 
             # Validate model state dicts are equal
             for (_, local_tensor), (_, dist_tensor) in zip(
-                local_model.state_dict().items(), net.module.state_dict().items()
+                local_model.state_dict().items(),
+                net.module.state_dict().items(),
+                strict=True,
             ):
                 self.assertEqual(local_tensor, dist_tensor)
 
@@ -7722,13 +7737,17 @@ class DistributedTest:
                     # materialized param grad is not touched by DDP, so its grad should
                     # be the same as if running locally.
                     for materialized_param, local_param in zip(
-                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+                        ddp.module.fc2.parameters(),
+                        local_model.fc2.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(materialized_param.grad, local_param.grad)
 
                     # fc1 parameter grad should still be different, due to allreduce.
                     for synced_param, local_param in zip(
-                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+                        ddp.module.fc1.parameters(),
+                        local_model.fc1.parameters(),
+                        strict=True,
                     ):
                         self.assertFalse(synced_param.grad == local_param.grad)
 
@@ -8581,7 +8600,7 @@ class DistributedTest:
 
                 # Verify grads are the same
                 for local_param, dist_param in zip(
-                    local_net.parameters(), net.parameters()
+                    local_net.parameters(), net.parameters(), strict=True
                 ):
                     local_grad = local_param.grad
                     dist_grad = dist_param.grad
@@ -8631,7 +8650,7 @@ class DistributedTest:
             torch._C._functions.UndefinedGrad()(out).backward()
             torch._C._functions.UndefinedGrad()(local_out).backward()
             for (dist_param_name, dist_param), (local_param_name, local_param) in zip(
-                net.named_parameters(), local_net.named_parameters()
+                net.named_parameters(), local_net.named_parameters(), strict=True
             ):
                 dist_grad = dist_param.grad
                 local_grad = local_param.grad
@@ -8689,7 +8708,9 @@ class DistributedTest:
             self.assertTrue(
                 static_model._get_ddp_logging_data().get("has_rebuilt_buckets", 0)
             )
-            for i, j in zip(base_model.parameters(), static_model.parameters()):
+            for i, j in zip(
+                base_model.parameters(), static_model.parameters(), strict=True
+            ):
                 self.assertEqual(i, j)
 
         @require_backend_is_available({"gloo"})
@@ -9297,7 +9318,7 @@ class DistributedTest:
                     loss_static.backward()
                     self._model_step(model_static_graph)
                     for p, p_static in zip(
-                        model.parameters(), model_static_graph.parameters()
+                        model.parameters(), model_static_graph.parameters(), strict=True
                     ):
                         self.assertEqual(p, p_static)
 
@@ -9974,7 +9995,7 @@ class DistributedTest:
                         p.grad.data = p.grad / iters
 
                     for p_ddp, p_local in zip(
-                        model.parameters(), local_model.parameters()
+                        model.parameters(), local_model.parameters(), strict=True
                     ):
                         self.assertTrue(
                             torch.allclose(p_ddp.grad, p_local.grad),
@@ -10191,7 +10212,9 @@ class DistributedTest:
             #  (refer to https://github.com/numpy/numpy/blob/266aad7478bc7fbcc55eea7f942a0d373b838396/numpy/random/mtrand.pyi)
             # To make sure random state was restored properly, all entries should equal the original
             for entry1, entry2 in zip(
-                hook_state.rng.get_state(), dummy_hook_state.rng.get_state()
+                hook_state.rng.get_state(),
+                dummy_hook_state.rng.get_state(),
+                strict=True,
             ):
                 np.testing.assert_array_equal(entry1, entry2)
 
@@ -10212,7 +10235,7 @@ class DistributedTest:
 
             # Check that gradients after 10 epochs are the same
             for orig_param, dummy_param in zip(
-                ddp_model.parameters(), dummy_ddp_model.parameters()
+                ddp_model.parameters(), dummy_ddp_model.parameters(), strict=True
             ):
                 self.assertEqual(orig_param.grad, dummy_param.grad)
 
@@ -10299,7 +10322,9 @@ class DistributedTest:
                 self.assertEqual(out_ddp, out_ddp_static)
                 out_ddp.backward()
                 out_ddp_static.backward()
-                for p1, p2 in zip(ddp.parameters(), ddp_static.parameters()):
+                for p1, p2 in zip(
+                    ddp.parameters(), ddp_static.parameters(), strict=True
+                ):
                     self.assertEqual(p1.grad, p2.grad)
 
         @skip_if_lt_x_gpu(2)
@@ -10392,7 +10417,9 @@ class DistributedTest:
                 test_model_1._get_ddp_logging_data().get("num_buckets_reduced"), 1
             )
 
-            for i, j in zip(base_model.parameters(), test_model_1.parameters()):
+            for i, j in zip(
+                base_model.parameters(), test_model_1.parameters(), strict=True
+            ):
                 self.assertEqual(i, j)
 
 
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 1f5d1ef1bdbd..2cc22cb7c23a 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -457,7 +457,9 @@ class ProcessLocalGroup(dist.ProcessGroup):
     ):
         works = [
             self._reduce_scatter_base(output_tensor, input_tensor, opts)
-            for output_tensor, input_tensor in zip(output_tensors, input_tensors)
+            for output_tensor, input_tensor in zip(
+                output_tensors, input_tensors, strict=True
+            )
         ]
         for work in works[:-1]:
             work.wait()
@@ -467,7 +469,7 @@ class ProcessLocalGroup(dist.ProcessGroup):
         self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()
     ):
         res = None
-        for o_t, i_t in zip(output_tensor_list, input_tensor_list):
+        for o_t, i_t in zip(output_tensor_list, input_tensor_list, strict=True):
             res = self._allgather_base(o_t, i_t)
         return res
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index f7cb2075e373..1d6c7500c5ad 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -2749,7 +2749,7 @@ class TensorPipeCudaDistAutogradTest(RpcAgentTestFixture):
 
                 for i in range(len(futs)):
                     local_gradients = [p.grad for p in local_layers[i].parameters()]
-                    for g1, g2 in zip(futs[i].wait(), local_gradients):
+                    for g1, g2 in zip(futs[i].wait(), local_gradients, strict=True):
                         self.assertEqual(g1, g2)
 
         rpc.shutdown()
diff --git a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
index f84ba5225c6e..ad0b7fbe2207 100644
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -46,7 +46,7 @@ class BatchUpdateParameterServer:
     @rpc.functions.async_execution
     def update_and_fetch_model(ps_rref, grads):
         self = ps_rref.local_value()
-        for p, g in zip(self.model.parameters(), grads):
+        for p, g in zip(self.model.parameters(), grads, strict=True):
             if p.grad is None:
                 p.grad = g
             else:
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index beb08a25484d..57008aed17db 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -216,7 +216,7 @@ class Agent:
             returns.insert(0, R)
         returns = torch.tensor(returns)
         returns = (returns - returns.mean()) / (returns.std() + self.eps)
-        for log_prob, R in zip(probs, returns):
+        for log_prob, R in zip(probs, returns, strict=True):
             policy_loss.append(-log_prob * R)
         self.optimizer.zero_grad()
         policy_loss = torch.cat(policy_loss).sum()
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 4bc0738ec2f3..e98d0e482683 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -249,7 +249,7 @@ class JitTestCase(JitCommonTestCase):
             saved_module_buffer_2.seek(0)
             code_files_2, _debug_files_2 = extract_files(saved_module_buffer_2)
 
-            for a, b in zip(code_files, code_files_2):
+            for a, b in zip(code_files, code_files_2, strict=True):
                 self.assertMultiLineEqual(a, b)
 
             if isinstance(m, torch._C.ScriptModule):
@@ -617,7 +617,7 @@ class JitTestCase(JitCommonTestCase):
         self.assertEqual(outputs, outputs_ge)
         if inputs_require_grads:
             self.assertEqual(grads, grads_ge, atol=grad_atol, rtol=grad_rtol)
-            for g2, g2_ge in zip(grads2, grads2_ge):
+            for g2, g2_ge in zip(grads2, grads2_ge, strict=True):
                 if g2 is None and g2_ge is None:
                     continue
                 self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4)
diff --git a/torch/testing/_internal/logging_utils.py b/torch/testing/_internal/logging_utils.py
index 1632149c6584..1e1ecf8f4f70 100644
--- a/torch/testing/_internal/logging_utils.py
+++ b/torch/testing/_internal/logging_utils.py
@@ -228,11 +228,11 @@ def multiple_logs_to_string(module: str, *log_options: str) -> tuple[list[io.Str
     def tmp_redirect_logs():
         loggers = [torch._logging.getArtifactLogger(module, option) for option in log_options]
         try:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                 logger.addHandler(handler)
             yield
         finally:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                 logger.removeHandler(handler)
 
     def ctx_manager() -> AbstractContextManager[None]:
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index 4ff16b343715..d65fbef658a4 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -402,9 +402,9 @@ def sample_inputs_masked_logaddexp(op_info, device, dtype, requires_grad, **kwar
         make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
     )
     for shape, input_masks, other_masks in zip(
-        shapes, input_mask_lists, other_mask_lists
+        shapes, input_mask_lists, other_mask_lists, strict=True
     ):
-        for input_mask, other_mask in zip(input_masks, other_masks):
+        for input_mask, other_mask in zip(input_masks, other_masks, strict=True):
             yield SampleInput(
                 make_arg(shape),
                 make_arg(shape),
diff --git a/torch/testing/_internal/two_tensor.py b/torch/testing/_internal/two_tensor.py
index 3a503c741e88..8197829ac7f4 100644
--- a/torch/testing/_internal/two_tensor.py
+++ b/torch/testing/_internal/two_tensor.py
@@ -78,7 +78,7 @@ class TwoTensor(torch.Tensor):
         # our two inner tensors return the same value
         out_flat = [
             cls(o_a, o_b) if isinstance(o_a, torch.Tensor) else o_a
-            for o_a, o_b in zip(out_a_flat, out_b_flat)
+            for o_a, o_b in zip(out_a_flat, out_b_flat, strict=True)
         ]
         out = pytree.tree_unflatten(out_flat, spec)
         from torch._higher_order_ops.cond import cond_op