[1/N] Add strict parameter to Python zip calls (#165531)

Add `strict=True/False` to zip calls in test utils. `strict=True` is passed when possible. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165531 Approved by: https://github.com/Skylion007
2025-10-21 05:34:18 +08:00 · 2025-10-18 05:26:29 +00:00
parent 0f0b4bf029
commit aaac8cb0f5
24 changed files with 111 additions and 74 deletions
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@ -92,7 +92,9 @@ def default_tolerances(
                f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
            )
    dtype_precisions = dtype_precisions or _DTYPE_PRECISIONS
-    rtols, atols = zip(*[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes])
+    rtols, atols = zip(
+        *[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes], strict=True
+    )
    return max(rtols), max(atols)


--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@ -437,7 +437,7 @@ class TestAutocast(TestCase):
                if isinstance(first, torch.Tensor):
                    return torch.equal(first, second)
                elif isinstance(first, collections.abc.Iterable):
-                    return all(compare(f, s) for f, s in zip(first, second))
+                    return all(compare(f, s) for f, s in zip(first, second, strict=False))
                else:
                    return first == second

--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@ -252,7 +252,7 @@ def tf32_on_and_off(tf32_precision=1e-5, *, only_if=True):

        @functools.wraps(f)
        def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
            cond = torch.cuda.is_tf32_supported() and only_if
            if 'device' in kwargs:
                cond = cond and (torch.device(kwargs['device']).type == 'cuda')
@ -325,7 +325,7 @@ def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.
    mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
    mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
    with torch.no_grad():
-        for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+        for c, s in zip(mod_control.parameters(), mod_scaling.parameters(), strict=True):
            s.copy_(c)

    kwargs = {"lr": 1.0}
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@ -1153,7 +1153,7 @@ def run_subtests(
    subtest_config_values: list[list[Any]] = [item[1] for item in subtest_config_items]
    for values in itertools.product(*subtest_config_values):
        # Map keyword to chosen value
-        subtest_kwargs = dict(zip(subtest_config_keys, values))
+        subtest_kwargs = dict(zip(subtest_config_keys, values, strict=True))
        with cls_inst.subTest(**subtest_kwargs):
            torch._dynamo.reset()
            test_fn(*test_args, **test_kwargs, **subtest_kwargs)
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@ -157,7 +157,7 @@ def _assert_module_states(
    assert rank0_states is not None  # mypy
    for state in olist[1:]:
        assert state is not None  # mypy
-        for (_, p1), (_, p2) in zip(rank0_states, state):
+        for (_, p1), (_, p2) in zip(rank0_states, state, strict=True):
            assert_fn(p1, p2)


@ -1135,7 +1135,9 @@ def check_sharded_parity(
    prefixes_to_ignore: tuple[str, ...] = (),
 ):
    for (replicated_name, replicated_param), (sharded_name, sharded_param) in zip(
-        replicated_module.named_parameters(), sharded_module.named_parameters()
+        replicated_module.named_parameters(),
+        sharded_module.named_parameters(),
+        strict=True,
    ):
        clean_sharded_name = sharded_name
        for prefix in prefixes_to_ignore:
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@ -135,7 +135,7 @@ def check_against_reference(self, func, reference_func, output_func, args, kwarg

        self.assertEqual(outputs, outputs_test)
        self.assertEqual(grads, grads_test)
-        for g2, g2_test in zip(grads2, grads2_test):
+        for g2, g2_test in zip(grads2, grads2_test, strict=True):
            if g2 is None and g2_test is None:
                continue
            self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4)
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@ -449,7 +449,7 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
    biases = [None, channels, None]
    is_training = [True, False, False]

-    for weight, bias, training in zip(weights, biases, is_training):
+    for weight, bias, training in zip(weights, biases, is_training, strict=True):
        yield SampleInput(
            make_arg(input_shape),
            args=(
@ -3631,7 +3631,7 @@ class _TestParamsMaxPoolBase:
    def _gen_kwargs(self):
        keys = self.kwargs.keys()
        for values in product(*self.kwargs.values()):
-            yield dict(zip(keys, values))
+            yield dict(zip(keys, values, strict=True))

    def gen_input_params(self):
        yield from product(self._gen_shape(), self._gen_kwargs())
@ -4400,7 +4400,7 @@ def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
    weights = [channels, None]
    biases = [None, None]

-    for weight_channels, bias_channels in zip(weights, biases):
+    for weight_channels, bias_channels in zip(weights, biases, strict=True):
        running_mean = make_arg_without_requires_grad(channels, low=0)
        running_var = make_arg_without_requires_grad(channels, low=0)
        yield SampleInput(
@ -11625,7 +11625,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
        split_sorter = [sorter[i] if (sorter is not None) else None for i in splits]

        split_ret = [np.searchsorted(s_seq, b, side=side, sorter=s_sort)
-                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter)]
+                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter, strict=True)]
        split_ret = [i.astype(np.int32) for i in split_ret] if out_int32 else split_ret
        return np.stack(split_ret).reshape(orig_shape)

--- a/torch/testing/_internal/common_mkldnn.py
+++ b/torch/testing/_internal/common_mkldnn.py
@ -91,7 +91,7 @@ def reduced_f32_on_and_off(bf32_precision=1e-2, tf32_precision=1e-5):

        @functools.wraps(f)
        def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
            cond = True
            if "device" in kwargs:
                cond = cond and (torch.device(kwargs["device"]).type == "cpu")
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@ -1413,7 +1413,7 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, tra
                    forward_input=FunctionInput(make_input((2, 3, 4)),
                                                make_input((2, 3, 4))),
                    reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
-                                                                         for a, b in zip(i, t))),
+                                                                         for a, b in zip(i, t, strict=True))),
        ModuleInput(constructor_input=FunctionInput(),
                    forward_input=FunctionInput(make_input(()), make_input(())),
                    reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@ -2633,7 +2633,7 @@ def get_new_module_tests():
    # add conv padding mode tests:
    for padding_mode, cpp_padding_mode in zip(
            ['reflect', 'circular', 'replicate', 'zeros'],
-            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros']):
+            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros'], strict=True):
        # conv signature:
        #     in_channels, out_channels, kernel_size, stride=1,
        #     padding=0, dilation=1, groups=1,
@ -2848,8 +2848,8 @@ def nllloss_reference(input, target, weight=None, ignore_index=-100,
        return (result, norm)

    losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
-                          for i, t in zip(input, target)]
-    losses, weights = zip(*losses_and_weights)
+                          for i, t in zip(input, target, strict=True)]
+    losses, weights = zip(*losses_and_weights, strict=True)
    losses_tensor = input.new_tensor(losses)
    if reduction == 'mean':
        return sum(losses_tensor) / sum(weights)
@ -3268,7 +3268,7 @@ class NNTestCase(TestCase):
        for i in range(output_size):
            param, d_param = self._get_parameters(module)
            # make non grad zeros
-            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
+            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param, strict=True)]

            d_out = torch.zeros_like(output)
            flat_d_out = d_out.view(-1)
@ -3282,7 +3282,7 @@ class NNTestCase(TestCase):
            d_input = self._backward(module, input, output, d_out)

            if jacobian_input:
-                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input)):
+                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input), strict=True):
                    jacobian_x[:, i] = d_x.contiguous().view(-1)
            if jacobian_parameters:
                jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
@ -3320,7 +3320,7 @@ class NNTestCase(TestCase):
        numerical_t = list(_iter_tensors(numerical))

        differences = []
-        for a, n in zip(analytical_t, numerical_t):
+        for a, n in zip(analytical_t, numerical_t, strict=True):
            if a.numel() != 0:
                differences.append(a.add(n, alpha=-1).abs().max())
            # TODO: compare structure (ensure analytic jacobian has correct shape)
@ -3528,7 +3528,7 @@ class ModuleTest(TestBase):
            gpu_module = self.constructor(*self.constructor_args).float().cuda()
            cpu_param = test_case._get_parameters(cpu_module)
            gpu_param = test_case._get_parameters(gpu_module)
-            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0], strict=True):
                gpu_p.data.copy_(cpu_p)

            test_case._zero_grad_input(cpu_input_tuple)
@ -3549,7 +3549,7 @@ class ModuleTest(TestBase):
                cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
                gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
                test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1], strict=True):
                    test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)

            # Run double-backwards on CPU and GPU and compare results
@ -3575,7 +3575,7 @@ class ModuleTest(TestBase):
                    gpu_gradOutput,
                    create_graph=True)

-                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs, strict=True):
                    test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)

                # We mix output into the second backwards computation so that
@ -3598,7 +3598,7 @@ class ModuleTest(TestBase):
                    gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                    retain_graph=True)
                test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg, strict=True):
                    test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)

            self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@ -692,7 +692,7 @@ class parametrize(_TestParametrizer):
            return f"{name}{idx}"

    def _default_subtest_name(self, idx, values):
-        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values)])
+        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values, strict=True)])

    def _get_subtest_name(self, idx, values, explicit_name=None):
        if explicit_name:
@ -736,7 +736,7 @@ class parametrize(_TestParametrizer):
                    raise RuntimeError(f'Expected # values == # arg names, but got: {len(values)} '
                                       f'values and {len(self.arg_names)} names for test "{test.__name__}"')

-                param_kwargs = dict(zip(self.arg_names, values))
+                param_kwargs = dict(zip(self.arg_names, values, strict=True))

                test_name = self._get_subtest_name(idx, values, explicit_name=maybe_name)

@ -3696,7 +3696,7 @@ class TestCase(expecttest.TestCase):
            n_compressed_dims, n_plain_dims = size[-1 - dense_dims] // blocksize1, size[-2 - dense_dims] // blocksize0
        blocknnz = nnz // (blocksize0 * blocksize1)
        sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, blocknnz) for _ in range(n_batch)]
-        sparse_tensors_it = map(list, zip(*sparse_tensors))
+        sparse_tensors_it = map(list, zip(*sparse_tensors, strict=True))

        values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, blocknnz, *blocksize, *dense_size)
        compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@ -234,7 +234,7 @@ def generate_cct_and_mode(autograd_view_consistency=True):
                    #    tensor results to be that of the tensors that alias the input
                    result = func(*args, **kwargs)
                    if isinstance(result, (tuple, list)):
-                        for a, b in zip(rs, result):
+                        for a, b in zip(rs, result, strict=True):
                            a.set_(b)
                    else:
                        rs.set_(result)
@ -303,7 +303,7 @@ def generate_subclass_choices(flat_args, CCT, cct_mode):
    for which_args_are_wrapped in itertools.product(*subclass_options):

        result = [maybe_map(partial(wrap, CCT=CCT, cct_mode=cct_mode), should_wrap_arg, arg)
-                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args)]
+                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args, strict=True)]
        yield result, which_args_are_wrapped


@ -539,11 +539,11 @@ def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None,
                return fwAD.make_dual(primal.detach(), tangent)
            elif is_tensorlist(primal):
                return tuple(fwAD.make_dual(pri.detach(), tang) if tang is not None else pri
-                             for pri, tang in zip(primal, tangent))
+                             for pri, tang in zip(primal, tangent, strict=True))
            return primal

        def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
-            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args)))
+            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args, strict=True)))
            op_kwargs = {k: maybe_make_dual((v, tangent_kwargs[k])) for k, v in kwargs.items()}

            if gradcheck_wrapper is None:
@ -572,7 +572,7 @@ def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None,
                new_tang_args, new_tang_kwargs, \
                    which_tang_args_are_wrapped, which_tang_kwargs_are_wrapped = tang_choice

-                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args)))
+                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args, strict=True)))
                op_kwargs = {k: maybe_make_dual((v, new_tang_kwargs[k])) for k, v in new_kwargs.items()}

                try:
--- a/torch/testing/_internal/custom_tensor.py
+++ b/torch/testing/_internal/custom_tensor.py
@ -144,7 +144,9 @@ class CustomTensorPlainOut(torch.Tensor):
            new_out = pytree.tree_unflatten(
                (
                    CustomTensorPlainOut(tensor1, tensor2)
-                    for tensor1, tensor2 in zip(out_inner_flat_1, out_inner_flat_2)
+                    for tensor1, tensor2 in zip(
+                        out_inner_flat_1, out_inner_flat_2, strict=True
+                    )
                ),
                spec,
            )
--- a/torch/testing/_internal/distributed/common_state_dict.py
+++ b/torch/testing/_internal/distributed/common_state_dict.py
@ -60,7 +60,7 @@ class VerifyStateDictMixin:
        dist_osd: dict[str, Any],
    ) -> None:
        params = list(chain.from_iterable(g["params"] for g in optim.param_groups))
-        param_pid_mapping = dict(zip(params, range(len(params))))
+        param_pid_mapping = dict(zip(params, range(len(params)), strict=True))
        fqn_pid_mapping = {}
        for fqn, param in model.named_parameters():
            pid = param_pid_mapping[param]
@ -90,7 +90,7 @@ class VerifyStateDictMixin:
            dist_osd[_PG] = [new_pg]

        self.assertEqual(len(osd[_PG]), len(dist_osd[_PG]))
-        for group, dist_group in zip(osd[_PG], dist_osd[_PG]):
+        for group, dist_group in zip(osd[_PG], dist_osd[_PG], strict=True):
            self.assertEqual(len(group), len(dist_group))
            for key, value in group.items():
                # Below doesn't work because param_groups can have None
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@ -238,7 +238,9 @@ class Trainer:
            sparse_microbatch = torch.split(sparse_features, 2)
            values_microbatch = torch.split(values, 2)
            batches = []
-            for d, s, v in zip(dense_microbatch, sparse_microbatch, values_microbatch):
+            for d, s, v in zip(
+                dense_microbatch, sparse_microbatch, values_microbatch, strict=True
+            ):
                feature_set = FeatureSet(dense_features=d, sparse_features=s, values=v)
                batches.append(feature_set)

--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@ -678,7 +678,7 @@ class DistributedTest:
            # Verify buffers across ranks.
            m1_buffers = list(m1.buffers())
            m2_buffers = list(m2.buffers())
-            for buf1, buf2 in zip(m1_buffers, m2_buffers):
+            for buf1, buf2 in zip(m1_buffers, m2_buffers, strict=True):
                gathered_bufs = [
                    torch.empty_like(buf1) for _ in range(dist.get_world_size())
                ]
@ -3045,7 +3045,7 @@ class DistributedTest:
                curr_values = master_values if rank == src else worker_values
                tensors = [
                    _build_tensor(src + 1, val, dtype=dtype)
-                    for dtype, val in zip(dtypes, curr_values)
+                    for dtype, val in zip(dtypes, curr_values, strict=True)
                ]
                if cuda:
                    tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
@ -3066,7 +3066,9 @@ class DistributedTest:
                )
                expected_tensors = [
                    _build_tensor(src + 1, expected_value, dtype=dtype)
-                    for dtype, expected_value in zip(dtypes, expected_values)
+                    for dtype, expected_value in zip(
+                        dtypes, expected_values, strict=True
+                    )
                ]
                self.assertEqual(tensors, expected_tensors)

@ -3338,7 +3340,7 @@ class DistributedTest:
                )
                if rank == dest:
                    expected_tensors = [_build_tensor(dest + 1, i) for i in group]
-                    for t1, t2 in zip(tensors, expected_tensors):
+                    for t1, t2 in zip(tensors, expected_tensors, strict=True):
                        self.assertEqual(t1, t2)

            self._barrier()
@ -3440,7 +3442,7 @@ class DistributedTest:
                expected_tensors = [
                    _build_tensor(dest + 1, i, dtype=dtype) for i in group
                ]
-                for t1, t2 in zip(tensors, expected_tensors):
+                for t1, t2 in zip(tensors, expected_tensors, strict=True):
                    self.assertEqual(t1, t2)

            self._barrier()
@ -3624,8 +3626,8 @@ class DistributedTest:
                tensor_shapes=tensor_shapes,
            )

-            for l1, l2 in zip(output_tensor_lists, expected_tensors):
-                for t1, t2 in zip(l1, l2):
+            for l1, l2 in zip(output_tensor_lists, expected_tensors, strict=True):
+                for t1, t2 in zip(l1, l2, strict=True):
                    if not torch.equal(t1, t2):
                        return False
            return True
@ -3824,7 +3826,7 @@ class DistributedTest:
                    ]
                    out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
                dist.all_to_all(out_tensors, in_tensors, group=group_id)
-                for t1, t2 in zip(out_tensors, expected_tensors):
+                for t1, t2 in zip(out_tensors, expected_tensors, strict=True):
                    self.assertEqual(t1, t2)
            self._barrier()

@ -4203,7 +4205,7 @@ class DistributedTest:

        def _assert_equal_param(self, param_gpu, param_DDP):
            self.assertEqual(len(param_gpu), len(param_DDP))
-            for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+            for p_gpu, p_DDP in zip(param_gpu, param_DDP, strict=True):
                self.assertEqual(p_gpu, p_DDP)

        def _test_DDP_niter(
@ -4618,6 +4620,7 @@ class DistributedTest:
                    for hook_param, allreduce_param in zip(
                        ddp_model_with_optimizer_hook.parameters(),
                        ddp_model_with_no_hook.parameters(),
+                        strict=True,
                    ):
                        self.assertEqual(hook_param, allreduce_param)

@ -4649,6 +4652,7 @@ class DistributedTest:
                    for hook_param, allreduce_param in zip(
                        ddp_model_with_optimizer_hook.parameters(),
                        ddp_model_with_no_hook.parameters(),
+                        strict=True,
                    ):
                        self.assertEqual(hook_param, allreduce_param)

@ -4825,7 +4829,9 @@ class DistributedTest:
                        optimizer_kwargs=optim_kwargs,
                    )

-                for p1, p2 in zip(model.parameters(), model_optim_in_bwd.parameters()):
+                for p1, p2 in zip(
+                    model.parameters(), model_optim_in_bwd.parameters(), strict=True
+                ):
                    self.assertEqual(p1, p2, "Parameters not initially equal!")
                # Enable determinism in cudnn operators
                with torch.backends.cudnn.flags(
@ -4843,7 +4849,9 @@ class DistributedTest:
                            inp
                        ).sum().backward()  # runs optimizer as well
                        for p1, p2 in zip(
-                            model.parameters(), model_optim_in_bwd.parameters()
+                            model.parameters(),
+                            model_optim_in_bwd.parameters(),
+                            strict=True,
                        ):
                            self.assertEqual(
                                p1, p2, f"Params not equal at iteration {i}"
@ -5323,7 +5331,9 @@ class DistributedTest:
                    # sync grads
                    step_model(ddp_model, ddp_input, ddp_target)

-                for i, j in zip(model.parameters(), ddp_model.parameters()):
+                for i, j in zip(
+                    model.parameters(), ddp_model.parameters(), strict=True
+                ):
                    if not i.requires_grad:
                        continue
                    if iteration % 2 == 0:
@ -5562,6 +5572,7 @@ class DistributedTest:
            for i, j in zip(
                ddp_model_grad_not_view.parameters(),
                ddp_model_grad_is_view.parameters(),
+                strict=True,
            ):
                self.assertEqual(i, j)

@ -5667,7 +5678,9 @@ class DistributedTest:
                    target,
                )
                for p1, p2 in zip(
-                    net.parameters(), net_using_post_localSGD_opt.parameters()
+                    net.parameters(),
+                    net_using_post_localSGD_opt.parameters(),
+                    strict=True,
                ):
                    self.assertEqual(p1.data, p2.data)

@ -6817,7 +6830,7 @@ class DistributedTest:
            # they are the same as new_model on rank_to_broadcast.
            if rank == rank_to_broadcast:
                expected_states = new_model.state_dict().values()
-                for t, expected in zip(net_module_states, expected_states):
+                for t, expected in zip(net_module_states, expected_states, strict=True):
                    self.assertEqual(t, expected)

        @skip_if_lt_x_gpu(2)
@ -7134,7 +7147,9 @@ class DistributedTest:

            # Validate model state dicts are equal
            for (_, local_tensor), (_, dist_tensor) in zip(
-                local_model.state_dict().items(), net.module.state_dict().items()
+                local_model.state_dict().items(),
+                net.module.state_dict().items(),
+                strict=True,
            ):
                self.assertEqual(local_tensor, dist_tensor)

@ -7722,13 +7737,17 @@ class DistributedTest:
                    # materialized param grad is not touched by DDP, so its grad should
                    # be the same as if running locally.
                    for materialized_param, local_param in zip(
-                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+                        ddp.module.fc2.parameters(),
+                        local_model.fc2.parameters(),
+                        strict=True,
                    ):
                        self.assertEqual(materialized_param.grad, local_param.grad)

                    # fc1 parameter grad should still be different, due to allreduce.
                    for synced_param, local_param in zip(
-                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+                        ddp.module.fc1.parameters(),
+                        local_model.fc1.parameters(),
+                        strict=True,
                    ):
                        self.assertFalse(synced_param.grad == local_param.grad)

@ -8581,7 +8600,7 @@ class DistributedTest:

                # Verify grads are the same
                for local_param, dist_param in zip(
-                    local_net.parameters(), net.parameters()
+                    local_net.parameters(), net.parameters(), strict=True
                ):
                    local_grad = local_param.grad
                    dist_grad = dist_param.grad
@ -8631,7 +8650,7 @@ class DistributedTest:
            torch._C._functions.UndefinedGrad()(out).backward()
            torch._C._functions.UndefinedGrad()(local_out).backward()
            for (dist_param_name, dist_param), (local_param_name, local_param) in zip(
-                net.named_parameters(), local_net.named_parameters()
+                net.named_parameters(), local_net.named_parameters(), strict=True
            ):
                dist_grad = dist_param.grad
                local_grad = local_param.grad
@ -8689,7 +8708,9 @@ class DistributedTest:
            self.assertTrue(
                static_model._get_ddp_logging_data().get("has_rebuilt_buckets", 0)
            )
-            for i, j in zip(base_model.parameters(), static_model.parameters()):
+            for i, j in zip(
+                base_model.parameters(), static_model.parameters(), strict=True
+            ):
                self.assertEqual(i, j)

        @require_backend_is_available({"gloo"})
@ -9297,7 +9318,7 @@ class DistributedTest:
                    loss_static.backward()
                    self._model_step(model_static_graph)
                    for p, p_static in zip(
-                        model.parameters(), model_static_graph.parameters()
+                        model.parameters(), model_static_graph.parameters(), strict=True
                    ):
                        self.assertEqual(p, p_static)

@ -9974,7 +9995,7 @@ class DistributedTest:
                        p.grad.data = p.grad / iters

                    for p_ddp, p_local in zip(
-                        model.parameters(), local_model.parameters()
+                        model.parameters(), local_model.parameters(), strict=True
                    ):
                        self.assertTrue(
                            torch.allclose(p_ddp.grad, p_local.grad),
@ -10191,7 +10212,9 @@ class DistributedTest:
            #  (refer to https://github.com/numpy/numpy/blob/266aad7478bc7fbcc55eea7f942a0d373b838396/numpy/random/mtrand.pyi)
            # To make sure random state was restored properly, all entries should equal the original
            for entry1, entry2 in zip(
-                hook_state.rng.get_state(), dummy_hook_state.rng.get_state()
+                hook_state.rng.get_state(),
+                dummy_hook_state.rng.get_state(),
+                strict=True,
            ):
                np.testing.assert_array_equal(entry1, entry2)

@ -10212,7 +10235,7 @@ class DistributedTest:

            # Check that gradients after 10 epochs are the same
            for orig_param, dummy_param in zip(
-                ddp_model.parameters(), dummy_ddp_model.parameters()
+                ddp_model.parameters(), dummy_ddp_model.parameters(), strict=True
            ):
                self.assertEqual(orig_param.grad, dummy_param.grad)

@ -10299,7 +10322,9 @@ class DistributedTest:
                self.assertEqual(out_ddp, out_ddp_static)
                out_ddp.backward()
                out_ddp_static.backward()
-                for p1, p2 in zip(ddp.parameters(), ddp_static.parameters()):
+                for p1, p2 in zip(
+                    ddp.parameters(), ddp_static.parameters(), strict=True
+                ):
                    self.assertEqual(p1.grad, p2.grad)

        @skip_if_lt_x_gpu(2)
@ -10392,7 +10417,9 @@ class DistributedTest:
                test_model_1._get_ddp_logging_data().get("num_buckets_reduced"), 1
            )

-            for i, j in zip(base_model.parameters(), test_model_1.parameters()):
+            for i, j in zip(
+                base_model.parameters(), test_model_1.parameters(), strict=True
+            ):
                self.assertEqual(i, j)


--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@ -457,7 +457,9 @@ class ProcessLocalGroup(dist.ProcessGroup):
    ):
        works = [
            self._reduce_scatter_base(output_tensor, input_tensor, opts)
-            for output_tensor, input_tensor in zip(output_tensors, input_tensors)
+            for output_tensor, input_tensor in zip(
+                output_tensors, input_tensors, strict=True
+            )
        ]
        for work in works[:-1]:
            work.wait()
@ -467,7 +469,7 @@ class ProcessLocalGroup(dist.ProcessGroup):
        self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()
    ):
        res = None
-        for o_t, i_t in zip(output_tensor_list, input_tensor_list):
+        for o_t, i_t in zip(output_tensor_list, input_tensor_list, strict=True):
            res = self._allgather_base(o_t, i_t)
        return res

--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@ -2749,7 +2749,7 @@ class TensorPipeCudaDistAutogradTest(RpcAgentTestFixture):

                for i in range(len(futs)):
                    local_gradients = [p.grad for p in local_layers[i].parameters()]
-                    for g1, g2 in zip(futs[i].wait(), local_gradients):
+                    for g1, g2 in zip(futs[i].wait(), local_gradients, strict=True):
                        self.assertEqual(g1, g2)

        rpc.shutdown()
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@ -46,7 +46,7 @@ class BatchUpdateParameterServer:
    @rpc.functions.async_execution
    def update_and_fetch_model(ps_rref, grads):
        self = ps_rref.local_value()
-        for p, g in zip(self.model.parameters(), grads):
+        for p, g in zip(self.model.parameters(), grads, strict=True):
            if p.grad is None:
                p.grad = g
            else:
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@ -216,7 +216,7 @@ class Agent:
            returns.insert(0, R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + self.eps)
-        for log_prob, R in zip(probs, returns):
+        for log_prob, R in zip(probs, returns, strict=True):
            policy_loss.append(-log_prob * R)
        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@ -249,7 +249,7 @@ class JitTestCase(JitCommonTestCase):
            saved_module_buffer_2.seek(0)
            code_files_2, _debug_files_2 = extract_files(saved_module_buffer_2)

-            for a, b in zip(code_files, code_files_2):
+            for a, b in zip(code_files, code_files_2, strict=True):
                self.assertMultiLineEqual(a, b)

            if isinstance(m, torch._C.ScriptModule):
@ -617,7 +617,7 @@ class JitTestCase(JitCommonTestCase):
        self.assertEqual(outputs, outputs_ge)
        if inputs_require_grads:
            self.assertEqual(grads, grads_ge, atol=grad_atol, rtol=grad_rtol)
-            for g2, g2_ge in zip(grads2, grads2_ge):
+            for g2, g2_ge in zip(grads2, grads2_ge, strict=True):
                if g2 is None and g2_ge is None:
                    continue
                self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4)
--- a/torch/testing/_internal/logging_utils.py
+++ b/torch/testing/_internal/logging_utils.py
@ -228,11 +228,11 @@ def multiple_logs_to_string(module: str, *log_options: str) -> tuple[list[io.Str
    def tmp_redirect_logs():
        loggers = [torch._logging.getArtifactLogger(module, option) for option in log_options]
        try:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                logger.addHandler(handler)
            yield
        finally:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                logger.removeHandler(handler)

    def ctx_manager() -> AbstractContextManager[None]:
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@ -402,9 +402,9 @@ def sample_inputs_masked_logaddexp(op_info, device, dtype, requires_grad, **kwar
        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
    )
    for shape, input_masks, other_masks in zip(
-        shapes, input_mask_lists, other_mask_lists
+        shapes, input_mask_lists, other_mask_lists, strict=True
    ):
-        for input_mask, other_mask in zip(input_masks, other_masks):
+        for input_mask, other_mask in zip(input_masks, other_masks, strict=True):
            yield SampleInput(
                make_arg(shape),
                make_arg(shape),
--- a/torch/testing/_internal/two_tensor.py
+++ b/torch/testing/_internal/two_tensor.py
@ -78,7 +78,7 @@ class TwoTensor(torch.Tensor):
        # our two inner tensors return the same value
        out_flat = [
            cls(o_a, o_b) if isinstance(o_a, torch.Tensor) else o_a
-            for o_a, o_b in zip(out_a_flat, out_b_flat)
+            for o_a, o_b in zip(out_a_flat, out_b_flat, strict=True)
        ]
        out = pytree.tree_unflatten(out_flat, spec)
        from torch._higher_order_ops.cond import cond_op