From 7f14b42adf70196d82340c59a9981ffcadf0c53c Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sat, 12 Jul 2025 13:12:13 +0800 Subject: [PATCH] [BE][2/16] fix typos in torch/ (torch/_*/) (#156312) Pull Request resolved: https://github.com/pytorch/pytorch/pull/156312 Approved by: https://github.com/albanD --- .lintrunner.toml | 1 - test/cpp/jit/test_custom_class_registrations.cpp | 2 +- test/export/test_export.py | 6 +++--- test/test_fx.py | 2 +- test/test_unary_ufuncs.py | 2 +- tools/linter/dictionary.txt | 1 + torch/_decomp/decompositions.py | 6 +++--- torch/_dynamo/convert_frame.py | 2 +- torch/_dynamo/output_graph.py | 2 +- torch/_dynamo/variables/builder.py | 4 ++-- torch/_export/converter.py | 12 ++++++------ torch/_export/passes/lift_constants_pass.py | 2 +- .../_export/passes/replace_autocast_with_hop_pass.py | 4 ++-- .../replace_quantized_ops_with_standard_ops_pass.py | 4 ++-- torch/_export/passes/replace_with_hop_pass_util.py | 4 ++-- torch/_export/serde/schema.py | 4 ++-- torch/_export/serde/schema_check.py | 2 +- torch/_export/serde/serialize.py | 4 ++-- torch/_export/serde/union.py | 2 +- torch/_export/utils.py | 2 +- torch/_functorch/_aot_autograd/autograd_cache.py | 2 +- .../_aot_autograd/collect_metadata_analysis.py | 8 ++++---- torch/_functorch/_aot_autograd/functional_utils.py | 2 +- .../_aot_autograd/jit_compile_runtime_wrappers.py | 4 ++-- torch/_functorch/_aot_autograd/runtime_wrappers.py | 6 +++--- torch/_functorch/_aot_autograd/schemas.py | 4 ++-- torch/_functorch/_aot_autograd/subclass_utils.py | 4 ++-- torch/_functorch/aot_autograd.py | 10 +++++----- torch/_functorch/compile_utils.py | 2 +- torch/_functorch/compilers.py | 4 ++-- torch/_functorch/config.py | 2 +- torch/_functorch/partitioners.py | 10 +++++----- torch/_higher_order_ops/auto_functionalize.py | 4 ++-- torch/_higher_order_ops/base_hop.py | 2 +- torch/_higher_order_ops/cond.py | 6 +++--- torch/_higher_order_ops/flat_apply.py | 2 +- torch/_higher_order_ops/invoke_subgraph.py | 6 +++--- torch/_higher_order_ops/map.py | 2 +- torch/_higher_order_ops/scan.py | 8 ++++---- torch/_higher_order_ops/schema.py | 2 +- torch/_higher_order_ops/torchbind.py | 2 +- torch/_higher_order_ops/triton_kernel_wrap.py | 2 +- torch/_higher_order_ops/utils.py | 2 +- torch/_higher_order_ops/while_loop.py | 8 ++++---- torch/_inductor/analysis/README.md | 4 ++-- torch/_inductor/analysis/profile_analysis.py | 4 ++-- torch/_inductor/codecache.py | 2 +- torch/_inductor/codegen/triton.py | 2 +- torch/_inductor/comms.py | 2 +- torch/_inductor/config.py | 2 +- torch/_inductor/runtime/triton_heuristics.py | 2 +- torch/_lazy/extract_compiled_graph.py | 8 ++++---- torch/_lazy/metrics.py | 2 +- torch/_library/fake_class_registry.py | 8 ++++---- torch/_library/fake_profile.py | 2 +- torch/_library/utils.py | 4 ++-- torch/_numpy/_dtypes_impl.py | 2 +- torch/_numpy/_funcs_impl.py | 4 ++-- torch/_numpy/_util.py | 2 +- torch/_prims/__init__.py | 2 +- torch/_prims_common/wrappers.py | 2 +- torch/_refs/__init__.py | 2 +- torch/_refs/nn/functional/__init__.py | 2 +- torch/_strobelight/cli_function_profiler.py | 2 +- torch/_strobelight/compile_time_profiler.py | 2 +- torch/_subclasses/fake_impls.py | 2 +- torch/_subclasses/fake_tensor.py | 2 +- torch/_subclasses/functional_tensor.py | 4 ++-- torch/_subclasses/meta_utils.py | 2 +- torch/csrc/distributed/rpc/rref_context.cpp | 2 +- 70 files changed, 123 insertions(+), 123 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index ef141907068c..7e9b7ebd5d2c 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -1169,7 +1169,6 @@ exclude_patterns = [ 'aten/src/ATen/[a-mA-M]*/**', 'test/**', 'test/[a-hA-h]*/**', - 'torch/_*/**', 'torch/distributed/tensor/**', ] init_command = [ diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp index 9aa46459c869..698c967df29f 100644 --- a/test/cpp/jit/test_custom_class_registrations.cpp +++ b/test/cpp/jit/test_custom_class_registrations.cpp @@ -376,7 +376,7 @@ struct ElementwiseInterpreter : torch::CustomClassHolder { // for more info. // This is the type we will use to marshall information on disk during - // ser/de. It is a simple tuple composed of primitive types and simple + // Ser/De. It is a simple tuple composed of primitive types and simple // collection types like vector, optional, and dict. using SerializationType = std::tuple< std::vector /*input_names_*/, diff --git a/test/export/test_export.py b/test/export/test_export.py index c2083d6c02f3..2ded21ec87e0 100755 --- a/test/export/test_export.py +++ b/test/export/test_export.py @@ -487,7 +487,7 @@ class TestExport(TestCase): eps = [ep] if test_serdes: # test dynamic shapes serialization - # test that behavior remains the same when exporting with ser/des specs: + # test that behavior remains the same when exporting with Ser/Des specs: # serialize + deserialize original specs, and export. ep_serdes = export( model, @@ -927,7 +927,7 @@ graph(): ep = export(f, args, strict=False) self.assertEqual(ep.module()(*args), f(*args)) - @testing.expectedFailureCppSerDes # Cpp serder seems to fail parsing complicated guards + @testing.expectedFailureCppSerDes # Cpp Ser/Der seems to fail parsing complicated guards def test_export_statically_known_true(self): class Foo(torch.nn.Module): def forward(self, x, y): @@ -5011,7 +5011,7 @@ def forward(self, p_linear_weight, p_linear_bias, b_buffer, x): # There should be nonzero view nodes in the graph self.assertTrue(view_count > 0) - @testing.expectedFailureCppSerDes # cpp ser/der not handling complicated symbols + @testing.expectedFailureCppSerDes # cpp Ser/Der not handling complicated symbols def test_solver_unsupported_sympy_function(self): # repro of https://github.com/pytorch/pytorch/issues/131897 diff --git a/test/test_fx.py b/test/test_fx.py index 19836147495f..55e98df70248 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -954,7 +954,7 @@ class TestFX(JitTestCase): script_out = scripted_lowered(x) torch.testing.assert_close(script_out, ref_out) - # Test TorchScript ser/de + # Test TorchScript Ser/De import_copy = self.getExportImportCopy(scripted_lowered) imported_out = import_copy(x) torch.testing.assert_close(imported_out, ref_out) diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 855bbfd7f251..d7d9a2b1aab6 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -1104,7 +1104,7 @@ class TestUnaryUfuncs(TestCase): self.assertEqual(res.real, out.real, atol=atol, rtol=rtol) self.assertEqual(res.imag, out.imag, atol=atol, rtol=rtol) - # It is not obvious how to merge this into OpInfo becuase these inputs + # It is not obvious how to merge this into OpInfo because these inputs # succeed for gradcheck but are expected to fail for gradgradcheck @dtypes(torch.double) def test_sinc(self, device, dtype): diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt index c9c5d7d707d3..49ae353c7d02 100644 --- a/tools/linter/dictionary.txt +++ b/tools/linter/dictionary.txt @@ -36,6 +36,7 @@ rebuilt reenable reenabled requestor +ser serde serder serdes diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index 0ff7e46f839b..f93a0bf84fb4 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -52,7 +52,7 @@ class Reduction(Enum): # This wraps a decomposition and performs various type promotion logic within it, depending on the strategy provided -# We're currently re-using ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops +# We're currently reusing ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops # Will need to validate the non-elementwise uses def type_casts( f: Callable, @@ -947,7 +947,7 @@ def im2col( ) torch._check( all(c > 0 for c in output_size), - lambda: f"Given an input with spacial size {tuple(shape[-2:])}, " + lambda: f"Given an input with spatial size {tuple(shape[-2:])}, " f"kernel_size={kernel_size}, dilation={dilation}, " f"padding={padding}, stride={stride}, " "the calculated shape of the array of sliding blocks " @@ -4046,7 +4046,7 @@ def nll_loss2d_forward( return _nll_loss_forward(self, target, weight, reduction, ignore_index) -# These are adapted from aten/src/ATen/native/UpSample.h, wich is based on +# These are adapted from aten/src/ATen/native/UpSample.h, which is based on # https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm def _upsample_cubic_convolution1(x: Tensor, A: float) -> Tensor: return ((A + 2) * x - (A + 3)) * x * x + 1 diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py index bcc334277269..fe547691add6 100644 --- a/torch/_dynamo/convert_frame.py +++ b/torch/_dynamo/convert_frame.py @@ -1063,7 +1063,7 @@ def _compile( return f"'{code.co_name}' ({code.co_filename}:{code.co_firstlineno})" # NS: Don't add period at the end of string, as it'll be added to URL - # renderring it incorrect + # rendering it incorrect log.warning( "torch._dynamo hit config.%s (%s)\n" " function: %s\n" diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py index 61a1447fdd8d..bcbb9dd37c54 100644 --- a/torch/_dynamo/output_graph.py +++ b/torch/_dynamo/output_graph.py @@ -347,7 +347,7 @@ class StackLocalsMetadata: def get_builtins_dict(global_scope): # f_globals["__builtins__"] can be a dict or a module. This is an - # implemenation detail - + # implementation detail - # https://docs.python.org/3/library/builtins.html. # This makes guarding on any builtin messy because the guard check_fn diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py index 5c0116c9c40a..8ae5a4bd6cee 100644 --- a/torch/_dynamo/variables/builder.py +++ b/torch/_dynamo/variables/builder.py @@ -1662,13 +1662,13 @@ class VariableBuilder: # <==> variable tracker" 1-to-1 mapping, which is mainly handled via # `side_effects`. Note that constructing `tensor_variable` above # already adds it to graph arg, but we never registered it with - # `side_effects`. The pre-emptive `realize` calls here basically + # `side_effects`. The preemptive `realize` calls here basically # does that registration (at the end of `self.__call__`). # # A slightly cleaner alternative is to register the # `tensor_variable`s above with `side_effects` directly, and just # return the `list_variable`, but that breaks some tensor-subclass - # releated tests like `test_inputs_aliasing_bytecode_stack_restore`, + # related tests like `test_inputs_aliasing_bytecode_stack_restore`, # because `tensor_variable` is constructed via # `handle_traced_output`, which doesn't really expect/handle tensor # subclass. diff --git a/torch/_export/converter.py b/torch/_export/converter.py index bf0cad5a310a..bba7c2d16aa6 100644 --- a/torch/_export/converter.py +++ b/torch/_export/converter.py @@ -134,7 +134,7 @@ def execute_subgraph_from_prim_loop( ): """ subgraph: GraphModule from sub-block. - iter_idx: The index of interation. + iter_idx: The index of interaction. len_loop_local_arguments: The number of loop local arguments in args. """ @@ -810,7 +810,7 @@ class TS2FXGraphConverter: fx_node = self.fx_graph.call_function(target, args, kwargs) - # TODO: covnert sourceRange() into stack_trace + # TODO: convert sourceRange() into stack_trace # fx_node.meta["stack_trace"] = node.sourceRange() if node.outputsSize() == 1: @@ -883,7 +883,7 @@ class TS2FXGraphConverter: torch.ops.aten._local_scalar_dense.default, (to_copy_node,) ) - # TODO: covnert sourceRange() into stack_trace + # TODO: convert sourceRange() into stack_trace # fx_node.meta["stack_trace"] = node.sourceRange() output_name = node.output().debugName() @@ -942,7 +942,7 @@ class TS2FXGraphConverter: kwargs, ) - # TODO: covnert sourceRange() into stack_trace + # TODO: convert sourceRange() into stack_trace # fx_node.meta["stack_trace"] = node.sourceRange() output_name = node.output().debugName() @@ -1006,7 +1006,7 @@ class TS2FXGraphConverter: ): target = torch.ops.aten.add.t else: - raise RuntimeError(f"unable to determind the target for {node}") + raise RuntimeError(f"unable to determined the target for {node}") else: target = get_op_overload(node) @@ -1565,7 +1565,7 @@ DEBUG: (TORCH_LOGS="+export" ), additionally # # This function should happen in TS2EPConverter instead of # TS2FXGraphConverter since it gets attributes from self.ts_model - # which is not accessable in TS2FXGraphConverter. It is similar to where + # which is not accessible in TS2FXGraphConverter. It is similar to where # we collect self.name_to_param and self.name_to_buffer. name_to_attribute_fqn: dict[str, str] = {} diff --git a/torch/_export/passes/lift_constants_pass.py b/torch/_export/passes/lift_constants_pass.py index 4f235c88e2a1..20253a91c258 100644 --- a/torch/_export/passes/lift_constants_pass.py +++ b/torch/_export/passes/lift_constants_pass.py @@ -165,7 +165,7 @@ def lift_constants_pass( constant_attrs: ConstantAttrMap, ) -> dict[str, _ConstantAttributeType]: """ - Takes a graph module, graph signature, and modifies them implace to lift any + Takes a graph module, graph signature, and modifies them inplace to lift any constants (tensors or custom classes) as inputs to the graph. Returns a dictionary of names to constants. diff --git a/torch/_export/passes/replace_autocast_with_hop_pass.py b/torch/_export/passes/replace_autocast_with_hop_pass.py index 9d415c4a0891..71b90a3ff1bf 100644 --- a/torch/_export/passes/replace_autocast_with_hop_pass.py +++ b/torch/_export/passes/replace_autocast_with_hop_pass.py @@ -100,8 +100,8 @@ def _split_autocast(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: split_autocast creates a new graph module that splits the input graph module into multiple submodules based on the `_enter_autocast` and `_exit_autocast` nodes. It doesn't mutate the input graph module. - Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are splitted - into a submodule. Nested autocast regions are not splitted. + Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are split + into a submodule. Nested autocast regions are not split. `_enter_autocast` and `_exit_autocast(_enter_autocast)` nodes are in the submodule as well. Below is an example of splitting. A, B, C, D, E are blocks of non-autocast nodes in the original graph diff --git a/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py b/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py index afa40d200620..4d9187680101 100644 --- a/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py +++ b/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py @@ -292,7 +292,7 @@ def _conv1d_op_with_squeeze( def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node): - """Conv specfic transformation function.""" + """Conv specific transformation function.""" assert isinstance(node.target, torch._ops.OpOverload) opname = node.target._opname scale_node, zero_point_node = node.args[2], node.args[3] @@ -347,7 +347,7 @@ def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.No def _transform_linear_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node): - """Linear specfic transformation function.""" + """Linear specific transformation function.""" scale_node, zero_point_node = node.args[2], node.args[3] inp_node, param_node = node.args[0], node.args[1] diff --git a/torch/_export/passes/replace_with_hop_pass_util.py b/torch/_export/passes/replace_with_hop_pass_util.py index bab39863ddab..974058092448 100644 --- a/torch/_export/passes/replace_with_hop_pass_util.py +++ b/torch/_export/passes/replace_with_hop_pass_util.py @@ -46,7 +46,7 @@ def _replace_with_hop_helper( enter_block_node.meta.get("nn_module_stack", {}) ) output_node = next(iter(reversed(sub_gm.graph.nodes)), None) - # Split_module pass intentially doesn't add output node + # Split_module pass intentionally doesn't add output node # if the graph doesn't return anything. # TODO (tmanlaibaatar) Figure out if this is right behaviour # for split_module @@ -97,7 +97,7 @@ def _replace_with_hop_helper( node_replace_(node, get_item_node) else: raise NotImplementedError( - f"repalce_with_hop_pass doesnt' support output type {type(output_args)}" + f"replace_with_hop_pass doesn't support output type {type(output_args)}" ) else: # TODO (shangdiy): remove this line, since the export graph can be non-functional diff --git a/torch/_export/serde/schema.py b/torch/_export/serde/schema.py index ef1ec3ee4981..32c69140807b 100644 --- a/torch/_export/serde/schema.py +++ b/torch/_export/serde/schema.py @@ -382,7 +382,7 @@ class ModuleCallSignature: out_spec: Annotated[str, 40] # This field is used to prettify the graph placeholders - # after we ser/der and retrace + # after we Ser/Der and retrace forward_arg_names: Annotated[Optional[list[str]], 50] = None @@ -413,7 +413,7 @@ class GraphModule: # Invariant: Every time a change is made to the schema, one of the versions -# should be upadted. +# should be updated. @dataclass class SchemaVersion: major: Annotated[ diff --git a/torch/_export/serde/schema_check.py b/torch/_export/serde/schema_check.py index 0c6c57c648bd..ccc963397530 100644 --- a/torch/_export/serde/schema_check.py +++ b/torch/_export/serde/schema_check.py @@ -689,7 +689,7 @@ def check(commit: _Commit, force_unsafe: bool = False): for f, d in fields.items(): if kind == "struct" and "default" not in d: reason += ( - f"Field {k}.{f} is added to schema.py without a default value as an incomparible change " + f"Field {k}.{f} is added to schema.py without a default value as an incompatible change " + "which requires major version bump.\n" ) next_version = [commit.base["SCHEMA_VERSION"][0] + 1, 1] diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py index f5970c904351..5c688b2a14d2 100644 --- a/torch/_export/serde/serialize.py +++ b/torch/_export/serde/serialize.py @@ -1408,7 +1408,7 @@ class GraphModuleSerializer(metaclass=Final): assert isinstance( return_schema.real_type, (torch.OptionalType, torch.TensorType) ) - # When the return type is annoated as Tensor type, the op can also return an + # When the return type is annotated as Tensor type, the op can also return an # undefined Tensor which will be implicitly converted to None in Python. output_arguments.append(Argument.create(as_none=True)) elif isinstance(meta, FakeTensor): @@ -2057,7 +2057,7 @@ class GraphModuleDeserializer(metaclass=Final): _additional_msg = ( ( f"We failed to resolve {target} to an operator. " - + "If it's a custom op/custom triton op, this is usally because the custom op is not registered" + + "If it's a custom op/custom triton op, this is usually because the custom op is not registered" + " when deserializing. Please import the custom op to register it before deserializing." + " Otherwise, please file an issue on github." ) diff --git a/torch/_export/serde/union.py b/torch/_export/serde/union.py index e0ca90dbad1a..c65ad38d337f 100644 --- a/torch/_export/serde/union.py +++ b/torch/_export/serde/union.py @@ -41,7 +41,7 @@ def _get_field_names(cls) -> set[str]: # this decorator to configure it. It's safe, faster and allows code sharing. # # For example, _union_dataclass customizes the __eq__ method to only check the type -# and value property instead of default implmentation of dataclass which goes +# and value property instead of default implementation of dataclass which goes # through every field in the dataclass. @dataclass_transform(eq_default=False) def _union_dataclass(cls: type[T]) -> type[T]: diff --git a/torch/_export/utils.py b/torch/_export/utils.py index 3117d7322340..1e2f84e5a3bd 100644 --- a/torch/_export/utils.py +++ b/torch/_export/utils.py @@ -1269,7 +1269,7 @@ def _collect_all_valid_cia_ops() -> set["OperatorBase"]: def _get_decomp_for_cia(op: "OperatorBase"): - # [NOTE] Seperating out func.decompose + # [NOTE] Separating out func.decompose # Ideally we should be able to just register func.decompose but # we can't as this decomp is gonna be registered to the py_impl. # As a result it will infinitely recurse. So we first check if the op diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py index 6c7cebbff898..954dc399f96b 100644 --- a/torch/_functorch/_aot_autograd/autograd_cache.py +++ b/torch/_functorch/_aot_autograd/autograd_cache.py @@ -279,7 +279,7 @@ def check_cacheable(gm: torch.fx.GraphModule): # Subgraphs are only used for caching logic. if hasattr(gm, "saved_tensors_hooks_pack_0"): check_cacheable(gm.saved_tensors_hooks_pack_0) # type: ignore[arg-type] - # We have guarantee of unpack sugraph existance if pack subgraph exists + # We have guarantee of unpack sugraph existence if pack subgraph exists check_cacheable(gm.saved_tensors_hooks_unpack_0) # type: ignore[arg-type] diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py index db5075c144b6..cc13a0a725f5 100644 --- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py +++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py @@ -61,7 +61,7 @@ static_input_logger = getArtifactLogger("torch._dynamo", "cudagraph_static_input # We assume tangents memory format to be similar to corresponding output's memory_format. # The idea is that we are technically making a guess about the strides of our tangents, # while we trace out the joint. -# If runtime specfied tangents will not have the same memory format as predicted traced tangents, +# If runtime specified tangents will not have the same memory format as predicted traced tangents, # we coerce them at runtime to traced tangents memory format. @@ -83,7 +83,7 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor): out = out.contiguous(memory_format=memory_format.memory_format) updated = was is not out - # For subclass we keep memory format of outer strides at the beggining of the list + # For subclass we keep memory format of outer strides at the beginning of the list out_memory_format = [memory_format] if is_subclass else memory_format # Note [Tangents memory format, Part 2] @@ -583,7 +583,7 @@ from a multi-output view call" and not o.requires_grad ): # In theory we could use any of these tensors to regenerate the aliased outputs from, - # since they all alias each other and have identical metatadata + # since they all alias each other and have identical metadata out_alias = outs_with_identical_metadata_that_require_grad[0] existing_out_idx = out_tensor_ids[id(out_alias)] output_type = OutputType.alias_of_intermediate_base_is_user_output @@ -702,7 +702,7 @@ from a multi-output view call" # (a * b).sum().backward() # # We can not deduce it easily now, so introducing a debug config to be able to turn off this for specific cases. - # NJT gurantees to have its tangent as NJT, because it has dedicated integration in Autograd + # NJT guarantees to have its tangent as NJT, because it has dedicated integration in Autograd # See torch/csrc/autograd/python_function.cpp, use_zeros_like. ( _plain_fake_tensor_like_subclass(inp) diff --git a/torch/_functorch/_aot_autograd/functional_utils.py b/torch/_functorch/_aot_autograd/functional_utils.py index e208fa4f6a44..4e74ed6341b9 100644 --- a/torch/_functorch/_aot_autograd/functional_utils.py +++ b/torch/_functorch/_aot_autograd/functional_utils.py @@ -371,7 +371,7 @@ class FunctionalTensorMetadataEq: if other is None: return True - # Comparison agains any other type is not implemented. + # Comparison against any other type is not implemented. if not isinstance(other, FunctionalTensorMetadataEq): return NotImplemented diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py index 2e512cb3c9ce..53bfa1e3c51e 100644 --- a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py +++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py @@ -1048,7 +1048,7 @@ def maybe_inline_graph_saved_tensors_hooks( fw_outs_bw_ins_node_names.append(new_node_name) else: # We can not specify desired name in node_copy. - # Copying node manually to set specifc name, + # Copying node manually to set specific name, # to have matching fw_outs, bw_inputs names. new_node_name = _gen_unused_name(f"{saved.name}_hook_{out_idx}") with fw_g.inserting_before(_n): @@ -1458,7 +1458,7 @@ def aot_dispatch_autograd( # It's possible to construct a case where eager may or may not have have tried to autograd through y, # depending on the actual grad_outputs that were passed in during the backward. # There is no easy fix for this: the simplest fix would be to run with `retain_graph=True`, - # allowing autograd to re-use the graph. + # allowing autograd to reuse the graph. # # An example of this case is: # def f(x): diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py index 03bc4fae429d..77eebd5e6248 100644 --- a/torch/_functorch/_aot_autograd/runtime_wrappers.py +++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py @@ -1440,7 +1440,7 @@ def merge_view_inputs( # to have incorrect sizes. example_idx = aliased_input_indices[0] example_alias = fwd_inputs[example_idx] - # Note that this function is re-used at both trace time and runtime. + # Note that this function is reused at both trace time and runtime. # At trace time, we're under a FakeMode so synthetic_base becomes a FakeTensor. synthetic_base = torch.empty( (0,), dtype=example_alias.dtype, device=example_alias.device @@ -1519,7 +1519,7 @@ def merge_view_inputs( # unless we suspect that inductor might specialize and insert additional guards. When we do lazy # lowering, we stash the AOT backward graph (bw_module) in this class. # -# Lowering passes are performed on a deepcopy of this bw_module due to compatbility +# Lowering passes are performed on a deepcopy of this bw_module due to compatibility # with compiled autograd. See: https://github.com/pytorch/pytorch/pull/149229#discussion_r2002122645. @dataclass class AutogradLazyBackwardCompileInfo: @@ -1842,7 +1842,7 @@ def coerce_to_expected_memory_format(x: torch.Tensor, memory_format: MemoryForma return x # Empty_strided creates a raw Tensor. - # We are guranteed that only raw Tensors has expected size and stride. + # We are guaranteed that only raw Tensors has expected size and stride. # Subclasses have only expected memory_format. restrided = torch.empty_strided( size=expected_size, diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py index cfcbaa8cc097..9b3239823303 100644 --- a/torch/_functorch/_aot_autograd/schemas.py +++ b/torch/_functorch/_aot_autograd/schemas.py @@ -224,7 +224,7 @@ class SubclassCreationMeta: # arg_count is inclusive of the arg_counts of any # inner tensor subclasses: If I have a TwoTensor and # both of its inner elements are TwoTensors, then the - # arg_count of the outer-most sublass will be 4 + # arg_count of the outer-most subclass will be 4 arg_count: int # Mark where or not symints were included. This flag is only used in one assertion # in "wrap_tensor_subclasses" @@ -384,7 +384,7 @@ class ViewAndMutationMeta: # metadata pass of the user's forward function. # Their only use today is to pass them as a best-guess for tangents when tracing the joint. # Stashing them as part of our "metadata" makes it simpler if we want to run our analysis - # pass once, and re-use the output throughout AOTAutograd + # pass once, and reuse the output throughout AOTAutograd traced_tangents: list[Any] # Each of these is a list telling us about subclasses for the inputs/outputs/grad_outs diff --git a/torch/_functorch/_aot_autograd/subclass_utils.py b/torch/_functorch/_aot_autograd/subclass_utils.py index 986e569dfc3d..789495d9fb97 100644 --- a/torch/_functorch/_aot_autograd/subclass_utils.py +++ b/torch/_functorch/_aot_autograd/subclass_utils.py @@ -370,7 +370,7 @@ def wrap_tensor_subclasses( # we computed subclass metadata on every forward output, but this did **not** include activations # created by the partitioner. # as a result, `unwrapped_args` here will correspond to (*unwrapped_user_fw_outs, *activations), - # but `subclass_metas` will only correspond to subclass metatadata on `user_fw_outs`. + # but `subclass_metas` will only correspond to subclass metadata on `user_fw_outs`. # We then need to make sure that we return (*wrapped_user_fw_outs, *activations). if num_fw_outs_saved_for_bw is not None: assert len(unwrapped_args) == num_args_tallied + num_fw_outs_saved_for_bw, ( @@ -396,7 +396,7 @@ def wrap_tensor_subclasses( def wrap_tensor_subclasses_maybe_joint( unwrapped_args, *, is_joint_structure: bool, meta: ViewAndMutationMeta ) -> Union[tuple[Any, ...], list[Any]]: - # Since this function is re-used for both inference and joint graphs, + # Since this function is reused for both inference and joint graphs, if is_joint_structure: assert isinstance(unwrapped_args, tuple) and len(unwrapped_args) == 2 assert isinstance(unwrapped_args[0], (tuple, list)) and isinstance( diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py index 7fe748e20896..56367c0c4676 100644 --- a/torch/_functorch/aot_autograd.py +++ b/torch/_functorch/aot_autograd.py @@ -365,7 +365,7 @@ AOT_COUNTER = itertools.count() # # We view every forward output when creating out tangent tensors to handle the problematic # case in which a subclass does extra aliasing between graph outputs/inputs in a way that -# is not visible above the sublass. +# is not visible above the subclass. # # Ordinarily, when constructing the joint function that we want to trace in AOTAutograd, # we're guaranteed that the tangent tensors that we pass @@ -872,7 +872,7 @@ def aot_function( This API is experimental and likely to change. Args: - fn (Callable): A Python function that takes one ore more arguments. Must + fn (Callable): A Python function that takes one or more arguments. Must return one or more Tensors. fw_compiler (Callable): A Python function that accepts an Fx graph with Aten ops and input args, and returns a Callable that semantically is @@ -1260,7 +1260,7 @@ def aot_export_module( # Your module can return multiple outputs, so you must specify which output the loss is. output_loss_index: Optional[int] = None, pre_dispatch: bool = False, - # If None, will be infered from inputs and mod.graph.nodes if mod is a graph module, but the inferred result might be wrong. + # If None, will be inferred from inputs and mod.graph.nodes if mod is a graph module, but the inferred result might be wrong. dynamic_shapes: Optional[bool] = None, kwargs=None, ) -> tuple[torch.fx.GraphModule, GraphSignature]: @@ -1459,7 +1459,7 @@ def aot_export_joint_simple( *, trace_joint: bool, # It looks like the main consequence of this API is that for dynamic shapes, - # it will assume that parms/buffers are static. + # it will assume that params/buffers are static. # With the new inferred dynamic shapes API, maybe this doesn't matter? num_params_buffers: int = 0, decompositions: Optional[dict] = None, @@ -1570,7 +1570,7 @@ def _aot_export_function( # We don't know this info at trace time though, so we need to make it an explicit config. no_tangents: bool = False, pre_dispatch: bool = False, - # If None, `dynamic_shapes` will be infered from inputs, but the inferred result might be wrong. + # If None, `dynamic_shapes` will be inferred from inputs, but the inferred result might be wrong. dynamic_shapes: Optional[bool] = None, keep_input_mutations: bool = False, kwargs=None, diff --git a/torch/_functorch/compile_utils.py b/torch/_functorch/compile_utils.py index 39eadaae7ef6..929b58540f41 100644 --- a/torch/_functorch/compile_utils.py +++ b/torch/_functorch/compile_utils.py @@ -179,7 +179,7 @@ def raise_getitems(gm: fx.GraphModule) -> fx.GraphModule: ) # loop through getitem nodes in the graph and raise them to the parent node - # in reverse order to perserve their original relative order + # in reverse order to preserve their original relative order for node in reversed(getitem_nodes): assert len(node.all_input_nodes) == 1 parent = node.all_input_nodes[0] diff --git a/torch/_functorch/compilers.py b/torch/_functorch/compilers.py index 65cb80211213..5295a526e25c 100644 --- a/torch/_functorch/compilers.py +++ b/torch/_functorch/compilers.py @@ -31,7 +31,7 @@ from .partitioners import ( log = logging.getLogger(__name__) -# These canonicalizations are needed here (and not decompositions), as the ops +# These canonicalization are needed here (and not decompositions), as the ops # we're trying to canonicalize to CompositeImplicitAutograd. def _canonicalize(fx_g): for node in fx_g.graph.find_nodes( @@ -249,7 +249,7 @@ def memory_efficient_fusion( Args: fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module`` - that takes one ore more arguments. Must return one or more Tensors. + that takes one or more arguments. Must return one or more Tensors. **kwargs: Any other overrides you want to make to the settings Returns: diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py index 66f1fe88c612..e8778f31889d 100644 --- a/torch/_functorch/config.py +++ b/torch/_functorch/config.py @@ -292,7 +292,7 @@ strict_autograd_cache = False # which can reorder or ,delete duplicate nodes in the graph # - If any of these passes reorder/delete/duplicate a collective # in a setting where the compiler is being run independently on multiple -# ranks, we run the risk that the compiler will make a different decison on +# ranks, we run the risk that the compiler will make a different decision on # different ranks, resulting in a NCCL hang when using torch.compile # To handle this, we will (by default) ensure that collectives are not modified # by the compiler. diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py index 21218e606853..7b36092e09eb 100644 --- a/torch/_functorch/partitioners.py +++ b/torch/_functorch/partitioners.py @@ -513,7 +513,7 @@ def should_quantize(node: torch.fx.Node) -> bool: ].get("skip_dynamo_guards", False): return size_in_mb >= size_threshold else: - # case 1: we alway quantize tensors with dynamic shapes + # case 1: we always quantize tensors with dynamic shapes if torch._inductor.config.post_grad_fusion_options[ "activation_quantization_aten_pass" ].get("quantize_dynamic_shape", False): @@ -521,7 +521,7 @@ def should_quantize(node: torch.fx.Node) -> bool: size_in_mb >= size_threshold ) or not statically_known_false(size_in_mb >= size_threshold) else: - # case 2: we alway not quantize tensors with dynamic shapes + # case 2: we always not quantize tensors with dynamic shapes return statically_known_true(size_in_mb >= size_threshold) @@ -592,7 +592,7 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None: output_updated_args = [ node_to_quant[node] if node in node_to_quant else node for node in fwd_outputs ] - # add the scale nodes to the ouput find the first sym_node in the output + # add the scale nodes to the output find the first sym_node in the output idx = find_first_sym_node(output_updated_args) scale_nodes = tensor_scale_nodes + sym_scale_nodes if scale_nodes: @@ -1094,7 +1094,7 @@ def reordering_to_mimic_autograd_engine(gm: fx.GraphModule) -> fx.GraphModule: """ This pass finds the first bwd node in the graph (by looking at users of tangents) and then reorders the graph by walking from this node to all the - way to the end of the graph. At each op in this traveral, we insert this op + way to the end of the graph. At each op in this traversal, we insert this op in a new graph and try to bring only the relevant subgraph from the other non-bwd edges relevant for this op. This closely mimics the behavior of autograd engine. @@ -1364,7 +1364,7 @@ def functionalize_rng_ops( get_device(node_pair["fwd"]) for node_pair in recomputable_rng_ops_map.values() ) devices.discard(torch.device("cpu")) - # multiple cuda devices wont work with cudagraphs anyway, + # multiple cuda devices won't work with cudagraphs anyway, # fallback to non graphsafe rng checkpointing multi_cuda_devices = len(devices) > 1 diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py index ef8cddbae7c1..d5aa0d09c8b1 100644 --- a/torch/_higher_order_ops/auto_functionalize.py +++ b/torch/_higher_order_ops/auto_functionalize.py @@ -586,7 +586,7 @@ class FunctionalCallableWithEpilogue: def __call__(self, *args, **kwargs): # We call torch.func.functionalize. This allows us to inline the epilogue graph. # Inlining has the benefit of allowing easiser fusion inside subgraph. - # Though the epilogue graph contains copy_, it is OK becuase inductor can handle it + # Though the epilogue graph contains copy_, it is OK because inductor can handle it # and this is also how we have been supporting top-level graph input mutation. return tuple(torch.func.functionalize(self.orig_callable)(*args, **kwargs)) @@ -944,7 +944,7 @@ def auto_functionalized_v2_proxy( # Below code materializes the callable inputs to the hop as graph modules. # kwargs may contain general callables, that are not proxable e.g. FunctionWithNoFreeVars # this could happen when we auto_functionalize the backward of the hop, - # where backward fn is a callablle that wrapps forward graph module. + # where backward fn is a callablle that wraps forward graph module. # This function materialize the callable args according to the schema of the hop. # We cannot materialize the callables in kwargs directly because the inputs to callable diff --git a/torch/_higher_order_ops/base_hop.py b/torch/_higher_order_ops/base_hop.py index 8898c56ab227..11826c3f6369 100644 --- a/torch/_higher_order_ops/base_hop.py +++ b/torch/_higher_order_ops/base_hop.py @@ -198,7 +198,7 @@ class BaseHOP(HigherOrderOperator, abc.ABC): import warnings warnings.warn( - "Aliasing is not suppported for HOP subgraph.\n" + "Aliasing is not supported for HOP subgraph.\n" f"{subgraph.print_readable(print_output=False)}\n" f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}" f"This may lead to silent incorrectness." diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py index 518c0624cbab..648d41b0b95a 100644 --- a/torch/_higher_order_ops/cond.py +++ b/torch/_higher_order_ops/cond.py @@ -348,7 +348,7 @@ class CondAutogradOp(torch.autograd.Function): operands = saved_tensors_and_symints(ctx) args = operands + flat_grads # TODO: we need to materialize the bw graphs because dynamo is unable to - # trace through the joint funcion when torch.compile torch.autograd.grad. + # trace through the joint function when torch.compile torch.autograd.grad. true_bw_gm = materialize_as_graph( ctx._true_bw_fn, args, @@ -552,7 +552,7 @@ def _merge_output( ... Case 2: At least one dimension has size 1, which can produce duplicates in strides. - In this case, theorectically, we cannot uniquely determine the expr of strides because + In this case, theoretically, we cannot uniquely determine the expr of strides because the accessing stride_expr with same key in different order causes the final stride expression to be different. @@ -562,7 +562,7 @@ def _merge_output( merged_size: (u0, u1) The stride expr could either be (u1, 1) or (1, u0) depending on whether we start with u1 or u0. - For this reason, we try to break tie by sorting via decending index so we always get (u1, 1). + For this reason, we try to break tie by sorting via descending index so we always get (u1, 1). Note that backend might optimize the strides anyway so this is usually not a problem as long as two branches matches. See relevant discussions in https://github.com/pytorch/pytorch/issues/142024. diff --git a/torch/_higher_order_ops/flat_apply.py b/torch/_higher_order_ops/flat_apply.py index 7b496d895129..654e2ea38384 100644 --- a/torch/_higher_order_ops/flat_apply.py +++ b/torch/_higher_order_ops/flat_apply.py @@ -108,7 +108,7 @@ def impl(func, in_spec, *flat_args): # # TODO: The following can be updated to support non-graphable outputs and pytrees. # For non-graphable constant outputs: the assumption would be that they are constant - # (everytime the function runs those MUST be the same) + # (every time the function runs those MUST be the same) # For pytree outputs: # I'm not sure if we need to return (flat_output, spec) or just (flat_output,): # in the latter case the tracers need to carry out the output specs diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py index 0b21b6153100..e4ebdb68fd34 100644 --- a/torch/_higher_order_ops/invoke_subgraph.py +++ b/torch/_higher_order_ops/invoke_subgraph.py @@ -560,9 +560,9 @@ def _(ctx, subgraph, identifier, *operands): # We call auto_functionalized_v2 to support input mutation of invoke_subgraph. # See NOTE [Support input mutation of hops] for the overall design. # - # invoke_subgraph is special because of its identifier based caching machanism. + # invoke_subgraph is special because of its identifier based caching mechanism. # In invoke_subgraph's functionalization key implementation, we create a new - # identifer because the subgraph is replaced by FunctionWithNoFreeVars in a + # identifier because the subgraph is replaced by FunctionWithNoFreeVars in a # functional + epilogue form. assert isinstance(identifier, str), identifier return do_auto_functionalize_v2( @@ -635,7 +635,7 @@ def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, *operands): # with a previously cached identifier, the corresponding graph module might not # exist as a submodule in the new tracer's root. Therefore, we register it as a submodule below. # - # The alternative is to give a new identifer when we re-trace the invoke_subgraph but this will increase + # The alternative is to give a new identifier when we re-trace the invoke_subgraph but this will increase # the compilatoin time, which defeats the purpose of caching. registered_before = False for ( diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py index ff26c25222db..9f73df7ef478 100644 --- a/torch/_higher_order_ops/map.py +++ b/torch/_higher_order_ops/map.py @@ -117,7 +117,7 @@ def map( *args: TypeVarTuple, ): r""" - Perfoms a map of f with xs. Intuitively, you can think of the semantic being: + Performs a map of f with xs. Intuitively, you can think of the semantic being: out = [] for idx in len(xs.size(0)): diff --git a/torch/_higher_order_ops/scan.py b/torch/_higher_order_ops/scan.py index fb94bda71d2d..7681cf8f070c 100644 --- a/torch/_higher_order_ops/scan.py +++ b/torch/_higher_order_ops/scan.py @@ -135,7 +135,7 @@ def scan( and the second output of ``combine_fn`` represents a slice of the output. This function must be pure, i.e., no lifted arguments are supported at the moment and may not have any side effects. - init (torch.Tensor or pytree with tensor leaves): The inital scan carry, a tensor, or nested pytree of tensors. + init (torch.Tensor or pytree with tensor leaves): The initial scan carry, a tensor, or nested pytree of tensors. The ``init`` is expected to have the same pytree structure as the first output element (i.e. carry) of ``combine_fn``. xs (torch.Tensor or pytree with tensor leaves): The input tensor, or nested pytree of tensors. @@ -154,7 +154,7 @@ def scan( - The combine_fn shouldn't have any aliasing between input-input, input-output, and output-output. E.g. return a view or the same tensor as input is not supported. As a workaround, can clone the output to avoid aliasing. - - The combine_fn shoudn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue + - The combine_fn shouldn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue if you input mutation support for training is needed. - The combine_fn's init carry should match the next_carry in pytree structure and in tensor metadata. @@ -585,7 +585,7 @@ class ScanAutogradOp(torch.autograd.Function): carry, y = _extract_carry_and_out(combine_fn(*args), num_leaves_init) return [ *carry, - # We additionally checkpoint all the intemediate carry outputs for backward. + # We additionally checkpoint all the intermediate carry outputs for backward. *[ n_c.clone().detach() if isinstance(n_c, torch.Tensor) else n_c for n_c in carry @@ -793,7 +793,7 @@ class ScanAutogradOp(torch.autograd.Function): # Prepare the bwd_init bwd_init = [*initial_g_additional_inputs, *g_c_T] - # 5.) Perform the backwrad scan: + # 5.) Perform the backward scan: # The ``combine_fn_bw_wrapped`` receives the # initial_g_additional_inputs and the last carry as the ``bwd_init`` and the # gradients of the outputs (g_ys), as well as the fw_carries and the fw_xs of the forward as the ``bwd_xs`` diff --git a/torch/_higher_order_ops/schema.py b/torch/_higher_order_ops/schema.py index c7378147a205..15bfac752ab7 100644 --- a/torch/_higher_order_ops/schema.py +++ b/torch/_higher_order_ops/schema.py @@ -18,7 +18,7 @@ class HopArgumentInfo: example_value: Any # Provide an default_value default_value: Any - # Whether this arugment gets mutated in the hop subgraph. + # Whether this argument gets mutated in the hop subgraph. # For output, this should always be False is_mutated: bool kw_only: bool diff --git a/torch/_higher_order_ops/torchbind.py b/torch/_higher_order_ops/torchbind.py index 5496276f1dda..c10e674b7ac0 100644 --- a/torch/_higher_order_ops/torchbind.py +++ b/torch/_higher_order_ops/torchbind.py @@ -136,7 +136,7 @@ def inner(mode, *args, **kwargs): # When tracing with fake script object, the call_torchbind op will return a fake tensor # When tracing with real script object, the call_torchbind op may return a real tensor, -# we need to convert it to fake tensor mannually. Dynamic shape is surpported. +# we need to convert it to fake tensor manually. Dynamic shape is supported. @call_torchbind.py_impl(FakeTensorMode) def call_torchbind_fake(mode, *args, **kwargs): with mode: diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py index 71fb20441037..34a9c5915254 100644 --- a/torch/_higher_order_ops/triton_kernel_wrap.py +++ b/torch/_higher_order_ops/triton_kernel_wrap.py @@ -1037,7 +1037,7 @@ def triton_kernel_wrapper_mutation_dense( # as we need to launch the kernel here, we "unwrap" the # tma_descriptor_metadata, create the TMA descriptors # from it, and replace the tensors in the kwargs by the - # correspoinding TMA descriptors before launching + # corresponding TMA descriptors before launching kwargs = kwargs.copy() for k, v in tma_descriptor_metadata.items(): tensor = kwargs[k] diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py index 9b5293fbe64b..2000571f6057 100644 --- a/torch/_higher_order_ops/utils.py +++ b/torch/_higher_order_ops/utils.py @@ -852,7 +852,7 @@ def check_input_alias_and_mutation_return_outputs( # Clone the fake args to avoid mutating the original fake args with ExitStack() as ctx_stack: - # We need to re-use prev_fake_mode's shape env to resolve + # We need to reuse prev_fake_mode's shape env to resolve # the runtime assertions for unbacked symbols. new_fake_mode = torch._subclasses.FakeTensorMode( shape_env=_get_shape_env(fake_args), diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py index e0e57dfad3f3..d94ccf16d216 100644 --- a/torch/_higher_order_ops/while_loop.py +++ b/torch/_higher_order_ops/while_loop.py @@ -107,9 +107,9 @@ def while_loop(cond_fn, body_fn, carried_inputs): - body_fn and cond_fn must not in-place mutate the carried_inputs. A clone before the mutation is required. - - body_fn and cond_fn must not mutate python varialbles (e.g. list/dict) created outside of the body_fn. + - body_fn and cond_fn must not mutate python variables (e.g. list/dict) created outside of the body_fn. - - body_fn and cond_fn's output cannot aliase any of the inputs. A clone is required. + - body_fn and cond_fn's output cannot alias any of the inputs. A clone is required. .. warning:: Temporal Limitations: @@ -279,8 +279,8 @@ def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs # For this reason, we treat int, symint outputs in the same way: # - they can match against any of int, symint carry # - we unspecialize them with new unbacked symints in fake while_loop - # Similarly, we could do some analysis to refine the output ranges but it's eaiser to start with - # fresh unbacked symints. One suprising case can be: an input unbacked symint is constrained by + # Similarly, we could do some analysis to refine the output ranges but it's easier to start with + # fresh unbacked symints. One surprising case can be: an input unbacked symint is constrained by # users to be >= 0 (either before while_loop or inside body_fn) and it increments by 1 in each # iteration. Ideally, we should know that the final output is >= 0 but we didn't constrain the # unbacked symint output of subgraph as of today because this requires a smart range analysis. diff --git a/torch/_inductor/analysis/README.md b/torch/_inductor/analysis/README.md index 761631c6f740..b0e1d154f9b1 100644 --- a/torch/_inductor/analysis/README.md +++ b/torch/_inductor/analysis/README.md @@ -14,7 +14,7 @@ python profile_analysis.py --analysis - `default_dtype`: The default dtype of the model. Sometimes the dtypes of the kernel inputs are not available in the profile, so we use the default dtype to infer the dtypes of the inputs. ## Diff -This mode will diff two different profiles and output a table of the differences. It groups by kernel name, which can fail to properly match accross hardware vendors. More intelligent grouping coming soon. +This mode will diff two different profiles and output a table of the differences. It groups by kernel name, which can fail to properly match across hardware vendors. More intelligent grouping coming soon. ### Usage ``` @@ -25,7 +25,7 @@ python profile_analysis.py --diff defaultdict[int, list[dict[str, Any]]]: """ - compute a mapping from exteral ids to non kernels, which contain the information we need to estimate flops etc + compute a mapping from external ids to non kernels, which contain the information we need to estimate flops etc """ extern_mapping: defaultdict[int, list[dict[str, Any]]] = defaultdict(list) for event in data["traceEvents"]: @@ -402,7 +402,7 @@ class JsonProfile: dtype: Optional[Union[torch.dtype, str]] = None, ): """ - Convienence class for running common operations on chrome/perfetto json traces. + Convenience class for running common operations on chrome/perfetto json traces. """ self.path = path with open(path) as f: diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index 3c32ca21ea80..0b333dc97236 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -1829,7 +1829,7 @@ class AotCodeCompiler: consts_asm += f"{symbol_prefix}_binary_constants_bin_end:\n" return consts_asm, "S" - # Use c++ to comvert consts to object file can support more compilers, such as msvc and icx. + # Use c++ to convert consts to object file can support more compilers, such as msvc and icx. def format_consts_to_cpp( consts: bytes, align_bytes: int, symbol_prefix: str ) -> tuple[str, str]: diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index de7e53a69f3c..b8db456a2e68 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -1772,7 +1772,7 @@ class TMACompatibilityChecker: # For a discontiguous tensor, a 1D block will be split across several # dimensions, e.g. R0_BLOCK: # block_shape=[XBLOCK, ((R0_BLOCK + 31)//32), Min(1, ((R0_BLOCK + 31)//32)), Min(32, R0_BLOCK)] - # The persistent R0_BLOCK will be a power of 2 that is atleast r0_numel So it + # The persistent R0_BLOCK will be a power of 2 that is at least r0_numel So it # should be guaranteed that Min(32, R0_BLOCK) * element_size >= 16 innermost_tree_prefix = prefix_str[innermost_block_symt] tree_numel = None diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py index 87a0bca6255e..caaf43dba590 100644 --- a/torch/_inductor/comms.py +++ b/torch/_inductor/comms.py @@ -663,7 +663,7 @@ def _sink_waits_iterative_internal( data_dep = o.get_name() break # 1. If we have data_dep - we can not swap => trying to group - # 2. If swap candidate and current node boths contain collectives => trying to group + # 2. If swap candidate and current node both contain collectives => trying to group if data_dep is not None or ( both_contain_comms := ( contains_collective(wait_gsnode) diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index 923c9745b27a..ba532518c5d8 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -1298,7 +1298,7 @@ class triton: # - For Nvidia GPUs, the compute capability should be >= 9.0 # - The innermost stride of a descriptor should be 1 # - The size of the block shape in the innermost dimension should load / store - # atleast 16 bytes. + # at least 16 bytes. # - Tensors are 16 byte aligned. Enabling this option therefore requires # assume_aligned_inputs to also be enabled # TMA descriptors are only going to be generated if the above conditions diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index bf779e4039a4..5195770dc24a 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -1235,7 +1235,7 @@ class CachingAutotuner(KernelInterface): if launcher.store_cubin and (not benchmark_run or not self.cuda_kernel_saved): self.save_gpu_kernel(stream, launcher) - # PyTorch execution trace replay calls CachingAutotuner::run() instread of calls launcher + # PyTorch execution trace replay calls CachingAutotuner::run() instead of calls launcher # so _RecordFunctionFast need to capture the args into CachingAutotuner::run() # make a copy here to avoid mutating the original args args_without_constexprs = tuple(args) diff --git a/torch/_lazy/extract_compiled_graph.py b/torch/_lazy/extract_compiled_graph.py index d014c272490b..38219a54b30b 100644 --- a/torch/_lazy/extract_compiled_graph.py +++ b/torch/_lazy/extract_compiled_graph.py @@ -56,9 +56,9 @@ class ReturnValueHandler: r""" When ltc_sync_multi is called on multi tensors, the compiled graph will contain output only for unique tensors - if a tensor appears multiple - times in the input to _ltc_sync_multi, only the first occurance matters. + times in the input to _ltc_sync_multi, only the first occurrence matters. - However from python level, we still expect multi tensors returned with duplciation + However from python level, we still expect multi tensors returned with duplication even if the TS graph dedup the output. e.g. for method: def forward(self, a): @@ -123,7 +123,7 @@ def force_lazy_device(model: fx.GraphModule): # To force those tensors on the lazy device, we can not simply override # the device argument since there is no explicit device argument. # What we are doing here is, for the list of covered tensor factory methods - # we add a lazy device argument explicity. + # we add a lazy device argument explicitly. # # TODO: This solution is no ideal since we may miss some factory methods. In future # when we support lazy mode, this method can be replaced by that. @@ -170,7 +170,7 @@ def extract_compiled_graph(model: fx.GraphModule, example_inputs) -> Callable: if len(fallback_ops) > 0: raise RuntimeError( - f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}" + f"Fail to extract the compiled graph because of fallback: {','.join(fallback_ops)}" ) if not isinstance(lazy_out, (tuple, list)): diff --git a/torch/_lazy/metrics.py b/torch/_lazy/metrics.py index a77981feb90d..3f676ec1f8ae 100644 --- a/torch/_lazy/metrics.py +++ b/torch/_lazy/metrics.py @@ -13,7 +13,7 @@ def counter_names(): def counter_value(name: str): - """Return the value of the counter with the speficied name""" + """Return the value of the counter with the specified name""" return torch._C._lazy._counter_value(name) diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py index c0dfc2fe72da..4cb79ae48725 100644 --- a/torch/_library/fake_class_registry.py +++ b/torch/_library/fake_class_registry.py @@ -137,7 +137,7 @@ def maybe_to_fake_obj( # x.__obj_flatten__() could be calling some tensor operations inside but we don't # want to call these ops in surrounding dispatch modes when executing it. # Otherwise, for example, the fake tensor modes will error out when the tensors inside - # script obeject execute some operations like clone if allow_non_fake_input flag is set. + # script object execute some operations like clone if allow_non_fake_input flag is set. with _disable_current_modes(): flat_x = x.__obj_flatten__() # type: ignore[attr-defined] @@ -238,8 +238,8 @@ def register_fake_class(qualname, fake_class: Optional[HasStaticMethodFromReal] def size(self): return len(self.queue) - In this example, the original TensorQeue need to addd a __obj_flatten__ method - to the class TensorQueue and the flattend result is passed into FakeTensorQueue's + In this example, the original TensorQeue need to add a __obj_flatten__ method + to the class TensorQueue and the flattened result is passed into FakeTensorQueue's __obj_unflatten__ as inputs to create a fake class. This protocol allows pytorch to look at the contents of the script object and properly handle them in the subsystems like dynamo, aot_aotugrad or more. @@ -248,7 +248,7 @@ def register_fake_class(qualname, fake_class: Optional[HasStaticMethodFromReal] def inner(fake_class: HasStaticMethodFromReal): ns, name = parse_namespace(qualname) - # This also checks whether the refered torch::class_ exists. + # This also checks whether the referred torch::class_ exists. torch._C._get_custom_class_python_wrapper(ns, name) from_method = getattr(fake_class, _CONVERT_FROM_REAL_NAME, None) diff --git a/torch/_library/fake_profile.py b/torch/_library/fake_profile.py index 9b2fc0ae2baf..d480f6662680 100644 --- a/torch/_library/fake_profile.py +++ b/torch/_library/fake_profile.py @@ -102,7 +102,7 @@ def unsafe_generate_fake_kernels(op_profiles: dict[str, set[OpProfile]]) -> Gene an output with the same metadata as in the recorded profile. If a profile doesn't exist then an exception will be thrown. - The fake kernel generation is considerd unsafe because it relies on the + The fake kernel generation is considered unsafe because it relies on the rigid, pre-defined operator profiles that do not account for potential variations in output behavior. Specifically, the generated kernels assume a fixed relationship between input and output ranks. However, in reality, it's diff --git a/torch/_library/utils.py b/torch/_library/utils.py index b5f87f230e7f..17e128bdbe0f 100644 --- a/torch/_library/utils.py +++ b/torch/_library/utils.py @@ -14,7 +14,7 @@ from torch._ops import OpOverload def warn_deploy(stacklevel=3): warnings.warn( - "Python torch.library APIs do nothing under torch::deploy (multipy). " + "Python torch.library APIs do nothing under torch::deploy (multipy). " # codespell:ignore multipy "Please instead use C++ custom operator registration APIs.", RuntimeWarning, stacklevel=stacklevel, @@ -442,7 +442,7 @@ class MutationChecker: f"{self.op._name}: for argument '{info.name}': the operator's schema " f"{self.op._schema} specified that " f"the operator {'mutates' if info.is_write else 'does not mutate'} " - f"the argument, but this seems to be emperically wrong. " + f"the argument, but this seems to be empirically wrong. " f"Please make the schema and operator behavior consistent. " f"You can specify that an operator mutates a Tensor by " f"e.g. changing its schema type from 'Tensor name' to 'Tensor(a!) name'" diff --git a/torch/_numpy/_dtypes_impl.py b/torch/_numpy/_dtypes_impl.py index d9eb9cc94c27..feed9c460050 100644 --- a/torch/_numpy/_dtypes_impl.py +++ b/torch/_numpy/_dtypes_impl.py @@ -1,6 +1,6 @@ # mypy: ignore-errors -"""Dtypes/scalar type implementaions with torch dtypes. +"""Dtypes/scalar type implementations with torch dtypes. Here `dtype` is always a torch.dtype, this module knows nothing about scalar types, wrapper dtypes or anything like that. PyTorch only. diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py index 4030ba97766b..19748a08b9de 100644 --- a/torch/_numpy/_funcs_impl.py +++ b/torch/_numpy/_funcs_impl.py @@ -96,7 +96,7 @@ def _concat_cast_helper(tensors, out=None, dtype=None, casting="same_kind"): else: out_dtype = _dtypes_impl.result_type_impl(*tensors) - # cast input arrays if necessary; do not broadcast them agains `out` + # cast input arrays if necessary; do not broadcast them against `out` tensors = _util.typecast_tensors(tensors, out_dtype, casting) return tensors @@ -1290,7 +1290,7 @@ def cross(a: ArrayLike, b: ArrayLike, axisa=-1, axisb=-1, axisc=-1, axis=None): def einsum(*operands, out=None, dtype=None, order="K", casting="safe", optimize=False): # Have to manually normalize *operands and **kwargs, following the NumPy signature - # We have a local import to avoid poluting the global space, as it will be then + # We have a local import to avoid polluting the global space, as it will be then # exported in funcs.py from ._ndarray import ndarray from ._normalizations import ( diff --git a/torch/_numpy/_util.py b/torch/_numpy/_util.py index 443623bcc901..fdb1736a1d0f 100644 --- a/torch/_numpy/_util.py +++ b/torch/_numpy/_util.py @@ -204,7 +204,7 @@ def _coerce_to_tensor(obj, dtype=None, copy=False, ndmin=0): Notes ----- - This is almost a "tensor_like" coersion function. Does not handle wrapper + This is almost a "tensor_like" coercive function. Does not handle wrapper ndarrays (those should be handled in the ndarray-aware layer prior to invoking this function). """ diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py index 93c9e5ffb101..6739b334c116 100644 --- a/torch/_prims/__init__.py +++ b/torch/_prims/__init__.py @@ -2174,7 +2174,7 @@ def _resize_aten(a: Tensor, shape: ShapeType) -> Tensor: _resize_doc = """ Gives a tensor with no elements a new shape, returning the modified tensor. - The tensor's strides are contiguous and its values are unitialized. + The tensor's strides are contiguous and its values are uninitialized. """ # TODO: review support arbitrary resizes diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py index 2ccba3c28c13..e5e5b13f62c7 100644 --- a/torch/_prims_common/wrappers.py +++ b/torch/_prims_common/wrappers.py @@ -94,7 +94,7 @@ class elementwise_type_promotion_wrapper: Takes two kwargs, type_promoting_args and type_promotion_kind. - type_promoting_args must be a string Sequence specifiying the argument names of all + type_promoting_args must be a string Sequence specifying the argument names of all arguments that participate in type promotion (and should be type promoted). If the arg specifies a Sequence-type then every element of the Sequence will participate in type promotion. diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 3b2344f44b9d..c82d7aaecb85 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -6077,7 +6077,7 @@ def bucketize( if n_boundaries == 0: return torch.zeros_like(a) # We are trying to find the bucket (defined by pairs of consecutive elements of `boundaries`) - # each element of `a` belongs to. We use binary search to achieve logarithimic complexity, + # each element of `a` belongs to. We use binary search to achieve logarithmic complexity, # but each step of the search is done "in parallel" over all elements of `a` # can't use int32 as indexes, so we have to do all computations with int64 and convert at the end start = torch.zeros(a.shape, device=a.device, dtype=torch.int64) diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py index 7a54ca2c3deb..89ead281d947 100644 --- a/torch/_refs/nn/functional/__init__.py +++ b/torch/_refs/nn/functional/__init__.py @@ -760,7 +760,7 @@ def _nll_loss_nd( batch_size = input.shape[0] loss = -input[torch.arange(batch_size), target] * current_weight else: - # 3D case (N batch size, C classe, K dimensions) + # 3D case (N batch size, C classes, K dimensions) # input (N batch size, C classes, K) batch_size = input.shape[0] extent = input.shape[2] diff --git a/torch/_strobelight/cli_function_profiler.py b/torch/_strobelight/cli_function_profiler.py index 29150b43f9f4..80108dc99186 100644 --- a/torch/_strobelight/cli_function_profiler.py +++ b/torch/_strobelight/cli_function_profiler.py @@ -59,7 +59,7 @@ class StrobelightCLIFunctionProfiler: StrobelightCLIFunctionProfiler can be used to profile a python function and generate a strobelight link with the results. It works on meta servers but - does not requries an fbcode target. + does not requires an fbcode target. When stop_at_error is false(default), error during profiling does not prevent the work function from running. diff --git a/torch/_strobelight/compile_time_profiler.py b/torch/_strobelight/compile_time_profiler.py index 2677b75cbbe0..436f9a2c8b59 100644 --- a/torch/_strobelight/compile_time_profiler.py +++ b/torch/_strobelight/compile_time_profiler.py @@ -127,7 +127,7 @@ class StrobelightCompileTimeProfiler: if not shutil.which("strobeclient"): logger.info( - "strobeclient not found, cant enable compile time strobelight profiling, seems" + "strobeclient not found, can't enable compile time strobelight profiling, seems" "like you are not on a FB machine." ) return diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py index 8cc9cae224ef..e802d9a4389d 100644 --- a/torch/_subclasses/fake_impls.py +++ b/torch/_subclasses/fake_impls.py @@ -231,7 +231,7 @@ def stride_incorrect_op(op): # These operators have meta implementations with incorrect strides @register_op_impl(stride_incorrect_op) def wordaround_stride_incorrect_op(fake_mode, func, *args, **kwargs): - # This is a workaround for meta implmentations with incorrect strides + # This is a workaround for meta implementations with incorrect strides def is_symbolic(x): if isinstance(x, FakeTensor): diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py index bbecee8004be..c17de15f46ea 100644 --- a/torch/_subclasses/fake_tensor.py +++ b/torch/_subclasses/fake_tensor.py @@ -2366,7 +2366,7 @@ class FakeTensorMode(TorchDispatchMode): # (aot autograd, torchdynamo) where each operation is run consecutively. # Because each operation is run in order, we can trace out and support # sequences like: x = torch.tensor(0.); y = x.add_(1) - # Whenver a constant is written to but with inputs that cannot be evaluated + # Whenever a constant is written to but with inputs that cannot be evaluated # statically, such as random_(), we invalidate all constants that alias the input # We will rely on functionalization for use of fake tensors constants as persistent # objects on an FX Graph. diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py index 956f22d1c4b6..28cc3070affc 100644 --- a/torch/_subclasses/functional_tensor.py +++ b/torch/_subclasses/functional_tensor.py @@ -67,7 +67,7 @@ class FunctionalTensor(torch.Tensor): # later, as long as it doesn't break anything). # FunctionalTensorWrapper copies **all** dispatch keys from the inner tensor # to the wrapper, excluding functorch and python dispatch keys. - # Here I'm trying to re-use the keyset the functorch wrapper subclasses copy, + # Here I'm trying to reuse the keyset the functorch wrapper subclasses copy, # except that they don't include ZeroTensor so I'm manually adding it in. _extra_dispatch_keys = torch._C._additional_keys_to_prop_for_wrapper_tensors.add( torch._C.DispatchKey.ZeroTensor @@ -488,7 +488,7 @@ class FunctionalTensorMode(TorchDispatchMode): - FunctionalTensor._extra_dispatch_keys ) - # All we want to do here is re-use the existing C++ functionalization logic. + # All we want to do here is reuse the existing C++ functionalization logic. # This requires swizzling our TLS dispatch keys so that the Functionalize key is active. with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set): try: diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py index ccc5cc914acb..5d24eb42090d 100644 --- a/torch/_subclasses/meta_utils.py +++ b/torch/_subclasses/meta_utils.py @@ -1643,7 +1643,7 @@ class MetaConverter(Generic[_TensorT]): with torch.enable_grad(): r = view_from_base(base, t) - # NB: We don't actaully faithfully replicate + # NB: We don't actually faithfully replicate # autograd connectivity, but that doesn't matter # today. See following for more info: # https://gist.github.com/soulitzer/e03f015b314c3f5fcf80888c69390913 diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp index fa26c1849dde..c36c6386b861 100644 --- a/torch/csrc/distributed/rpc/rref_context.cpp +++ b/torch/csrc/distributed/rpc/rref_context.cpp @@ -348,7 +348,7 @@ c10::intrusive_ptr RRefContext::getOrCreateOwnerRRef( // here is a plain TensorType, they are not equal relationship: // specialized TensorType <: plain TensorType // - // In RPC we don't care the difference as we ser'de with just the + // In RPC we don't care the difference as we Ser/De with just the // plain TensorType. This is not a issue for UserRRef creation either, // since Tensor can only get specialized with a previous run of local // JIT function, and we shouldn't preserve the specialized SubTensorType