diff --git a/.lintrunner.toml b/.lintrunner.toml index 55a09fdcdde9..06ad52ead071 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -1172,6 +1172,7 @@ exclude_patterns = [ 'test/distributed/**', 'torch/**', 'torch/_*/**', + 'torch/_inductor/**', 'torch/_dynamo/**', 'torch/ao/**', 'torch/fx/**', diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py index adf74dc62b70..b6d19ada8138 100644 --- a/test/export/test_unflatten.py +++ b/test/export/test_unflatten.py @@ -233,7 +233,7 @@ class TestUnflatten(TestCase): new_inps = *inps, torch.rand(2, 3) with self.assertRaisesRegex( TypeError, - "There is no flat args adapter specified. Are you sure you are calling this with the right arguments?", + "There is no flat args adapter sepcified. Are you sure you are calling this with the right arguments?", ): unflattened(new_inps) diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt index 7856a58d54ca..cdb8d4571239 100644 --- a/tools/linter/dictionary.txt +++ b/tools/linter/dictionary.txt @@ -2,7 +2,6 @@ coo Din Dout dOut -ElementE fro froms hsa diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py index c936fbe92c67..2cd563003295 100644 --- a/torch/_inductor/autotune_process.py +++ b/torch/_inductor/autotune_process.py @@ -585,7 +585,7 @@ class TritonBenchmarkRequest(BenchmarkRequest): num_buffers_warp_spec: int = 0, matrix_instr_nonkdim: int = 0, # only used for hip to choose the shape of mfma instruction. waves_per_eu: int = 0, # only used for hip to schedule waves per execution unit - kpack: int = 0, # ROCm specific gemm parameter + kpack: int = 0, # ROCm specific gemm paramete ) -> None: super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args) self.module_path = module_path diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index f259def02e41..aa16b028d037 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -621,7 +621,7 @@ class FxGraphCachePickler(pickle.Pickler): defined triton kernels Essentially what we are doing here is a huge hack where user defined triton kernel contain a dynamo time side table and the arguments to the - call_function are indices into this side table. These arguments are not + call_function are indicies into this side table. These arguments are not for hashing purposes since we included the source code into the cache key and the numbers are prone to give false negatives due to ordering. """ @@ -1154,7 +1154,7 @@ class FxGraphCache(GuardedCache[CompiledFxGraph]): current context to validate that a cached entry can be served. - A given graph could have multiple compiled versions, corresponding to different sets of guards. Therefore, we store cache entries in the form: - // + // - On lookup, we compute the key from the graph details, iterate over all leaf files in the corresponding subdirectory, deserialize the entry, and evaluate its guards expression. If the evaluation succeeds, we have a @@ -1836,8 +1836,8 @@ class AotCodeCompiler: ) consts_s = Path(consts_s) object_build_options = CppTorchDeviceOptions( - # Intel compiler failed to compile this manually constructed assembly file. - # it is ok to use gcc to compile the .S to a .o and linked with Intel compiler . + # Intel compiler failed to compile this manully constructed assembly file. + # it is ok to use gcc to compile the .S to a .o and linked with Intel comiler . device_type=device_type if device_type != "xpu" else "cpu", aot_mode=graph.aot_mode, compile_only=True, @@ -2206,7 +2206,7 @@ class AotCodeCompiler: generated_files.append(weight_file) else: - # TODO: unify to always use mmap_weights + # TODO: unify to alway use mmap_weights generated_files.append(consts_o) so_builder.save_src_to_cmake(cmake_path, consts_o) @@ -3164,31 +3164,31 @@ class HalideCodeCache(CppPythonBindingsCodeCache): base = cache_dir() dirpath = Path(base) / f"halide-runtime-{target}-{cls.config_hash()}" os.makedirs(dirpath, exist_ok=True) - done_file = str(dirpath / "done") - lock_file = str(dirpath / "lock") - hook_file = str(dirpath / "hooks.cpp") - a_file = str(dirpath / "standalone_halide_runtime.a") - so_file = str(dirpath / libname) - if not os.path.exists(done_file): + donefile = str(dirpath / "done") + lockfile = str(dirpath / "lock") + hookfile = str(dirpath / "hooks.cpp") + afile = str(dirpath / "standalone_halide_runtime.a") + sofile = str(dirpath / libname) + if not os.path.exists(donefile): import halide as hl # type: ignore[import-untyped,import-not-found] from torch.utils._filelock import FileLock - with FileLock(lock_file, LOCK_TIMEOUT): - if not os.path.exists(done_file): - with open(hook_file, "w") as f: + with FileLock(lockfile, LOCK_TIMEOUT): + if not os.path.exists(donefile): + with open(hookfile, "w") as f: if device_type == "cuda": f.write( cls.standalone_runtime_cuda_init.format( cls.find_header("HalideRuntimeCuda.h") ) ) - hl.compile_standalone_runtime(a_file, hl.Target(target)) + hl.compile_standalone_runtime(afile, hl.Target(target)) - name, output_dir = get_name_and_dir_from_output_file_path(so_file) + name, output_dir = get_name_and_dir_from_output_file_path(sofile) halide_cmd_gen = CppBuilder( name=name, - sources=[hook_file, a_file], + sources=[hookfile, afile], output_dir=output_dir, BuildOption=CppTorchDeviceOptions( device_type=device_type, @@ -3198,10 +3198,10 @@ class HalideCodeCache(CppPythonBindingsCodeCache): subprocess.check_call( shlex.split(halide_cmd_gen.get_command_line()) ) - touch(done_file) - assert os.path.exists(so_file) - cls._standalone_runtime_path = so_file - return so_file + touch(donefile) + assert os.path.exists(sofile) + cls._standalone_runtime_path = sofile + return sofile @classmethod def _get_uncompiled_header(cls, device: str) -> str | None: diff --git a/torch/_inductor/codegen/aoti_hipify_utils.py b/torch/_inductor/codegen/aoti_hipify_utils.py index eb71d4ee7f39..b6ccaab56f82 100644 --- a/torch/_inductor/codegen/aoti_hipify_utils.py +++ b/torch/_inductor/codegen/aoti_hipify_utils.py @@ -8,7 +8,7 @@ from torch.utils.hipify.hipify_python import PYTORCH_MAP, PYTORCH_TRIE # "... # from ..codecache import CudaKernelParamCache # ..." -# In such cases, we do not need to hipify_torch the original class/file name in codegen/codecache +# In such cases, we do not need to hipify_torch the orignial class/file name in codegen/codecache def maybe_hipify_code_wrapper(source_codes: str, force_hipify: bool = False) -> str: diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py index 882be85d2e12..dc4928c8d0fc 100644 --- a/torch/_inductor/codegen/common.py +++ b/torch/_inductor/codegen/common.py @@ -1551,7 +1551,7 @@ class KernelArgs: def size(self, name: sympy.Symbol) -> str: assert isinstance(name, sympy.Symbol), (type(name), name) if name.name == "seed": - self.sizevars[name] = "seed" # don't manage the name of seeds + self.sizevars[name] = "seed" # dont' mange the name of seeds return "seed" return self._lookup("ks", self.sizevars, name) @@ -1884,7 +1884,7 @@ class CSE(Generic[CSEVariableType, AugmentedKeyT]): line = f"{expr}{self.suffix}" buffer.writeline(line) - # cpp backend cannot determine is_vec at this point + # cpp backend cannot determin is_vec at this point if ( assignment and ( @@ -2102,7 +2102,7 @@ class Kernel(CodeGen, Generic[CSEVariableType]): assert upper is None or isinstance(upper, str) if lower and upper: # The conditions need to be in parens because of Python's operator precedence. - # It'd be less error-prone to use and/or/not, which is supported by triton + # It'd be less error-prone to use and/or/not, which is suported by triton cond = f"({lower} <= {var}) & ({var} < {upper})" cond_print = f"{lower} <= {var} < {upper}" elif lower: diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py index d8fe282e9f44..9fc72a4908a4 100644 --- a/torch/_inductor/codegen/cpp.py +++ b/torch/_inductor/codegen/cpp.py @@ -327,7 +327,7 @@ def reduction_prefix_array( Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here. We just use unique_ptr make it works on MSVC. - For other compilers, we continue to use VLA to get best performance. + For other compilers, we continue to use VLA to get best performence. """ code_buffer = IndentedBuffer() acc_decl = ( diff --git a/torch/_inductor/codegen/cpp_flex_attention_template.py b/torch/_inductor/codegen/cpp_flex_attention_template.py index 5081e2ad9f61..2542acc6108b 100644 --- a/torch/_inductor/codegen/cpp_flex_attention_template.py +++ b/torch/_inductor/codegen/cpp_flex_attention_template.py @@ -311,7 +311,7 @@ extern "C" } if (need_pack) { // When the number of gemm is greater than the number of pack, - // the pack overhead can be overlapped. + // the pack overhead can be overlaped. int64_t thresh_size = 64; need_pack = kvSize >= thresh_size && qSize >= thresh_size; if (need_pack) { diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py index ce5ec7ed9eaa..8e5caef080d5 100644 --- a/torch/_inductor/codegen/cpp_gemm_template.py +++ b/torch/_inductor/codegen/cpp_gemm_template.py @@ -1092,7 +1092,7 @@ class CppGemmTemplate(CppTemplate): """ NOTE Weight prep consists of 2 separate steps: 1. Blocking the weight tensor into a 3D shape: [n//block_n, k, block_n] - This is always done if the weight tensor is constant, i.e. for all GEMM and some BMM. + This is always done if the weight tensor is contant, i.e. for all GEMM and some BMM. For BMM, we also block non-contiguous weight tensors, since they would be reshaped anyway. This assumes that blocked, contiguous weights will be more efficient for the GEMM kernel, and is worth the overhead of reshape and blocking. diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py index c9c54553756f..4e90b1ba9e17 100644 --- a/torch/_inductor/codegen/cpp_micro_gemm.py +++ b/torch/_inductor/codegen/cpp_micro_gemm.py @@ -684,7 +684,7 @@ inline void {{kernel_name}}_transpose_b_kernel( // Use 2 implementations for the transposed B: // First implementation: // Transpose first and then perform outer product calculation in sub-blocks, - // which introduces an additional transpose overhead of [K, N] compared to the non-transpose version. + // which introduces an additional tranpose overhead of [K, N] compared to the non-tranpose version. // Second implementation: // Directly perform inner product calculation in sub-blocks, // which introduces an additional vector reduction of [M, N] compared to the non-tranpose version. @@ -1001,7 +1001,7 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs): ) class CppMicroGemmAMX(CppMicroGemm): """ - This class generates the code for micro gemm using Advanced Matrix extension (AMX) + This class generates the code for micro gemm using Advanced Matrix eXtention (AMX) instructions available in 4th generation Intel Xeon for compute. It supports input types of torch.bfloat16 with fp32 output. """ diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py index bdaf74952ce8..4cae56228725 100644 --- a/torch/_inductor/codegen/cpp_wrapper_cpu.py +++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py @@ -1351,7 +1351,7 @@ class CppWrapperCpu(PythonWrapperCodegen): def generate_index_put_fallback(self, kernel, x, indices, values, accumulate): # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version # See the comment in codegen_reinterpret_view about why having something like - # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding + # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding # tensor prematurely deallocated, thus the temporary array trick here. indices_str = self._generate_temporary_array_pointer( "AtenTensorHandle", indices @@ -1788,7 +1788,7 @@ class CppWrapperCpu(PythonWrapperCodegen): if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer): # in ABI-compatible mode, we need to use the ABI shim function - # to extract a C++ bool from the underlying scalar bool Tensor + # to extract a C++ bool from the unrelying scalar bool Tensor predicate = f"{conditional.predicate.get_name()}_scalar" if predicate not in self.used_cond_predicate: self.codegen_tensor_item( @@ -1852,7 +1852,7 @@ class CppWrapperCpu(PythonWrapperCodegen): # in ABI-compatible mode, the carried inputs are codegened # as buffers outside the while loop and set to the initial # values. at the end of each while_loop iteration, they - # will be assigned the carried values. + # will be assined the carried values. out_name = out.get_name() self.writeline(f"AtenTensorHandle {out_name}_handle;") self.writeline( @@ -1861,7 +1861,7 @@ class CppWrapperCpu(PythonWrapperCodegen): self.writeline(f"RAIIAtenTensorHandle {out_name}({out_name}_handle);") cond_outer_inputs.append(out_name) - # additional inputs will be assigned within the while_loop + # additional inputs will be assinged within the while_loop # iteration directly from the corresponding outer graph buffers cond_outer_inputs.extend(outer_additional_inputs) diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py index 0d53db7f32c6..7334ff5c64b7 100644 --- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py +++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py @@ -728,7 +728,7 @@ class CppWrapperCpuArrayRef(CppWrapperCpu): self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor() # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version # See the comment in codegen_reinterpret_view about why having something like - # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding + # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding # tensor prematurely deallocated, thus the temporary array trick here. indices_str = self._generate_temporary_array_pointer( "AtenTensorHandle", diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py index 29fbe5eeabf8..909f8b7284b5 100644 --- a/torch/_inductor/codegen/cpp_wrapper_gpu.py +++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py @@ -380,7 +380,7 @@ class CppWrapperGpu(CppWrapperCpu): # `source` is in the form of `&var_x`, where `var_x` is the data pointer # (CUdeviceptr); we dereference `source` and cast to `void*` to pass to - # the data pointer of the source tensor to the helper function + # the data pointer of the source tensor ot the helper function # `init{1,2}DTMADescriptor` ptr = f"reinterpret_cast(*({source}))" dims = ", ".join(self.val_to_arg_str(dim) for dim in desc.dims) diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py index becbf1f2c552..2c61e0fdf2f7 100644 --- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py +++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py @@ -77,7 +77,7 @@ if try_import_cutlass(): if not is_row_major and not is_column_major: raise RuntimeError( f"Cannot create example tensor for {buffer.get_name()} with \ -non-contiguous layout, received stride: {stride} and shape: {shape}" +non-contiguous layout, recieved stride: {stride} and shape: {shape}" ) return CutlassTensor( diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py index 176d1e2f69f0..fec507e6508f 100644 --- a/torch/_inductor/codegen/cuda/gemm_template.py +++ b/torch/_inductor/codegen/cuda/gemm_template.py @@ -293,7 +293,7 @@ GEMM_ARGS_SPARSE_CUTLASS_2X = r""" }; """ -# Additional includes which are necessary if the standalone test / debug runner is generated as well +# Additional includes which are neccessary if the standalone test / debug runner is generated as wel GEMM_STANDALONE_RUNNER_ADDITIONAL_INCLUDES = r""" #ifdef GENERATE_STANDALONE_RUNNER #include "cutlass/util/distribution.h" @@ -375,7 +375,7 @@ extern "C" int run_standalone(uint64_t seed, int repetitions) { std::cout << "Calling once to get workspace size" << std::endl; {{test_call_statement}}; - // Allocate workspace if necessary + // Allocate workspace if neccessary if (workspace_size > 0) { workspace_data.reset(workspace_size); std::cout << "Allocated workspace size of " << workspace_size << " bytes" << std::endl; @@ -684,13 +684,13 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC): ) -> bool: """ Helper method to determine whether we should do an explicit transpose by switching the order of the - matmul operands. This might be necessary when we can't otherwise arrive at the right memory + matmul operands. This might be neccessary when we can't otherwise arrive at the right memory layout for the given Bias operand. Note: This method is a workaround for CUDA Errors that seemingly non-deterministically occurred in practice in some CUTLASS GEMM Kernels with Linear epilogues that have a bias term. it might make sense to check on newer Cutlass releases whether it makes sense to keep - returning True in certain cases or whether it becomes unnecessary. + returning True in certain cases or whether it becomes unneccessary. """ # If bias is row major, swap all M and N dimensions if ( diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py index 1749db7576ed..f51ee70b73bc 100644 --- a/torch/_inductor/codegen/halide.py +++ b/torch/_inductor/codegen/halide.py @@ -1447,7 +1447,7 @@ class HalideKernel(SIMDKernel): current_device = V.graph.get_current_device_or_throw() if current_device.type == "cpu": target = [config.halide.cpu_target] - scheduler = config.halide.scheduler_cpu + schduler = config.halide.scheduler_cpu scheduler_flags = { "parallelism": parallel_num_threads(), } @@ -1456,7 +1456,7 @@ class HalideKernel(SIMDKernel): assert current_device.type == "cuda", "only cpu/cuda supported" assert current_device.index <= 0, "only default device supported" target = [config.halide.gpu_target] - scheduler = config.halide.scheduler_cuda + schduler = config.halide.scheduler_cuda capability = torch.cuda.get_device_properties(current_device) if "cuda_capability" not in target[0]: for major, minor in [(8, 6), (8, 0), (7, 5), (7, 0), (6, 1)]: @@ -1490,7 +1490,7 @@ class HalideKernel(SIMDKernel): return HalideMeta( argtypes, target="-".join(target), - scheduler=scheduler, + scheduler=schduler, scheduler_flags=scheduler_flags, # type: ignore[arg-type] cuda_device=cuda_device, ) diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py index ded6cb093af9..e33c0037e899 100644 --- a/torch/_inductor/codegen/mps.py +++ b/torch/_inductor/codegen/mps.py @@ -478,9 +478,9 @@ class MetalKernel(SIMDKernel): dtype = V.graph.get_dtype(name) line = f"{var}[{self.index_to_str(index)}]" if dtype in [torch.float16, torch.bfloat16]: - # TODO(NS): Figure out the right balance between optype casts + # TODO(NS): Figure out the right balance betwene optype casts # op_math_t for half-precision floats should be float32 - # Otherwise it can lead to a correctness issues with eager + # Otherwise it can lead to a corretness issues with eager line = f"static_cast({line})" dtype = torch.float32 return self.cse.generate(self.loads, line, dtype=dtype) @@ -879,7 +879,7 @@ class MetalKernel(SIMDKernel): else: return f"{kwarg}=[{', '.join(threads)}]" - # For reduction kernels, limit the maximum size over reduction dimensions to + # For reduction kernels, limit the maximum size over reduction dimentions to # a maximum threadgroup size if len(self.active_range_trees()) > 0: threads = [ diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py index 9425eaa97ff7..8b07374faf02 100644 --- a/torch/_inductor/codegen/simd.py +++ b/torch/_inductor/codegen/simd.py @@ -151,7 +151,7 @@ class IterationRanges: class IterationRangesRoot(IterationRanges): """ Root of a iteration range tree that represents a single - tiled dimension in the output kernel. It contains multiple + tiled dimension in the output kernel. It contains muliple sets of iteration represented with IterationRangesEntry. """ @@ -1570,7 +1570,7 @@ class SIMDScheduling(BaseScheduling): p_n.can_codegen_without_upcasts() for p_n in prologue_group ) - # TODO - this doesn't work with libdevice calls, potentially other bugs + # TODO - this doesnt work with libdevice calls, potentially other bugs # upcasting to fp32 and downcasting gives large slowdown with config.patch( "triton.codegen_upcast_to_fp32", not can_codegen_without_upcast @@ -1908,7 +1908,7 @@ class SIMDScheduling(BaseScheduling): reduction_numel, ) -> list[dict[str, tuple[sympy.Expr]]]: """ - Creates N-dimensional tiling candidates, attempting to simplify loads/stores + Creates N-dimensional tiling candidiates, attempting to simplify loads/stores by tiling the kernel into higher dimensions. Returns a list of tilings ranked by dimensionality. @@ -2128,7 +2128,7 @@ class SIMDScheduling(BaseScheduling): split_scores.append(prev_var_coalesced_score) # penalize splits that leave small blocks - # where we can't fully utilize full memory transaction + # where we cant fully utilize full memory transaction # TODO: incorporate exact bitwidth, and read/write # coalesced write is 2x more important for i in range(len(splits)): diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index a404abc136f5..5fedba5e89f1 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -647,7 +647,7 @@ class TritonPrinter(PythonPrinter): def _print_min_max_helper(self, expr: sympy.Expr, cmp: str) -> str: """ - Helper for max/min code generation. + Helper for max/min code genereration. cmp: > or < """ if len(expr.args) == 1: @@ -939,7 +939,7 @@ class TritonOverrides(OpOverrides): return triton_val # NOTE: We use a tensor here in order to get the expected type. - # Otherwise, e.g. float64 constants would be truncated to float32. + # Otherwise, e.g. float64 constants would be trunctated to float32. if value < 0 and not dtype.is_signed: triton_signed_type = f"tl.{triton_type[4:]}" return f"tl.full({shape}, {triton_val}, {triton_signed_type}).to({triton_type})" @@ -1956,7 +1956,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): # Compute the ND block shape from the linear block size. # Use CielDiv to round leading dimensions up to 1. # Non-leading dimensions are clamped to the size of the iteration range, - # while the leading dimension can exceed this to accommodate a larger + # while the leading dimension can exceed this to accomodate a larger # block size. linear_block_size = TritonSymbols.get_block_size(range_tree) block_shape: list[sympy.Expr] = [ diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py index dc2392119cc5..d67cb56082f2 100644 --- a/torch/_inductor/codegen/triton_combo_kernel.py +++ b/torch/_inductor/codegen/triton_combo_kernel.py @@ -51,7 +51,7 @@ def _default_custom_combo_kernel_horizontal_partition( node_info_map: dict[BaseSchedulerNode, tuple[Any, Any, Any, Any]], ) -> list[list[BaseSchedulerNode]]: """Horizontally partition the given list of nodes into a list of list of nodes where each sublist - represents a partition. Nodes in different partitions are implemented in different combo kernels. + represents a partion. Nodes in different partitions are implemented in different combo kernels. Nodes in the same partition are likely to be implemented in the same combo kernel, but subject to subsequent restrictions like CUDA limits for number of args. diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py index fd9019fa6b62..d52b73a8c0fe 100644 --- a/torch/_inductor/codegen/triton_utils.py +++ b/torch/_inductor/codegen/triton_utils.py @@ -36,24 +36,24 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str: # TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes. # Related PR: https://github.com/triton-lang/triton/pull/2279/ if arg.dtype == torch.float8_e4m3fn: - typ = "*fp8e4nv" + tye = "*fp8e4nv" elif arg.dtype == torch.float8_e5m2: - typ = "*fp8e5" + tye = "*fp8e5" elif arg.dtype == torch.float8_e4m3fnuz: - typ = "*fp8e4b8" + tye = "*fp8e4b8" elif arg.dtype == torch.float8_e5m2fnuz: - typ = "*fp8e5b16" + tye = "*fp8e5b16" else: - typ = _type_of(arg.dtype) + tye = _type_of(arg.dtype) if should_unwrap_unspec_arg(arg.buffer): # had unwrapped 0d tensor as scalar - new_typ = typ.lstrip("*") - if new_typ in ["fp16", "bf16"]: + new_tye = tye.lstrip("*") + if new_tye in ["fp16", "bf16"]: return "fp32" else: - return new_typ + return new_tye else: - return typ + return tye if isinstance(arg, SizeArg): if arg.expr is None: if triton_version_uses_attrs_dict(): diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py index ff5324cf4e47..baf781a6d43c 100644 --- a/torch/_inductor/comms.py +++ b/torch/_inductor/comms.py @@ -326,8 +326,8 @@ def _schedule_for_comm( for snode in snodes: if raise_comms and contains_collective(snode): scores_0[snode.get_name()] = comm_idx - for ancestor in snode.ancestors: - anc_fused_name = name_to_fused_node[ancestor].get_name() + for anc in snode.ancestors: + anc_fused_name = name_to_fused_node[anc].get_name() scores_0[anc_fused_name] = min(scores_0[anc_fused_name], comm_idx) comm_idx += 1 elif sink_waits and contains_wait(snode): @@ -486,7 +486,7 @@ def node_summary(snode): def visualize_overlap(order): - # TODO - this function probably doesn't do a very good job estimating the runtime because it doesn't carefully model + # TODO - this function probably doesn't do a very good job estimating the runtime becuase it doesn't carefully model # streams and overlap. For now its mostly useful as a debug visualization. total_est_runtime: float = 0.0 diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index dcfefab265c0..bd6590dceea5 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -497,7 +497,7 @@ coordinate_descent_search_radius = int( ) # AutoHeuristic is a framework that allows one to collect data from autotuning, use the data to learn a heuristic, and -# generate the learned heuristic to code which is shipped with the compiler +# generate the learned heursitic to code which is shipped with the compiler # Specify a list of comma separated optimizations to collect data for autoheuristic_collect = os.environ.get("TORCHINDUCTOR_AUTOHEURISTIC_COLLECT", "") # Specify a list of comma separated optimizations to use learned heuristics for @@ -595,7 +595,7 @@ max_fusion_size = 64 # how many nodes to attempt pairwise fusion with in a buffer group max_fusion_buffer_group_pairwise_attempts = 64 -# max number of inputs to generate cat as a pointwise op with masked loads +# max number of inputs to generate cat as a pointwise op with masked laods max_pointwise_cat_inputs = 8 # force concat to be generated as a pointwise op with masked loads @@ -713,7 +713,7 @@ worker_suppress_logging: bool = Config( default=True, ) -# Flags to turn on all_reduce fusion. These 2 flags should be automatically turned +# Flags to turn on all_reduce fusion. These 2 flags should be automaticaly turned # on by DDP and should not be set by the users. _fuse_ddp_communication = False _fuse_ddp_bucket_size = 25 @@ -858,7 +858,7 @@ padding_alignment_bytes = 128 # Pad too small stride may also cause perf loss. We may result in many tiny data blocks # with gaps in between. That causes less coalesced GPU memory access! # -# Initially we pick 320 as the threshold since for alignment=16, +# Initially we pick 320 as the threshold since for alignement=16, # that results in at most 5% memory cost. # # But later on we raise the threshold to 1024 to avoid interfere with persistent reduction. @@ -958,7 +958,7 @@ enable_linear_binary_folding = ( ) -# Adds NVTX annotations around training phases +# Adds NVTX annotations aroung training phases annotate_training: bool = os.environ.get("TORCHINDUCTOR_ANNOTATE_TRAINING", "0") == "1" # Enable caching codegen of triton templates. @@ -1261,7 +1261,7 @@ class triton: codegen_upcast_to_fp32 = True # Whether persistent matmul kernels should be enabled this flag only has effect when on h100 - # with a version of triton new enough to support TMA + # with a verison of triton new enough to support TMA enable_persistent_tma_matmul = ( os.environ.get("ENABLE_PERSISTENT_TMA_MATMUL", "0") == "1" ) @@ -1321,7 +1321,7 @@ class aot_inductor: # flag to decide whether to create a submodule for constant graph. use_runtime_constant_folding: bool = False - # flag to force weight to be appended to the shared library and mapped by the runtime + # flag to force weight to be appened to the shared library and mmaped by the runtime # rather than embedded into the data section. Needed to support 1B+ parameter models force_mmap_weights: bool = False @@ -1669,7 +1669,7 @@ class trace: # replace records with HTML-like labels" # and thus fail to generate a graph. So, let's give the user an option # to specify the shape attribute for the dot graph. For example, passing - # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like labels + # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables # to workaround the above failure. dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None) diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py index d0a646c1de8f..a27292d9c6f6 100644 --- a/torch/_inductor/cpp_builder.py +++ b/torch/_inductor/cpp_builder.py @@ -331,7 +331,7 @@ def get_compiler_version_info(compiler: str) -> str: ).decode(*SUBPROCESS_DECODE_ARGS) except Exception: return "" - # Multiple lines to one line string. + # Mutiple lines to one line string. version_string = version_string.replace("\r", "_") version_string = version_string.replace("\n", "_") return version_string @@ -410,7 +410,7 @@ def normalize_path_separator(orig_path: str) -> str: class BuildOptionsBase: """ This is the Base class for store cxx build options, as a template. - Actually, to build a cxx shared library. We just need to select a compiler + Acturally, to build a cxx shared library. We just need to select a compiler and maintains the suitable args. """ @@ -948,7 +948,7 @@ def perload_icx_libomp_win(cpp_compiler: str) -> None: return False """ - Intel Compiler implemented more math libraries than clang, for performance proposal. + Intel Compiler implenmented more math libraries than clang, for performance proposal. We need preload them like openmp library. """ preload_list = [ @@ -1427,7 +1427,7 @@ def get_name_and_dir_from_output_file_path( dir = /tmp/tmpof1n5g7t/5c/ put 'name' and 'dir' to CppBuilder's 'name' and 'output_dir'. - CppBuilder --> get_target_file_path will format output path according OS: + CppBuilder --> get_target_file_path will format output path accoding OS: Linux: /tmp/tmppu87g3mm/zh/czhwiz4z7ca7ep3qkxenxerfjxy42kehw6h5cjk6ven4qu4hql4i.so Windows: [Windows temp path]/tmppu87g3mm/zh/czhwiz4z7ca7ep3qkxenxerfjxy42kehw6h5cjk6ven4qu4hql4i.dll """ @@ -1444,13 +1444,13 @@ class CppBuilder: Args: name: 1. Build target name, the final target file will append extension type automatically. - 2. Due to the CppBuilder is supports multiple OS, it will maintains ext for OS difference. + 2. Due to the CppBuilder is supports mutliple OS, it will maintains ext for OS difference. sources: Source code file list to be built. BuildOption: Build options to the builder. output_dir: - 1. The output_dir the target file will output to. + 1. The output_dir the taget file will output to. 2. The default value is empty string, and then the use current dir as output dir. 3. Final target file: output_dir/name.ext """ @@ -1464,7 +1464,7 @@ class CppBuilder: @staticmethod def __get_object_flags() -> tuple[str, str]: extension = ".obj" if _IS_WINDOWS else ".o" - output_flags = "/c /Fo" if _IS_WINDOWS else "-c -o" # codespell:ignore + output_flags = "/c /Fo" if _IS_WINDOWS else "-c -o" return extension, output_flags @staticmethod @@ -1505,7 +1505,7 @@ class CppBuilder: self._name = name - # Code start here, initial self internal variables firstly. + # Code start here, initial self internal veriables firstly. self._build_option = BuildOption self._compiler = BuildOption.get_compiler() self._use_relative_path = BuildOption.get_use_relative_path() @@ -1702,8 +1702,8 @@ class CppBuilder: def build(self) -> None: """ - It is must need a temporary directory to store object files in Windows. - After build completed, delete the temporary directory to save disk space. + It is must need a temperary directory to store object files in Windows. + After build completed, delete the temperary directory to save disk space. """ if self._use_relative_path: # remote build uses relative path diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py index ef37f87a2515..20bee66549e2 100644 --- a/torch/_inductor/cudagraph_trees.py +++ b/torch/_inductor/cudagraph_trees.py @@ -228,7 +228,7 @@ class TreeManagerContainer: self.graph = None # manager was used again after existing cleanup, - # we shouldn't set it to None + # we shouldnt set it to None if self.live_cudagraphify_fns == 0: self.tree_manager = None @@ -1231,7 +1231,7 @@ class CUDAGraphNode: } if config.triton.slow_path_cudagraph_asserts: - # need to use parent live weakrefs because live_indices isn't set yet + # need to use parent live weakrefs because live_indices isnt set yet memory = ( [] if self.parent is None else list(self.parent.path_live_weakrefs()) ) @@ -1607,7 +1607,7 @@ class CUDAGraphNode: def clear_path_state(self) -> None: "Clear the path state in this current executing node" - # this doesn't actually do anything right now, leaving it as placeholder + # this doesnt actually do anything right now, leaving it as placeholder @staticmethod def _tensor_metadata( diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py index 2686d1d2ddde..f6ce7e43ad95 100644 --- a/torch/_inductor/cudagraph_utils.py +++ b/torch/_inductor/cudagraph_utils.py @@ -131,7 +131,7 @@ def check_for_mutation( inputs: list[InputType], is_cuda_graph_recorded_tensor: Callable[[torch.Tensor], bool], ) -> Optional[str]: - # doesn't work for non-trees because the warmup run would apply mutation twice + # doesnt work for non-trees because the warmup run would apply mutation twice if torch._inductor.config.triton.cudagraph_trees: # checking if mutation is only on parameters/static inputs mutation_indices: Sequence[int] = [ @@ -222,7 +222,7 @@ def check_for_mutation_ignore_cuda_graph_managed_tensor( ) -> Optional[str]: default_msg = format_default_skip_message("mutated inputs") - # doesn't work for non-trees because the warmup run would apply mutation twice + # doesnt work for non-trees because the warmup run would apply mutation twice if torch._inductor.config.triton.cudagraph_trees: unique_idxs = OrderedSet(static_input_idxs) # checking if mutation is only on parameters/static inputs diff --git a/torch/_inductor/custom_graph_pass.py b/torch/_inductor/custom_graph_pass.py index c9a8e33a1145..769b9b68ae13 100644 --- a/torch/_inductor/custom_graph_pass.py +++ b/torch/_inductor/custom_graph_pass.py @@ -18,7 +18,7 @@ class CustomGraphPass(ABC): identifies your implementation (and can be pickled). The caching logic includes this identifier in its key calculation, i.e., any new value will effectively invalidate existing entries. We expect custom passes would typically depend purely on the - textual representation of the implementation. In that case, we recommend using the + textual reprensentation of the implementation. In that case, we recommend using the 'get_hash_for_files' helper below to compute a unique hash from the contents of a static list of source files, i.e., the source(s) containing the custom pass implementation. That approach ensures that any change to the implementation will @@ -64,7 +64,7 @@ class CustomGraphModulePass(ABC): identifies your implementation (and can be pickled). The caching logic includes this identifier in its key calculation, i.e., any new value will effectively invalidate existing entries. We expect custom passes would typically depend purely on the - textual representation of the implementation. In that case, we recommend using the + textual reprensentation of the implementation. In that case, we recommend using the 'get_hash_for_files' helper below to compute a unique hash from the contents of a static list of source files, i.e., the source(s) containing the custom pass implementation. That approach ensures that any change to the implementation will diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py index ad3b93775d35..b1f75372ee4c 100644 --- a/torch/_inductor/dependencies.py +++ b/torch/_inductor/dependencies.py @@ -125,7 +125,7 @@ class MemoryDep(Dep): ) return None - # May happen if self and other are as follows + # May hanppen if self and other are as follows # MemoryDep('addmm_6', 393216*d0 + 768*d1 + d2, {d0: 16, d1: 512, d2: 768}, None) # MemoryDep('addmm_6', 98304*d0 + d1 + 768*d2, {d0: 64, d1: 768, d2: 128}, None) if OrderedSet(self_strides) != OrderedSet(other_strides): @@ -708,7 +708,7 @@ def extract_input_node_reduction_ranges( # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes? # The current method still uses reduction ranges from the dependent realized node, which is not ideal. - # Is there a way to check whether there are permutations in between? + # Is there a way to check whether there are permutations inbetween? reads = input_node.get_reads() reduction_size: Optional[list[sympy.Expr]] = None size: Optional[list[sympy.Expr]] = None diff --git a/torch/_inductor/fx_passes/ddp_fusion.py b/torch/_inductor/fx_passes/ddp_fusion.py index ccea7d7e70af..2d9409523c15 100644 --- a/torch/_inductor/fx_passes/ddp_fusion.py +++ b/torch/_inductor/fx_passes/ddp_fusion.py @@ -73,7 +73,7 @@ class CommBlock: def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]: """ Given a collective node (e.g., allreduce), find out all the nodes belong to - this communication. + this communcation. Args: comm_node(fx.Node): The target communication/collective node. @@ -304,7 +304,7 @@ def _scatter_fused_allreduce_waits( """ # Before we mass up the order, we need to get the index of the last wait node - # in orig_comm_blocks. This index will be later used to determine what users + # in orig_comm_blocks. This index will be later used to determinee what users # nodes need to be move to maintain a correct topological sort order. last_wait_node_idx = 0 for node in graph.nodes: diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py index 357a9d66cdad..0d6e74817854 100644 --- a/torch/_inductor/fx_passes/group_batch_fusion.py +++ b/torch/_inductor/fx_passes/group_batch_fusion.py @@ -1052,7 +1052,7 @@ class BatchMathOpsPreGradFusion(BatchPointwiseOpsFusionFactory): def match(self, node: torch.fx.Node): input = get_arg_value(node, 0, "input") if CallFunctionVarArgs(self.op).match(node) and is_node_meta_valid(node): - # check the input has the same shape and its users have the same target + # check the input has the same shape and its uers have the same target # check all clamp operators have the same min and max values, and # nan_to_num operators use the same default value. child = next(iter(node.users.keys())) diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py index c9d7187de0d9..5a0958921009 100644 --- a/torch/_inductor/fx_passes/joint_graph.py +++ b/torch/_inductor/fx_passes/joint_graph.py @@ -206,7 +206,7 @@ def remove_redundant_views(gm: torch.fx.GraphModule): class UniformValueConstantFolder(ConstantFolder): """ - Runs constant folding and replaces tensors that have a uniform value + Runs constant folding and replaces tensors that have a unifrom value with a tensor constructor call: aten.full([shape], value, ...) """ diff --git a/torch/_inductor/fx_passes/micro_pipeline_tp.py b/torch/_inductor/fx_passes/micro_pipeline_tp.py index af40d987f7d1..5eb2dce80dfe 100644 --- a/torch/_inductor/fx_passes/micro_pipeline_tp.py +++ b/torch/_inductor/fx_passes/micro_pipeline_tp.py @@ -440,7 +440,7 @@ class _Matmul: A_node=cast("torch.fx.Node", match[0].args[0]), B_node=cast("torch.fx.Node", mm_node.args[1]), # _Matmul handles reshapes via custom graph manipulation logic, see `replace_with()` method. - # TODO: explore unifying the _Matmul and _ScaledMatmul approaches to handling reshapes. + # TOOO: explore unifying the _Matmul and _ScaledMatmul approaches to handling reshapes. pre_mm_reshape=None, post_mm_reshape=None, ) @@ -906,7 +906,7 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None: # 1. The scatter dim before the reshape, which was assigned using the original (a,b,c) @ (c,d) = (a,b,d) dims. # 2. The scatter dim after the reshape, to use when we are doing the 2D (a*b,c) @ (c,d) = (a,b,d) scaled mm op. # 3. Store expected potentially 3D+ mm output shape, so we can reshape the 2D mm output to the intended - # 3D+ shape before applying reduce-scatter, and to prevent shape errors with subsequent ops. + # 3D+ shape before applying reduce-scatter, and to prevent shape erros with subsequent ops. # If 'A' was reshaped from 3D+ -> 2D for the mm, we need to determine the new scattter dim after the reshape # for the fused matmul reduce scatter implementation to use. diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py index 96f454d4f3db..8dae6521d538 100644 --- a/torch/_inductor/fx_passes/mkldnn_fusion.py +++ b/torch/_inductor/fx_passes/mkldnn_fusion.py @@ -187,7 +187,7 @@ if torch._C._has_mkldnn: def grouped_gemm_pass(graph: torch.fx.Graph): """ - Group GEMM has multi output nodes which is complicated to define a Pattern. + Group GEMM has multi output nodes which is compilicated to define a Pattern. Use below way to connect the pattern to the lowering. TODO: Use MultiOutputPattern, current limitation is the pattern requires fixed number of output nodes. Extend to support Group GEMM for pattern matcher. diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py index d2dfc3d9e4d0..10ca1c4dae97 100644 --- a/torch/_inductor/fx_passes/pad_mm.py +++ b/torch/_inductor/fx_passes/pad_mm.py @@ -102,7 +102,7 @@ def should_pad_common( symbolic_cnt += 1 else: return False - # filter out cases where all dimensions are symbolic + # filter out cases where all dimentions are symbolic if symbolic_cnt == len(t.size()): return False return all( @@ -226,7 +226,7 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool: and K > M and K > N and torch.cuda.get_device_capability() < (9, 0) - ): # doesn't repro on h100s: + ): # doesnt repro on h100s: return True # Fails with AMD @@ -239,7 +239,7 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool: # dram_gbps might be underestimating bandwidth because of cache. # if we estimate machine balance too low we might miss some speedups, - # if we estimate too high there will be unnecessary compilation time increase. + # if we extimate too high there will be unnecessary compilation time increase. # TODO - finetune coefficient here. As a reference point, Triton mm model assumes # 80% of reads are in cache and cache is 4x faster than dram_gbps machine_balance = machine_balance * 0.5 @@ -382,7 +382,7 @@ def should_pad_mm_bf16(dtype: torch.dtype, M: int, N: int, K: int) -> bool: and N % 2 == 1 and K >= large_k_threshold_to_pad and torch.cuda.get_device_capability() < (9, 0) - ): # doesn't repro on h100s: + ): # doesnt repro on h100s: return True return False @@ -711,7 +711,7 @@ def run_autoheuristic( ah_ori_time = autoheuristic.get_collected_feedback(orig_choice) ah_pad_time = autoheuristic.get_collected_feedback(pad_choice) - # if precondition is not satisfied, autoheuristic does not collect data + # if precondition is not satisifed, autoheuristic does not collect data if ah_ori_time is not None and ah_pad_time is not None: if ori_time is None: set_cached_base_mm_benchmark_time(ori_time_key, ah_ori_time) diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py index 71285de81c15..c00a7ac1ea3a 100644 --- a/torch/_inductor/fx_passes/post_grad.py +++ b/torch/_inductor/fx_passes/post_grad.py @@ -617,7 +617,7 @@ def reorder_for_locality(graph: torch.fx.Graph): # only reorder nodes before the first copy_ in the graph. # copy_ will appear at the end of functionalized graphs when there is mutation on inputs, - # and this reordering doesn't work well with mutation + # and this reordering doesnt work well with mutation first_copy = next( iter(graph.find_nodes(op="call_function", target=torch.ops.aten.copy_.default)), None, @@ -1436,7 +1436,7 @@ def register_partial_reduction_pattern(): def reuse_partial(match, input, reduced_dims, keepdim): partial_red, full_red = match.output_nodes() - # if they're small, reuse not worth it + # if theyre small, reuse not worth it if not statically_known_true(input.meta["val"].numel() >= 4096): return True diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py index 2d1709962e64..b51d7bc21a1e 100644 --- a/torch/_inductor/fx_passes/pre_grad.py +++ b/torch/_inductor/fx_passes/pre_grad.py @@ -394,7 +394,7 @@ def fetch_attr(target: str, mod): for i, atom in enumerate(target_atoms): if not hasattr(attr_itr, atom): raise RuntimeError( - f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}" + f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}" ) attr_itr = getattr(attr_itr, atom) return attr_itr diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py index ea379b0115d6..8f41e7885385 100644 --- a/torch/_inductor/fx_passes/split_cat.py +++ b/torch/_inductor/fx_passes/split_cat.py @@ -247,7 +247,7 @@ def remove_split_with_size_one(match: Match, *args, **kwargs): return # remove the dummy split whose split sections size is one # theoretically nodes with no users should be removed, but we have seen the corner case - # thus we add its users check to walk around the StopIteration error. + # thus we add its uers check to walk around the StopIteration error. if len(split_sections) == 1 and len(split_node.users.keys()) > 0: # find the grand children of the split_node next_users = find_next_users(split_node) @@ -1525,7 +1525,7 @@ def merge_getitem_cat(match: Match, split_sections: list[int], dim: int): # find the index of getitems to be cated/stacked # type: ignore[union-attr] indices = [arg.args[1] for arg in cat_user.args[0]] # type: ignore[union-attr] - # the getitems to be merged must be consecutive, otherwise + # the gettitems to be merged must be consecutive, otherwise # returned sliced tensor could be wrong if not is_sorted_and_consecutive(indices): # type: ignore[arg-type] continue @@ -1627,7 +1627,7 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int): for getitem in cat_user.args[0]: # type: ignore[union-attr] indices.append(getitem.args[1]) # type: ignore[union-attr] idx_to_getitem[getitem.args[1]] = getitem # type: ignore[union-attr] - # the getitems to be merged must be consecutive, otherwise + # the gettitems to be merged must be consecutive, otherwise # returned sliced tensor could be wrong if not is_sorted_and_consecutive(indices): # type: ignore[arg-type] continue @@ -2069,7 +2069,7 @@ def update_args_from_split_getitem( threshold_to_cat: int = 2, ): split_input, split_size, split_dim = _get_split_args_default(parents_seen[-1]) - # case 1: the number of getitems is the same as the split size, eliminate the split + # case 1: the number of getitems is the same as the split size, elimiate the split if len(split_size) == len(getitem_indices) and is_sorted_and_consecutive( getitem_indices ): @@ -2164,7 +2164,7 @@ def update_args_from_unbind_getitem( unbind_input = get_arg_value(parents_seen[-1], 0, "input") # split or unbind input unbind_dim = get_arg_value(parents_seen[-1], 1, "dim") # split or unbind dim cat_dim = get_arg_value(node, 1, "dim") # cat or stack dim - # case 1: the number of getitems is the same as the split size, eliminate the split + # case 1: the number of getitems is the same as the split size, elimiate the split size = list(unbind_input.meta["example_value"].shape)[unbind_dim] if size == len(getitem_indices): cat_shape = torch.cat( diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index 23554a0f4123..e3ed4f3c506d 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -1885,7 +1885,7 @@ class GraphLowering(torch.fx.Interpreter): # [NOTE] Codegen runtime asserts in Inductor # # We need to generate runtime asserts directly in Inductor instead - # of just reusing the asserts from input graphs because we reuse the + # of just re-using the asserts from input graphs becase we reuse the # same ShapeEnv as before. In particular, on subsequent graph passes, # we would immediately turn all of these assertions into noops, # because when we evaluated their expressions, we would see that @@ -1901,8 +1901,8 @@ class GraphLowering(torch.fx.Interpreter): # equals = torch.add(ones, c) # return equals # torch._dynamo.mark_dynamic(c, 0) - # When we reuse the ShapeEnv in Inductor lowering, the check that checks - # a and nonzero have the same shape would be evaluated to True after we resolve + # When we re-use the ShapeEnv in Inductor lowering, the check that checks + # a and nonzero have the same shape would be evaluted to True after we resolve # unbacked bindings using the ShapeEnv. # See test_unbacked_equals_input_size_runtime_assertion in test_aot_inductor. # @@ -2253,7 +2253,7 @@ class GraphLowering(torch.fx.Interpreter): graph. The parent graph is passed as an argument: the intention is to inline codegening of the subgraph in the parent graph's wrapper code (including the generated - kernels). The wrapper code is not finalized (via `.generate()` + kerenls). The wrapper code is not finalized (via `.generate()` call), as this will be done in the parent graph's `codegen()`. """ with dynamo_timed("GraphLowering.codegen_subgraph", log_pt2_compile_event=True): diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py index a43925b8d744..3b15096b0a9c 100644 --- a/torch/_inductor/index_propagation.py +++ b/torch/_inductor/index_propagation.py @@ -311,7 +311,7 @@ class IndexPropagation(DefaultHandler): If this is an issue, just use guards in `self.axioms`. The proper way of handling this would be to have a global shape_env that adds - runtime_asserts as they happen in the code. Then, it should be used in SimplifyIndexing + runtime_asserts as they happen in the code. Then, it shuld be used in SimplifyIndexing to perform wrap_expr and in CSEProxy.check_bounds to elide upper / lower bounds also for indirect_indexing """ diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index 2a070610da45..c13bcfa20d6c 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -200,7 +200,7 @@ def _is_static(x: object) -> bool: @dataclasses.dataclass(frozen=True) class GraphPartitionSignature: - # symbol inputs that are necessary for codegen + # symbol inputs that are neccessary for codegen symbol_inputs: OrderedSet[sympy.Symbol] # mapping from partition input name to IRNode or Expr. Need the name str since @@ -428,7 +428,7 @@ def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> b (V.graph.sizevars.size_hint_or_throw(x.get_stride()[i]) % alignment) == 0 for i in range(len(x.get_stride()) - 1) ) - # if the last dim size is <= 1, stride doesn't matter + # if the last dim size is <= 1, stride doesnt matter aligned_last_dim = ( V.graph.sizevars.size_hint_or_throw(x.get_stride()[-1]) == 1 or V.graph.sizevars.size_hint_or_throw(x.get_size()[-1]) <= 1 @@ -2215,7 +2215,7 @@ class Scan(Loops): dtypes: tuple[torch.dtype, ...] inner_fns: tuple[Callable[..., Any], ...] - # HACK we mimic reduction + # HACK we mimick reduction def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]: # TODO: Can combine_fn/reindex close over unbacked symbols? If so, we @@ -2424,7 +2424,7 @@ class Sort(Loops): stable: bool descending: bool - # HACK we mimic reduction + # HACK we mimick reduction def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]: return ( @@ -4618,7 +4618,7 @@ class TritonTemplateBuffer(TemplateBuffer): NOTE:[TritonTemplates with multiple outputs] We want the ability for TritonTemplates to output multiple tensors. Triton kernels have no notion of outputs and this is done by creating tensors that - are then mutated by the kernel. Currently our STORE_OUTPUT codegen doesn't + are then mutated by the kernel. Currenlty our STORE_OUTPUT codegen doesn't support creating multinode outputs for triton templates. We work around this by creating an extra input buffer during the lowering and we mark them as mutated inputs. @@ -4873,7 +4873,7 @@ class InputsKernel(OperationBuffer): if isinstance(input, list): reads.update(StarDep(x.get_name()) for x in input) elif isinstance(input, ShapeAsConstantBuffer): - # Skip creating dependency for symbolics as they're visible globally + # Skip creating dependncy for symbolics as they're visible globally continue else: reads.add(StarDep(input.get_name())) @@ -5190,7 +5190,7 @@ class ExternKernel(InputsKernel): else {} ) # FIXME: self.kwargs does not always match kwargs defined in schema, so sometimes - # ordered_kwargs_for_cpp_kernel is explicitly passed in. + # ordered_kwargs_for_cpp_kernel is explicilty passed in. if isinstance(self.op_overload, torch._ops.OpOverload): if not self.ordered_kwargs_for_cpp_kernel: self.ordered_kwargs_for_cpp_kernel = [ @@ -6835,7 +6835,7 @@ class FallbackKernel(ExternKernelAlloc): """ A class that represents a fallback kernel for handling operators that are not directly support by inductor. It currently supports functional ops, view ops, - inplace aten ops, and mutating ops that are auto-functionalizable. + implace aten ops, and mutating ops that are auto-functionalizable. """ def __init__( # type: ignore[no-untyped-def] @@ -7848,10 +7848,10 @@ class Conditional(ExternKernel): # make sure true and false outputs are structurally equivalent assert len(true_outputs) == len(false_outputs), (true_outputs, false_outputs) - for i, (t_o, f_o) in enumerate(zip(true_outputs, false_outputs)): - assert t_o.get_device() == f_o.get_device(), (i, t_o, f_o) - assert t_o.get_dtype() == f_o.get_dtype(), (i, t_o, f_o) - assert t_o.get_layout().offset == f_o.get_layout().offset, (i, t_o, f_o) + for i, (to, fo) in enumerate(zip(true_outputs, false_outputs)): + assert to.get_device() == fo.get_device(), (i, to, fo) + assert to.get_dtype() == fo.get_dtype(), (i, to, fo) + assert to.get_layout().offset == fo.get_layout().offset, (i, to, fo) device = next( o.get_device() diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py index 103abe085968..a3204de8b39f 100644 --- a/torch/_inductor/kernel/flex_attention.py +++ b/torch/_inductor/kernel/flex_attention.py @@ -1025,7 +1025,7 @@ def check_cpu_supported(): def contiguous_last_dim(x): - """Ensure that realized IR node has a contiguous stride in the last dimension.""" + """Ensure that realized IR node has a contigous stride in the last dimension.""" strides = x.maybe_get_stride() if strides and strides[-1] != 1: contiguous_stride_order = list(reversed(range(len(x.get_size())))) @@ -1080,7 +1080,7 @@ def lower_cpu( cur_kvSplitSize = V.graph.sizevars.shape_env.create_unbacked_symint().node.expr shape_env = V.graph.sizevars.shape_env - # We don't know the concrete value of cur_qSplitSize and cur_kvSplitSize during the compilation. + # We don't know the concret value of cur_qSplitSize and cur_kvSplitSize during the compilation. # Mark symbols > 1 to ensure broadcasting is always applied. # This avoids treating them as equal when `eq(var, 1)` is evaluated in `broadcast_symbolic_shapes`. shape_env.var_to_range[cur_qSplitSize] = ValueRanges(2, int_oo) @@ -1826,7 +1826,7 @@ flex_attention_backward_template = TritonTemplate( sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m # noqa: B950 - # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads. + # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads. q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64) do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64) dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64) @@ -1934,7 +1934,7 @@ flex_attention_backward_template = TritonTemplate( for off_g in range(0, GQA_SHARED_HEADS): off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g - # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads. + # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads. q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64) do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64) dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64) diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py index 030ba13a4edb..12f849aed7ac 100644 --- a/torch/_inductor/kernel/mm_common.py +++ b/torch/_inductor/kernel/mm_common.py @@ -108,7 +108,7 @@ def scaled_mm_options( # type: ignore[no-untyped-def] device_tma: bool = False, ) -> dict[str, Any]: def are_compatible_scales(size_a, size_b) -> bool: - # Same sized scales are compatible + # Same sized scales are compatable if len(size_a) == len(size_b): return True diff --git a/torch/_inductor/kernel/mm_scaled_grouped.py b/torch/_inductor/kernel/mm_scaled_grouped.py index ad34ea0210b5..9ca2ff39f3aa 100644 --- a/torch/_inductor/kernel/mm_scaled_grouped.py +++ b/torch/_inductor/kernel/mm_scaled_grouped.py @@ -601,7 +601,7 @@ def _tuned_grouped_mm_common( _, is_nonzero = _is_static_problem(layout) - # Checking only for the equality of corresponding dims of + # Checking only for the equality of correspoding dims of # multiplicands here, relying on meta function checks for # everything else. if is_nonzero and can_use_triton_kernel(mat_a, mat_b, offs, bias, scale_result): diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index a50d5ccecbb6..fc0ea67b1d50 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -2753,7 +2753,7 @@ make_fallback(torch._prims.rng_prims.run_with_rng_state) make_fallback(torch._prims.rng_prims.graphsafe_run_with_rng_state) -# Implemented / Half implemented +# Implmented / Half implemented # Scans. Implemented for CUDA, missing CPU make_fallback(aten.masked_scatter) make_fallback(aten.masked_scatter_backward) @@ -7083,7 +7083,7 @@ def prepare_softmax_online(x, dim): # Note: [Split online_softmax_reduce] # We don't split reduction for online_softmax_reduce for now. # On one hand, supporting split reduction makes things complex since - # the split out reuctions requires 2 inputs rather than one. + # the splitted out reuctions requires 2 inputs rather than one. # On the other hand, during training the online_softmax_reduce should # usually don't requires a split due to large batch size # (more specifically batch size times sequence length). diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py index 6eb3e30d87a9..0e93a5fe67aa 100644 --- a/torch/_inductor/mkldnn_ir.py +++ b/torch/_inductor/mkldnn_ir.py @@ -179,7 +179,7 @@ def _prepare_convolution_fusion_create( # Currently we don't support channel last for the situation that stride of input's batch dim is 0, # eg. input_size = (1, 1280, 64, 64), but input_stride=(0, 1, 81920, 1280). # So we use NCHW hear instead. - # Different with cpu, cpu conv always use channels_last for convolution when weight is prepacked, + # Different with cpu, cpu conv alway use channels_last for convolution when weight is prepacked, # but xpu does not do the prepack, so the problem exposed here is only for xpu. # TODO support channels_last for such zero stride input. elif get_device_type(x) == "xpu" and x.get_stride()[0] == 0: @@ -686,11 +686,11 @@ class QConvPointWiseBinaryPT2E(ExternKernelAlloc): if bias is not None - inputs = [x, x_scale, x_zp, w, w_scale, w_zp, accum, b] - const_args = [stride, padding, dilation, groups, o_scale, o_zp, - output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] + output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] else - inputs = [x, x_scale, x_zp, w, w_scale, w_zp, accum] - const_args [b, stride, padding, dilation, groups, o_scale, o_zp, - output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] + output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] """ self.has_bias = len(inputs) == 8 self.idx_for_inplace_sum = 6 @@ -1041,11 +1041,11 @@ class QLinearPointwiseBinaryPT2E(ExternKernelAlloc): if bias is not None - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2, bias] - const_args is: [o_scale, o_zp, - fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] + fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] else - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2] - const_args is: [bias, o_scale, o_zp, - fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] + fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] """ self.has_bias = has_bias self.idx_for_inplace_sum = 6 diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py index 1f4150952a85..df4d79fe55d5 100644 --- a/torch/_inductor/mkldnn_lowerings.py +++ b/torch/_inductor/mkldnn_lowerings.py @@ -624,7 +624,7 @@ def register_onednn_fusion_ops(): # For int8-mixed-bf16 quantization and inplace add, # there is case when accum dtype is float32 but output dtype is bfloat16. # Since the accum will be inplaced changed with post op sum, - # we will do accum dtype conversion here. + # we will do accum dtype convertion here. accum = to_dtype(accum, output_dtype) return TensorBox.create( mkldnn_ir.QConvPointWiseBinaryPT2E.create( @@ -1042,7 +1042,7 @@ def register_onednn_fusion_ops(): # For int8-mixed-bf16 quantization and inplace add, # there is case when accum dtype is float32 but output dtype is bfloat16. # Since the accum will be inplaced changed with post op sum, - # we will do accum dtype conversion here. + # we will do accum dtype convertion here. x2 = to_dtype(x2, output_dtype) else: assert x2.get_dtype() == output_dtype, ( diff --git a/torch/_inductor/remote_cache.py b/torch/_inductor/remote_cache.py index aaa266b60e00..2aadc806bf90 100644 --- a/torch/_inductor/remote_cache.py +++ b/torch/_inductor/remote_cache.py @@ -136,7 +136,7 @@ class RemoteCachePassthroughSerde(RemoteCacheSerde[_T, _T]): # To write (`put`), the RemoteCache takes data, uses the RemoteCacheSerde to # convert it for the backend and passes it to the backend. # -# Conversely when reading (`get`), the RemoteCache takes data from the backend, +# Conversly when reading (`get`), the RemoteCache takes data from the backend, # uses the RemoteCacheSerde to convert it and returns it. # # The RemoteCacheBackend is generic on _U - which is the type of data the diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py index 5c9cc60bef87..74df6ed671ef 100644 --- a/torch/_inductor/runtime/benchmarking.py +++ b/torch/_inductor/runtime/benchmarking.py @@ -230,7 +230,7 @@ class InductorBenchmarker(TritonBenchmarker): in milliseconds. An estimated duration is calculated based on the values of `memory_warmup_iters` and `benchmark_iters`, along with the estimated runtime of `_callable` and various other factors, and we then shrink - `benchmark_iters` to fit in the allotted maximum duration. + `benchmark_iters` to fit in the alloted maximum duration. - **kwargs: Additional kwargs that may be passed to the fallback. Returns: diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py index 413dfaf09d06..b41ca81ebdfc 100644 --- a/torch/_inductor/runtime/coordinate_descent_tuner.py +++ b/torch/_inductor/runtime/coordinate_descent_tuner.py @@ -208,7 +208,7 @@ class CoordescTuner: """ Check if candidate_config is better than best_config. - Return a tuple of (compare_result, candidate_timing). + Return a touple of (compare_result, candidate_timing). compare_result is true iff candidate_config is better. """ log.debug("Try config %s", candidate_config) diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py index 21cd5987f8f4..bf5b24a9fe56 100644 --- a/torch/_inductor/runtime/runtime_utils.py +++ b/torch/_inductor/runtime/runtime_utils.py @@ -25,8 +25,8 @@ def conditional_product(*args: int) -> int: return functools.reduce(operator.mul, [x for x in args if x]) -def ceildiv(number: int, denom: int) -> int: - return -(number // -denom) +def ceildiv(numer: int, denom: int) -> int: + return -(numer // -denom) def is_power_of_2(n: int) -> bool: @@ -155,7 +155,7 @@ dynamo_timed = torch._dynamo.utils.dynamo_timed # type: ignore[has-type] def triton_hash_to_path_key(key: str) -> str: # In early versions of Triton, the hash is directly used in the path name. # Later, the hash is converted to base64 before being used in the path name. - # Later, the base64 conversion was replaced to the base32 + # Later, the base64 convertion was replaced to the base32 # # This code tries to import _base64 and falls back to _base32 if _base64 is unavailable. # diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py index cfd708bcf4bf..1a421f5239a8 100644 --- a/torch/_inductor/runtime/triton_helpers.py +++ b/torch/_inductor/runtime/triton_helpers.py @@ -202,7 +202,7 @@ def online_softmax_combine(lhs_max, lhs_sum, rhs_max, use_fast_math: tl.constexp # Should be # out_sum = lhs_sum * lhs_scale + rhs_sum * rhs_scale - # but since rhs_sum is all 1, we can simplify it. + # but since rhs_sum is all 1, we can simpliy it. out_sum = lhs_sum * lhs_scale + rhs_scale return out_max, out_sum @@ -460,7 +460,7 @@ def exclusive_scan_decoupled_lookback_64(scratch_base, block_value, index, combi block_value: Scalar value for this block, must be 64-bits wide index: Scalar index of this block relative to the current scan combine_fn: Function ``(value, value) -> value`` which is scanned over - init: Scalar value equal to the identity of combine_fn + init: Scalar value equal to the identiy of combine_fn """ # Publish block sum so subsequent blocks don't get stuck waiting for us if index > 0: diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py index 687ba95e1dd1..7bb476f178ee 100644 --- a/torch/_inductor/scheduler.py +++ b/torch/_inductor/scheduler.py @@ -477,7 +477,7 @@ class BaseSchedulerNode: buf_name = buf_to_be_inplaced.get_name() # Dedup read/writes with equivalent indices # TODO - would be nice if we could just cache accesses on ReadWrites, - # and enforce variant that this class & members are functional.. + # and inforce variant that this class & members are functional.. deps: OrderedSet[Dep] = OrderedSet() for user in buf_to_be_inplaced.users: user_node = user.node @@ -1079,7 +1079,7 @@ class SchedulerNode(BaseSchedulerNode): # TODO(shunting) if this cause compilation time increase when # enabling LOAF by default, try just clearing the specific cache - # entry by using a customized cache implementation rather than + # entry by using a customized cache implemetation rather than # lru_cache. SIMDScheduling.candidate_tilings.cache_clear() @@ -3325,7 +3325,7 @@ class Scheduler: Return true if fusing the two nodes can potentially increasing peak memory. The implementation is more like a heuristic since we don't really know if we are at peak - or not when trying to fuse these two nodes. The order of nodes may change later which makes the + or not when trying to fuse these two ndoes. The order of nodes may change later which makes the peak memory estimation hard. Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes: @@ -3365,7 +3365,7 @@ class Scheduler: try: memory_overhead += int(key[2]) except ValueError: - # not an integer. Fallback is to fuse + # not an interger. Fallback is to fuse return False bw_saving = self.score_fusion_memory(node1, node2) @@ -3470,7 +3470,7 @@ class Scheduler: """ Right now just greedily reorder the loop of node1 to be compatible with node2, but ideally we should have some heuristics to reorder the loop for node2 - to be compatible with node1 if that's more efficient. + to be compatibile with node1 if that's more efficient. """ # TODO Don't do loop reordering for CPU for now. @@ -3569,7 +3569,7 @@ class Scheduler: # potential bad cache behavior and shared memory use. # we also want to avoid benchmarking reliably unprofitable fusions like downcasts from fp32 -> fp16 inside kernel. # allowing gathers by allowing increasing write_bytes by small factor - # TODO - make configurable per input, for instance, bias can fuse fp32 -> fp16 profitably + # TODO - make configurable per input, for insance, bias can fuse fp32 -> fp16 profitably BYTES_THRESHOLD_MULTIPLIER = 1.1 if read_bytes > (write_bytes * BYTES_THRESHOLD_MULTIPLIER): @@ -4436,7 +4436,7 @@ class Scheduler: ) -> list[BaseSchedulerNode]: """ Reorder nodes to minimize the number of partitions via a bfs - topological sort. This is the optimal reordering such that the + topological sort. This is the optimal reodering such that the number of partitions cannot be reduced further. This may be sub-optimal for other metrics such as peak memory. This does not change relative orders of two cudagraphable nodes, nor the diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py index d00eca1304f0..30dbf0dbc6b4 100644 --- a/torch/_inductor/select_algorithm.py +++ b/torch/_inductor/select_algorithm.py @@ -375,7 +375,7 @@ class TritonTemplateKernel(TritonKernel): self.template_out: Optional[str] = None self.ops_handler: Optional[V.WrapperHandler] = None # type: ignore[name-defined] - # When caching is enabled, the generated code is not dependent on the input nodes names, or + # Whe caching is enabled, the generated code is not dependent on the input nodes names, or # symbolic sizes names. # However, some of the variables returned by generate_and_load that are computed during the # triton template expansions (code generation) are dependent on those. diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py index a506b915e9a8..f8202dcb6d51 100644 --- a/torch/_inductor/sizevars.py +++ b/torch/_inductor/sizevars.py @@ -774,11 +774,11 @@ class SizeVarAllocator: return False if is_first: - # first ModularIndexing should contains a nested ModularIndex + # first ModularIndexing should conatins a nested ModularIndex if not isinstance(x, ModularIndexing): return False else: - # second ModularIndexing should contains a non-negative + # second ModularIndexing should constains a non-negative # symbol if not isinstance(x, sympy.Symbol) or not self.statically_known_geq( x, 0 @@ -809,7 +809,7 @@ class SizeVarAllocator: ) -> Union[bool, tuple[sympy.Expr, sympy.Expr]]: """ Expand the FloorDiv to the entire expression so that the expression may - be simplified. + be simplfied. E.g., for a 2D contiguous tensor with shape [a, 2 * b], and index variables x1, x2, index expression 'x1 * 2b + x2' can be easily combined. diff --git a/torch/_inductor/standalone_compile.py b/torch/_inductor/standalone_compile.py index e49e8774a2c5..93af8cc3209d 100644 --- a/torch/_inductor/standalone_compile.py +++ b/torch/_inductor/standalone_compile.py @@ -74,7 +74,7 @@ class CompiledArtifact: key = cache_info.aot_autograd_artifacts[0] if format == "binary": - # can't assert that it is a file since it might not exist yet + # cant assert that it is a file since it might not exist yet assert not os.path.isdir(path) from torch.utils._appending_byte_serializer import BytesWriter @@ -118,7 +118,7 @@ class CompiledArtifact: ) -> CompiledArtifact: with dynamo_timed("CompiledArtifact.load"): if format == "binary": - # can't assert that it is a file since it might not exist yet + # cant assert that it is a file since it might not exist yet assert not os.path.isdir(path) with open(path, "rb") as file: artifacts = file.read() diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py index 4a1febe08e99..bec7cf8db648 100644 --- a/torch/_inductor/tiling_utils.py +++ b/torch/_inductor/tiling_utils.py @@ -300,7 +300,7 @@ class NodeSplitGetter: # initially, we are just going to do a single reduction split since # reduction tiling is off by default. even if we miss a reduction split, # we can recover it in the split var analysis. - # TODO: an earlier version for this code tried to iteratively try the maximum number + # TODO: an earlier version fo this code tried to iteratively try the maximum number # of split vars, by iterating over both pointwise and reduction. but not worth # the complexity yet. @@ -336,7 +336,7 @@ class NodeSplitGetter: ) self.pw_split_options[len(new_split)].add(new_split) - # if for whatever reason we couldn't split above, return default split + # if for whatever reason we couldnt split above, return default split return ((self.pointwise_numel,), (self.red_numel,)) def try_split(self, pw: Split, red: Split) -> Optional[tuple[Split, Split]]: diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py index b5ccb873e33f..6fb142477617 100644 --- a/torch/_inductor/triton_bundler.py +++ b/torch/_inductor/triton_bundler.py @@ -109,7 +109,7 @@ class TritonBundler: _static_autotuners: Optional[list[StaticallyLaunchedAutotuner]] = None # __grp__kernel_name.json contains metadata with source code paths - # we use this as sentinel value for search and replace + # we use this as sentinal value for search and replace _REPLACE_BYTES: bytes = b"[REPLACE]" @staticmethod diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index 197764dc130e..d8064ecb8158 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -381,17 +381,17 @@ def unique(it: Iterable[_T]) -> ValuesView[_T]: def ceildiv( - number: Union[int, sympy.Expr], denom: Union[int, sympy.Expr] + numer: Union[int, sympy.Expr], denom: Union[int, sympy.Expr] ) -> Union[int, sympy.Expr]: - if isinstance(number, sympy.Expr) or isinstance(denom, sympy.Expr): - return CeilDiv(sympy.sympify(number), sympy.sympify(denom)) + if isinstance(numer, sympy.Expr) or isinstance(denom, sympy.Expr): + return CeilDiv(sympy.sympify(numer), sympy.sympify(denom)) # TODO: There is a bug in a call to this function, to repro: # python benchmarks/dynamo/huggingface.py --inductor -d cuda --accuracy # --amp --only YituTechConvBert --dynamic-shapes - assert isinstance(number, int) and isinstance(denom, int), ( - f"{number}: {type(number)}, {denom}: {type(denom)}" + assert isinstance(numer, int) and isinstance(denom, int), ( + f"{numer}: {type(numer)}, {denom}: {type(denom)}" ) - return runtime_ceildiv(number, denom) + return runtime_ceildiv(numer, denom) def _type_of(key: Optional[torch.dtype]) -> str: @@ -980,7 +980,7 @@ def get_first_incompatible_cudagraph_node( and torch._C.Tag.cudagraph_unsafe in node.target.tags ): # skip cudagraph if a cudagraph_unsafe op is detected. - # graph_partition helps by splitting on this cudagraph_unsafe + # graph_partition helps by spliting on this cudagraph_unsafe # op and cudagraphifying the subgraphs. return node diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py index 210b5755f9e6..54e698822b30 100644 --- a/torch/export/unflatten.py +++ b/torch/export/unflatten.py @@ -524,7 +524,7 @@ class UnflattenedModule(torch.nn.Module): if self.flat_args_adapter is None: raise TypeError( - "There is no flat args adapter specified. " + "There is no flat args adapter sepcified. " "Are you sure you are calling this with the right arguments? " ) else: