diff --git a/.lintrunner.toml b/.lintrunner.toml index 06ad52ead071..55a09fdcdde9 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -1172,7 +1172,6 @@ exclude_patterns = [ 'test/distributed/**', 'torch/**', 'torch/_*/**', - 'torch/_inductor/**', 'torch/_dynamo/**', 'torch/ao/**', 'torch/fx/**', diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py index b6d19ada8138..adf74dc62b70 100644 --- a/test/export/test_unflatten.py +++ b/test/export/test_unflatten.py @@ -233,7 +233,7 @@ class TestUnflatten(TestCase): new_inps = *inps, torch.rand(2, 3) with self.assertRaisesRegex( TypeError, - "There is no flat args adapter sepcified. Are you sure you are calling this with the right arguments?", + "There is no flat args adapter specified. Are you sure you are calling this with the right arguments?", ): unflattened(new_inps) diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt index cdb8d4571239..7856a58d54ca 100644 --- a/tools/linter/dictionary.txt +++ b/tools/linter/dictionary.txt @@ -2,6 +2,7 @@ coo Din Dout dOut +ElementE fro froms hsa diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py index 2cd563003295..c936fbe92c67 100644 --- a/torch/_inductor/autotune_process.py +++ b/torch/_inductor/autotune_process.py @@ -585,7 +585,7 @@ class TritonBenchmarkRequest(BenchmarkRequest): num_buffers_warp_spec: int = 0, matrix_instr_nonkdim: int = 0, # only used for hip to choose the shape of mfma instruction. waves_per_eu: int = 0, # only used for hip to schedule waves per execution unit - kpack: int = 0, # ROCm specific gemm paramete + kpack: int = 0, # ROCm specific gemm parameter ) -> None: super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args) self.module_path = module_path diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index aa16b028d037..f259def02e41 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -621,7 +621,7 @@ class FxGraphCachePickler(pickle.Pickler): defined triton kernels Essentially what we are doing here is a huge hack where user defined triton kernel contain a dynamo time side table and the arguments to the - call_function are indicies into this side table. These arguments are not + call_function are indices into this side table. These arguments are not for hashing purposes since we included the source code into the cache key and the numbers are prone to give false negatives due to ordering. """ @@ -1154,7 +1154,7 @@ class FxGraphCache(GuardedCache[CompiledFxGraph]): current context to validate that a cached entry can be served. - A given graph could have multiple compiled versions, corresponding to different sets of guards. Therefore, we store cache entries in the form: - // + // - On lookup, we compute the key from the graph details, iterate over all leaf files in the corresponding subdirectory, deserialize the entry, and evaluate its guards expression. If the evaluation succeeds, we have a @@ -1836,8 +1836,8 @@ class AotCodeCompiler: ) consts_s = Path(consts_s) object_build_options = CppTorchDeviceOptions( - # Intel compiler failed to compile this manully constructed assembly file. - # it is ok to use gcc to compile the .S to a .o and linked with Intel comiler . + # Intel compiler failed to compile this manually constructed assembly file. + # it is ok to use gcc to compile the .S to a .o and linked with Intel compiler . device_type=device_type if device_type != "xpu" else "cpu", aot_mode=graph.aot_mode, compile_only=True, @@ -2206,7 +2206,7 @@ class AotCodeCompiler: generated_files.append(weight_file) else: - # TODO: unify to alway use mmap_weights + # TODO: unify to always use mmap_weights generated_files.append(consts_o) so_builder.save_src_to_cmake(cmake_path, consts_o) @@ -3164,31 +3164,31 @@ class HalideCodeCache(CppPythonBindingsCodeCache): base = cache_dir() dirpath = Path(base) / f"halide-runtime-{target}-{cls.config_hash()}" os.makedirs(dirpath, exist_ok=True) - donefile = str(dirpath / "done") - lockfile = str(dirpath / "lock") - hookfile = str(dirpath / "hooks.cpp") - afile = str(dirpath / "standalone_halide_runtime.a") - sofile = str(dirpath / libname) - if not os.path.exists(donefile): + done_file = str(dirpath / "done") + lock_file = str(dirpath / "lock") + hook_file = str(dirpath / "hooks.cpp") + a_file = str(dirpath / "standalone_halide_runtime.a") + so_file = str(dirpath / libname) + if not os.path.exists(done_file): import halide as hl # type: ignore[import-untyped,import-not-found] from torch.utils._filelock import FileLock - with FileLock(lockfile, LOCK_TIMEOUT): - if not os.path.exists(donefile): - with open(hookfile, "w") as f: + with FileLock(lock_file, LOCK_TIMEOUT): + if not os.path.exists(done_file): + with open(hook_file, "w") as f: if device_type == "cuda": f.write( cls.standalone_runtime_cuda_init.format( cls.find_header("HalideRuntimeCuda.h") ) ) - hl.compile_standalone_runtime(afile, hl.Target(target)) + hl.compile_standalone_runtime(a_file, hl.Target(target)) - name, output_dir = get_name_and_dir_from_output_file_path(sofile) + name, output_dir = get_name_and_dir_from_output_file_path(so_file) halide_cmd_gen = CppBuilder( name=name, - sources=[hookfile, afile], + sources=[hook_file, a_file], output_dir=output_dir, BuildOption=CppTorchDeviceOptions( device_type=device_type, @@ -3198,10 +3198,10 @@ class HalideCodeCache(CppPythonBindingsCodeCache): subprocess.check_call( shlex.split(halide_cmd_gen.get_command_line()) ) - touch(donefile) - assert os.path.exists(sofile) - cls._standalone_runtime_path = sofile - return sofile + touch(done_file) + assert os.path.exists(so_file) + cls._standalone_runtime_path = so_file + return so_file @classmethod def _get_uncompiled_header(cls, device: str) -> str | None: diff --git a/torch/_inductor/codegen/aoti_hipify_utils.py b/torch/_inductor/codegen/aoti_hipify_utils.py index b6ccaab56f82..eb71d4ee7f39 100644 --- a/torch/_inductor/codegen/aoti_hipify_utils.py +++ b/torch/_inductor/codegen/aoti_hipify_utils.py @@ -8,7 +8,7 @@ from torch.utils.hipify.hipify_python import PYTORCH_MAP, PYTORCH_TRIE # "... # from ..codecache import CudaKernelParamCache # ..." -# In such cases, we do not need to hipify_torch the orignial class/file name in codegen/codecache +# In such cases, we do not need to hipify_torch the original class/file name in codegen/codecache def maybe_hipify_code_wrapper(source_codes: str, force_hipify: bool = False) -> str: diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py index dc4928c8d0fc..882be85d2e12 100644 --- a/torch/_inductor/codegen/common.py +++ b/torch/_inductor/codegen/common.py @@ -1551,7 +1551,7 @@ class KernelArgs: def size(self, name: sympy.Symbol) -> str: assert isinstance(name, sympy.Symbol), (type(name), name) if name.name == "seed": - self.sizevars[name] = "seed" # dont' mange the name of seeds + self.sizevars[name] = "seed" # don't manage the name of seeds return "seed" return self._lookup("ks", self.sizevars, name) @@ -1884,7 +1884,7 @@ class CSE(Generic[CSEVariableType, AugmentedKeyT]): line = f"{expr}{self.suffix}" buffer.writeline(line) - # cpp backend cannot determin is_vec at this point + # cpp backend cannot determine is_vec at this point if ( assignment and ( @@ -2102,7 +2102,7 @@ class Kernel(CodeGen, Generic[CSEVariableType]): assert upper is None or isinstance(upper, str) if lower and upper: # The conditions need to be in parens because of Python's operator precedence. - # It'd be less error-prone to use and/or/not, which is suported by triton + # It'd be less error-prone to use and/or/not, which is supported by triton cond = f"({lower} <= {var}) & ({var} < {upper})" cond_print = f"{lower} <= {var} < {upper}" elif lower: diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py index 9fc72a4908a4..d8fe282e9f44 100644 --- a/torch/_inductor/codegen/cpp.py +++ b/torch/_inductor/codegen/cpp.py @@ -327,7 +327,7 @@ def reduction_prefix_array( Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here. We just use unique_ptr make it works on MSVC. - For other compilers, we continue to use VLA to get best performence. + For other compilers, we continue to use VLA to get best performance. """ code_buffer = IndentedBuffer() acc_decl = ( diff --git a/torch/_inductor/codegen/cpp_flex_attention_template.py b/torch/_inductor/codegen/cpp_flex_attention_template.py index 2542acc6108b..5081e2ad9f61 100644 --- a/torch/_inductor/codegen/cpp_flex_attention_template.py +++ b/torch/_inductor/codegen/cpp_flex_attention_template.py @@ -311,7 +311,7 @@ extern "C" } if (need_pack) { // When the number of gemm is greater than the number of pack, - // the pack overhead can be overlaped. + // the pack overhead can be overlapped. int64_t thresh_size = 64; need_pack = kvSize >= thresh_size && qSize >= thresh_size; if (need_pack) { diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py index 8e5caef080d5..ce5ec7ed9eaa 100644 --- a/torch/_inductor/codegen/cpp_gemm_template.py +++ b/torch/_inductor/codegen/cpp_gemm_template.py @@ -1092,7 +1092,7 @@ class CppGemmTemplate(CppTemplate): """ NOTE Weight prep consists of 2 separate steps: 1. Blocking the weight tensor into a 3D shape: [n//block_n, k, block_n] - This is always done if the weight tensor is contant, i.e. for all GEMM and some BMM. + This is always done if the weight tensor is constant, i.e. for all GEMM and some BMM. For BMM, we also block non-contiguous weight tensors, since they would be reshaped anyway. This assumes that blocked, contiguous weights will be more efficient for the GEMM kernel, and is worth the overhead of reshape and blocking. diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py index 4e90b1ba9e17..c9c54553756f 100644 --- a/torch/_inductor/codegen/cpp_micro_gemm.py +++ b/torch/_inductor/codegen/cpp_micro_gemm.py @@ -684,7 +684,7 @@ inline void {{kernel_name}}_transpose_b_kernel( // Use 2 implementations for the transposed B: // First implementation: // Transpose first and then perform outer product calculation in sub-blocks, - // which introduces an additional tranpose overhead of [K, N] compared to the non-tranpose version. + // which introduces an additional transpose overhead of [K, N] compared to the non-transpose version. // Second implementation: // Directly perform inner product calculation in sub-blocks, // which introduces an additional vector reduction of [M, N] compared to the non-tranpose version. @@ -1001,7 +1001,7 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs): ) class CppMicroGemmAMX(CppMicroGemm): """ - This class generates the code for micro gemm using Advanced Matrix eXtention (AMX) + This class generates the code for micro gemm using Advanced Matrix extension (AMX) instructions available in 4th generation Intel Xeon for compute. It supports input types of torch.bfloat16 with fp32 output. """ diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py index 4cae56228725..bdaf74952ce8 100644 --- a/torch/_inductor/codegen/cpp_wrapper_cpu.py +++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py @@ -1351,7 +1351,7 @@ class CppWrapperCpu(PythonWrapperCodegen): def generate_index_put_fallback(self, kernel, x, indices, values, accumulate): # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version # See the comment in codegen_reinterpret_view about why having something like - # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding + # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding # tensor prematurely deallocated, thus the temporary array trick here. indices_str = self._generate_temporary_array_pointer( "AtenTensorHandle", indices @@ -1788,7 +1788,7 @@ class CppWrapperCpu(PythonWrapperCodegen): if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer): # in ABI-compatible mode, we need to use the ABI shim function - # to extract a C++ bool from the unrelying scalar bool Tensor + # to extract a C++ bool from the underlying scalar bool Tensor predicate = f"{conditional.predicate.get_name()}_scalar" if predicate not in self.used_cond_predicate: self.codegen_tensor_item( @@ -1852,7 +1852,7 @@ class CppWrapperCpu(PythonWrapperCodegen): # in ABI-compatible mode, the carried inputs are codegened # as buffers outside the while loop and set to the initial # values. at the end of each while_loop iteration, they - # will be assined the carried values. + # will be assigned the carried values. out_name = out.get_name() self.writeline(f"AtenTensorHandle {out_name}_handle;") self.writeline( @@ -1861,7 +1861,7 @@ class CppWrapperCpu(PythonWrapperCodegen): self.writeline(f"RAIIAtenTensorHandle {out_name}({out_name}_handle);") cond_outer_inputs.append(out_name) - # additional inputs will be assinged within the while_loop + # additional inputs will be assigned within the while_loop # iteration directly from the corresponding outer graph buffers cond_outer_inputs.extend(outer_additional_inputs) diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py index 7334ff5c64b7..0d53db7f32c6 100644 --- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py +++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py @@ -728,7 +728,7 @@ class CppWrapperCpuArrayRef(CppWrapperCpu): self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor() # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version # See the comment in codegen_reinterpret_view about why having something like - # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding + # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding # tensor prematurely deallocated, thus the temporary array trick here. indices_str = self._generate_temporary_array_pointer( "AtenTensorHandle", diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py index 909f8b7284b5..29fbe5eeabf8 100644 --- a/torch/_inductor/codegen/cpp_wrapper_gpu.py +++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py @@ -380,7 +380,7 @@ class CppWrapperGpu(CppWrapperCpu): # `source` is in the form of `&var_x`, where `var_x` is the data pointer # (CUdeviceptr); we dereference `source` and cast to `void*` to pass to - # the data pointer of the source tensor ot the helper function + # the data pointer of the source tensor to the helper function # `init{1,2}DTMADescriptor` ptr = f"reinterpret_cast(*({source}))" dims = ", ".join(self.val_to_arg_str(dim) for dim in desc.dims) diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py index 2c61e0fdf2f7..becbf1f2c552 100644 --- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py +++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py @@ -77,7 +77,7 @@ if try_import_cutlass(): if not is_row_major and not is_column_major: raise RuntimeError( f"Cannot create example tensor for {buffer.get_name()} with \ -non-contiguous layout, recieved stride: {stride} and shape: {shape}" +non-contiguous layout, received stride: {stride} and shape: {shape}" ) return CutlassTensor( diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py index fec507e6508f..176d1e2f69f0 100644 --- a/torch/_inductor/codegen/cuda/gemm_template.py +++ b/torch/_inductor/codegen/cuda/gemm_template.py @@ -293,7 +293,7 @@ GEMM_ARGS_SPARSE_CUTLASS_2X = r""" }; """ -# Additional includes which are neccessary if the standalone test / debug runner is generated as wel +# Additional includes which are necessary if the standalone test / debug runner is generated as well GEMM_STANDALONE_RUNNER_ADDITIONAL_INCLUDES = r""" #ifdef GENERATE_STANDALONE_RUNNER #include "cutlass/util/distribution.h" @@ -375,7 +375,7 @@ extern "C" int run_standalone(uint64_t seed, int repetitions) { std::cout << "Calling once to get workspace size" << std::endl; {{test_call_statement}}; - // Allocate workspace if neccessary + // Allocate workspace if necessary if (workspace_size > 0) { workspace_data.reset(workspace_size); std::cout << "Allocated workspace size of " << workspace_size << " bytes" << std::endl; @@ -684,13 +684,13 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC): ) -> bool: """ Helper method to determine whether we should do an explicit transpose by switching the order of the - matmul operands. This might be neccessary when we can't otherwise arrive at the right memory + matmul operands. This might be necessary when we can't otherwise arrive at the right memory layout for the given Bias operand. Note: This method is a workaround for CUDA Errors that seemingly non-deterministically occurred in practice in some CUTLASS GEMM Kernels with Linear epilogues that have a bias term. it might make sense to check on newer Cutlass releases whether it makes sense to keep - returning True in certain cases or whether it becomes unneccessary. + returning True in certain cases or whether it becomes unnecessary. """ # If bias is row major, swap all M and N dimensions if ( diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py index f51ee70b73bc..1749db7576ed 100644 --- a/torch/_inductor/codegen/halide.py +++ b/torch/_inductor/codegen/halide.py @@ -1447,7 +1447,7 @@ class HalideKernel(SIMDKernel): current_device = V.graph.get_current_device_or_throw() if current_device.type == "cpu": target = [config.halide.cpu_target] - schduler = config.halide.scheduler_cpu + scheduler = config.halide.scheduler_cpu scheduler_flags = { "parallelism": parallel_num_threads(), } @@ -1456,7 +1456,7 @@ class HalideKernel(SIMDKernel): assert current_device.type == "cuda", "only cpu/cuda supported" assert current_device.index <= 0, "only default device supported" target = [config.halide.gpu_target] - schduler = config.halide.scheduler_cuda + scheduler = config.halide.scheduler_cuda capability = torch.cuda.get_device_properties(current_device) if "cuda_capability" not in target[0]: for major, minor in [(8, 6), (8, 0), (7, 5), (7, 0), (6, 1)]: @@ -1490,7 +1490,7 @@ class HalideKernel(SIMDKernel): return HalideMeta( argtypes, target="-".join(target), - scheduler=schduler, + scheduler=scheduler, scheduler_flags=scheduler_flags, # type: ignore[arg-type] cuda_device=cuda_device, ) diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py index e33c0037e899..ded6cb093af9 100644 --- a/torch/_inductor/codegen/mps.py +++ b/torch/_inductor/codegen/mps.py @@ -478,9 +478,9 @@ class MetalKernel(SIMDKernel): dtype = V.graph.get_dtype(name) line = f"{var}[{self.index_to_str(index)}]" if dtype in [torch.float16, torch.bfloat16]: - # TODO(NS): Figure out the right balance betwene optype casts + # TODO(NS): Figure out the right balance between optype casts # op_math_t for half-precision floats should be float32 - # Otherwise it can lead to a corretness issues with eager + # Otherwise it can lead to a correctness issues with eager line = f"static_cast({line})" dtype = torch.float32 return self.cse.generate(self.loads, line, dtype=dtype) @@ -879,7 +879,7 @@ class MetalKernel(SIMDKernel): else: return f"{kwarg}=[{', '.join(threads)}]" - # For reduction kernels, limit the maximum size over reduction dimentions to + # For reduction kernels, limit the maximum size over reduction dimensions to # a maximum threadgroup size if len(self.active_range_trees()) > 0: threads = [ diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py index 8b07374faf02..9425eaa97ff7 100644 --- a/torch/_inductor/codegen/simd.py +++ b/torch/_inductor/codegen/simd.py @@ -151,7 +151,7 @@ class IterationRanges: class IterationRangesRoot(IterationRanges): """ Root of a iteration range tree that represents a single - tiled dimension in the output kernel. It contains muliple + tiled dimension in the output kernel. It contains multiple sets of iteration represented with IterationRangesEntry. """ @@ -1570,7 +1570,7 @@ class SIMDScheduling(BaseScheduling): p_n.can_codegen_without_upcasts() for p_n in prologue_group ) - # TODO - this doesnt work with libdevice calls, potentially other bugs + # TODO - this doesn't work with libdevice calls, potentially other bugs # upcasting to fp32 and downcasting gives large slowdown with config.patch( "triton.codegen_upcast_to_fp32", not can_codegen_without_upcast @@ -1908,7 +1908,7 @@ class SIMDScheduling(BaseScheduling): reduction_numel, ) -> list[dict[str, tuple[sympy.Expr]]]: """ - Creates N-dimensional tiling candidiates, attempting to simplify loads/stores + Creates N-dimensional tiling candidates, attempting to simplify loads/stores by tiling the kernel into higher dimensions. Returns a list of tilings ranked by dimensionality. @@ -2128,7 +2128,7 @@ class SIMDScheduling(BaseScheduling): split_scores.append(prev_var_coalesced_score) # penalize splits that leave small blocks - # where we cant fully utilize full memory transaction + # where we can't fully utilize full memory transaction # TODO: incorporate exact bitwidth, and read/write # coalesced write is 2x more important for i in range(len(splits)): diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index 5fedba5e89f1..a404abc136f5 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -647,7 +647,7 @@ class TritonPrinter(PythonPrinter): def _print_min_max_helper(self, expr: sympy.Expr, cmp: str) -> str: """ - Helper for max/min code genereration. + Helper for max/min code generation. cmp: > or < """ if len(expr.args) == 1: @@ -939,7 +939,7 @@ class TritonOverrides(OpOverrides): return triton_val # NOTE: We use a tensor here in order to get the expected type. - # Otherwise, e.g. float64 constants would be trunctated to float32. + # Otherwise, e.g. float64 constants would be truncated to float32. if value < 0 and not dtype.is_signed: triton_signed_type = f"tl.{triton_type[4:]}" return f"tl.full({shape}, {triton_val}, {triton_signed_type}).to({triton_type})" @@ -1956,7 +1956,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): # Compute the ND block shape from the linear block size. # Use CielDiv to round leading dimensions up to 1. # Non-leading dimensions are clamped to the size of the iteration range, - # while the leading dimension can exceed this to accomodate a larger + # while the leading dimension can exceed this to accommodate a larger # block size. linear_block_size = TritonSymbols.get_block_size(range_tree) block_shape: list[sympy.Expr] = [ diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py index d67cb56082f2..dc2392119cc5 100644 --- a/torch/_inductor/codegen/triton_combo_kernel.py +++ b/torch/_inductor/codegen/triton_combo_kernel.py @@ -51,7 +51,7 @@ def _default_custom_combo_kernel_horizontal_partition( node_info_map: dict[BaseSchedulerNode, tuple[Any, Any, Any, Any]], ) -> list[list[BaseSchedulerNode]]: """Horizontally partition the given list of nodes into a list of list of nodes where each sublist - represents a partion. Nodes in different partitions are implemented in different combo kernels. + represents a partition. Nodes in different partitions are implemented in different combo kernels. Nodes in the same partition are likely to be implemented in the same combo kernel, but subject to subsequent restrictions like CUDA limits for number of args. diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py index d52b73a8c0fe..fd9019fa6b62 100644 --- a/torch/_inductor/codegen/triton_utils.py +++ b/torch/_inductor/codegen/triton_utils.py @@ -36,24 +36,24 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str: # TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes. # Related PR: https://github.com/triton-lang/triton/pull/2279/ if arg.dtype == torch.float8_e4m3fn: - tye = "*fp8e4nv" + typ = "*fp8e4nv" elif arg.dtype == torch.float8_e5m2: - tye = "*fp8e5" + typ = "*fp8e5" elif arg.dtype == torch.float8_e4m3fnuz: - tye = "*fp8e4b8" + typ = "*fp8e4b8" elif arg.dtype == torch.float8_e5m2fnuz: - tye = "*fp8e5b16" + typ = "*fp8e5b16" else: - tye = _type_of(arg.dtype) + typ = _type_of(arg.dtype) if should_unwrap_unspec_arg(arg.buffer): # had unwrapped 0d tensor as scalar - new_tye = tye.lstrip("*") - if new_tye in ["fp16", "bf16"]: + new_typ = typ.lstrip("*") + if new_typ in ["fp16", "bf16"]: return "fp32" else: - return new_tye + return new_typ else: - return tye + return typ if isinstance(arg, SizeArg): if arg.expr is None: if triton_version_uses_attrs_dict(): diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py index baf781a6d43c..ff5324cf4e47 100644 --- a/torch/_inductor/comms.py +++ b/torch/_inductor/comms.py @@ -326,8 +326,8 @@ def _schedule_for_comm( for snode in snodes: if raise_comms and contains_collective(snode): scores_0[snode.get_name()] = comm_idx - for anc in snode.ancestors: - anc_fused_name = name_to_fused_node[anc].get_name() + for ancestor in snode.ancestors: + anc_fused_name = name_to_fused_node[ancestor].get_name() scores_0[anc_fused_name] = min(scores_0[anc_fused_name], comm_idx) comm_idx += 1 elif sink_waits and contains_wait(snode): @@ -486,7 +486,7 @@ def node_summary(snode): def visualize_overlap(order): - # TODO - this function probably doesn't do a very good job estimating the runtime becuase it doesn't carefully model + # TODO - this function probably doesn't do a very good job estimating the runtime because it doesn't carefully model # streams and overlap. For now its mostly useful as a debug visualization. total_est_runtime: float = 0.0 diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index bd6590dceea5..dcfefab265c0 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -497,7 +497,7 @@ coordinate_descent_search_radius = int( ) # AutoHeuristic is a framework that allows one to collect data from autotuning, use the data to learn a heuristic, and -# generate the learned heursitic to code which is shipped with the compiler +# generate the learned heuristic to code which is shipped with the compiler # Specify a list of comma separated optimizations to collect data for autoheuristic_collect = os.environ.get("TORCHINDUCTOR_AUTOHEURISTIC_COLLECT", "") # Specify a list of comma separated optimizations to use learned heuristics for @@ -595,7 +595,7 @@ max_fusion_size = 64 # how many nodes to attempt pairwise fusion with in a buffer group max_fusion_buffer_group_pairwise_attempts = 64 -# max number of inputs to generate cat as a pointwise op with masked laods +# max number of inputs to generate cat as a pointwise op with masked loads max_pointwise_cat_inputs = 8 # force concat to be generated as a pointwise op with masked loads @@ -713,7 +713,7 @@ worker_suppress_logging: bool = Config( default=True, ) -# Flags to turn on all_reduce fusion. These 2 flags should be automaticaly turned +# Flags to turn on all_reduce fusion. These 2 flags should be automatically turned # on by DDP and should not be set by the users. _fuse_ddp_communication = False _fuse_ddp_bucket_size = 25 @@ -858,7 +858,7 @@ padding_alignment_bytes = 128 # Pad too small stride may also cause perf loss. We may result in many tiny data blocks # with gaps in between. That causes less coalesced GPU memory access! # -# Initially we pick 320 as the threshold since for alignement=16, +# Initially we pick 320 as the threshold since for alignment=16, # that results in at most 5% memory cost. # # But later on we raise the threshold to 1024 to avoid interfere with persistent reduction. @@ -958,7 +958,7 @@ enable_linear_binary_folding = ( ) -# Adds NVTX annotations aroung training phases +# Adds NVTX annotations around training phases annotate_training: bool = os.environ.get("TORCHINDUCTOR_ANNOTATE_TRAINING", "0") == "1" # Enable caching codegen of triton templates. @@ -1261,7 +1261,7 @@ class triton: codegen_upcast_to_fp32 = True # Whether persistent matmul kernels should be enabled this flag only has effect when on h100 - # with a verison of triton new enough to support TMA + # with a version of triton new enough to support TMA enable_persistent_tma_matmul = ( os.environ.get("ENABLE_PERSISTENT_TMA_MATMUL", "0") == "1" ) @@ -1321,7 +1321,7 @@ class aot_inductor: # flag to decide whether to create a submodule for constant graph. use_runtime_constant_folding: bool = False - # flag to force weight to be appened to the shared library and mmaped by the runtime + # flag to force weight to be appended to the shared library and mapped by the runtime # rather than embedded into the data section. Needed to support 1B+ parameter models force_mmap_weights: bool = False @@ -1669,7 +1669,7 @@ class trace: # replace records with HTML-like labels" # and thus fail to generate a graph. So, let's give the user an option # to specify the shape attribute for the dot graph. For example, passing - # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables + # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like labels # to workaround the above failure. dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None) diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py index a27292d9c6f6..d0a646c1de8f 100644 --- a/torch/_inductor/cpp_builder.py +++ b/torch/_inductor/cpp_builder.py @@ -331,7 +331,7 @@ def get_compiler_version_info(compiler: str) -> str: ).decode(*SUBPROCESS_DECODE_ARGS) except Exception: return "" - # Mutiple lines to one line string. + # Multiple lines to one line string. version_string = version_string.replace("\r", "_") version_string = version_string.replace("\n", "_") return version_string @@ -410,7 +410,7 @@ def normalize_path_separator(orig_path: str) -> str: class BuildOptionsBase: """ This is the Base class for store cxx build options, as a template. - Acturally, to build a cxx shared library. We just need to select a compiler + Actually, to build a cxx shared library. We just need to select a compiler and maintains the suitable args. """ @@ -948,7 +948,7 @@ def perload_icx_libomp_win(cpp_compiler: str) -> None: return False """ - Intel Compiler implenmented more math libraries than clang, for performance proposal. + Intel Compiler implemented more math libraries than clang, for performance proposal. We need preload them like openmp library. """ preload_list = [ @@ -1427,7 +1427,7 @@ def get_name_and_dir_from_output_file_path( dir = /tmp/tmpof1n5g7t/5c/ put 'name' and 'dir' to CppBuilder's 'name' and 'output_dir'. - CppBuilder --> get_target_file_path will format output path accoding OS: + CppBuilder --> get_target_file_path will format output path according OS: Linux: /tmp/tmppu87g3mm/zh/czhwiz4z7ca7ep3qkxenxerfjxy42kehw6h5cjk6ven4qu4hql4i.so Windows: [Windows temp path]/tmppu87g3mm/zh/czhwiz4z7ca7ep3qkxenxerfjxy42kehw6h5cjk6ven4qu4hql4i.dll """ @@ -1444,13 +1444,13 @@ class CppBuilder: Args: name: 1. Build target name, the final target file will append extension type automatically. - 2. Due to the CppBuilder is supports mutliple OS, it will maintains ext for OS difference. + 2. Due to the CppBuilder is supports multiple OS, it will maintains ext for OS difference. sources: Source code file list to be built. BuildOption: Build options to the builder. output_dir: - 1. The output_dir the taget file will output to. + 1. The output_dir the target file will output to. 2. The default value is empty string, and then the use current dir as output dir. 3. Final target file: output_dir/name.ext """ @@ -1464,7 +1464,7 @@ class CppBuilder: @staticmethod def __get_object_flags() -> tuple[str, str]: extension = ".obj" if _IS_WINDOWS else ".o" - output_flags = "/c /Fo" if _IS_WINDOWS else "-c -o" + output_flags = "/c /Fo" if _IS_WINDOWS else "-c -o" # codespell:ignore return extension, output_flags @staticmethod @@ -1505,7 +1505,7 @@ class CppBuilder: self._name = name - # Code start here, initial self internal veriables firstly. + # Code start here, initial self internal variables firstly. self._build_option = BuildOption self._compiler = BuildOption.get_compiler() self._use_relative_path = BuildOption.get_use_relative_path() @@ -1702,8 +1702,8 @@ class CppBuilder: def build(self) -> None: """ - It is must need a temperary directory to store object files in Windows. - After build completed, delete the temperary directory to save disk space. + It is must need a temporary directory to store object files in Windows. + After build completed, delete the temporary directory to save disk space. """ if self._use_relative_path: # remote build uses relative path diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py index 20bee66549e2..ef37f87a2515 100644 --- a/torch/_inductor/cudagraph_trees.py +++ b/torch/_inductor/cudagraph_trees.py @@ -228,7 +228,7 @@ class TreeManagerContainer: self.graph = None # manager was used again after existing cleanup, - # we shouldnt set it to None + # we shouldn't set it to None if self.live_cudagraphify_fns == 0: self.tree_manager = None @@ -1231,7 +1231,7 @@ class CUDAGraphNode: } if config.triton.slow_path_cudagraph_asserts: - # need to use parent live weakrefs because live_indices isnt set yet + # need to use parent live weakrefs because live_indices isn't set yet memory = ( [] if self.parent is None else list(self.parent.path_live_weakrefs()) ) @@ -1607,7 +1607,7 @@ class CUDAGraphNode: def clear_path_state(self) -> None: "Clear the path state in this current executing node" - # this doesnt actually do anything right now, leaving it as placeholder + # this doesn't actually do anything right now, leaving it as placeholder @staticmethod def _tensor_metadata( diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py index f6ce7e43ad95..2686d1d2ddde 100644 --- a/torch/_inductor/cudagraph_utils.py +++ b/torch/_inductor/cudagraph_utils.py @@ -131,7 +131,7 @@ def check_for_mutation( inputs: list[InputType], is_cuda_graph_recorded_tensor: Callable[[torch.Tensor], bool], ) -> Optional[str]: - # doesnt work for non-trees because the warmup run would apply mutation twice + # doesn't work for non-trees because the warmup run would apply mutation twice if torch._inductor.config.triton.cudagraph_trees: # checking if mutation is only on parameters/static inputs mutation_indices: Sequence[int] = [ @@ -222,7 +222,7 @@ def check_for_mutation_ignore_cuda_graph_managed_tensor( ) -> Optional[str]: default_msg = format_default_skip_message("mutated inputs") - # doesnt work for non-trees because the warmup run would apply mutation twice + # doesn't work for non-trees because the warmup run would apply mutation twice if torch._inductor.config.triton.cudagraph_trees: unique_idxs = OrderedSet(static_input_idxs) # checking if mutation is only on parameters/static inputs diff --git a/torch/_inductor/custom_graph_pass.py b/torch/_inductor/custom_graph_pass.py index 769b9b68ae13..c9a8e33a1145 100644 --- a/torch/_inductor/custom_graph_pass.py +++ b/torch/_inductor/custom_graph_pass.py @@ -18,7 +18,7 @@ class CustomGraphPass(ABC): identifies your implementation (and can be pickled). The caching logic includes this identifier in its key calculation, i.e., any new value will effectively invalidate existing entries. We expect custom passes would typically depend purely on the - textual reprensentation of the implementation. In that case, we recommend using the + textual representation of the implementation. In that case, we recommend using the 'get_hash_for_files' helper below to compute a unique hash from the contents of a static list of source files, i.e., the source(s) containing the custom pass implementation. That approach ensures that any change to the implementation will @@ -64,7 +64,7 @@ class CustomGraphModulePass(ABC): identifies your implementation (and can be pickled). The caching logic includes this identifier in its key calculation, i.e., any new value will effectively invalidate existing entries. We expect custom passes would typically depend purely on the - textual reprensentation of the implementation. In that case, we recommend using the + textual representation of the implementation. In that case, we recommend using the 'get_hash_for_files' helper below to compute a unique hash from the contents of a static list of source files, i.e., the source(s) containing the custom pass implementation. That approach ensures that any change to the implementation will diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py index b1f75372ee4c..ad3b93775d35 100644 --- a/torch/_inductor/dependencies.py +++ b/torch/_inductor/dependencies.py @@ -125,7 +125,7 @@ class MemoryDep(Dep): ) return None - # May hanppen if self and other are as follows + # May happen if self and other are as follows # MemoryDep('addmm_6', 393216*d0 + 768*d1 + d2, {d0: 16, d1: 512, d2: 768}, None) # MemoryDep('addmm_6', 98304*d0 + d1 + 768*d2, {d0: 64, d1: 768, d2: 128}, None) if OrderedSet(self_strides) != OrderedSet(other_strides): @@ -708,7 +708,7 @@ def extract_input_node_reduction_ranges( # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes? # The current method still uses reduction ranges from the dependent realized node, which is not ideal. - # Is there a way to check whether there are permutations inbetween? + # Is there a way to check whether there are permutations in between? reads = input_node.get_reads() reduction_size: Optional[list[sympy.Expr]] = None size: Optional[list[sympy.Expr]] = None diff --git a/torch/_inductor/fx_passes/ddp_fusion.py b/torch/_inductor/fx_passes/ddp_fusion.py index 2d9409523c15..ccea7d7e70af 100644 --- a/torch/_inductor/fx_passes/ddp_fusion.py +++ b/torch/_inductor/fx_passes/ddp_fusion.py @@ -73,7 +73,7 @@ class CommBlock: def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]: """ Given a collective node (e.g., allreduce), find out all the nodes belong to - this communcation. + this communication. Args: comm_node(fx.Node): The target communication/collective node. @@ -304,7 +304,7 @@ def _scatter_fused_allreduce_waits( """ # Before we mass up the order, we need to get the index of the last wait node - # in orig_comm_blocks. This index will be later used to determinee what users + # in orig_comm_blocks. This index will be later used to determine what users # nodes need to be move to maintain a correct topological sort order. last_wait_node_idx = 0 for node in graph.nodes: diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py index 0d6e74817854..357a9d66cdad 100644 --- a/torch/_inductor/fx_passes/group_batch_fusion.py +++ b/torch/_inductor/fx_passes/group_batch_fusion.py @@ -1052,7 +1052,7 @@ class BatchMathOpsPreGradFusion(BatchPointwiseOpsFusionFactory): def match(self, node: torch.fx.Node): input = get_arg_value(node, 0, "input") if CallFunctionVarArgs(self.op).match(node) and is_node_meta_valid(node): - # check the input has the same shape and its uers have the same target + # check the input has the same shape and its users have the same target # check all clamp operators have the same min and max values, and # nan_to_num operators use the same default value. child = next(iter(node.users.keys())) diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py index 5a0958921009..c9d7187de0d9 100644 --- a/torch/_inductor/fx_passes/joint_graph.py +++ b/torch/_inductor/fx_passes/joint_graph.py @@ -206,7 +206,7 @@ def remove_redundant_views(gm: torch.fx.GraphModule): class UniformValueConstantFolder(ConstantFolder): """ - Runs constant folding and replaces tensors that have a unifrom value + Runs constant folding and replaces tensors that have a uniform value with a tensor constructor call: aten.full([shape], value, ...) """ diff --git a/torch/_inductor/fx_passes/micro_pipeline_tp.py b/torch/_inductor/fx_passes/micro_pipeline_tp.py index 5eb2dce80dfe..af40d987f7d1 100644 --- a/torch/_inductor/fx_passes/micro_pipeline_tp.py +++ b/torch/_inductor/fx_passes/micro_pipeline_tp.py @@ -440,7 +440,7 @@ class _Matmul: A_node=cast("torch.fx.Node", match[0].args[0]), B_node=cast("torch.fx.Node", mm_node.args[1]), # _Matmul handles reshapes via custom graph manipulation logic, see `replace_with()` method. - # TOOO: explore unifying the _Matmul and _ScaledMatmul approaches to handling reshapes. + # TODO: explore unifying the _Matmul and _ScaledMatmul approaches to handling reshapes. pre_mm_reshape=None, post_mm_reshape=None, ) @@ -906,7 +906,7 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None: # 1. The scatter dim before the reshape, which was assigned using the original (a,b,c) @ (c,d) = (a,b,d) dims. # 2. The scatter dim after the reshape, to use when we are doing the 2D (a*b,c) @ (c,d) = (a,b,d) scaled mm op. # 3. Store expected potentially 3D+ mm output shape, so we can reshape the 2D mm output to the intended - # 3D+ shape before applying reduce-scatter, and to prevent shape erros with subsequent ops. + # 3D+ shape before applying reduce-scatter, and to prevent shape errors with subsequent ops. # If 'A' was reshaped from 3D+ -> 2D for the mm, we need to determine the new scattter dim after the reshape # for the fused matmul reduce scatter implementation to use. diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py index 8dae6521d538..96f454d4f3db 100644 --- a/torch/_inductor/fx_passes/mkldnn_fusion.py +++ b/torch/_inductor/fx_passes/mkldnn_fusion.py @@ -187,7 +187,7 @@ if torch._C._has_mkldnn: def grouped_gemm_pass(graph: torch.fx.Graph): """ - Group GEMM has multi output nodes which is compilicated to define a Pattern. + Group GEMM has multi output nodes which is complicated to define a Pattern. Use below way to connect the pattern to the lowering. TODO: Use MultiOutputPattern, current limitation is the pattern requires fixed number of output nodes. Extend to support Group GEMM for pattern matcher. diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py index 10ca1c4dae97..d2dfc3d9e4d0 100644 --- a/torch/_inductor/fx_passes/pad_mm.py +++ b/torch/_inductor/fx_passes/pad_mm.py @@ -102,7 +102,7 @@ def should_pad_common( symbolic_cnt += 1 else: return False - # filter out cases where all dimentions are symbolic + # filter out cases where all dimensions are symbolic if symbolic_cnt == len(t.size()): return False return all( @@ -226,7 +226,7 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool: and K > M and K > N and torch.cuda.get_device_capability() < (9, 0) - ): # doesnt repro on h100s: + ): # doesn't repro on h100s: return True # Fails with AMD @@ -239,7 +239,7 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool: # dram_gbps might be underestimating bandwidth because of cache. # if we estimate machine balance too low we might miss some speedups, - # if we extimate too high there will be unnecessary compilation time increase. + # if we estimate too high there will be unnecessary compilation time increase. # TODO - finetune coefficient here. As a reference point, Triton mm model assumes # 80% of reads are in cache and cache is 4x faster than dram_gbps machine_balance = machine_balance * 0.5 @@ -382,7 +382,7 @@ def should_pad_mm_bf16(dtype: torch.dtype, M: int, N: int, K: int) -> bool: and N % 2 == 1 and K >= large_k_threshold_to_pad and torch.cuda.get_device_capability() < (9, 0) - ): # doesnt repro on h100s: + ): # doesn't repro on h100s: return True return False @@ -711,7 +711,7 @@ def run_autoheuristic( ah_ori_time = autoheuristic.get_collected_feedback(orig_choice) ah_pad_time = autoheuristic.get_collected_feedback(pad_choice) - # if precondition is not satisifed, autoheuristic does not collect data + # if precondition is not satisfied, autoheuristic does not collect data if ah_ori_time is not None and ah_pad_time is not None: if ori_time is None: set_cached_base_mm_benchmark_time(ori_time_key, ah_ori_time) diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py index c00a7ac1ea3a..71285de81c15 100644 --- a/torch/_inductor/fx_passes/post_grad.py +++ b/torch/_inductor/fx_passes/post_grad.py @@ -617,7 +617,7 @@ def reorder_for_locality(graph: torch.fx.Graph): # only reorder nodes before the first copy_ in the graph. # copy_ will appear at the end of functionalized graphs when there is mutation on inputs, - # and this reordering doesnt work well with mutation + # and this reordering doesn't work well with mutation first_copy = next( iter(graph.find_nodes(op="call_function", target=torch.ops.aten.copy_.default)), None, @@ -1436,7 +1436,7 @@ def register_partial_reduction_pattern(): def reuse_partial(match, input, reduced_dims, keepdim): partial_red, full_red = match.output_nodes() - # if theyre small, reuse not worth it + # if they're small, reuse not worth it if not statically_known_true(input.meta["val"].numel() >= 4096): return True diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py index b51d7bc21a1e..2d1709962e64 100644 --- a/torch/_inductor/fx_passes/pre_grad.py +++ b/torch/_inductor/fx_passes/pre_grad.py @@ -394,7 +394,7 @@ def fetch_attr(target: str, mod): for i, atom in enumerate(target_atoms): if not hasattr(attr_itr, atom): raise RuntimeError( - f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}" + f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}" ) attr_itr = getattr(attr_itr, atom) return attr_itr diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py index 8f41e7885385..ea379b0115d6 100644 --- a/torch/_inductor/fx_passes/split_cat.py +++ b/torch/_inductor/fx_passes/split_cat.py @@ -247,7 +247,7 @@ def remove_split_with_size_one(match: Match, *args, **kwargs): return # remove the dummy split whose split sections size is one # theoretically nodes with no users should be removed, but we have seen the corner case - # thus we add its uers check to walk around the StopIteration error. + # thus we add its users check to walk around the StopIteration error. if len(split_sections) == 1 and len(split_node.users.keys()) > 0: # find the grand children of the split_node next_users = find_next_users(split_node) @@ -1525,7 +1525,7 @@ def merge_getitem_cat(match: Match, split_sections: list[int], dim: int): # find the index of getitems to be cated/stacked # type: ignore[union-attr] indices = [arg.args[1] for arg in cat_user.args[0]] # type: ignore[union-attr] - # the gettitems to be merged must be consecutive, otherwise + # the getitems to be merged must be consecutive, otherwise # returned sliced tensor could be wrong if not is_sorted_and_consecutive(indices): # type: ignore[arg-type] continue @@ -1627,7 +1627,7 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int): for getitem in cat_user.args[0]: # type: ignore[union-attr] indices.append(getitem.args[1]) # type: ignore[union-attr] idx_to_getitem[getitem.args[1]] = getitem # type: ignore[union-attr] - # the gettitems to be merged must be consecutive, otherwise + # the getitems to be merged must be consecutive, otherwise # returned sliced tensor could be wrong if not is_sorted_and_consecutive(indices): # type: ignore[arg-type] continue @@ -2069,7 +2069,7 @@ def update_args_from_split_getitem( threshold_to_cat: int = 2, ): split_input, split_size, split_dim = _get_split_args_default(parents_seen[-1]) - # case 1: the number of getitems is the same as the split size, elimiate the split + # case 1: the number of getitems is the same as the split size, eliminate the split if len(split_size) == len(getitem_indices) and is_sorted_and_consecutive( getitem_indices ): @@ -2164,7 +2164,7 @@ def update_args_from_unbind_getitem( unbind_input = get_arg_value(parents_seen[-1], 0, "input") # split or unbind input unbind_dim = get_arg_value(parents_seen[-1], 1, "dim") # split or unbind dim cat_dim = get_arg_value(node, 1, "dim") # cat or stack dim - # case 1: the number of getitems is the same as the split size, elimiate the split + # case 1: the number of getitems is the same as the split size, eliminate the split size = list(unbind_input.meta["example_value"].shape)[unbind_dim] if size == len(getitem_indices): cat_shape = torch.cat( diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index e3ed4f3c506d..23554a0f4123 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -1885,7 +1885,7 @@ class GraphLowering(torch.fx.Interpreter): # [NOTE] Codegen runtime asserts in Inductor # # We need to generate runtime asserts directly in Inductor instead - # of just re-using the asserts from input graphs becase we reuse the + # of just reusing the asserts from input graphs because we reuse the # same ShapeEnv as before. In particular, on subsequent graph passes, # we would immediately turn all of these assertions into noops, # because when we evaluated their expressions, we would see that @@ -1901,8 +1901,8 @@ class GraphLowering(torch.fx.Interpreter): # equals = torch.add(ones, c) # return equals # torch._dynamo.mark_dynamic(c, 0) - # When we re-use the ShapeEnv in Inductor lowering, the check that checks - # a and nonzero have the same shape would be evaluted to True after we resolve + # When we reuse the ShapeEnv in Inductor lowering, the check that checks + # a and nonzero have the same shape would be evaluated to True after we resolve # unbacked bindings using the ShapeEnv. # See test_unbacked_equals_input_size_runtime_assertion in test_aot_inductor. # @@ -2253,7 +2253,7 @@ class GraphLowering(torch.fx.Interpreter): graph. The parent graph is passed as an argument: the intention is to inline codegening of the subgraph in the parent graph's wrapper code (including the generated - kerenls). The wrapper code is not finalized (via `.generate()` + kernels). The wrapper code is not finalized (via `.generate()` call), as this will be done in the parent graph's `codegen()`. """ with dynamo_timed("GraphLowering.codegen_subgraph", log_pt2_compile_event=True): diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py index 3b15096b0a9c..a43925b8d744 100644 --- a/torch/_inductor/index_propagation.py +++ b/torch/_inductor/index_propagation.py @@ -311,7 +311,7 @@ class IndexPropagation(DefaultHandler): If this is an issue, just use guards in `self.axioms`. The proper way of handling this would be to have a global shape_env that adds - runtime_asserts as they happen in the code. Then, it shuld be used in SimplifyIndexing + runtime_asserts as they happen in the code. Then, it should be used in SimplifyIndexing to perform wrap_expr and in CSEProxy.check_bounds to elide upper / lower bounds also for indirect_indexing """ diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index c13bcfa20d6c..2a070610da45 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -200,7 +200,7 @@ def _is_static(x: object) -> bool: @dataclasses.dataclass(frozen=True) class GraphPartitionSignature: - # symbol inputs that are neccessary for codegen + # symbol inputs that are necessary for codegen symbol_inputs: OrderedSet[sympy.Symbol] # mapping from partition input name to IRNode or Expr. Need the name str since @@ -428,7 +428,7 @@ def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> b (V.graph.sizevars.size_hint_or_throw(x.get_stride()[i]) % alignment) == 0 for i in range(len(x.get_stride()) - 1) ) - # if the last dim size is <= 1, stride doesnt matter + # if the last dim size is <= 1, stride doesn't matter aligned_last_dim = ( V.graph.sizevars.size_hint_or_throw(x.get_stride()[-1]) == 1 or V.graph.sizevars.size_hint_or_throw(x.get_size()[-1]) <= 1 @@ -2215,7 +2215,7 @@ class Scan(Loops): dtypes: tuple[torch.dtype, ...] inner_fns: tuple[Callable[..., Any], ...] - # HACK we mimick reduction + # HACK we mimic reduction def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]: # TODO: Can combine_fn/reindex close over unbacked symbols? If so, we @@ -2424,7 +2424,7 @@ class Sort(Loops): stable: bool descending: bool - # HACK we mimick reduction + # HACK we mimic reduction def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]: return ( @@ -4618,7 +4618,7 @@ class TritonTemplateBuffer(TemplateBuffer): NOTE:[TritonTemplates with multiple outputs] We want the ability for TritonTemplates to output multiple tensors. Triton kernels have no notion of outputs and this is done by creating tensors that - are then mutated by the kernel. Currenlty our STORE_OUTPUT codegen doesn't + are then mutated by the kernel. Currently our STORE_OUTPUT codegen doesn't support creating multinode outputs for triton templates. We work around this by creating an extra input buffer during the lowering and we mark them as mutated inputs. @@ -4873,7 +4873,7 @@ class InputsKernel(OperationBuffer): if isinstance(input, list): reads.update(StarDep(x.get_name()) for x in input) elif isinstance(input, ShapeAsConstantBuffer): - # Skip creating dependncy for symbolics as they're visible globally + # Skip creating dependency for symbolics as they're visible globally continue else: reads.add(StarDep(input.get_name())) @@ -5190,7 +5190,7 @@ class ExternKernel(InputsKernel): else {} ) # FIXME: self.kwargs does not always match kwargs defined in schema, so sometimes - # ordered_kwargs_for_cpp_kernel is explicilty passed in. + # ordered_kwargs_for_cpp_kernel is explicitly passed in. if isinstance(self.op_overload, torch._ops.OpOverload): if not self.ordered_kwargs_for_cpp_kernel: self.ordered_kwargs_for_cpp_kernel = [ @@ -6835,7 +6835,7 @@ class FallbackKernel(ExternKernelAlloc): """ A class that represents a fallback kernel for handling operators that are not directly support by inductor. It currently supports functional ops, view ops, - implace aten ops, and mutating ops that are auto-functionalizable. + inplace aten ops, and mutating ops that are auto-functionalizable. """ def __init__( # type: ignore[no-untyped-def] @@ -7848,10 +7848,10 @@ class Conditional(ExternKernel): # make sure true and false outputs are structurally equivalent assert len(true_outputs) == len(false_outputs), (true_outputs, false_outputs) - for i, (to, fo) in enumerate(zip(true_outputs, false_outputs)): - assert to.get_device() == fo.get_device(), (i, to, fo) - assert to.get_dtype() == fo.get_dtype(), (i, to, fo) - assert to.get_layout().offset == fo.get_layout().offset, (i, to, fo) + for i, (t_o, f_o) in enumerate(zip(true_outputs, false_outputs)): + assert t_o.get_device() == f_o.get_device(), (i, t_o, f_o) + assert t_o.get_dtype() == f_o.get_dtype(), (i, t_o, f_o) + assert t_o.get_layout().offset == f_o.get_layout().offset, (i, t_o, f_o) device = next( o.get_device() diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py index a3204de8b39f..103abe085968 100644 --- a/torch/_inductor/kernel/flex_attention.py +++ b/torch/_inductor/kernel/flex_attention.py @@ -1025,7 +1025,7 @@ def check_cpu_supported(): def contiguous_last_dim(x): - """Ensure that realized IR node has a contigous stride in the last dimension.""" + """Ensure that realized IR node has a contiguous stride in the last dimension.""" strides = x.maybe_get_stride() if strides and strides[-1] != 1: contiguous_stride_order = list(reversed(range(len(x.get_size())))) @@ -1080,7 +1080,7 @@ def lower_cpu( cur_kvSplitSize = V.graph.sizevars.shape_env.create_unbacked_symint().node.expr shape_env = V.graph.sizevars.shape_env - # We don't know the concret value of cur_qSplitSize and cur_kvSplitSize during the compilation. + # We don't know the concrete value of cur_qSplitSize and cur_kvSplitSize during the compilation. # Mark symbols > 1 to ensure broadcasting is always applied. # This avoids treating them as equal when `eq(var, 1)` is evaluated in `broadcast_symbolic_shapes`. shape_env.var_to_range[cur_qSplitSize] = ValueRanges(2, int_oo) @@ -1826,7 +1826,7 @@ flex_attention_backward_template = TritonTemplate( sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m # noqa: B950 - # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads. + # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads. q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64) do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64) dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64) @@ -1934,7 +1934,7 @@ flex_attention_backward_template = TritonTemplate( for off_g in range(0, GQA_SHARED_HEADS): off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g - # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads. + # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads. q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64) do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64) dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64) diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py index 12f849aed7ac..030ba13a4edb 100644 --- a/torch/_inductor/kernel/mm_common.py +++ b/torch/_inductor/kernel/mm_common.py @@ -108,7 +108,7 @@ def scaled_mm_options( # type: ignore[no-untyped-def] device_tma: bool = False, ) -> dict[str, Any]: def are_compatible_scales(size_a, size_b) -> bool: - # Same sized scales are compatable + # Same sized scales are compatible if len(size_a) == len(size_b): return True diff --git a/torch/_inductor/kernel/mm_scaled_grouped.py b/torch/_inductor/kernel/mm_scaled_grouped.py index 9ca2ff39f3aa..ad34ea0210b5 100644 --- a/torch/_inductor/kernel/mm_scaled_grouped.py +++ b/torch/_inductor/kernel/mm_scaled_grouped.py @@ -601,7 +601,7 @@ def _tuned_grouped_mm_common( _, is_nonzero = _is_static_problem(layout) - # Checking only for the equality of correspoding dims of + # Checking only for the equality of corresponding dims of # multiplicands here, relying on meta function checks for # everything else. if is_nonzero and can_use_triton_kernel(mat_a, mat_b, offs, bias, scale_result): diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index fc0ea67b1d50..a50d5ccecbb6 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -2753,7 +2753,7 @@ make_fallback(torch._prims.rng_prims.run_with_rng_state) make_fallback(torch._prims.rng_prims.graphsafe_run_with_rng_state) -# Implmented / Half implemented +# Implemented / Half implemented # Scans. Implemented for CUDA, missing CPU make_fallback(aten.masked_scatter) make_fallback(aten.masked_scatter_backward) @@ -7083,7 +7083,7 @@ def prepare_softmax_online(x, dim): # Note: [Split online_softmax_reduce] # We don't split reduction for online_softmax_reduce for now. # On one hand, supporting split reduction makes things complex since - # the splitted out reuctions requires 2 inputs rather than one. + # the split out reuctions requires 2 inputs rather than one. # On the other hand, during training the online_softmax_reduce should # usually don't requires a split due to large batch size # (more specifically batch size times sequence length). diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py index 0e93a5fe67aa..6eb3e30d87a9 100644 --- a/torch/_inductor/mkldnn_ir.py +++ b/torch/_inductor/mkldnn_ir.py @@ -179,7 +179,7 @@ def _prepare_convolution_fusion_create( # Currently we don't support channel last for the situation that stride of input's batch dim is 0, # eg. input_size = (1, 1280, 64, 64), but input_stride=(0, 1, 81920, 1280). # So we use NCHW hear instead. - # Different with cpu, cpu conv alway use channels_last for convolution when weight is prepacked, + # Different with cpu, cpu conv always use channels_last for convolution when weight is prepacked, # but xpu does not do the prepack, so the problem exposed here is only for xpu. # TODO support channels_last for such zero stride input. elif get_device_type(x) == "xpu" and x.get_stride()[0] == 0: @@ -686,11 +686,11 @@ class QConvPointWiseBinaryPT2E(ExternKernelAlloc): if bias is not None - inputs = [x, x_scale, x_zp, w, w_scale, w_zp, accum, b] - const_args = [stride, padding, dilation, groups, o_scale, o_zp, - output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] + output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] else - inputs = [x, x_scale, x_zp, w, w_scale, w_zp, accum] - const_args [b, stride, padding, dilation, groups, o_scale, o_zp, - output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] + output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] """ self.has_bias = len(inputs) == 8 self.idx_for_inplace_sum = 6 @@ -1041,11 +1041,11 @@ class QLinearPointwiseBinaryPT2E(ExternKernelAlloc): if bias is not None - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2, bias] - const_args is: [o_scale, o_zp, - fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] + fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] else - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2] - const_args is: [bias, o_scale, o_zp, - fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm] + fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm] """ self.has_bias = has_bias self.idx_for_inplace_sum = 6 diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py index df4d79fe55d5..1f4150952a85 100644 --- a/torch/_inductor/mkldnn_lowerings.py +++ b/torch/_inductor/mkldnn_lowerings.py @@ -624,7 +624,7 @@ def register_onednn_fusion_ops(): # For int8-mixed-bf16 quantization and inplace add, # there is case when accum dtype is float32 but output dtype is bfloat16. # Since the accum will be inplaced changed with post op sum, - # we will do accum dtype convertion here. + # we will do accum dtype conversion here. accum = to_dtype(accum, output_dtype) return TensorBox.create( mkldnn_ir.QConvPointWiseBinaryPT2E.create( @@ -1042,7 +1042,7 @@ def register_onednn_fusion_ops(): # For int8-mixed-bf16 quantization and inplace add, # there is case when accum dtype is float32 but output dtype is bfloat16. # Since the accum will be inplaced changed with post op sum, - # we will do accum dtype convertion here. + # we will do accum dtype conversion here. x2 = to_dtype(x2, output_dtype) else: assert x2.get_dtype() == output_dtype, ( diff --git a/torch/_inductor/remote_cache.py b/torch/_inductor/remote_cache.py index 2aadc806bf90..aaa266b60e00 100644 --- a/torch/_inductor/remote_cache.py +++ b/torch/_inductor/remote_cache.py @@ -136,7 +136,7 @@ class RemoteCachePassthroughSerde(RemoteCacheSerde[_T, _T]): # To write (`put`), the RemoteCache takes data, uses the RemoteCacheSerde to # convert it for the backend and passes it to the backend. # -# Conversly when reading (`get`), the RemoteCache takes data from the backend, +# Conversely when reading (`get`), the RemoteCache takes data from the backend, # uses the RemoteCacheSerde to convert it and returns it. # # The RemoteCacheBackend is generic on _U - which is the type of data the diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py index 74df6ed671ef..5c9cc60bef87 100644 --- a/torch/_inductor/runtime/benchmarking.py +++ b/torch/_inductor/runtime/benchmarking.py @@ -230,7 +230,7 @@ class InductorBenchmarker(TritonBenchmarker): in milliseconds. An estimated duration is calculated based on the values of `memory_warmup_iters` and `benchmark_iters`, along with the estimated runtime of `_callable` and various other factors, and we then shrink - `benchmark_iters` to fit in the alloted maximum duration. + `benchmark_iters` to fit in the allotted maximum duration. - **kwargs: Additional kwargs that may be passed to the fallback. Returns: diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py index b41ca81ebdfc..413dfaf09d06 100644 --- a/torch/_inductor/runtime/coordinate_descent_tuner.py +++ b/torch/_inductor/runtime/coordinate_descent_tuner.py @@ -208,7 +208,7 @@ class CoordescTuner: """ Check if candidate_config is better than best_config. - Return a touple of (compare_result, candidate_timing). + Return a tuple of (compare_result, candidate_timing). compare_result is true iff candidate_config is better. """ log.debug("Try config %s", candidate_config) diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py index bf5b24a9fe56..21cd5987f8f4 100644 --- a/torch/_inductor/runtime/runtime_utils.py +++ b/torch/_inductor/runtime/runtime_utils.py @@ -25,8 +25,8 @@ def conditional_product(*args: int) -> int: return functools.reduce(operator.mul, [x for x in args if x]) -def ceildiv(numer: int, denom: int) -> int: - return -(numer // -denom) +def ceildiv(number: int, denom: int) -> int: + return -(number // -denom) def is_power_of_2(n: int) -> bool: @@ -155,7 +155,7 @@ dynamo_timed = torch._dynamo.utils.dynamo_timed # type: ignore[has-type] def triton_hash_to_path_key(key: str) -> str: # In early versions of Triton, the hash is directly used in the path name. # Later, the hash is converted to base64 before being used in the path name. - # Later, the base64 convertion was replaced to the base32 + # Later, the base64 conversion was replaced to the base32 # # This code tries to import _base64 and falls back to _base32 if _base64 is unavailable. # diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py index 1a421f5239a8..cfd708bcf4bf 100644 --- a/torch/_inductor/runtime/triton_helpers.py +++ b/torch/_inductor/runtime/triton_helpers.py @@ -202,7 +202,7 @@ def online_softmax_combine(lhs_max, lhs_sum, rhs_max, use_fast_math: tl.constexp # Should be # out_sum = lhs_sum * lhs_scale + rhs_sum * rhs_scale - # but since rhs_sum is all 1, we can simpliy it. + # but since rhs_sum is all 1, we can simplify it. out_sum = lhs_sum * lhs_scale + rhs_scale return out_max, out_sum @@ -460,7 +460,7 @@ def exclusive_scan_decoupled_lookback_64(scratch_base, block_value, index, combi block_value: Scalar value for this block, must be 64-bits wide index: Scalar index of this block relative to the current scan combine_fn: Function ``(value, value) -> value`` which is scanned over - init: Scalar value equal to the identiy of combine_fn + init: Scalar value equal to the identity of combine_fn """ # Publish block sum so subsequent blocks don't get stuck waiting for us if index > 0: diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py index 7bb476f178ee..687ba95e1dd1 100644 --- a/torch/_inductor/scheduler.py +++ b/torch/_inductor/scheduler.py @@ -477,7 +477,7 @@ class BaseSchedulerNode: buf_name = buf_to_be_inplaced.get_name() # Dedup read/writes with equivalent indices # TODO - would be nice if we could just cache accesses on ReadWrites, - # and inforce variant that this class & members are functional.. + # and enforce variant that this class & members are functional.. deps: OrderedSet[Dep] = OrderedSet() for user in buf_to_be_inplaced.users: user_node = user.node @@ -1079,7 +1079,7 @@ class SchedulerNode(BaseSchedulerNode): # TODO(shunting) if this cause compilation time increase when # enabling LOAF by default, try just clearing the specific cache - # entry by using a customized cache implemetation rather than + # entry by using a customized cache implementation rather than # lru_cache. SIMDScheduling.candidate_tilings.cache_clear() @@ -3325,7 +3325,7 @@ class Scheduler: Return true if fusing the two nodes can potentially increasing peak memory. The implementation is more like a heuristic since we don't really know if we are at peak - or not when trying to fuse these two ndoes. The order of nodes may change later which makes the + or not when trying to fuse these two nodes. The order of nodes may change later which makes the peak memory estimation hard. Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes: @@ -3365,7 +3365,7 @@ class Scheduler: try: memory_overhead += int(key[2]) except ValueError: - # not an interger. Fallback is to fuse + # not an integer. Fallback is to fuse return False bw_saving = self.score_fusion_memory(node1, node2) @@ -3470,7 +3470,7 @@ class Scheduler: """ Right now just greedily reorder the loop of node1 to be compatible with node2, but ideally we should have some heuristics to reorder the loop for node2 - to be compatibile with node1 if that's more efficient. + to be compatible with node1 if that's more efficient. """ # TODO Don't do loop reordering for CPU for now. @@ -3569,7 +3569,7 @@ class Scheduler: # potential bad cache behavior and shared memory use. # we also want to avoid benchmarking reliably unprofitable fusions like downcasts from fp32 -> fp16 inside kernel. # allowing gathers by allowing increasing write_bytes by small factor - # TODO - make configurable per input, for insance, bias can fuse fp32 -> fp16 profitably + # TODO - make configurable per input, for instance, bias can fuse fp32 -> fp16 profitably BYTES_THRESHOLD_MULTIPLIER = 1.1 if read_bytes > (write_bytes * BYTES_THRESHOLD_MULTIPLIER): @@ -4436,7 +4436,7 @@ class Scheduler: ) -> list[BaseSchedulerNode]: """ Reorder nodes to minimize the number of partitions via a bfs - topological sort. This is the optimal reodering such that the + topological sort. This is the optimal reordering such that the number of partitions cannot be reduced further. This may be sub-optimal for other metrics such as peak memory. This does not change relative orders of two cudagraphable nodes, nor the diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py index 30dbf0dbc6b4..d00eca1304f0 100644 --- a/torch/_inductor/select_algorithm.py +++ b/torch/_inductor/select_algorithm.py @@ -375,7 +375,7 @@ class TritonTemplateKernel(TritonKernel): self.template_out: Optional[str] = None self.ops_handler: Optional[V.WrapperHandler] = None # type: ignore[name-defined] - # Whe caching is enabled, the generated code is not dependent on the input nodes names, or + # When caching is enabled, the generated code is not dependent on the input nodes names, or # symbolic sizes names. # However, some of the variables returned by generate_and_load that are computed during the # triton template expansions (code generation) are dependent on those. diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py index f8202dcb6d51..a506b915e9a8 100644 --- a/torch/_inductor/sizevars.py +++ b/torch/_inductor/sizevars.py @@ -774,11 +774,11 @@ class SizeVarAllocator: return False if is_first: - # first ModularIndexing should conatins a nested ModularIndex + # first ModularIndexing should contains a nested ModularIndex if not isinstance(x, ModularIndexing): return False else: - # second ModularIndexing should constains a non-negative + # second ModularIndexing should contains a non-negative # symbol if not isinstance(x, sympy.Symbol) or not self.statically_known_geq( x, 0 @@ -809,7 +809,7 @@ class SizeVarAllocator: ) -> Union[bool, tuple[sympy.Expr, sympy.Expr]]: """ Expand the FloorDiv to the entire expression so that the expression may - be simplfied. + be simplified. E.g., for a 2D contiguous tensor with shape [a, 2 * b], and index variables x1, x2, index expression 'x1 * 2b + x2' can be easily combined. diff --git a/torch/_inductor/standalone_compile.py b/torch/_inductor/standalone_compile.py index 93af8cc3209d..e49e8774a2c5 100644 --- a/torch/_inductor/standalone_compile.py +++ b/torch/_inductor/standalone_compile.py @@ -74,7 +74,7 @@ class CompiledArtifact: key = cache_info.aot_autograd_artifacts[0] if format == "binary": - # cant assert that it is a file since it might not exist yet + # can't assert that it is a file since it might not exist yet assert not os.path.isdir(path) from torch.utils._appending_byte_serializer import BytesWriter @@ -118,7 +118,7 @@ class CompiledArtifact: ) -> CompiledArtifact: with dynamo_timed("CompiledArtifact.load"): if format == "binary": - # cant assert that it is a file since it might not exist yet + # can't assert that it is a file since it might not exist yet assert not os.path.isdir(path) with open(path, "rb") as file: artifacts = file.read() diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py index bec7cf8db648..4a1febe08e99 100644 --- a/torch/_inductor/tiling_utils.py +++ b/torch/_inductor/tiling_utils.py @@ -300,7 +300,7 @@ class NodeSplitGetter: # initially, we are just going to do a single reduction split since # reduction tiling is off by default. even if we miss a reduction split, # we can recover it in the split var analysis. - # TODO: an earlier version fo this code tried to iteratively try the maximum number + # TODO: an earlier version for this code tried to iteratively try the maximum number # of split vars, by iterating over both pointwise and reduction. but not worth # the complexity yet. @@ -336,7 +336,7 @@ class NodeSplitGetter: ) self.pw_split_options[len(new_split)].add(new_split) - # if for whatever reason we couldnt split above, return default split + # if for whatever reason we couldn't split above, return default split return ((self.pointwise_numel,), (self.red_numel,)) def try_split(self, pw: Split, red: Split) -> Optional[tuple[Split, Split]]: diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py index 6fb142477617..b5ccb873e33f 100644 --- a/torch/_inductor/triton_bundler.py +++ b/torch/_inductor/triton_bundler.py @@ -109,7 +109,7 @@ class TritonBundler: _static_autotuners: Optional[list[StaticallyLaunchedAutotuner]] = None # __grp__kernel_name.json contains metadata with source code paths - # we use this as sentinal value for search and replace + # we use this as sentinel value for search and replace _REPLACE_BYTES: bytes = b"[REPLACE]" @staticmethod diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index d8064ecb8158..197764dc130e 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -381,17 +381,17 @@ def unique(it: Iterable[_T]) -> ValuesView[_T]: def ceildiv( - numer: Union[int, sympy.Expr], denom: Union[int, sympy.Expr] + number: Union[int, sympy.Expr], denom: Union[int, sympy.Expr] ) -> Union[int, sympy.Expr]: - if isinstance(numer, sympy.Expr) or isinstance(denom, sympy.Expr): - return CeilDiv(sympy.sympify(numer), sympy.sympify(denom)) + if isinstance(number, sympy.Expr) or isinstance(denom, sympy.Expr): + return CeilDiv(sympy.sympify(number), sympy.sympify(denom)) # TODO: There is a bug in a call to this function, to repro: # python benchmarks/dynamo/huggingface.py --inductor -d cuda --accuracy # --amp --only YituTechConvBert --dynamic-shapes - assert isinstance(numer, int) and isinstance(denom, int), ( - f"{numer}: {type(numer)}, {denom}: {type(denom)}" + assert isinstance(number, int) and isinstance(denom, int), ( + f"{number}: {type(number)}, {denom}: {type(denom)}" ) - return runtime_ceildiv(numer, denom) + return runtime_ceildiv(number, denom) def _type_of(key: Optional[torch.dtype]) -> str: @@ -980,7 +980,7 @@ def get_first_incompatible_cudagraph_node( and torch._C.Tag.cudagraph_unsafe in node.target.tags ): # skip cudagraph if a cudagraph_unsafe op is detected. - # graph_partition helps by spliting on this cudagraph_unsafe + # graph_partition helps by splitting on this cudagraph_unsafe # op and cudagraphifying the subgraphs. return node diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py index 54e698822b30..210b5755f9e6 100644 --- a/torch/export/unflatten.py +++ b/torch/export/unflatten.py @@ -524,7 +524,7 @@ class UnflattenedModule(torch.nn.Module): if self.flat_args_adapter is None: raise TypeError( - "There is no flat args adapter sepcified. " + "There is no flat args adapter specified. " "Are you sure you are calling this with the right arguments? " ) else: