Update

[ghstack-poisoned]
Update (base update)
2025-10-23 14:59:34 +08:00 · 2025-10-19 18:28:29 +08:00 · 2025-10-19 18:28:29 +08:00 · 2025-10-11 21:43:12 +08:00 · 2025-10-11 21:43:12 +08:00 · 2025-10-08 22:35:50 +08:00
183 changed files with 471 additions and 455 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1202,12 +1202,6 @@ exclude_patterns = [
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
    'torch/utils/model_dump/preact.mjs',
-    # These files are all grandfathered in, feel free to remove from this list
-    # as necessary
-    # NOTE: remove the patterns in the order they are listed
-    'aten/src/ATen/native/[a-pA-P]*/**',
-    'aten/src/ATen/[a-mA-M]*/**',
-    'test/**',
 ]
 init_command = [
    'python3',
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -94,11 +94,11 @@ struct PinnedReserveSegment {
 struct TORCH_API HostStats {
  // COUNT: total allocations (active)
  Stat active_requests;
-  // SUM: bytes allocated/reserved by this memory alocator. (active)
+  // SUM: bytes allocated/reserved by this memory allocator. (active)
  Stat active_bytes;
  // COUNT: total allocations (active + free)
  Stat allocations;
-  // SUM: bytes allocated/reserved by this memory alocator. This accounts
+  // SUM: bytes allocated/reserved by this memory allocator. This accounts
  // for both free and in-use blocks.
  Stat allocated_bytes;

@ -127,7 +127,7 @@ struct alignas(64) HostStatsStaged {
  // COUNT: total allocations (active + free)
  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
  Stat allocations;
-  // SUM: bytes allocated/reserved by this memory alocator. This accounts
+  // SUM: bytes allocated/reserved by this memory allocator. This accounts
  // for both free and in-use blocks.
  Stat allocated_bytes;
  // COUNT: number of allocations per bucket (active)
@ -455,7 +455,7 @@ struct CachingHostAllocatorImpl {
  }

  void resetAccumulatedStats() {
-    // Reseting accumulated memory stats requires concurrently holding both the
+    // Resetting accumulated memory stats requires concurrently holding both the
    // free list mutexes and the blocks mutex. Previously, this was only done in
    // empty_cache function.
    for (size_t i = 0; i < free_list_.size(); ++i) {
@ -482,7 +482,7 @@ struct CachingHostAllocatorImpl {
  }

  void resetPeakStats() {
-    // Reseting peak memory stats requires concurrently holding both the
+    // Resetting peak memory stats requires concurrently holding both the
    // free list mutexes and the blocks mutex. Previously, this was only done in
    // empty_cache function.
    for (size_t i = 0; i < free_list_.size(); ++i) {
--- a/aten/src/ATen/core/DimVector.h
+++ b/aten/src/ATen/core/DimVector.h
@ -3,7 +3,7 @@

 namespace at {

-// Re-declaring 'DimVector' type and size inside 'at' namespace.
+// Redeclaring 'DimVector' type and size inside 'at' namespace.
 // This is done to avoid modifying every use into their 'c10'
 // equivalent.

--- a/aten/src/ATen/core/GeneratorForPrivateuseone.cpp
+++ b/aten/src/ATen/core/GeneratorForPrivateuseone.cpp
@ -16,7 +16,7 @@ _GeneratorRegister::_GeneratorRegister(const GeneratorFuncType& func) {

  TORCH_WARN_DEPRECATION(
      "REGISTER_GENERATOR_PRIVATEUSE1 is deprecated. \
-      Please derive PrivateUse1HooksInterface to implememt getNewGenerator instead.")
+      Please derive PrivateUse1HooksInterface to implement getNewGenerator instead.")

  TORCH_CHECK(
      !GetGeneratorPrivate().has_value(),
--- a/aten/src/ATen/core/IListRef.h
+++ b/aten/src/ATen/core/IListRef.h
@ -149,7 +149,7 @@
 *    First, keep in mind that we assume that boxed containers will
 *    have to deal with `IValue` (e.g. `c10::List`). In this context,
 *    what may be happening is that `IValue` doesn't store internally
- *    your type `T`. Instead, it constructs a type new `T` everytime
+ *    your type `T`. Instead, it constructs a type new `T` every time
 *    you try to get `T` for it (see `IListRef<at::OptinalTensorRef>`).
 */

@ -186,7 +186,7 @@ class IListRef;
 * This macro is useful because it allows us to handle different
 * types (that correspond to different tags) to be implemented
 * only once. We can do it even when the implementation of the
- * different tags aren't syntatically the same, by dispatching
+ * different tags aren't syntactically the same, by dispatching
 * it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
 */
 #define TORCH_ILISTREF_UNWRAP(TAG, BODY)                         \
--- a/aten/src/ATen/core/IListRef_inl.h
+++ b/aten/src/ATen/core/IListRef_inl.h
@ -42,7 +42,7 @@ class IListRefTagImplBase<IListRefTag::Unboxed, T, ListElemT> {
  /*
   * We have these function (besides the `unwrap`s above) because the
   * implementation for both `IListRef::operator[]` and `IListRefIterator::operator*`
-   * weren't syntatically equal for the existing tags at the time
+   * weren't syntactically equal for the existing tags at the time
   * (`Unboxed` and `Boxed`).
   */
  static IListRefConstRef<T> front(const list_type& lst) {
--- a/aten/src/ATen/core/Variadic.h
+++ b/aten/src/ATen/core/Variadic.h
@ -12,7 +12,7 @@ namespace at {
 // in order.  This is most commonly used in autogenerated code,
 // where it is convenient to have a function that can uniformly
 // take arguments of different types.  If your arguments
-// are homogenous consider using a std::initializer_list instead.
+// are homogeneous consider using a std::initializer_list instead.
 //
 // For examples of this in use, see torch/csrc/utils/variadic.h
 template <typename F>
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType {

  void checkNotExist(const std::string& name, const std::string& what) const;

-  // Attributes are stored in a specific slot at runtime for effiency.
+  // Attributes are stored in a specific slot at runtime for efficiency.
  // When emitting instructions we specify the slot so that attribute access is
  // a constant lookup
  std::optional<size_t> findAttributeSlot(const std::string& name) const {
@ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType {
  // Holds method attributes
  std::weak_ptr<CompilationUnit> compilation_unit_;

-  // Holds all atrributes, attribute details are found on ClassAttribute
+  // Holds all attributes, attribute details are found on ClassAttribute
  std::vector<ClassAttribute> attributes_;
  // Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
  // Never fill this without using the appropriate provideNewClassAttribute method
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -111,7 +111,7 @@ void Dispatcher::waitForDef(const FunctionSchema& schema) {
  TORCH_INTERNAL_ASSERT(r,
    "Expected main interpreter to define ", schema.operator_name(),
    ", but this didn't happen within timeout.  Are you trying to load "
-    "different models in the same torchdeploy/multipy instance?  You "
+    "different models in the same torchdeploy/multipy instance?  You " // codespell:ignore
    "must warmup each interpreter identically, e.g., import all "
    "the same dependencies.");
 }
@ -129,7 +129,7 @@ void Dispatcher::waitForImpl(const OperatorName& op_name, std::optional<c10::Dis
  TORCH_INTERNAL_ASSERT(r,
    "Expected main interpreter to implement ", dk, " for ", op_name,
    ", but this didn't happen within timeout.  Are you trying to load "
-    "different models in the same torchdeploy/multipy instance?  You "
+    "different models in the same torchdeploy/multipy instance?  You " // codespell:ignore
    "must warmup each interpreter identically, e.g., import all "
    "the same dependencies.");
 }
@ -531,7 +531,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK

  // Note: this records a sequence number for both Autograd keys, and for
  // non-Autograd keys where the dispatchKeySet still contains an autograd key.
-  // This means that we might collect the same sequence nubmer two different
+  // This means that we might collect the same sequence number two different
  // events if they all occurred above Autograd and still had the Autograd
  // dispatch key in the dispatch key set.
  // However, this usually doesn't happen: normally the first call will
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -222,7 +222,8 @@ class TORCH_API Dispatcher final {
    return backendFallbackKernels_[dispatch_ix].kernel.isValid();
  }

-  // Used by torchdeploy/multipy for multiple interpreters racing.
+  // Used by torchdeploy/multipy for multiple  // codespell:ignore: multipy
+  // interpreters racing.
  void waitForDef(const FunctionSchema& schema);
  void waitForImpl(
      const OperatorName& op_name,
@ -414,7 +415,7 @@ class TORCH_API Dispatcher final {
  std::unique_ptr<detail::RegistrationListenerList> listeners_;

  // This condition variable gets notified whenever we add a new def/impl to the
-  // dispatch table.  This is primarily used by multipy/torchdeploy, when
+  // dispatch table.  This is primarily used by multiply/torchdeploy, when
  // we have multiple interpreters trying to register to the dispatch table.
  // In this situation, whenever the non-primary interpreter would have tried
  // to register to the dispatch table, instead it will check to see if the
@ -585,7 +586,7 @@ class TORCH_API OperatorHandle {

  // We need to store this iterator in order to make
  // Dispatcher::cleanup() fast -- it runs a lot on program
-  // termination (and presuambly library unloading).
+  // termination (and presumably library unloading).
  std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
 };

--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
    //
    // There are 2 cases
    // 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
-    // without the extra parenthesis, the c++ schem parser can not parse it.
+    // without the extra parenthesis, the c++ scheme parser can not parse it.
    // 2. something like '-> ((str, str))'. Need extra parenthesis so the return
    // type is a single tuple rather than two strings.
    // PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -1176,7 +1176,7 @@ struct TORCH_API IValue final {
  using HashIdentityIValueMap =
      std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;

-  // Chechs if this and rhs has a subvalues in common.
+  // Checks if this and rhs has a subvalues in common.
  // [t1,t2] and [t2, t3] returns true.
  bool overlaps(const IValue& rhs) const;

--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -990,7 +990,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
    std::unique_lock<std::mutex> lock(mutex_);
    if (completed_) {
      // This should be rare and shouldn't cause log spew. Its important to
-      // log errors and thats why we have this log here.
+      // log errors and that's why we have this log here.
      std::string msg = c10::str(
          "Skipping setting following error on the Future since "
          "it is already marked completed (this is not necessarily "
@ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
  // However, the CompilationUnit holds ownership of the type's graphs, so
  // inserting a constant object into a Graph would create a reference cycle if
  // that constant object held a shared_ptr to its CU. For these objects we
-  // instatiate them with non-owning references to its CU
+  // instantiate them with non-owning references to its CU
  Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
    slots_.resize(numSlots);
  }
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -374,7 +374,7 @@ struct TORCH_API SymbolicShape {
  // Unranked shape constructor.
  SymbolicShape() : dims_(std::nullopt) {}

-  // Known rank but unknown dimentions.
+  // Known rank but unknown dimensions.
  SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
    if(!rank) {
      return;
@ -891,10 +891,10 @@ struct TORCH_API ListType

  // global singleton
  // Given an inner type T and an identifier,
-  // this function wil return the global singleton type pointer
+  // this function will return the global singleton type pointer
  // the type List<T>.
-  // The extra "identifier" argument is needed beccause we have multiple container types
-  // that all re-use this function (List<T>, array<T, N>, etc.)
+  // The extra "identifier" argument is needed because we have multiple container types
+  // that all reuse this function (List<T>, array<T, N>, etc.)
  static TypePtr get(const std::string& identifier, TypePtr inner);

  // common cast List[Tensor]
@ -992,7 +992,7 @@ struct TORCH_API DictType : public SharedType {
  // this function will return the global singleton type pointer
  // the type List<T>.
  // The extra "identifier" argument is needed because we have multiple container types
-  // that all re-use this function (Dict<K, V> and unordered_map<K, V>)
+  // that all reuse this function (Dict<K, V> and unordered_map<K, V>)
  static TypePtr get(const std::string& identifier, TypePtr key, TypePtr val);

 private:
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@ -21,7 +21,7 @@ namespace c10 {

 namespace detail {
 // The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
-// We do this because every argument in a function schema is expected to be convertable
+// We do this because every argument in a function schema is expected to be convertible
 // to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
 // See Note [Plumbing Keys Through The Dispatcher]
 template<class KernelFunctor>
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
  // The logic below follows what TensorIterator uses in its logic:
  //   1. Fast_set_up is the short-cut to identify a. channels_last and
  //      b. contiguous format, which is what we have in the below logic.
-  //   2. In more generla cases, it does best effort to preserve permutatoin.
+  //   2. In more general cases, it does best effort to preserve permutatoin.
  if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
    // case 1.a. short cut channels last
    std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -679,7 +679,7 @@ TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
    return false;
  }
  if (elem_type->kind() == AnyType::Kind) {
-    // List of Any can contains heterogenous types
+    // List of Any can contains heterogeneous types
    return false;
  }
  return true;
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16<
        vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
    return vaddvq_u16(bits_vec);
 #else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    // use known working implmentation.
+    // use known working implementation.
    __at_align__ value_type tmp[size()];
    store(tmp);
    int mask = 0;
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16(

  // Control masks for shuffle operation, treating 256 bits as an
  // array of 16-bit elements, and considering pairs of neighboring
-  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+  // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
  // M!=N) is set so that shuffle will move element with index M from
  // input pair into element with index N in output pair, and element
  // with index M in output pair will be set to all 0s.
@ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8(

  // Control masks for shuffle operation, treating 256 bits as an
  // array of 8-bit elements, and considering quadruples of
-  // neighboring elements.  Specifially, a mask named "ctl_M_N" (M,N
+  // neighboring elements.  Specifically, a mask named "ctl_M_N" (M,N
  // in [0,1,2,3], and M!=N) is set so that shuffle will move element
  // with index M from input quadruple into element with index N in
  // output quadruple, and other elements in output quadruple will be
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@ -143,7 +143,7 @@ class Vectorized<double> {
      const Vectorized<double>& a,
      const Vectorized<double>& b,
      const Vectorized<double>& mask) {
-    // the mask used here returned by comparision of vec256
+    // the mask used here returned by comparison of vec256

    return {
        vec_sel(a._vec0, b._vec0, mask._vecb0),
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@ -142,7 +142,7 @@ class Vectorized<float> {
      const Vectorized<float>& a,
      const Vectorized<float>& b,
      const Vectorized<float>& mask) {
-    // the mask used here returned by comparision of vec256
+    // the mask used here returned by comparison of vec256
    // assuming this we can use the same mask directly with vec_sel
    return {
        vec_sel(a._vec0, b._vec0, mask._vecb0),
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@ -202,7 +202,7 @@ class Vectorized<int16_t> {
      const Vectorized<int16_t>& a,
      const Vectorized<int16_t>& b,
      const Vectorized<int16_t>& mask) {
-    // the mask used here returned by comparision of vec256
+    // the mask used here returned by comparison of vec256
    // assuming this we can use the same mask directly with vec_sel
    // warning intel style mask will not work properly
    return {
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@ -155,7 +155,7 @@ class Vectorized<int32_t> {
      const Vectorized<int32_t>& a,
      const Vectorized<int32_t>& b,
      const Vectorized<int32_t>& mask) {
-    // the mask used here returned by comparision of vec256
+    // the mask used here returned by comparison of vec256
    // assuming this we can use the same mask directly with vec_sel
    // warning intel style mask will not work properly
    return {
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@ -119,7 +119,7 @@ class Vectorized<int64_t> {
      const Vectorized<int64_t>& a,
      const Vectorized<int64_t>& b,
      const Vectorized<int64_t>& mask) {
-    // the mask used here returned by comparision of vec256
+    // the mask used here returned by comparison of vec256

    return {
        vec_sel(a._vec0, b._vec0, mask._vecb0),
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -397,7 +397,7 @@ inline Vectorized<bool> operator&&(
  const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
  __m512i out = _mm512_and_si512(*self_, *other_);
  Vectorized<bool> ret;
-  // We do not have a constructer that takes __m512i, so we need to memcpy
+  // We do not have a constructor that takes __m512i, so we need to memcpy
  std::memcpy(ret, &out, ret.size() * sizeof(bool));
  return ret;
 }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float8.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float8.h
@ -498,8 +498,8 @@ static inline Vectorized<T> binary_fp8_op_as_fp32(

 // Refer to
 // https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
-// -, *, /, planed to be deleted in the future and here is just to make compiler
-// happy
+// -, *, /, planned to be deleted in the future and here is just to make
+// compiler happy
 Vectorized<Float8_e4m3fn> inline operator+(
    const Vectorized<Float8_e4m3fn>& a,
    const Vectorized<Float8_e4m3fn>& b) {
@ -585,8 +585,8 @@ class Vectorized<Float8_e5m2> : public Vectorizedf8<Float8_e5m2> {

 // Refer to
 // https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
-// -, *, /, planed to be deleted in the future and here is just to make compiler
-// happy
+// -, *, /, planned to be deleted in the future and here is just to make
+// compiler happy
 Vectorized<Float8_e5m2> inline operator+(
    const Vectorized<Float8_e5m2>& a,
    const Vectorized<Float8_e5m2>& b) {
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8(

  // Control masks for shuffle operation, treating 512 bits as an
  // array of 8-bit elements, and considering pairs of neighboring
-  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+  // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
  // M!=N) is set so that shuffle will move element with index M from
  // input pair into element with index N in output pair, and element
  // with index M in output pair will be set to all 0s.
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1958,7 +1958,7 @@ void scaled_gemm(
    ScalarType result_dtype,
    bool use_fast_accum,
    const std::optional<Tensor>& alpha) {
-  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
+  // Note: see `cublasCommonArgs` for various non-intuitive manipulations
  // of input arguments to this function.
  const auto computeType = CUBLAS_COMPUTE_32F;
  const auto scaleType = CUDA_R_32F;
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -311,7 +311,7 @@ CUDAGraph::~CUDAGraph() {
 // There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
 // They wait for next sync point in order to free the memory, this is to ensure that all
 // hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
-// We need to ensure all async opreations finish before deleting the object.
+// We need to ensure all async operations finish before deleting the object.
 #if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
  if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
  {
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@ -179,7 +179,7 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
          batch_offset * values_batch_stride * values.itemsize(),
      index_type, // data type of row offsets index
      index_type, // data type of col indices
-      CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col indes
+      CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col index
      value_type // data type of values
      ));

--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl
  void free_block_slowpath(Block* block) {
    auto start = std::chrono::steady_clock::now();
    // Users may change the allocator config at will. torch unit tests do this.
-    // However, allocations using cudaHostRegister should use corresonding
+    // However, allocations using cudaHostRegister should use corresponding
    // cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
    void* ptr = block->ptr_;
    bool use_register = false;
--- a/aten/src/ATen/cuda/CachingHostAllocator.h
+++ b/aten/src/ATen/cuda/CachingHostAllocator.h
@ -10,7 +10,7 @@ namespace at::cuda {
 //
 // A caching allocator for CUDA host allocations (pinned memory).
 //
-// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
+// This provides a drop-in replacement for THCudaHostAllocator, which reuses
 // freed pinned (page-locked) memory allocations. This avoids device
 // synchronizations due to cudaFreeHost calls.
 //
@ -26,7 +26,7 @@ inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
 }

 // Records an event in the specified stream. The allocation corresponding to the
-// input `ptr`/`ctx` will not be re-used until the event has occurred.
+// input `ptr`/`ctx` will not be reused until the event has occurred.
 C10_DEPRECATED_MESSAGE(
  "at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
 inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
--- a/aten/src/ATen/cuda/cub.h
+++ b/aten/src/ATen/cuda/cub.h
@ -4,7 +4,7 @@
 #include <ATen/cuda/CUDAConfig.h>

 // NOTE: These templates are intentionally not defined in this header,
-// which aviods re-compiling them for each translation unit. If you get
+// which avoids re-compiling them for each translation unit. If you get
 // a link error, you need to add an explicit instantiation for your
 // types in cub.cu

--- a/aten/src/ATen/cuda/detail/TensorInfo.cuh
+++ b/aten/src/ATen/cuda/detail/TensorInfo.cuh
@ -93,7 +93,7 @@ struct IndexToOffset {
  }
 };

-// Uses dynamic (runtime) instead of static (compiletime) dims
+// Uses dynamic (runtime) instead of static (compile time) dims
 template <typename T, typename IndexType>
 struct IndexToOffset<T, IndexType, -1> {
  static inline __host__ __device__ IndexType get(
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@ -32,7 +32,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(

  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
  //   fn_ptr is set to the appropriate function based on the vec size and GPU used
-  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  // TODO: Memory use can probably be optimized by reusing kernels across GPUs with
  //   the same compute capability

  std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype);
--- a/aten/src/ATen/cuda/tunable/README.md
+++ b/aten/src/ATen/cuda/tunable/README.md
@ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
 GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
 ```

-Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
+Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
 this and reject the tunings file because the prior tunings are likely affected by other software changes.

 The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@ -235,7 +235,7 @@ class TunableOp {
      // numeric check option is controlled by non-static env var, so check it once per tuned operator
      bool do_numerics_check = ctx->IsNumericsCheckEnabled();

-      // calcaulte a reference answer for numerical check
+      // calculate a reference answer for numerical check
      if (do_numerics_check) {
        reference_params = params->DeepCopy(false);
        TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
--- a/aten/src/ATen/detail/AcceleratorHooksInterface.h
+++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h
@ -12,7 +12,7 @@ namespace at {

 // AcceleratorHooksInterface is a shared interface provided by all
 // accelerators to allow generic code.
-// This inferface is hook-based as it corresponds to all the functions
+// This interface is hook-based as it corresponds to all the functions
 // that are going to be called in a generic way from the CPU code.

 struct TORCH_API AcceleratorHooksInterface {
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {

  Generator getNewGenerator(
      [[maybe_unused]] DeviceIndex device_index = -1) const override {
-    // TODO(FFFrog): Perserved for BC and will be removed in the future.
+    // TODO(FFFrog): Preserved for BC and will be removed in the future.
    if (at::GetGeneratorPrivate().has_value())
      return at::GetGeneratorForPrivateuse1(device_index);

--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule(
 // Use when all tensors arguments accept one (normal) batch dim.
 // This batching rule expands the batch dim on all Tensors, reshapes it into
 // dim 0, calls the op, and then reshapes the batch dim out of dim 0.
-// This is not the most efficient thing; if there are alternatives, plese try
+// This is not the most efficient thing; if there are alternatives, please try
 // to use them. Use this only as a last resort.
 #define EXISTING_BDIM_ALL_BOXED(op) \
  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule(

  // NOTE [ solve_ex Batch Rule Contiguity ]
  // A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
-  // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
+  // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
  // differs based on whether or not the optimized path was taken
  const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
  if (batched_A_was_contiguous && !A.is_complex()) {
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(

  dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);

-  // Not sure why output_ needs to be marked as .contiguous(). Someting must
+  // Not sure why output_ needs to be marked as .contiguous(). Something must
  // have changed in PyTorch (and output of softmax is probably always contiguous)
  return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
 }
--- a/aten/src/ATen/functorch/BatchedFallback.cpp
+++ b/aten/src/ATen/functorch/BatchedFallback.cpp
@ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) {
  // is possible for the backward function to return an undefined grad for some
  // grad_input for each example. In that case, we return an undefined grad.
  //
-  // It is theoretically posssible for *some* of the examples to produce an
+  // It is theoretically possible for *some* of the examples to produce an
  // undefined grad (a kernel could peek at the gradient values and return an
  // undefined tensor if it determines the gradient is full of zeros). We
  // could handle this by treating the undefined grad as a zero-filled tensor
--- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
@ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
  return sym_sizes_default();
 }

-// The following are publically exposed as methods of Tensor
+// The following are publicly exposed as methods of Tensor

 IntArrayRef BatchedTensorImpl::strides_custom() const {
  return strides_default();
--- a/aten/src/ATen/functorch/DynamicLayer.h
+++ b/aten/src/ATen/functorch/DynamicLayer.h
@ -37,7 +37,7 @@ namespace at::functorch  {
 // how to perform the transform.
 //
 // TODO: we can excise DynamicLayer in favor of Interpreter,
-// But I am going to leave it for now as a compatiblity shim to avoid
+// But I am going to leave it for now as a compatibility shim to avoid
 // needing to refactor a lot of callsites...
 struct TORCH_API DynamicLayer {
  explicit DynamicLayer(
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
 // >>> VmapInterpreterPtr(&interpreter).batchSize()
 //
 // Finally, Interpreter::process switches on the type of the interpreter
-// and calls one of {Transform}Intepreter::processImpl under the hood.
+// and calls one of {Transform}Interpreter::processImpl under the hood.
 // Same for Interpreter::sendToNextInterpreter :)

 struct VmapInterpreterMeta {
--- a/aten/src/ATen/functorch/LegacyVmapTransforms.h
+++ b/aten/src/ATen/functorch/LegacyVmapTransforms.h
@ -143,7 +143,7 @@ struct TORCH_API VmapPhysicalView {
  // mapping a physical tensor to a new logical tensor (BatchedTensor)
  VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;

-  // Maps a logical shape to a physical shape by pre-pending the batch
+  // Maps a logical shape to a physical shape by prepending the batch
  // sizes to the logical shape.
  VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
  SymDimVector getPhysicalShape(c10::SymIntArrayRef logical_shape) const;
--- a/aten/src/ATen/functorch/TensorWrapper.h
+++ b/aten/src/ATen/functorch/TensorWrapper.h
@ -27,7 +27,7 @@ namespace at::functorch {
 //
 // There are alternative designs we could have chosen (e.g. each grad transform
 // stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper
-// design is that we can re-use existing VariableType kernels (i.e. Autograd kernels)
+// design is that we can reuse existing VariableType kernels (i.e. Autograd kernels)
 // without much modification. Since a TensorWrapper looks like a regular Tensor,
 // the VariableType kernel can pull out the AutogradMeta struct from where it
 // expects and extend the autograd graph
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
      endKernelCoalescing();
      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];

-      // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
+      // For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
      // See https://github.com/pytorch/pytorch/issues/163962
      // Workaround by batching copy commands into 4Gb chunks
      constexpr size_t max_copy_size = 0x100000000; // 4GB
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
  auto* input_tr_ptr =
      reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
  // TODO: Activation transpose before and after the kernel can be removed if we
-  // keep activation tensor always tranposed.
+  // keep activation tensor always transposed.
  fbgemm::transpose_simd<uint8_t>(
      batch_size, K, input_ptr, K, input_tr_ptr, batch_size);

--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -34,7 +34,7 @@ struct Dist {
  //     finish :   This tells what to do with the aggregated value to compute
  //                the norm. Generally this is the result of val ^ (1 / p).
  //     backward : This is the gradient for that norm. Arguments are pretty
-  //                self explanitory.
+  //                self explanatory.
  //
  // There are a few cases where these aren't used. The 0 norm has no backward,
  // because it's always 0, so that's shortcircuited earlier. There's a special
--- a/aten/src/ATen/native/cpu/README.md
+++ b/aten/src/ATen/native/cpu/README.md
@ -74,7 +74,7 @@ it to sum up the entire array into a single value.

 `ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
 compiler flags it is currently compiled. This allows the programmer to write
-generic code, which will be compiled under multipled compilation settings.
+generic code, which will be compiled under multiplied compilation settings.

 `../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
 a generic definition of `sumImplAll`. This function allows the user to reduce
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@ -1017,7 +1017,7 @@ struct HelperInterpBase {
      while (aligned_interp_size % sizeof(int32_t) != 0) {
        aligned_interp_size += 1;
      }
-      // assert that we wont go out of bounds
+      // assert that we won't go out of bounds
      TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
    }

--- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@ -655,7 +655,7 @@ void ImagingResampleHorizontalConvolution8u4x(
      // last element
      auto mmk = _mm256_set1_epi32(k[i]);
      // For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
-      // lines 0, 1 and 2 wont go out of allocated memory bounds
+      // lines 0, 1 and 2 won't go out of allocated memory bounds
      auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
          mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
          mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
            _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
            _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);

-        // Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
+        // Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
        // RGBA: pix1 = [
        //   r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  a0 0 a1 0
        //   r4 0 r5 0  g4 0 g5 0  b4 0 b5 0  a4 0 a5 0
@ -1312,7 +1312,7 @@ void ImagingResampleVerticalConvolution8u(

    // Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
    // It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
-    // We also wont go out of bounds of lineOut memory allocation
+    // We also won't go out of bounds of lineOut memory allocation
    std::memcpy(lineOut + j, (uint8_t *) &o, 4);
  }

--- a/aten/src/ATen/native/cpu/avx_mathfun.h
+++ b/aten/src/ATen/native/cpu/avx_mathfun.h
@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2,  4.166664568298827E-002);
 _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI


-/* evaluation of 8 sines at onces using AVX intrinsics
+/* evaluation of 8 sines at once using AVX intrinsics

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
  const bool gamma_null = (gamma_data == nullptr);
  const bool beta_null = beta_data == nullptr;

-  // NB: About algorithm choosen:
+  // NB: About algorithm chosen:
  //
  // On channels last, GroupNorm has a input shape of {N, H, W, GD},
  // Mean and rstd are collected per each n and g, which involves reduction
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
        }
      };

-  // Dynamically Quantize the float32 input to 8 bit assymetric
+  // Dynamically Quantize the float32 input to 8 bit asymmetric
  input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);

  const size_t lhs_stride =
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
  const int64_t weight_packed_size =
      kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
  if (weight_packed_size == packed_weights.numel()) {
-    // KleidiAI interface intenally handles the Channelwise and groupwise
+    // KleidiAI interface internally handles the Channelwise and groupwise
    // distinction
    kleidiai::kai_quant_pack_lhs_int4_mm(
        output, inp, packed_weights, M, N, K, block_size);
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@ -705,7 +705,7 @@ namespace {
            );
        } while (!done && max_threads);
        if (!done) {
-          TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
+          TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accommodate sharedMemPerBlock limit");
        }
        break;
      }
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -154,19 +154,19 @@ struct cublasCommonArgs {
      const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
    bool transpose_result = false, transpose_a = false, transpose_b = false;
    result = prepare_matrix_for_cublas(c, transpose_result);
-    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
-    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
+    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); // codespell:ignore
+    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); // codespell:ignore

    // Handle scale tensors if provided
    if (scale_a && scale_b) {
      // By default since we return in row-major we run the gemm
      // as B.T @ A.T, check transpose_result to determine if we flip the scales
-      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
-      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
-      scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
-      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
-      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
-      scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
+      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); // codespell:ignore
+      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); // codespell:ignore
+      scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; // codespell:ignore
+      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); // codespell:ignore
+      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); // codespell:ignore
+      scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; // codespell:ignore
    }

    if (scale_result) {
@ -180,17 +180,17 @@ struct cublasCommonArgs {
      transpose_b = !transpose_b;
    }

-    auto sizes_a = mata->sizes();
-    auto sizes_b = matb->sizes();
+    auto sizes_a = mata->sizes(); // codespell:ignore
+    auto sizes_b = matb->sizes(); // codespell:ignore

    m = sizes_a[transpose_result ? 1 : 0];
    k = sizes_a[transpose_result ? 0 : 1];
    n = sizes_b[transpose_result ? 0 : 1];
-    lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
-    ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
+    lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); // codespell:ignore
+    ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); // codespell:ignore
    result_ld = result->stride(transpose_result ? 0 : 1);
-    transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
-    transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
+    transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
+    transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore

    // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
    // if the gemm operands are in packed float4
@ -205,16 +205,16 @@ struct cublasCommonArgs {
  char transa, transb;
  int64_t m, n, k;
  int64_t lda, ldb, result_ld;
-  c10::MaybeOwned<Tensor> mata, matb, result;
+  c10::MaybeOwned<Tensor> mata, matb, result; // codespell:ignore

  // Scale members
-  void* scale_mata_ptr = nullptr;
-  void* scale_matb_ptr = nullptr;
+  void* scale_mata_ptr = nullptr; // codespell:ignore
+  void* scale_matb_ptr = nullptr; // codespell:ignore
  void* scale_result_ptr = nullptr;
-  std::optional<c10::ScalarType> scale_mata_dtype;
-  std::optional<ScalingType> scaling_mata_type;
-  std::optional<c10::ScalarType> scale_matb_dtype;
-  std::optional<ScalingType> scaling_matb_type;
+  std::optional<c10::ScalarType> scale_mata_dtype; // codespell:ignore
+  std::optional<ScalingType> scaling_mata_type; // codespell:ignore
+  std::optional<c10::ScalarType> scale_matb_dtype; // codespell:ignore
+  std::optional<ScalingType> scaling_matb_type; // codespell:ignore
  std::optional<c10::ScalarType> scale_result_dtype;
 };
 } // namespace
@ -362,7 +362,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
 #endif
  // if lt path fails, we recurse back into this function here and force the lt path to off
-  // we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
+  // we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
  bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
 #if defined(USE_ROCM) && ROCM_VERSION == 60400
  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -2886,7 +2886,7 @@ _scaled_grouped_mm_cuda_v2(
          "Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
          mat_b.size(dim_b));
      // Note: only (-1, -2) is currently supported
-      TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
+      TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Currently contraction dims must be (-1, -2) only");
    } else {
      TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
    }
--- a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
@ -298,7 +298,7 @@ static void jitted_gpu_kernel_impl(
    at::opmath_type<f_inputs_type> scalar_val,
    const std::tuple<ExtraArgs...>& extra_args) {

-  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  // TODO: Memory use can probably be optimized by reusing kernels across GPUs with
  //   the same compute capability
  static std::mutex jiterator_mutex;
  static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
      auto value = static_cast<scalar_t>(rand * range + from);
      // reverse the bounds of curand4 from (0, 1] to [0, 1)
      // Note that this method is from legacy THCTensorRandom and is likely to give
-      // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
+      // you more 0-s, since, the probability of getting 1-s is higher than 0-s and
      // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
      auto reverse_bound_value = value == to ? from : value;
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@ -75,7 +75,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType>
    // We'll use this to actually cause vectorized loads later
    LoadT *value = reinterpret_cast<LoadT*>(&src);

-    //curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
+    //curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
    // Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
    // sets of rand.
    if ((VEC >= 4) || (gridxvec_loop_state == 0)) {
@ -159,7 +159,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
  for (IndexType linearIndex = idx;
       linearIndex < rounded_size;
       linearIndex += gridDim.x * blockDim.x*UNROLL) {
-//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
+//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
       float4 rand = curand_uniform4(&state);
       scalar_t src[UNROLL];
       rand.x = rand.x < p;
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@ -24,7 +24,7 @@ namespace at::native {
 namespace {

 /* This code computes the sum of the weights in two-steps:
-  1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
+  1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indices`
  2) Each partial-sum from 1) are summed and scatter into `grad_weight`

  Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@ -204,7 +204,7 @@ Scalar scalar_reciprocal(const Scalar& scalar) {
    return Scalar(1. / scalar.toComplexDouble());
  }
  TORCH_INTERNAL_ASSERT(
-      false, "divison with ", scalar.type(), " not supported");
+      false, "division with ", scalar.type(), " not supported");
 }

 void foreach_tensor_div_scalar_kernel_cuda_(
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@ -57,7 +57,7 @@ namespace {
      const index_t n = index / (out_H * out_W);
      const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;

-      // get the corresponding input x, y co-ordinates from grid
+      // get the corresponding input x, y coordinates from grid
      opmath_t x = grid.data[grid_offset];
      opmath_t y = grid.data[grid_offset + grid_sCoor];

@ -193,7 +193,7 @@ namespace {
      const index_t n = index / (out_D * out_H * out_W);
      const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;

-      // get the corresponding input x, y, z co-ordinates from grid
+      // get the corresponding input x, y, z coordinates from grid
      opmath_t x = grid.data[grid_offset];
      opmath_t y = grid.data[grid_offset + grid_sCoor];
      opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
@ -358,7 +358,7 @@ namespace {
      const index_t n = index / (out_H * out_W);
      const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;

-      // get the corresponding input x, y co-ordinates from grid
+      // get the corresponding input x, y coordinates from grid
      scalar_t x = grid.data[grid_offset];
      scalar_t y = grid.data[grid_offset + grid_sCoor];

@ -572,7 +572,7 @@ namespace {
      const index_t n = index / (out_D * out_H * out_W);
      const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;

-      // get the corresponding input x, y, z co-ordinates from grid
+      // get the corresponding input x, y, z coordinates from grid
      scalar_t ix = grid.data[grid_offset];
      scalar_t iy = grid.data[grid_offset + grid_sCoor];
      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
--- a/aten/src/ATen/native/cuda/GroupMM.cu
+++ b/aten/src/ATen/native/cuda/GroupMM.cu
@ -8,7 +8,7 @@
 #include <c10/util/irange.h>


-// Three warninngs in Cutlass included header files
+// Three warnings in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@ -377,7 +377,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
   *   result at the boundary
   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
   *   Large Parameter (see DLMF 8.12.4 [igam1])
-   * - if x > 1.1 and x < a, using the substraction from the regularized lower
+   * - if x > 1.1 and x < a, using the subtraction from the regularized lower
   *   incomplete gamma
   * - otherwise, calculate the series from [igam2] eq (5)
   */
@ -460,7 +460,7 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
   *   result at the boundary
   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
   *   Large Parameter (see DLMF 8.12.3 [igam1])
-   * - if x > 1 and x > a, using the substraction from the regularized upper
+   * - if x > 1 and x > a, using the subtraction from the regularized upper
   *   incomplete gamma
   * - otherwise, calculate the series from [igam2] eq (4)
   */
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@ -332,7 +332,7 @@ void cuda_take_put_kernel(
  const auto offset_calc = make_offset_calculator<2>(iter);
  using uindex_t = std::make_unsigned_t<index_t>;

-  // OffsetCalculator needs the sizes and strides reveresed
+  // OffsetCalculator needs the sizes and strides reversed
  const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
  const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
  const auto* indexed_strides_data = indexed_strides.data();
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -1611,7 +1611,7 @@ void index_select_out_cuda_impl(

  // SmallIndexKernel is more performant when the number of indices is small, and pre-loading
  // the index reduces memory accesses. When the number of indices is large, we avoid that
-  // and increase parallellism by calling gather_out which is a generalization of index_select
+  // and increase parallelism by calling gather_out which is a generalization of index_select
  if (cuda::detail::canUse32BitIndexMath(out) &&
      cuda::detail::canUse32BitIndexMath(self) &&
      cuda::detail::canUse32BitIndexMath(index) &&
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -273,7 +273,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(

    scalar_t* dst = self_ptr + index;

-    //pack coalseced bf16 and fp16
+    //pack coalesced bf16 and fp16
    if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
    {
        typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
@ -316,7 +316,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
        }
    }

-    // not coalsced, so now let try to capture lane-matches...
+    // not coalesced, so now let try to capture lane-matches...

    if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
      // well shucks, unlikely to capture same-dest atomics in a wave.
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@ -343,7 +343,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
  if (input_length == 0)
    return;

-  // "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
+  // "first" row, the beta initialization before eq (10) (t=target_length - differs per batch)
  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
    int64_t s = threadIdx.x + block_s;
    scalar_t lb;
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@ -816,7 +816,7 @@ const auto erfcx_string = jiterator_stringify(
    with the usual checks for overflow etcetera.

    Performance-wise, it seems to be substantially faster than either
-    the SLATEC DERFC function [or an erfcx function derived therefrom]
+    the SLATEC DERFC function [or an erfcx function derived there from]
    or Cody's CALERF function (from netlib.org/specfun), while
    retaining near machine precision in accuracy.
  */
--- a/aten/src/ATen/native/cuda/MemoryAccess.cuh
+++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@ -370,7 +370,7 @@ struct vectorized {

 #ifdef USE_ROCM
 // This is similar to vectorized policy above, but this one supports
-// heterogenous input tensor types as templated parameters.
+// heterogeneous input tensor types as templated parameters.
 // Its use should be limited to frequently used heterogeneous data types
 // as each instantiation will generate a separate kernel, leading to code
 // bloating if applied to all combinations supported in PyTorch. Assumption: all
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@ -309,7 +309,7 @@ __global__ void sampleMultinomialOnce(
      } else {
        // This should address a rare bug where we don't select a valid index. This likely occurs when
        // due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
-        // and our uniform sample is greater than this value. In this case we likely have unitialized memory
+        // and our uniform sample is greater than this value. In this case we likely have uninitialized memory
        // in dest[curDist]. So basically we will loop through the distribution and pick the largest index
        // where the distribution is non-zero. This is obviously terribly inefficient, but due to the
        // rarity in which this occurs, this should not be an issue.
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@ -1623,7 +1623,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
  const auto stride = input.sizes()[1];
  const auto reduction_size = input.numel() / stride;

-  // Input is guarunteed to be channels-last compatible
+  // Input is guaranteed to be channels-last compatible
  at::Tensor grad_input = at::empty_like(input);

  dim3 block;
@ -1691,7 +1691,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
  const auto reduction_size = input.numel() / stride;
  auto norm_fct = 1.0 / reduction_size;

-  // Input is guarunteed to be channels-last compatible
+  // Input is guaranteed to be channels-last compatible
  at::Tensor grad_input = at::empty_like(input);

  dim3 block;
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@ -37,7 +37,7 @@ namespace at::native {
 // threshold probability for having non-duplicate keys, then it can be proved that[1]
 // the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
 //
-// Then after sort, we lauch a separate kernel that additionally shuffles any islands
+// Then after sort, we launch a separate kernel that additionally shuffles any islands
 // of values whose keys matched. The algorithm of this kernel is as follows:
 // Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
 // For each island, the first thread in the island sees a key match at index i+1 but not index i-1.
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -1088,12 +1088,12 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  // load instructions.
  //
  // Case 1: "vectorize along input"
-  // This case happens when we are reducing along fastest moving dimesion. In such case, threads
+  // This case happens when we are reducing along fastest moving dimension. In such case, threads
  // with the same threadIdx.y works on the same reduction cooperatively and will produce results
  // for the same output. In such case, values in each loaded vector always correspond to the same output.
  //
  // Case 2: "vectorize along output"
-  // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
+  // This case happens when the fastest moving dimension is not the dimension of reduction. In such case,
  // threads with different threadIdx.x are independent and will produce results for different outputs.
  // In such case, values in each loaded vector always correspond to different outputs.
  if (fastest_moving_stride == sizeof(scalar_t)) {
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@ -241,7 +241,7 @@ __global__ void reflection_pad2d_backward_det_out_kernel(
        const int64_t dist_cols = ::abs(inp_col - (input_dim_x - 1));

        // we were dist_rows after, now we want to be dist_rows before
-        // we were dist_cols before, now we wnat to be dist_cols after
+        // we were dist_cols before, now we want to be dist_cols after
        const int64_t reflect_tr_out_row = (corner_tr_out_row - dist_rows);
        const int64_t reflect_tr_out_col = (corner_tr_out_col + dist_cols);
        const int64_t reflect_tr_out =
--- a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
+++ b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
@ -5,7 +5,7 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/macros/Macros.h>

-// Two warninngs in Cutlass included header files
+// Two warnings in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@ -7,7 +7,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>

-// Two warninngs in Cutlass included header files
+// Two warnings in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
--- a/aten/src/ATen/native/cuda/group_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu
@ -460,7 +460,7 @@ __global__ void GammaBetaBackwardCUDAKernel2(
    }
  }

-  // Do warp reduce for the 2st 16 cols in the tile.
+  // Do warp reduce for the 2nd 16 cols in the tile.
  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
  sum1 = cuda_utils::WarpReduceSum<T_ACC>(sum1);
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(

  std::string file_path;
  if (cache_dir.has_value()) {
-    // Attemps to read from the cache.
+    // Attempts to read from the cache.
    // Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
    // Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
    //   because we hash on the CUDA code, but we save the compiled ptx or sass
@ -1556,19 +1556,19 @@ NvrtcFunction jit_pwise_function(
    ss << "_" << hash_code;
    file_path = ss.str();

-    std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
-    if (readin.fail()) {
+    std::ifstream read_stream{file_path, std::ios::in | std::ifstream::binary};
+    if (read_stream.fail()) {
      // NOTE: this does not warn because the file might not exist
      // TODO: consider if this should explicitly check for the file's existence or not to throw
      //   an informative warning
-      readin.close();
+      read_stream.close();
    } else {
      // TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
-      std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
+      std::vector<char> buffer(std::istreambuf_iterator<char>(read_stream), {});
      AT_CUDA_DRIVER_CHECK(nvrtc.cuModuleLoadData(&(compiled_kernel_.module), buffer.data()));
      AT_CUDA_DRIVER_CHECK(
        nvrtc.cuModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
-      readin.close();
+      read_stream.close();
      return compiled_kernel_;
    }
  }
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@ -1050,7 +1050,7 @@ void launch_vectorized_layer_norm_kernel(
    C10_CUDA_KERNEL_LAUNCH_CHECK();

 #ifdef USE_ROCM
-    // the blocks.x contains the max grid x dimention without invalid configuration error
+    // the blocks.x contains the max grid x dimension without invalid configuration error
    // Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
    // Ensure all elements are processed. Prepare for next round
    int64_t remaining = M - blocks.x;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
    });

  if (input.dim() > 2) {
-    // if upper=true we need to tranpose and conjugate the result tensor
+    // if upper=true we need to transpose and conjugate the result tensor
    // because the cholesky decomposition is stored in the lower triangular part
    if (upper) {
      input.copy_(result.mH());
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {

  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
-  // TODO Investigate whether the following magma bug is still occuring.
+  // TODO Investigate whether the following magma bug is still occurring.
  // It may be the case that geqrf followed by orgqr is wrong for the magma backend
  // geqrf_magma currently uses geqrf2_gpu
  //
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
 #if defined(BUILD_LAZY_CUDA_LINALG)
 namespace cuda { namespace detail {
 // This is only used for an old-style dispatches
-// Please do not add any new entires to it
+// Please do not add any new entries to it
 struct LinalgDispatch {
   Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
 };
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -177,7 +177,7 @@ bool use_ragged_in_dense(
    TORCH_WARN_ONCE(
        "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
        "e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
-        "Falling back to regualr dense case, which may trigger excessive recompilation.");
+        "Falling back to regular dense case, which may trigger excessive recompilation.");
  }
  return all_bshd;
 }
@ -771,7 +771,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
  if (attn_bias.has_value()) {
    TORCH_CHECK(
        false,
-        "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
+        "attn_bias not yet supported with cuDNN Attention and NestedTensor");
    scaled_dot_product_flash_attention_options.set_bias(
        mha_graph->tensor(fe::graph::Tensor_attributes()
                              .set_uid(BIAS)
@ -1196,7 +1196,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
  if (attn_bias.has_value()) {
    TORCH_CHECK(
        false,
-        "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
+        "attn_bias not yet supported with cuDNN Attention and NestedTensor");
    sdpa_backward_options.set_bias(
        mha_graph->tensor(fe::graph::Tensor_attributes()
                              .set_uid(BIAS)
@ -1864,7 +1864,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
  }
  TORCH_CHECK(
      !attn_bias.has_value(),
-      "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
+      "attn_bias not yet supported with cuDNN Attention and NestedTensor");

  auto workspace_size = mha_graph.get_workspace_size();
  auto workspace_ptr =
--- a/aten/src/ATen/native/hip/ck_bgemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_bgemm_bfloat16.hip
@ -30,7 +30,7 @@ static const std::unordered_map<
    };


-// This is the heursitic to choose a kernel based on inputs
+// This is the heuristic to choose a kernel based on inputs
 BGEMMKernel_BFloat16 dispatch_bfloat16_bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
  // Optional/future use: directly lookup shape tuples to map to instances
  /*
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
 namespace at::native {

 void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  // If any of the shapes cant be tiled, we must use padding.
+  // If any of the shapes can't be tiled, we must use padding.
  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
  // Dispatch to best implementation.
  // TODO add more configurations. Optimize.
@ -471,7 +471,7 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 }

 void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  // If any of the shapes cant be tiled, we must use padding.
+  // If any of the shapes can't be tiled, we must use padding.
  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
  // Dispatch to best implementation.
  // TODO add more configurations. Optimize.
--- a/aten/src/ATen/native/hip/ck_gemm_float.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_float.hip
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
 namespace at::native {

 void dispatch_float_gemm(CUDABLAS_GEMM_ARGTYPES(float)) {
-  // If any of the shapes cant be tiled, we must use padding.
+  // If any of the shapes can't be tiled, we must use padding.
  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
  // Dispatch to best implementation.
  // TODO add more configurations. Optimize.
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@ -13,7 +13,7 @@ namespace at::native {

 void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 #if 0
-  // If any of the shapes cant be tiled, we must use padding.
+  // If any of the shapes can't be tiled, we must use padding.
  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
  // Dispatch to best implementation.
  // TODO add more configurations. Optimize.
@ -299,7 +299,7 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 #endif
 }
 void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  // If any of the shapes cant be tiled, we must use padding.
+  // If any of the shapes can't be tiled, we must use padding.
  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
  // Dispatch to best implementation.
  // TODO add more configurations. Optimize.
--- a/aten/src/ATen/native/metal/MetalShaders.h
+++ b/aten/src/ATen/native/metal/MetalShaders.h
@ -545,7 +545,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
    const ushort slices2 = divRoundUp(C2, 4);
    const ushort slices1 = divRoundUp(C1, 4);
    const ushort n2 = gid.z / slices2; //image index
-    const ushort s2 = gid.z - n2 * slices2; // slice offest
+    const ushort s2 = gid.z - n2 * slices2; // slice offset
    half4 value;
    for (int idx = 0; idx < 4; ++idx){
        // we compute the "linear index" of the output element,
--- a/aten/src/ATen/native/metal/ops/MetalNeurons.mm
+++ b/aten/src/ATen/native/metal/ops/MetalNeurons.mm
@ -86,4 +86,4 @@ TORCH_LIBRARY_IMPL(aten, Metal, m) {
  m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
 }

-} // namepsace at::native::metal
+} // namespace at::native::metal
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
 //  blocked format will propagate between layers. Input, output will be in blocked format.
 //
 //  For inference case, weight can be prepacked into blocked format by
-//  (so as to save weight reoder overhead):
+//  (so as to save weight reorder overhead):
 //      model = torch.utils.mkldnn.to_mkldnn(model)
 //
 //  For training case, grad_output can be CPU tensor or MKLDNN tensor,
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
  args.insert({DNNL_ARG_WEIGHTS, expected_weight});
  args.insert({DNNL_ARG_DST, dst});
  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
-  // Create primitve and execute
+  // Create primitive and execute
  auto primitive = dnnl::matmul(prim_desc);
  primitive.execute(ideep::stream::default_stream(), args);
 }
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
  // For optional additive mask
  std::optional<op> mask_add;

-  // For optional implicite causal mask
+  // For optional implicit causal mask
  std::optional<op> mask_gen_idx_row;
  std::optional<logical_tensor> mask_row_idx;
  std::optional<op> mask_gen_idx_col;
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
  // For optional additive mask
  std::optional<op> mask_add;

-  // For optional implicite causal mask
+  // For optional implicit causal mask
  std::optional<op> mask_gen_idx_row;
  std::optional<logical_tensor> mask_row_idx;
  std::optional<op> mask_gen_idx_col;
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
@ -34,7 +34,7 @@ namespace at::native::onednn {

 /*
   oneDNN postops usage:
-   Currently, oneDNN supports 5 kinds of post ops. More details can be refered
+   Currently, oneDNN supports 5 kinds of post ops. More details can be referred
 to oneDNN doc.
   https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise

@ -345,7 +345,7 @@ class Attr {
        dnnl::memory binary_m;
        auto binary = ops_params_[i].binary_;
        auto md = ops_params_[i].meta_;
-        // qeury expected_md to achieve peak performance
+        // query expected_md to achieve peak performance
        auto expected_md = pd.query_md(
            dnnl::query::exec_arg_md,
            DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
@ -399,7 +399,7 @@ static inline void construct_attr_for_unary(
  } else {
    TORCH_CHECK(
        unary_post_op == "none",
-        "onednn qlinear: unspported unary post op",
+        "onednn qlinear: unsupported unary post op",
        unary_post_op);
  }
 }
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
      return false;
  }

-  // the overlaped cases are not supported
+  // the overlapped cases are not supported
  dnnl::memory::dims strides = get_onednn_strides(tensor);
  int64_t storage_size = 1;
  for (size_t dim = 0; dim < tensor_dim; ++dim)
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -29,7 +29,7 @@
                                                            secondaryTensor:(MPSGraphTensor*)secondaryTensor
                                                                       name:(NSString*)name {
  // As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
-  // agruments results in
+  // arguments results in
  //  /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
  //  failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
  if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@ -42,7 +42,7 @@
                                                            secondaryTensor:(MPSGraphTensor*)secondaryTensor
                                                                       name:(NSString*)name {
  // As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
-  // agruments results in
+  // arguments results in
  //  /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
  //  failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
  if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,

  static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
  // Use gather kernel to solve strides for macOS < 15.0
-  // Starting with macOS 15.0, MPS supports native strides direclty in the kernels
+  // Starting with macOS 15.0, MPS supports native strides directly in the kernels
  if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
    if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
      Tensor emptyShell = Tensor();
@ -856,7 +856,7 @@ id<MTLLibrary> MetalShaderLibrary::getLibrary(const std::initializer_list<std::s
      break;
    }
    default:
-      TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams);
+      TORCH_INTERNAL_ASSERT(false, "Unsupported number of parameters ", nparams);
  }
  return libMap[key] = lib;
 }
@ -1184,9 +1184,9 @@ void MetalKernelFunction::dispatch(uint64_t length, std::optional<uint64_t> grou
 }

 void MetalKernelFunction::dispatch(c10::ArrayRef<uint64_t> length, c10::OptionalArrayRef<uint64_t> group_size) {
-  TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
+  TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimensions must be less than 3 and non-empty");
  TORCH_CHECK(!group_size.has_value() || group_size->size() == length.size(),
-              "size and group_size must have same number of dimentions");
+              "size and group_size must have same number of dimensions");
  const auto max_tg_size = getMaxThreadsPerThreadgroup();
  const auto group_size_length = group_size.has_value() ? group_size->size() : 0;
  auto tg_size = MTLSizeMake(group_size_length > 0 ? group_size->at(0) : max_tg_size,
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@ -59,7 +59,7 @@ static GridSamplerOffsets find_grid_sampler_offsets(
  return offsets;
 }

-// Mod function which gives postive output when `a` is negative
+// Mod function which gives positive output when `a` is negative
 static int32_t mod(int32_t a, int32_t b) {
  auto r = a % b;
  return r + (r < 0 ? b : 0);
@ -191,9 +191,9 @@ void grid_sampler_single_element(
  int32_t right_indices[3];
  opmath_t<T> scales[3];

-  // For each dimension, find the pair of indices in the cooresponding dimension
+  // For each dimension, find the pair of indices in the corresponding dimension
  // of `input` which surround the grid coordinate in that dimension. We'll do
-  // this by mapping different coordiante spaces onto each other. There are
+  // this by mapping different coordinate spaces onto each other. There are
  // basically three different coordinate spaces to keep in mind:
  //
  //  * aligned grid space
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@ -137,7 +137,7 @@ kernel void index_put_serial(
    constant int64_t* index_strides,
    constant uint4& ndim_nindices_numel,
    uint thread_index [[thread_position_in_grid]]) {
-  (void)thread_index; // Suppress unused vairable varning
+  (void)thread_index; // Suppress unused variable warning
  for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
    index_put_impl(
        output,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Xuehai Pan	f7a13a6dfc	Update [ghstack-poisoned]	2025-10-19 18:28:29 +08:00
Xuehai Pan	98aff4e90e	Update (base update) [ghstack-poisoned]	2025-10-19 18:28:29 +08:00
Xuehai Pan	6cdf27661c	Update [ghstack-poisoned]	2025-10-11 21:43:12 +08:00
Xuehai Pan	ecd4542830	Update (base update) [ghstack-poisoned]	2025-10-11 21:43:12 +08:00
Xuehai Pan	d0b7578e17	Update [ghstack-poisoned]	2025-10-08 22:35:50 +08:00
Xuehai Pan	e22a5ecb45	Update (base update) [ghstack-poisoned]	2025-10-08 22:35:50 +08:00
Xuehai Pan	9c8b449159	Update [ghstack-poisoned]	2025-09-20 23:28:20 +08:00
Xuehai Pan	613b0adb13	Update (base update) [ghstack-poisoned]	2025-09-20 14:37:09 +08:00
Xuehai Pan	12534b44b3	Update [ghstack-poisoned]	2025-09-20 14:37:09 +08:00
Xuehai Pan	831ad1f70b	Update [ghstack-poisoned]	2025-09-20 14:30:33 +08:00
Xuehai Pan	1eed70b417	Update (base update) [ghstack-poisoned]	2025-09-19 17:48:16 +08:00
Xuehai Pan	ef230fcd2d	Update [ghstack-poisoned]	2025-09-19 17:48:16 +08:00
Xuehai Pan	c283224b77	Update (base update) [ghstack-poisoned]	2025-09-05 23:43:05 +08:00
Xuehai Pan	9bd3d28afa	Update [ghstack-poisoned]	2025-09-05 23:43:05 +08:00
Xuehai Pan	1c2ad5fe9d	Update [ghstack-poisoned]	2025-08-31 23:15:35 +08:00
Xuehai Pan	5c57109190	Update (base update) [ghstack-poisoned]	2025-08-31 22:56:40 +08:00
Xuehai Pan	e3b463d216	Update [ghstack-poisoned]	2025-08-31 22:56:40 +08:00
Xuehai Pan	e6bab78b38	Update (base update) [ghstack-poisoned]	2025-08-27 19:30:05 +08:00
Xuehai Pan	b9f7dd6e77	Update [ghstack-poisoned]	2025-08-27 19:30:05 +08:00
Xuehai Pan	2b6c6e3d64	Update [ghstack-poisoned]	2025-08-25 13:08:58 +08:00
Xuehai Pan	e289d12b73	Update (base update) [ghstack-poisoned]	2025-08-25 13:05:28 +08:00
Xuehai Pan	bb2c89fc3b	Update [ghstack-poisoned]	2025-08-25 13:05:28 +08:00
Xuehai Pan	d530a21122	Update (base update) [ghstack-poisoned]	2025-08-20 00:20:04 +08:00
Xuehai Pan	27fb875a70	Update [ghstack-poisoned]	2025-08-20 00:20:04 +08:00
Xuehai Pan	a5ecc01ef8	Update (base update) [ghstack-poisoned]	2025-08-16 19:42:28 +08:00
Xuehai Pan	f7b574c862	Update [ghstack-poisoned]	2025-08-16 19:42:28 +08:00
Xuehai Pan	5d6924f0a4	Update (base update) [ghstack-poisoned]	2025-08-14 12:26:15 +08:00
Xuehai Pan	cf7a543013	Update [ghstack-poisoned]	2025-08-14 12:26:15 +08:00
Xuehai Pan	55043b3ada	Update (base update) [ghstack-poisoned]	2025-08-10 13:46:42 +08:00
Xuehai Pan	7b61d461ab	Update [ghstack-poisoned]	2025-08-10 13:46:42 +08:00
Xuehai Pan	0ff1ddecad	Update (base update) [ghstack-poisoned]	2025-08-05 13:35:59 +08:00
Xuehai Pan	0f9b4aae53	Update [ghstack-poisoned]	2025-08-05 13:35:59 +08:00
Xuehai Pan	d6886407ef	Update (base update) [ghstack-poisoned]	2025-07-31 15:02:17 +08:00
Xuehai Pan	c37103c16d	Update [ghstack-poisoned]	2025-07-31 15:02:17 +08:00
Xuehai Pan	d7ecbf7243	Update (base update) [ghstack-poisoned]	2025-07-30 15:15:56 +08:00
Xuehai Pan	8080febac5	Update [ghstack-poisoned]	2025-07-30 15:15:56 +08:00
Xuehai Pan	408ab373f8	Update [ghstack-poisoned]	2025-07-29 12:55:39 +08:00
Xuehai Pan	23b1bbb810	Update (base update) [ghstack-poisoned]	2025-07-29 11:38:05 +08:00
Xuehai Pan	8268316590	Update [ghstack-poisoned]	2025-07-29 11:38:05 +08:00
Xuehai Pan	a56e4e5fc7	Update (base update) [ghstack-poisoned]	2025-07-25 19:38:51 +08:00
Xuehai Pan	dac4caf77b	Update [ghstack-poisoned]	2025-07-25 19:38:51 +08:00
Xuehai Pan	1710c27341	Update (base update) [ghstack-poisoned]	2025-07-24 18:43:42 +08:00
Xuehai Pan	92fde5bdc5	Update [ghstack-poisoned]	2025-07-24 18:43:42 +08:00
Xuehai Pan	eafc6437d6	Update (base update) [ghstack-poisoned]	2025-07-20 18:09:58 +08:00
Xuehai Pan	4a90ec1387	Update [ghstack-poisoned]	2025-07-20 18:09:58 +08:00
Xuehai Pan	d9ab4e3ade	Update (base update) [ghstack-poisoned]	2025-07-18 13:10:59 +08:00
Xuehai Pan	ce372a06e9	Update [ghstack-poisoned]	2025-07-18 13:10:59 +08:00
Xuehai Pan	67f0059726	Update (base update) [ghstack-poisoned]	2025-07-17 14:55:19 +08:00
Xuehai Pan	55473096e2	Update [ghstack-poisoned]	2025-07-17 14:55:19 +08:00
Xuehai Pan	2a15541b5d	Update (base update) [ghstack-poisoned]	2025-07-16 09:07:20 +08:00
Xuehai Pan	16875b228b	Update [ghstack-poisoned]	2025-07-16 09:07:20 +08:00
Xuehai Pan	808af988ae	Update (base update) [ghstack-poisoned]	2025-07-16 03:48:55 +08:00
Xuehai Pan	61c298ed56	Update [ghstack-poisoned]	2025-07-16 03:48:55 +08:00
Xuehai Pan	2d38369728	Update [ghstack-poisoned]	2025-07-15 12:00:06 +08:00
Xuehai Pan	74bd12415e	Update (base update) [ghstack-poisoned]	2025-07-15 11:44:30 +08:00
Xuehai Pan	a8c0c0263c	Update [ghstack-poisoned]	2025-07-15 11:44:30 +08:00
Xuehai Pan	e3f14cdafa	Update (base update) [ghstack-poisoned]	2025-07-12 13:12:15 +08:00
Xuehai Pan	b03cc2d9c8	Update [ghstack-poisoned]	2025-07-12 13:12:15 +08:00
Xuehai Pan	99917e659b	Update (base update) [ghstack-poisoned]	2025-07-11 22:11:06 +08:00
Xuehai Pan	def4476b6b	Update [ghstack-poisoned]	2025-07-11 22:11:06 +08:00
Xuehai Pan	84bb803719	Update (base update) [ghstack-poisoned]	2025-07-11 15:05:13 +08:00
Xuehai Pan	8266849bda	Update [ghstack-poisoned]	2025-07-11 15:05:13 +08:00
Xuehai Pan	f766c8ceea	Update (base update) [ghstack-poisoned]	2025-07-09 13:24:04 +08:00
Xuehai Pan	b3cf7bc86d	Update [ghstack-poisoned]	2025-07-09 13:24:04 +08:00
Xuehai Pan	6a4a8b453d	Update (base update) [ghstack-poisoned]	2025-07-09 13:18:08 +08:00
Xuehai Pan	cab14368a7	Update [ghstack-poisoned]	2025-07-09 13:18:08 +08:00
Xuehai Pan	0b87f606ca	Update (base update) [ghstack-poisoned]	2025-07-09 11:16:42 +08:00
Xuehai Pan	25f68b6b5b	Update [ghstack-poisoned]	2025-07-09 11:16:42 +08:00
Xuehai Pan	334489bfd0	Update (base update) [ghstack-poisoned]	2025-07-08 19:50:26 +08:00
Xuehai Pan	ada5d90c25	Update [ghstack-poisoned]	2025-07-08 19:50:26 +08:00
Xuehai Pan	a1e3e2026b	Update (base update) [ghstack-poisoned]	2025-07-08 13:52:13 +08:00
Xuehai Pan	8dc4932b0c	Update [ghstack-poisoned]	2025-07-08 13:52:13 +08:00
Xuehai Pan	1505d7a461	Update [ghstack-poisoned]	2025-07-06 15:43:17 +08:00
Xuehai Pan	1c842a9686	Update (base update) [ghstack-poisoned]	2025-07-06 10:59:24 +08:00
Xuehai Pan	692827ad29	Update [ghstack-poisoned]	2025-07-06 10:59:24 +08:00
Xuehai Pan	786d4646ee	Update (base update) [ghstack-poisoned]	2025-07-05 02:30:33 +08:00
Xuehai Pan	70df0aed59	Update [ghstack-poisoned]	2025-07-05 02:30:33 +08:00