[Inductor] optimize scalar welford_reduce (#162709 )

**Summary:** Optimize scalar welford_reduce implementation, combining Welford algorithm with cascade sum to improve numerical stability. Specifically: 1. Use Welford algorithm to compute mean and variance. 2. Use cascade summation when computing sum over input for both mean and variance. **Example:** Take https://github.com/pytorch/pytorch/issues/141541 as an example: ``` import torch import torch.nn as nn torch.manual_seed(0) class Model(nn.Module): def __init__(self): super().__init__() self.gn = nn.GroupNorm(num_groups=32, num_channels=32) def forward(self, x): return self.gn(x) model = Model().eval() x = torch.randn(1, 32, 128, 128, 128) with torch.no_grad(): output = model(x) with torch._inductor.config.patch({"cpp.simdlen": 0}): c_model = torch.compile(model) c_output = c_model(x) print(torch.max(torch.abs(output - c_output))) print(torch.allclose(output, c_output, 1.3e-6, 1e-5)) ``` **logs** - before ``` tensor(0.0005) False ``` - After ``` tensor(1.4305e-06) True ``` **Generated code:** - before ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float*', 'float*', 'const float*', 'const float*', 'const float*', 'float*'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(float* in_out_ptr0, float* in_out_ptr1, const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr2) { auto out_ptr1 = in_out_ptr0; auto out_ptr0 = in_out_ptr1; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<float> tmp_acc0_arr[4]; for (int i = 0; i < 4; i++) { tmp_acc0_arr[i] = Welford<float>(); } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); Welford<float> tmp_acc0_local = Welford<float>(); #pragma omp for for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0); } } } tmp_acc0_arr[tid] = tmp_acc0_local; } for (int tid = 0; tid < 4; tid++) { tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]); } in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean; in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2; } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { { auto tmp0 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp6 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp8 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp11 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp1 = static_cast<float>(2097152.0); auto tmp2 = tmp0 / tmp1; auto tmp3 = static_cast<float>(1e-05); auto tmp4 = float(tmp2 + tmp3); auto tmp5 = 1 / std::sqrt(tmp4); auto tmp7 = float(tmp5 * tmp6); auto tmp9 = decltype(tmp8)(-tmp8); auto tmp10 = float(tmp9 * tmp7); auto tmp12 = float(tmp10 + tmp11); in_out_ptr0[static_cast<int64_t>(x0)] = tmp7; in_out_ptr1[static_cast<int64_t>(x0)] = tmp12; } } } } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); { #pragma omp for for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { #pragma GCC ivdep for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)]; auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)]; auto tmp2 = float(tmp0 * tmp1); auto tmp4 = float(tmp2 + tmp3); out_ptr2[static_cast<int64_t>(x1 + 2097152L*x0)] = tmp4; } } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, arg1_1, arg2_1 = args args.clear() assert_size_stride(arg0_1, (32, ), (1, )) assert_size_stride(arg1_1, (32, ), (1, )) assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1)) buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1 # reuse buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0 # reuse buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32) # [Provenance debug handles] cpp_fused_native_group_norm_0:1 cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5) del arg0_1 del arg1_1 del arg2_1 return (buf5, ) ``` - After ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float*', 'float*', 'const float*', 'const float*', 'const float*', 'float*'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(float* in_out_ptr0, float* in_out_ptr1, const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr2) { auto out_ptr1 = in_out_ptr0; auto out_ptr0 = in_out_ptr1; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<float> tmp_acc0_arr[4]; for (int i = 0; i < 4; i++) { tmp_acc0_arr[i] = Welford<float>(); } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); WelfordHelper<float, float, 4096> scalar_welford_helper0(static_cast<int64_t>(524288L)); Welford<float> tmp_acc0_local = Welford<float>(); #pragma omp for for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0, &scalar_welford_helper0); } } } tmp_acc0_local = welford_combine(tmp_acc0_local, &scalar_welford_helper0); tmp_acc0_arr[tid] = tmp_acc0_local; } for (int tid = 0; tid < 4; tid++) { tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]); } in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean; in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2; } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { { auto tmp0 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp6 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp8 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp11 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp1 = static_cast<float>(2097152.0); auto tmp2 = tmp0 / tmp1; auto tmp3 = static_cast<float>(1e-05); auto tmp4 = float(tmp2 + tmp3); auto tmp5 = 1 / std::sqrt(tmp4); auto tmp7 = float(tmp5 * tmp6); auto tmp9 = decltype(tmp8)(-tmp8); auto tmp10 = float(tmp9 * tmp7); auto tmp12 = float(tmp10 + tmp11); in_out_ptr0[static_cast<int64_t>(x0)] = tmp7; in_out_ptr1[static_cast<int64_t>(x0)] = tmp12; } } } } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); { #pragma omp for for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { #pragma GCC ivdep for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)]; auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)]; auto tmp2 = float(tmp0 * tmp1); auto tmp4 = float(tmp2 + tmp3); out_ptr2[static_cast<int64_t>(x1 + 2097152L*x0)] = tmp4; } } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, arg1_1, arg2_1 = args args.clear() assert_size_stride(arg0_1, (32, ), (1, )) assert_size_stride(arg1_1, (32, ), (1, )) assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1)) buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1 # reuse buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0 # reuse buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32) # [Provenance debug handles] cpp_fused_native_group_norm_0:1 cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5) del arg0_1 del arg1_1 del arg2_1 return (buf5, ) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162709 Approved by: https://github.com/CaoE, https://github.com/jansel
s/Stragety/Strategy/ (#167916 )
2025-11-17 16:46:31 +08:00 · 2025-11-17 02:52:33 +00:00 · 2025-11-16 19:47:23 +00:00 · 2025-11-16 17:51:23 +00:00 · 2025-11-16 14:15:28 +00:00 · 2025-11-16 07:19:16 +00:00
216 changed files with 1065 additions and 1034 deletions
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
 }

 inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
-  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
  return out;
 }

--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -9,7 +9,7 @@ namespace indexing {
 const EllipsisIndexType Ellipsis = EllipsisIndexType();

 std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
-  stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
+  stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
  return stream;
 }

@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 }

 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
-  stream << "(";
+  stream << '(';
  for (const auto i : c10::irange(tensor_indices.size())) {
    stream << tensor_indices[i];
    if (i < tensor_indices.size() - 1) stream << ", ";
  }
-  stream << ")";
+  stream << ')';
  return stream;
 }

--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
 std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
  out << tensorname.name_ << " (index ";
  out << tensorname.origin_idx_ << " of ";
-  out << tensorname.origin_ << ")";
+  out << tensorname.origin_ << ')';
  return out;
 }

--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
  if (t.pos == 0) {
    // 0 is distinguished; it usually indicates 'self' or the return
    // tensor
-    out << "'" << t.name << "'";
+    out << '\'' << t.name << '\'';
  } else {
-    out << "argument #" << t.pos << " '" << t.name << "'";
+    out << "argument #" << t.pos << " '" << t.name << '\'';
  }
  return out;
 }
@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
      oss << "Tensor for " << t2 << " is on CPU, ";
    }
    oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
-        << " to be on GPU (while checking arguments for " << c << ")";
+        << " to be on GPU (while checking arguments for " << c << ')';
    TORCH_CHECK(false, oss.str());
  }
  TORCH_CHECK(
@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
        i++;
      }
      oss << "; but got " << t->toString()
-          << " instead (while checking arguments for " << c << ")";
+          << " instead (while checking arguments for " << c << ')';
      TORCH_CHECK(false, oss.str());
    }
 }
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
    // https://github.com/intel/ideep/issues/29
    {
      const dnnl_version_t* ver = dnnl_version();
-      ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
-         << " (Git Hash " << ver->hash << ")";
+      ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
+         << " (Git Hash " << ver->hash << ')';
    }
  #else
    ss << "MKLDNN not found";
@ -81,7 +81,7 @@ std::string get_openmp_version() {
          break;
      }
      if (ver_str) {
-        ss << " (a.k.a. OpenMP " << ver_str << ")";
+        ss << " (a.k.a. OpenMP " << ver_str << ')';
      }
    }
  #else
@ -135,38 +135,38 @@ std::string show_config() {

 #if defined(__GNUC__)
  {
-    ss << "  - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
+    ss << "  - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
  }
 #endif

 #if defined(__cplusplus)
  {
-    ss << "  - C++ Version: " << __cplusplus << "\n";
+    ss << "  - C++ Version: " << __cplusplus << '\n';
  }
 #endif

 #if defined(__clang_major__)
  {
-    ss << "  - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
+    ss << "  - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
  }
 #endif

 #if defined(_MSC_VER)
  {
-    ss << "  - MSVC " << _MSC_FULL_VER << "\n";
+    ss << "  - MSVC " << _MSC_FULL_VER << '\n';
  }
 #endif

 #if AT_MKL_ENABLED()
-  ss << "  - " << get_mkl_version() << "\n";
+  ss << "  - " << get_mkl_version() << '\n';
 #endif

 #if AT_MKLDNN_ENABLED()
-  ss << "  - " << get_mkldnn_version() << "\n";
+  ss << "  - " << get_mkldnn_version() << '\n';
 #endif

 #ifdef _OPENMP
-  ss << "  - " << get_openmp_version() << "\n";
+  ss << "  - " << get_openmp_version() << '\n';
 #endif

 #if AT_BUILD_WITH_LAPACK()
@ -183,7 +183,7 @@ std::string show_config() {
  ss << "  - Cross compiling on MacOSX\n";
 #endif

-  ss << "  - "<< used_cpu_capability() << "\n";
+  ss << "  - "<< used_cpu_capability() << '\n';

  if (hasCUDA()) {
    ss << detail::getCUDAHooks().showConfig();
@ -200,10 +200,10 @@ std::string show_config() {
  ss << "  - Build settings: ";
  for (const auto& pair : caffe2::GetBuildOptions()) {
    if (!pair.second.empty()) {
-      ss << pair.first << "=" << pair.second << ", ";
+      ss << pair.first << '=' << pair.second << ", ";
    }
  }
-  ss << "\n";
+  ss << '\n';

  // TODO: do HIP
  // TODO: do XLA
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@ -209,7 +209,7 @@ struct CodeTemplate {
  // to indent correctly in the context.
  void emitIndent(std::ostream& out, size_t indent) const {
    for ([[maybe_unused]] const auto i : c10::irange(indent)) {
-      out << " ";
+      out << ' ';
    }
  }
  void emitStringWithIndents(
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
  if (dimname.type() == NameType::WILDCARD) {
    out << "None";
  } else {
-    out << "'" << dimname.symbol().toUnqualString() << "'";
+    out << '\'' << dimname.symbol().toUnqualString() << '\'';
  }
  return out;
 }
--- a/aten/src/ATen/core/Range.cpp
+++ b/aten/src/ATen/core/Range.cpp
@ -5,7 +5,7 @@
 namespace at {

 std::ostream& operator<<(std::ostream& out, const Range& range) {
-  out << "Range[" << range.begin << ", " << range.end << "]";
+  out << "Range[" << range.begin << ", " << range.end << ']';
  return out;
 }

--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {

 void TensorBase::print() const {
  if (defined()) {
-    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
+    std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
  } else {
    std::cerr << "[UndefinedTensor]" << '\n';
  }
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@ -9,8 +9,8 @@ APIVitals VitalsAPI;

 std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
  for (const auto& m : tv.attrs) {
-    os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
-       << m.second.value << "\n";
+    os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
+       << m.second.value << '\n';
  }
  return os;
 }
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {

 // this does match the way things are represented in the schema
 inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
-  out << "(";
+  out << '(';
  bool first = true;
  for (const auto& set : aliasInfo.beforeSets()) {
    if (first) {
      first = false;
    } else {
-      out << "|";
+      out << '|';
    }
    out << set.toUnqualString();
  }
  if (aliasInfo.isWrite()) {
-    out << "!";
+    out << '!';
  }
  if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
    out << " -> ";
@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
      if (first) {
        first = false;
      } else {
-        out << "|";
+        out << '|';
      }
      out << set.toUnqualString();
    }
  }
-  out << ")";
+  out << ')';
  return out;
 }
 } // namespace c10
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs)  noexcept {
 }

 inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
-  return out << "Blob[" << v.TypeName() << "]";
+  return out << "Blob[" << v.TypeName() << ']';
 }

 } // namespace caffe2
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
          *why_not << "Method on class '" << repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << self_method->getSchema() << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << self_method->getSchema() << '\n'
+                   << "  (2) " << schema << '\n';
        }
        return false;
      }
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
  std::string repr_str() const override {
    std::stringstream ss;
    ss << str()
-       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
+       << " (of Python compilation unit at: " << compilation_unit().get() << ')';
    return ss.str();
  }

--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
  std::ostringstream oss;
  for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
    if (dispatch_arg_indices_reverse_.get(i)) {
-      oss << "1";
+      oss << '1';
    } else {
-      oss << "0";
+      oss << '0';
    }
  }
-  oss << " " << nonFallthroughKeys_ << "\n";
+  oss << ' ' << nonFallthroughKeys_ << '\n';
  return oss.str();
 }

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -69,8 +69,8 @@ private:

 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
  auto nesting_value = dispatch_trace_nesting_value();
-  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
 }
 } // namespace detail

--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {

 std::string OperatorEntry::listAllDispatchKeys() const {
  std::ostringstream str;
-  str << "[";
+  str << '[';

  bool has_kernels = false;
  for (auto k : allDispatchKeysInFullSet()) {
@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
    str << k;
    has_kernels = true;
  }
-  str << "]";
+  str << ']';
  return str.str();
 }

@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
 // This WON'T report backend fallbacks.
 std::string OperatorEntry::dumpState() const {
  std::ostringstream oss;
-  oss << "name: " << name_ << "\n";
+  oss << "name: " << name_ << '\n';
  if (schema_) {
-    oss << "schema: " << schema_->schema << "\n";
-    oss << "debug: " << schema_->debug << "\n";
+    oss << "schema: " << schema_->schema << '\n';
+    oss << "debug: " << schema_->debug << '\n';
    oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
-        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
+        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
  } else {
    oss << "schema: (none)\n";
  }
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -7,7 +7,7 @@
 namespace c10 {

 void FunctionSchema::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }

 const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {

  out << schema.name();
  if (!schema.overload_name().empty()) {
-    out << "." << schema.overload_name();
+    out << '.' << schema.overload_name();
  }
-  out << "(";
+  out << '(';

  bool seen_kwarg_only = false;
  for (const auto i : c10::irange(schema.arguments().size())) {
@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
  }

  if (need_paren) {
-    out << "(";
+    out << '(';
  }
  for (const auto i : c10::irange(returns.size())) {
    if (i > 0) {
@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
    out << "...";
  }
  if (need_paren) {
-    out << ")";
+    out << ')';
  }
  return out;
 }
@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
    if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not
-            << "'" << arguments().at(i).name() << "'"
+            << '\'' << arguments().at(i).name() << '\''
            << " is not forward compatible with the older version of the schema";
      }
      return false;
@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
             .isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not << "Out argument '"
-                << "'" << arguments().at(i).name()
+                << '\'' << arguments().at(i).name()
                << " is not FC with the older version of the schema";
      }
      return false;
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
    if (arg.N()) {
        N = std::to_string(*arg.N());
    }
-    out << "[" << N << "]";
+    out << '[' << N << ']';
  } else {
    out << unopt_type->str();
  }
@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
  }

  if (is_opt) {
-    out << "?";
+    out << '?';
  }

  if (!arg.name().empty()) {
-    out << " " << arg.name();
+    out << ' ' << arg.name();
  }

  if (arg.default_value()) {
-    out << "=";
+    out << '=';
    if ((type->kind() == c10::TypeKind::StringType ||
        unopt_type->kind() == c10::TypeKind::StringType) &&
        arg.default_value().value().isString()) {
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
 }

 std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << "." << v.name();
+  out << v.qualifiedClassName() << '.' << v.name();
  return out;
 }

@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
      !elementTypeCanBeInferredFromMembers(list_elem_type)) {
    out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
    printList(out, the_list.toListRef(), "[", "]", formatter);
-    out << ")";
+    out << ')';
    return out;
  } else {
    return printList(out, the_list.toListRef(), "[", "]", formatter);
@ -538,7 +538,7 @@ std::ostream& printDict(
    std::ostream& out,
    const Dict& v,
    const IValueFormatter& formatter) {
-  out << "{";
+  out << '{';

  bool first = true;
  for (const auto& pair : v) {
@ -552,7 +552,7 @@ std::ostream& printDict(
    first = false;
  }

-  out << "}";
+  out << '}';
  return out;
 }
 }
@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
  auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
  if (the_dict.toGenericDict().empty() ||
      !elementTypeCanBeInferredFromMembers(value_type)) {
-    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
-    printDict(out, the_dict.toGenericDict(), formatter) << ")";
+    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
+    printDict(out, the_dict.toGenericDict(), formatter) << ')';
  } else {
    return printDict(out, the_dict.toGenericDict(), formatter);
  }
@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
  c10::complex<double> d = v.toComplexDouble();
  IValue real(d.real()), imag(std::abs(d.imag()));
  auto sign = d.imag() >= 0 ? '+' : '-';
-  return out << real << sign << imag << "j";
+  return out << real << sign << imag << 'j';
 }

 std::ostream& IValue::repr(
@ -605,9 +605,9 @@ std::ostream& IValue::repr(
        if (static_cast<double>(i) == d) {
          // -0.0 (signed zero) needs to be parsed as -0.
          if (i == 0 && std::signbit(d)) {
-            return out << "-" << i << ".";
+            return out << '-' << i << '.';
          }
-          return out << i << ".";
+          return out << i << '.';
        }
      }
      auto orig_prec = out.precision();
@ -643,20 +643,20 @@ std::ostream& IValue::repr(
      device_stream << v.toDevice();
      out << "torch.device(";
      c10::printQuotedString(out, device_stream.str());
-      return out << ")";
+      return out << ')';
    }
    case IValue::Tag::Generator: {
      auto generator = v.toGenerator();
      out << "torch.Generator(device=";
      c10::printQuotedString(out, generator.device().str());
-      out << ", seed=" << generator.current_seed() << ")";
+      out << ", seed=" << generator.current_seed() << ')';
      return out;
    }
    case IValue::Tag::GenericDict:
      return printMaybeAnnotatedDict(out, v, formatter);
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << enum_holder->qualifiedClassName() << "." <<
+      return out << enum_holder->qualifiedClassName() << '.' <<
          enum_holder->name();
    }
    case IValue::Tag::Object: {
@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      if (c == FP_NORMAL || c == FP_ZERO) {
        int64_t i = static_cast<int64_t>(d);
        if (static_cast<double>(i) == d) {
-          return out << i << ".";
+          return out << i << '.';
        }
      }
      auto orig_prec = out.precision();
@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      return printDict(out, v.toGenericDict(), formatter);
    case IValue::Tag::PyObject: {
      auto py_obj = v.toPyObject();
-      return out << "<PyObject at" << py_obj << ">";
+      return out << "<PyObject at" << py_obj << '>';
    }
    case IValue::Tag::Generator:
      return out << "Generator";
@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      // TODO we should attempt to call __str__ if the object defines it.
      auto obj = v.toObject();
      // print this out the way python would do it
-      return out << "<" << obj->name() << " object at " << obj.get() << ">";
+      return out << '<' << obj->name() << " object at " << obj.get() << '>';
    }
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
-          enum_holder->name() << ">";
+      return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
+          enum_holder->name() << '>';
    }

  }
-  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
+  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
 }

 #undef TORCH_FORALL_TAGS

 void IValue::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }

 std::shared_ptr<ClassType> ivalue::Object::type() const {
@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
      std::stringstream err;
      err << "Cannot serialize custom bound C++ class";
      if (auto qualname = type()->name()) {
-        err << " " << qualname->qualifiedName();
+        err << ' ' << qualname->qualifiedName();
      }
      err << ". Please define serialization methods via def_pickle() for "
            "this class.";
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string str() const override {
    std::stringstream ss;
-    ss << getElementType()->str() << "?";
+    ss << getElementType()->str() << '?';
    return ss.str();
  }

@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -906,7 +906,7 @@ struct TORCH_API ListType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << "]";
+    ss << "List[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
  std::string str() const override {
    std::stringstream ss;
    ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
-       << ")";
+       << ')';
    return ss.str();
  }

@ -1018,7 +1018,7 @@ struct TORCH_API FutureType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Future(" << getElementType()->str() << ")";
+    ss << "Future(" << getElementType()->str() << ')';
    return ss.str();
  }
  TypePtr createWithContained(
@ -1041,7 +1041,7 @@ struct TORCH_API FutureType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Future[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Await(" << getElementType()->str() << ")";
+    ss << "Await(" << getElementType()->str() << ')';
    return ss.str();
  }
  TypePtr createWithContained(
@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Await[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -1102,7 +1102,7 @@ struct TORCH_API RRefType

  std::string str() const override {
    std::stringstream ss;
-    ss << "RRef(" << getElementType()->str() << ")";
+    ss << "RRef(" << getElementType()->str() << ')';
    return ss.str();
  }
  TypePtr createWithContained(
@ -1115,7 +1115,7 @@ struct TORCH_API RRefType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
+    ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
  os << opName.name;
  if (!opName.overload_name.empty()) {
-    os << "." << opName.overload_name;
+    os << '.' << opName.overload_name;
  }
  return os;
 }
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {

 template <typename T>
 std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
-  out << "(";
+  out << '(';
  if (!vs.size()) {
    out << "*)";
    return out;
@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
    if (v.has_value()) {
      out << v.value();
    } else {
-      out << "*";
+      out << '*';
    }
  }
-  out << ")";
+  out << ')';
  return out;
 }

@ -105,7 +105,7 @@ std::ostream& operator<<(
  }
  auto sizes_opt = ss.sizes();

-  os << "(";
+  os << '(';
  for (size_t i = 0; i < rank_opt.value(); i++) {
    if (i > 0) {
      os << ", ";
@ -113,10 +113,10 @@ std::ostream& operator<<(
    if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
      os << sizes_opt.value()[i];
    } else {
-      os << "*";
+      os << '*';
    }
  }
-  os << ")";
+  os << ')';

  return os;
 }
@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
 }

 std::ostream& operator<<(std::ostream& os, const Stride& s) {
-  os << "{";
+  os << '{';
  if (s.stride_index_.has_value()) {
    os << *s.stride_index_;
  } else {
-    os << "*";
+    os << '*';
  }
-  os << ":";
+  os << ':';
  if (s.stride_.has_value()) {
    os << *s.stride_;
  } else {
-    os << "*";
+    os << '*';
  }
  os << '}';
  return os;
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
      bool has_valid_strides_info = ndim > 0 &&
          value->strides().isComplete() && value->strides().size() == ndim;

-      out << "(";
+      out << '(';
      size_t i = 0;
      bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
      for (i = 0; i < *ndim; ++i) {
@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        } else if (symbolic) {
          out << value->symbolic_sizes().at(i);
        } else {
-          out << "*";
+          out << '*';
        }
      }
      if (has_valid_strides_info &&
@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          }
          out << value->strides()[i].value();
        }
-        out << "]";
+        out << ']';
      }
      if (type_verbosity() >= TypeVerbosity::Full) {
        if (value->requiresGrad()) {
@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << "device=" << *value->device();
        }
      }
-      out << ")";
+      out << ')';
    } else {
      if (type_verbosity() >= TypeVerbosity::Full) {
        size_t i = 0;
        if (value->requiresGrad()) {
-          out << "("
+          out << '('
              << "requires_grad=" << *value->requiresGrad();
          i++;
        }
@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
        }
        if (i > 0) {
-          out << ")";
+          out << ')';
        }
      }
    }
@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
    out << *prim << "[]";
  } else if (t.kind() == TypeKind::OptionalType) {
    auto prim = t.castRaw<OptionalType>()->getElementType();
-    out << *prim << "?";
+    out << *prim << '?';
  } else if(t.kind() == TypeKind::FutureType) {
    auto elem = t.castRaw<FutureType>()->getElementType();
-    out << "Future[" << *elem << "]";
+    out << "Future[" << *elem << ']';
  } else if(t.kind() == TypeKind::RRefType) {
    auto elem = t.castRaw<RRefType>()->getElementType();
-    out << "RRef[" << *elem << "]";
+    out << "RRef[" << *elem << ']';
  } else if(auto tup = t.cast<TupleType>()) {
    if (tup->schema()) {
      out << "NamedTuple";
    }
-    out << "(";
+    out << '(';
    for(size_t i = 0; i < tup->elements().size(); ++i) {
      if(i > 0)
        out << ", ";
@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        out << *(tup->elements()[i]);
      }
    }
-    out << ")";
+    out << ')';
  } else if (t.kind() == TypeKind::FunctionType) {
    out << "Function";
  } else {
@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
      why_not << "Could not unify type list since element " << i << " of type "
              << elements.at(i)->repr_str()
              << " did not match the types before it ("
-              << ret_type->repr_str() << ")";
+              << ret_type->repr_str() << ')';
      return std::nullopt;
    }
    ret_type = *maybe_unified;
@ -907,13 +907,13 @@ std::string TupleType::str() const {
    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    ss << name()->qualifiedName();
  } else {
-    ss << "(";
+    ss << '(';
    for(size_t i = 0; i < elements().size(); ++i) {
      if(i > 0)
        ss << ", ";
      ss << elements()[i]->str();
    }
-    ss << ")";
+    ss << ')';
  }
  return ss.str();
 }
@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
          *why_not << "Method on interface '" << lhs.repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << *self_schema << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << *self_schema << '\n'
+                   << "  (2) " << schema << '\n';
          return false;
        }
        return false;
@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
 }

 void SymbolicShape::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }

 bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
    for (const auto i : c10::irange(reference.size())) {
      msg << reference[i]->repr_str();
      if (i > 0) {
-        msg << ",";
+        msg << ',';
      }
-      msg << " ";
+      msg << ' ';
    }
    msg << "} has the single type " << types_[0]->repr_str()
         << ". Use the common supertype instead of creating a Union"
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << "]";
+  stream << ']';
  return stream;
 }

--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << "]";
+  stream << ']';
  return stream;
 }

--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
    // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
    if(v < 500) {
      // If major=xx, minor=yy then format -> xxyy
-      oss << (v / 100) << "." << (v % 10);
+      oss << (v / 100) << '.' << (v % 10);
    }
    else {
      // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
-      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
+      oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
    }
 #else
-    oss << (v / 1000) << "." << (v / 10 % 100);
+    oss << (v / 1000) << '.' << (v / 10 % 100);
    if (v % 10 != 0) {
-      oss << "." << (v % 10);
+      oss << '.' << (v % 10);
    }
 #endif
  };
@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
  oss << "  - HIP Runtime ";
 #endif
  printCudaStyleVersion(runtimeVersion);
-  oss << "\n";
+  oss << '\n';

  // TODO: Make HIPIFY understand CUDART_VERSION macro
 #if !defined(USE_ROCM)
  if (runtimeVersion != CUDART_VERSION) {
    oss << "  - Built with CUDA Runtime ";
    printCudaStyleVersion(CUDART_VERSION);
-    oss << "\n";
+    oss << '\n';
  }
-  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
+  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
 #endif

 #if !defined(USE_ROCM)
@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {


  auto printCudnnStyleVersion = [&](size_t v) {
-    oss << (v / 1000) << "." << (v / 100 % 10);
+    oss << (v / 1000) << '.' << (v / 100 % 10);
    if (v % 100 != 0) {
-      oss << "." << (v % 100);
+      oss << '.' << (v % 100);
    }
  };

@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
  if (cudnnCudartVersion != CUDART_VERSION) {
    oss << "  (built against CUDA ";
    printCudaStyleVersion(cudnnCudartVersion);
-    oss << ")";
+    oss << ')';
  }
-  oss << "\n";
+  oss << '\n';
  if (cudnnVersion != CUDNN_VERSION) {
    oss << "    - Built with CuDNN ";
    printCudnnStyleVersion(CUDNN_VERSION);
-    oss << "\n";
+    oss << '\n';
  }
 #endif
 #else
  // TODO: Check if miopen has the functions above and unify
-  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
+  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
 #endif

 #if AT_MAGMA_ENABLED()
-  oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
+  oss << "  - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
 #endif

  return oss.str();
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + vec_size + dev_idx
  std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
  ss << extra_args_types;
@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + dev_idx
  std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << contiguous << dynamic_casting;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
 std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
  static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
  if (!blaslog) {
-    return stream << entry.key_ << "," << entry.time_;
+    return stream << entry.key_ << ',' << entry.time_;
  }
  else {
-    return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
+    return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
  }
 }

@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
    if (isNew) {
      static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
      if (!blaslog) {
-        untuned_file << op_signature << "," << params_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << std::endl;
      }
      else {
-        untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
      }
      TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
    }
@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const

  if(!file_exists || file_empty) {
    for(const auto& [key, val] : validators) {
-      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
+      (*realtime_out_) << "Validator," << key << ',' << val << std::endl;
      realtime_out_->flush();
    }
    validators_written_ = true;
@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
    return;
  }

-  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
+  (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
  realtime_out_->flush(); //ensure immediate write to disk

  TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
      return "CUDNN_DATA_UINT8x4";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  int strideA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  return out;
 }

@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
      return "CUDNN_TENSOR_NHWC";
    default:
      std::ostringstream oss;
-      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
-  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnTensorFormat_t tformat{};
  cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  return out;
 }

--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
 }

 std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
-  os << layer.layerId() << ":" << layer.key();
+  os << layer.layerId() << ':' << layer.key();
  return os;
 }
 std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
  os << "DynamicLayerStack[ ";
  for (const auto& layer : dls) {
-    os << layer << " ";
+    os << layer << ' ';
  }
-  os << "]";
+  os << ']';
  return os;
 }

--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    if (batched) {
      ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
      dumpTensor(ss, batched->value());
-      ss << "]";
+      ss << ']';
      return;
    }
    ss << "Tensor" << tensor.sizes();
@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    ss << "dead, ";
  }
  dumpTensor(ss, wrapped->value());
-  ss << "]";
+  ss << ']';
 }

 void TensorWrapper::refreshMetadata() {
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
      return "miopenBFloat16";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
  int nbDims = 0;
  int dimA[MIOPEN_DIM_MAX];
  int strideA[MIOPEN_DIM_MAX];
  miopenDataType_t dtype;
  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
  miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
-  out << "    type = " << miopenTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << miopenTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  return out;
 }

--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
    std::stringstream kernelStr;
    kernelStr << kernelName;
    for (const Tensor& tensor : tensors) {
-      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+      kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
    }
    return kernelStr.str();
  }
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@ -39,9 +39,9 @@ std::string BaseInfo::buildTensorString(const Tensor& tensor, bool includeBuffer
    // see comments for INCLUDE_BUFFER_ID
    if (includeBufferId && deviceType == at::kMPS) {
      id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
-      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
+      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
    }
-    tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
+    tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
    return tensorStr.str();
  } else {
    return "undefined";
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
    std::stringstream ss;
    ss << arg_name << " should be greater than zero but got (";
    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ')';
    TORCH_CHECK(false, ss.str());
  }
 }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
      << "  deterministic = " << params.deterministic
      << "  cudnn_enabled = " << params.cudnn_enabled
      << "  allow_tf32 = " << params.allow_tf32
-      << "}";
+      << '}';
  return out;
 }

--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
    } else { \
      SS << "None"; \
    } \
@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
    } else { \
      SS << "None"; \
    } \
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
 }

 void _print(std::string_view s) {
-  std::cout << s << "\n";
+  std::cout << s << '\n';
 }

 // Sorting-based algorithm for isin(); used when the number of test elements is
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
  if (dim.y == 1 && dim.z == 1) {
    out << dim.x;
  } else {
-    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
+    out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
  }
  return out;
 }
@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "input_mult=[";
  for (int i = 0; i < 3; i++) {
    if (i != 0) {
-      out << ",";
+      out << ',';
    }
    out << config.input_mult[i];
  }
@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "output_mult=[";
  for (int i = 0; i < 2; i++) {
    if (i != 0) {
-      out << ",";
+      out << ',';
    }
    out << config.output_mult[i];
  }
@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "block=" << config.block() << ", ";
  out << "grid=" << config.grid() << ", ";
  out << "global_memory_size=" << config.global_memory_size();
-  out << ")";
+  out << ')';
  return out;
 }

--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
  //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
  //           stride_output_h + group_count);

-  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
+  //   std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
  //   "
-  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
+  //             << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
  //             << scale_b.data_ptr() << "\n";
  //   for (int i = 0; i < group_count; i++) {
  //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@ -1057,14 +1057,14 @@ std::string generate_code(
    // TODO these arrays are potentially of the different types, use function
    // traits to determine the types
    declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
  }
  env.s("declare_load_arrays", declare_load_arrays.str());

  std::stringstream declare_store_arrays;
  for (int i = 0; i < nOutputs; i++) {
    declare_store_arrays << result_type << " out" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
  }
  env.s("declare_store_arrays", declare_store_arrays.str());

@ -1217,7 +1217,7 @@ std::string generate_code(
  for (const auto i : c10::irange(nInputs)){
    auto i_string = std::to_string(i);
    vector_inputs << "auto * input" << i_string <<
-        " = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
+        " = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
        " + block_work_size * idx;\n";
  }
  env.s("vector_inputs", vector_inputs.str());
@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(

    // Constructs file path by appending constructed cubin name to cache path
    std::stringstream ss;
-    ss << *cache_dir << "/";
+    ss << *cache_dir << '/';
    ss << kernel_name;
 #ifdef USE_ROCM
    ss << "_arch" << prop->gcnArchName;
 #else
-    ss << "_arch" << cuda_major << "." << cuda_minor;
+    ss << "_arch" << cuda_major << '.' << cuda_minor;
 #endif
-    ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
+    ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
    ss << (compile_to_sass ? "_sass" : "_ptx");
-    ss << "_" << code.length();
-    ss << "_" << hash_code;
+    ss << '_' << code.length();
+    ss << '_' << hash_code;
    file_path = ss.str();

    std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@ -82,15 +82,15 @@ namespace native {

 std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
  out << "ConvolutionParams \n"
-      << "    memory_format = " << params.memory_format << "\n"
-      << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
-      << "    padding = " << ArrayRef<int>{params.padding} << "\n"
-      << "    stride = " << ArrayRef<int>{params.stride} << "\n"
-      << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
-      << "    groups = " << params.groups << "\n"
+      << "    memory_format = " << params.memory_format << '\n'
+      << "    data_type = " << cudnnTypeToString(params.dataType) << '\n'
+      << "    padding = " << ArrayRef<int>{params.padding} << '\n'
+      << "    stride = " << ArrayRef<int>{params.stride} << '\n'
+      << "    dilation = " << ArrayRef<int>{params.dilation} << '\n'
+      << "    groups = " << params.groups << '\n'
      << "    deterministic = " << (params.deterministic ? "true" : "false")
-      << "\n"
-      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
+      << '\n'
+      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';

  return out;
 }
@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
            at::globalContext().float32Precision(
                at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
            at::Float32Precision::TF32)
-     << "\n";
+     << '\n';
  ss << "torch.backends.cudnn.benchmark = "
-     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
+     << pybool(at::globalContext().benchmarkCuDNN()) << '\n';
  ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
-     << "\n";
+     << '\n';
  ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
-     << "\n";
+     << '\n';
  ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
     << ", dtype=" << full_dtype << ", ";
-  ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
+  ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
  ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
     << out_channels << ", ";
  ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
  ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
  ss << "groups=" << params.groups << ")\n";
  ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
-     << "\n";
+     << '\n';
  ss << "out = net(data)\n";
  ss << "out.backward(torch.randn_like(out))\n";
  ss << "torch.cuda.synchronize()\n\n";
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@ -93,11 +93,10 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
      << "input: " << args.idesc // already has a trailing newline
      << "output: " << args.odesc // already has a trailing newline
      << "weight: " << args.wdesc // already has a trailing newline
-      << "Pointer addresses: "
-      << "\n"
-      << "    input: " << args.input.const_data_ptr() << "\n"
-      << "    output: " << args.output.const_data_ptr() << "\n"
-      << "    weight: " << args.weight.const_data_ptr() << "\n";
+      << "Pointer addresses: " << '\n'
+      << "    input: " << args.input.const_data_ptr() << '\n'
+      << "    output: " << args.output.const_data_ptr() << '\n'
+      << "    weight: " << args.weight.const_data_ptr() << '\n';

  return out;
 }
--- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
+++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
@ -115,7 +115,7 @@ std::ostream& operator<<(
  std::copy(
      strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
  oss << sizes.back();
-  output << oss.str() << "}";
+  output << oss.str() << '}';
  return output;
 }

--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
      << "  transposed = " << params.transposed
      << "  output_padding = " << IntArrayRef{params.output_padding}
      << "  groups = " << params.groups << "  benchmark = " << params.benchmark
-      << "  deterministic = " << params.deterministic << "}";
+      << "  deterministic = " << params.deterministic << '}';
  return out;
 }

--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
          ASSERT_NEAR(
              float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
        }
      }
@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
          ASSERT_NEAR(
              float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
        }
      }
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc();
        }
      }
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
    std::ostringstream ss;
    ss << "expected " << param_name << " to be a single integer value or a "
       << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << "=" << list_param;
+       << "dimensions, but got " << param_name << '=' << list_param;
    TORCH_CHECK(false, ss.str());
  } else {
    return list_param.vec();
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
  std::string device_type = get_device_type_str(properties.deviceType);
  VkPhysicalDeviceLimits limits = properties.limits;

-  ss << "{" << std::endl;
+  ss << '{' << std::endl;
  ss << "  Physical Device Info {" << std::endl;
-  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
+  ss << "    apiVersion:    " << v_major << '.' << v_minor << std::endl;
  ss << "    driverversion: " << properties.driverVersion << std::endl;
  ss << "    deviceType:    " << device_type << std::endl;
  ss << "    deviceName:    " << properties.deviceName << std::endl;
@ -371,7 +371,7 @@ std::string Adapter::stringize() const {

 #define PRINT_LIMIT_PROP_VEC3(name)                                       \
  ss << "      " << std::left << std::setw(36) << #name << limits.name[0] \
-     << "," << limits.name[1] << "," << limits.name[2] << std::endl;
+     << ',' << limits.name[1] << ',' << limits.name[2] << std::endl;

  ss << "    Physical Device Limits {" << std::endl;
  PRINT_LIMIT_PROP(maxImageDimension1D);
@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
    ;
  }
  ss << "  ]" << std::endl;
-  ss << "}";
+  ss << '}';

  return ss.str();
 }
--- a/aten/src/ATen/native/vulkan/api/Exception.cpp
+++ b/aten/src/ATen/native/vulkan/api/Exception.cpp
@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
    VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
    VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
    default:
-      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
+      out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
      break;
  }
  return out;
@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
 //

 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
+  out << loc.function << " at " << loc.file << ':' << loc.line;
  return out;
 }

@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
    : msg_(std::move(msg)), source_location_{source_location} {
  std::ostringstream oss;
  oss << "Exception raised from " << source_location_ << ": ";
-  oss << "(" << cond << ") is false! ";
+  oss << '(' << cond << ") is false! ";
  oss << msg_;
  what_ = oss.str();
 }
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@ -173,8 +173,8 @@ void QueryPool::extract_results() {

 static std::string stringize(const VkExtent3D& extents) {
  std::stringstream ss;
-  ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
-     << "}";
+  ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
+     << '}';
  return ss.str();
 }

--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
  (void)flags;

  std::stringstream stream;
-  stream << layer_prefix << " " << message_code << " " << message << std::endl;
+  stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
  const std::string log = stream.str();

  std::cout << log;
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@ -253,7 +253,7 @@ using vec4 = vec<4u>;

 // uvec3 is the type representing tensor extents. Useful for debugging.
 inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
  return os;
 }

--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@ -246,7 +246,7 @@ void TestToCFloat() {
 void TestToString() {
  Tensor b = ones({3, 7}) * .0000001f;
  std::stringstream s;
-  s << b << "\n";
+  s << b << '\n';
  std::string expect = "1e-07 *";
  ASSERT_EQ_RESOLVED(s.str().substr(0, expect.size()), expect);
 }
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@ -33,7 +33,7 @@ struct Foo {
  static void apply(Tensor a, Tensor b) {
    scalar_type s = 1;
    std::stringstream ss;
-    ss << "hello, dispatch: " << a.toString() << s << "\n";
+    ss << "hello, dispatch: " << a.toString() << s << '\n';
    auto data = (scalar_type*)a.data_ptr();
    (void)data;
  }
@ -73,8 +73,8 @@ TEST(TestScalar, TestScalar) {
  Scalar bar = 3.0;
  Half h = bar.toHalf();
  Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
-       << bar.toDouble() << " " << what.isIntegral(false) << "\n";
+  cout << "H2: " << h2.toDouble() << ' ' << what.toFloat() << ' '
+       << bar.toDouble() << ' ' << what.isIntegral(false) << '\n';
  auto gen = at::detail::getDefaultCPUGenerator();
  {
    // See Note [Acquire lock when using random generators]
@ -84,7 +84,7 @@ TEST(TestScalar, TestScalar) {
  }
  if (at::hasCUDA()) {
    auto t2 = zeros({4, 4}, at::kCUDA);
-    cout << &t2 << "\n";
+    cout << &t2 << '\n';
  }
  auto t = ones({4, 4});

@ -129,7 +129,7 @@ TEST(TestScalar, TestScalar) {
      std::stringstream ss;
      // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
      ASSERT_NO_THROW(
-          ss << "hello, dispatch" << x.toString() << s << "\n");
+          ss << "hello, dispatch" << x.toString() << s << '\n');
      auto data = (scalar_t*)x.data_ptr();
      (void)data;
    });
--- a/aten/src/ATen/test/test_install/main.cpp
+++ b/aten/src/ATen/test/test_install/main.cpp
@ -1,5 +1,5 @@
 #include <ATen/ATen.h>

 int main() {
-  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n";
+  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << '\n';
 }
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -1828,9 +1828,9 @@ namespace {
      #endif

        EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
-            << "Test failed for float to uint16 " << f32s[i] << "\n";
+            << "Test failed for float to uint16 " << f32s[i] << '\n';
        EXPECT_EQ(x, c10::detail::fp16_ieee_to_fp32_value(u16))
-            << "Test failed for uint16 to float " << u16 << "\n";
+            << "Test failed for uint16 to float " << u16 << '\n';
      }
    }
    TEST(FP8E4M3Test, FP8E4M3ConversionFloat) {
@ -1848,10 +1848,10 @@ namespace {
          EXPECT_TRUE(std::isnan(f32));
        } else {
          EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
        }
        EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
      }
    }
    TEST(FP8E4M3Test, FP8E4M3BinaryAdd) {
@ -2015,10 +2015,10 @@ namespace {
          EXPECT_TRUE(std::isnan(f32));
        } else {
          EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
        }
        EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
      }
    }
    TEST(FP8E5M2Test, FP8E5M2BinaryAdd) {
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@ -19,7 +19,7 @@ TEST(Vitals, Basic) {
    c10::utils::set_env("TORCH_VITAL", "1");
    TORCH_VITAL_DEFINE(Testing);
    TORCH_VITAL(Testing, Attribute0) << 1;
-    TORCH_VITAL(Testing, Attribute1) << "1";
+    TORCH_VITAL(Testing, Attribute1) << '1';
    TORCH_VITAL(Testing, Attribute2) << 1.0f;
    TORCH_VITAL(Testing, Attribute3) << 1.0;
    auto t = at::ones({1, 1});
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@ -129,14 +129,14 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) {
  std::cout << "Max Diff allowed: " << maxDiff << std::endl;
  if (diff.sizes().size() == 2) {
    for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
      for (const auto x : c10::irange(diff.sizes()[1])) {
        float diff_xy = diff[y][x].item<float>();
        if (diff_xy > maxDiff) {
          std::cout << std::setw(5) << x;
        }
        else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
        }
      }
      std::cout << std::endl;
@ -3276,7 +3276,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {

 void print_shape(const std::vector<int64_t>& shape) {
  for (const auto& num : shape) {
-    std::cout << num << " ";
+    std::cout << num << ' ';
  }
 }

@ -3367,7 +3367,7 @@ void test_masked_fill_scalar(
            print_shape(tmp_curr_input_shape);
            std::cout << "], and mask of shape [";
            print_shape(tmp_curr_mask_shape);
-            std::cout << "]" << std::endl;
+            std::cout << ']' << std::endl;
          }

          ASSERT_TRUE(check);
@ -4542,9 +4542,9 @@ void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
    if (!check) {
      std::cout << "Softmax test failed on axis " << dim << "for tensor dims {";
      for (uint32_t place = 0; place < shape.size() - 1; place++) {
-        std::cout << shape[place] << " ";
+        std::cout << shape[place] << ' ';
      }
-      std::cout << shape.back() << "}" << std::endl;
+      std::cout << shape.back() << '}' << std::endl;
      showRtol(out_cpu, out_vulkan.cpu());
    }
    ASSERT_TRUE(check);
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@ -95,7 +95,7 @@ void showRtol(
  std::cout << "Max Diff found is: " << diff.max().item<double>() << std::endl;
  if (diff.sizes().size() == 2) {
    for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
      for (const auto x : c10::irange(diff.sizes()[1])) {
        double diff_xy = diff[y][x].item<double>();
        if (diff_xy > maxDiff) {
@ -109,7 +109,7 @@ void showRtol(
            }
          }
        } else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
        }
      }
      std::cout << std::endl;
@ -148,19 +148,19 @@ using at::native::vulkan::api::utils::ivec4;
 using at::native::vulkan::api::utils::vec4;

 std::ostream& operator<<(std::ostream& os, const vec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
  return os;
 }

 std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
  return os;
 }

 std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
  return os;
 }

@ -3379,51 +3379,51 @@ bool _test_quantized_linear(
      showRtol(out_cpu_dequant, out_vk_to_cpu_dequant);
    }
    if (xpos != -1 && ypos != -1) {
-      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos
-                << "\n";
+      std::cout << "\nFailure caused on row/col: " << ypos << '/' << xpos
+                << '\n';
      std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point
-                << "\n";
-      std::cout << "Input tensor row " << ypos << "\n";
+                << '\n';
+      std::cout << "Input tensor row " << ypos << '\n';
      for (int i = 0; i < input_cpu.sizes()[1]; i++) {
        std::cout << input_cpu[ypos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "Weight tensor scale: " << w_scale
-                << " zerop: " << w_zero_point << "\n";
-      std::cout << "Weight tensor col " << xpos << "\n";
+                << " zerop: " << w_zero_point << '\n';
+      std::cout << "Weight tensor col " << xpos << '\n';
      for (int i = 0; i < weight.sizes()[1]; i++) {
        std::cout << weight[xpos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "Input tensor quantized row " << ypos << " with dtype "
-                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
      for (int i = 0; i < input_cpu.sizes()[1]; i++) {
        std::cout << input_cpu_quantized[ypos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "Weight tensor quantized col " << xpos << " with dtype "
-                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
      for (int i = 0; i < weight.sizes()[1]; i++) {
        std::cout << weight_cpu_quantized[xpos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "bias tensor\n";
      for (int i = 0; i < bias.sizes()[0]; i++) {
        std::cout << bias[i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "out_scale: " << out_scale
-                << " out_zero_point: " << out_zero_point << "\n";
+                << " out_zero_point: " << out_zero_point << '\n';

      std::cout << "cpu unmatched output: "
-                << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_cpu_dequant[ypos][xpos].item<double>() << '\n';
      std::cout << "vk unmatched output: "
-                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << '\n';
    }
  }
  return check;
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@ -176,7 +176,7 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
    os << k;
    first = false;
  }
-  os << ")";
+  os << ')';
  return os;
 }

--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorOptions& options) {
  } else {
    stream << "(nullopt)";
  }
-  stream << ")";
+  stream << ')';

  return stream;
 }
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@ -136,7 +136,7 @@ std::string c10_retrieve_device_side_assertion_info() {
    // Something failed, let's talk about that
    oss << failures_found
        << " CUDA device-side assertion failures were found on GPU #"
-        << device_num << "!" << std::endl;
+        << device_num << '!' << std::endl;
    if (assertion_data_for_device.assertion_count >
        C10_CUDA_DSA_ASSERTION_COUNT) {
      oss << "But at least " << assertion_data_for_device.assertion_count
@ -151,17 +151,17 @@ std::string c10_retrieve_device_side_assertion_info() {
      oss << "Assertion failure " << i << std::endl;
      oss << "  GPU assertion failure message = " << self.assertion_msg
          << std::endl;
-      oss << "  File containing assertion = " << self.filename << ":"
+      oss << "  File containing assertion = " << self.filename << ':'
          << self.line_number << std::endl;
      oss << "  Device function containing assertion = " << self.function_name
          << std::endl;
-      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
-          << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
-      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
-          << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
+      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ','
+          << self.thread_id[1] << ',' << self.thread_id[2] << ']' << std::endl;
+      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ','
+          << self.block_id[1] << ',' << self.block_id[2] << ']' << std::endl;
      if (launch_info.generation_number == self.caller) {
        oss << "  File containing kernel launch = "
-            << launch_info.launch_filename << ":" << launch_info.launch_linenum
+            << launch_info.launch_filename << ':' << launch_info.launch_linenum
            << std::endl;
        oss << "  Function containing kernel launch = "
            << launch_info.launch_function << std::endl;
@ -175,7 +175,7 @@ std::string c10_retrieve_device_side_assertion_info() {
        if (launch_registry.gather_launch_stacktrace) {
          oss << "Launch stacktracing disabled." << std::endl;
        } else {
-          oss << "\n" << launch_info.launch_stacktrace << std::endl;
+          oss << '\n' << launch_info.launch_stacktrace << std::endl;
        }
      } else {
        oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@ -435,7 +435,7 @@ TEST(DispatchKeySet, TestFunctionalityDispatchKeyToString) {
    if (i > 0) {
      ASSERT_TRUE(res.find("Unknown") == std::string::npos)
          << i << " (before is " << toString(static_cast<DispatchKey>(i - 1))
-          << ")";
+          << ')';
    } else {
      ASSERT_TRUE(res.find("Unknown") == std::string::npos) << i;
    }
--- a/c10/test/util/Half_test.cpp
+++ b/c10/test/util/Half_test.cpp
@ -96,10 +96,10 @@ TEST(HalfConversionTest, TestPorableConversion) {
  for (auto x : inputs) {
    auto target = c10::detail::fp16_ieee_to_fp32_value(x);
    EXPECT_EQ(halfbits2float(x), target)
-        << "Test failed for uint16 to float " << x << "\n";
+        << "Test failed for uint16 to float " << x << '\n';
    EXPECT_EQ(
        float2halfbits(target), c10::detail::fp16_ieee_from_fp32_value(target))
-        << "Test failed for float to uint16" << target << "\n";
+        << "Test failed for float to uint16" << target << '\n';
  }
 }

--- a/c10/test/util/logging_test.cpp
+++ b/c10/test/util/logging_test.cpp
@ -98,7 +98,7 @@ struct Noncopyable {
 };

 std::ostream& operator<<(std::ostream& out, const Noncopyable& nc) {
-  out << "Noncopyable(" << nc.x << ")";
+  out << "Noncopyable(" << nc.x << ')';
  return out;
 }
 } // namespace
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@ -204,13 +204,13 @@ ArrayRef(const std::initializer_list<T>&) -> ArrayRef<T>;
 template <typename T>
 std::ostream& operator<<(std::ostream& out, ArrayRef<T> list) {
  int i = 0;
-  out << "[";
+  out << '[';
  for (const auto& e : list) {
    if (i++ > 0)
      out << ", ";
    out << e;
  }
-  out << "]";
+  out << ']';
  return out;
 }

--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@ -106,8 +106,8 @@ class GetBacktraceImpl {
          /*length*/ &length,
          /*status*/ &status);

-      os << " frame #" << idx++ << "\t"
-         << ((demangled != NULL && status == 0) ? demangled : symbol) << "["
+      os << " frame #" << idx++ << '\t'
+         << ((demangled != NULL && status == 0) ? demangled : symbol) << '['
         << addr << "]\t" << std::endl;
    }
    free(demangled);
@ -274,7 +274,7 @@ class GetBacktraceImpl {
      } else {
        // In the edge-case where we couldn't parse the frame string, we can
        // just use it directly (it may have a different format).
-        stream << symbols[frame_number] << "\n";
+        stream << symbols[frame_number] << '\n';
      }
    }

@ -413,8 +413,8 @@ class GetBacktraceImpl {
             << back_trace_[i_frame] << std::dec;
      if (with_symbol) {
        stream << std::setfill('0') << std::setw(16) << std::uppercase
-               << std::hex << p_symbol->Address << std::dec << " " << module
-               << "!" << p_symbol->Name;
+               << std::hex << p_symbol->Address << std::dec << ' ' << module
+               << '!' << p_symbol->Name;
      } else {
        stream << " <unknown symbol address> " << module << "!<unknown symbol>";
      }
@ -424,7 +424,7 @@ class GetBacktraceImpl {
      } else {
        stream << "<unknown file> @ <unknown line number>";
      }
-      stream << "]" << std::endl;
+      stream << ']' << std::endl;
    }

    return stream.str();
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@ -45,7 +45,7 @@ std::string Error::compute_what(bool include_backtrace) const {

  if (context_.size() == 1) {
    // Fold error and context in one line
-    oss << " (" << context_[0] << ")";
+    oss << " (" << context_[0] << ')';
  } else {
    for (const auto& c : context_) {
      oss << "\n  " << c;
@ -53,7 +53,7 @@ std::string Error::compute_what(bool include_backtrace) const {
  }

  if (include_backtrace && backtrace_) {
-    oss << "\n" << backtrace_->get();
+    oss << '\n' << backtrace_->get();
  }

  return oss.str();
@ -248,7 +248,7 @@ void WarningHandler::process(const Warning& warning) {
  LOG_AT_FILE_LINE(
      WARNING, warning.source_location().file, warning.source_location().line)
      << "Warning: " << warning.msg() << " (function "
-      << warning.source_location().function << ")";
+      << warning.source_location().function << ')';
 }

 std::string GetExceptionString(const std::exception& e) {
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -474,12 +474,12 @@ MessageLogger::MessageLogger(
  if (GLOBAL_RANK != -1) {
    stream_ << "[rank" << GLOBAL_RANK << "]:";
  }
-  stream_ << "[" << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
+  stream_ << '[' << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
          << (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday
-          << std::setfill('0') << " " << std::setw(2) << timeinfo->tm_hour
-          << ":" << std::setw(2) << timeinfo->tm_min << ":" << std::setw(2)
-          << timeinfo->tm_sec << "." << std::setw(9) << ns << " "
-          << c10::filesystem::path(file).filename() << ":" << line << "] ";
+          << std::setfill('0') << ' ' << std::setw(2) << timeinfo->tm_hour
+          << ':' << std::setw(2) << timeinfo->tm_min << ':' << std::setw(2)
+          << timeinfo->tm_sec << '.' << std::setw(9) << ns << ' '
+          << c10::filesystem::path(file).filename() << ':' << line << "] ";
 }

 // Output the contents of the stream to the proper channel on destruction.
@ -488,7 +488,7 @@ MessageLogger::~MessageLogger() noexcept(false) {
    // Nothing needs to be logged.
    return;
  }
-  stream_ << "\n";
+  stream_ << '\n';
 #ifdef ANDROID
  static const int android_log_levels[] = {
      ANDROID_LOG_FATAL, // LOG_FATAL
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@ -1412,13 +1412,13 @@ inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
 template <typename T, unsigned N>
 std::ostream& operator<<(std::ostream& out, const SmallVector<T, N>& list) {
  int i = 0;
-  out << "[";
+  out << '[';
  for (auto e : list) {
    if (i++ > 0)
      out << ", ";
    out << e;
  }
-  out << "]";
+  out << ']';
  return out;
 }

--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@ -79,7 +79,7 @@ std::ostream& _str(std::ostream& ss, const std::wstring& wString) {
 } // namespace detail

 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
+  out << loc.function << " at " << loc.file << ':' << loc.line;
  return out;
 }

--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@ -170,7 +170,7 @@ inline bool isPrint(char s) {
 }

 inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
-  stmt << "\"";
+  stmt << '"';
  for (auto s : str) {
    switch (s) {
      case '\\':
@ -224,7 +224,7 @@ inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
        break;
    }
  }
-  stmt << "\"";
+  stmt << '"';
 }

 template <typename T>
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@ -223,7 +223,7 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
        // a single thread that wouldn't receive the SIGUSR2
        if (std::cv_status::timeout == writingCond.wait_for(ul, 2s)) {
          if (!signalReceived) {
-            std::cerr << "signal lost waiting for stacktrace " << pid << ":"
+            std::cerr << "signal lost waiting for stacktrace " << pid << ':'
                      << tid << '\n';
            break;
          }
--- a/c10/util/sparse_bitset.h
+++ b/c10/util/sparse_bitset.h
@ -877,7 +877,7 @@ std::ostream& operator<<(
    std::ostream& stream,
    const SparseBitVector<ElementSize>& vec) {
  bool first = true;
-  stream << "{";
+  stream << '{';
  for (auto el : vec) {
    if (first) {
      first = false;
@ -886,7 +886,7 @@ std::ostream& operator<<(
    }
    stream << el;
  }
-  stream << "}";
+  stream << '}';
  return stream;
 }

--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -773,8 +773,20 @@ void PyTorchStreamWriter::writeRecord(
    bool compress) {
  AT_ASSERT(!finalized_);
  AT_ASSERT(!archive_name_plus_slash_.empty());
-  TORCH_INTERNAL_ASSERT(
-      files_written_.count(name) == 0, "Tried to serialize file twice: ", name);
+  if (files_written_.count(name) > 0) {
+    // Allow multiple writes for triton binaries
+    bool is_triton_extension =
+        c10::ends_with(name, ".so") ||
+        c10::ends_with(name, ".cubin") ||
+        c10::ends_with(name, ".hsaco");
+
+    if (is_triton_extension) {
+      LOG(WARNING) << "File '" << name << "' is being serialized multiple times";
+      return;
+    }
+
+    TORCH_INTERNAL_ASSERT(false, "Tried to serialize file twice: ", name);
+  }
  if (name == kSerializationIdRecordName && serialization_id_.empty()) {
    // In case of copying records from another file, skip writing a different
    // serialization_id than the one computed in this writer.
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@ -4449,16 +4449,17 @@ class CPUReproTests(TestCase):
            def forward(self, x):
                return self.gn(x)

-        for dynamic in [True, False]:
-            torch._dynamo.reset()
-            metrics.reset()
-            mod = M().eval()
-            x = torch.randn(1, 32, 128, 128, 128)
-            with torch.no_grad():
-                expected = mod(x)
-                compiled_m = torch.compile(mod, dynamic=dynamic)
-                actual = compiled_m(x)
-                self.assertEqual(expected, actual)
+        for simdlen, dynamic in itertools.product([None, 0], [True, False]):
+            with config.patch({"cpp.simdlen": simdlen}):
+                torch._dynamo.reset()
+                metrics.reset()
+                mod = M().eval()
+                x = torch.randn(1, 32, 128, 128, 128)
+                with torch.no_grad():
+                    expected = mod(x)
+                    compiled_m = torch.compile(mod, dynamic=dynamic)
+                    actual = compiled_m(x)
+                    self.assertEqual(expected, actual)

    @torch._dynamo.config.patch(
        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -53,8 +53,10 @@ from torch.utils.data import DataLoader
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests  # noqa: PLW0127

-HAS_CUDA = torch.cuda.is_available()
-
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+)
+TEST_GPU = torch.xpu.is_available() or torch.cuda.is_available()

 from torch.testing._internal.common_utils import run_tests, TestCase

@ -302,24 +304,24 @@ class TestCheckpoint(TestCase):

            self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)

-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
-    def test_checkpoint_rng_cuda(self):
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
+    def test_checkpoint_rng_gpu(self):
        for _ in range(5):
-            inp = torch.randn(20000, device="cuda").requires_grad_()
+            inp = torch.randn(20000, device=device_type).requires_grad_()
            phase1 = torch.nn.Dropout()
            phase2 = torch.nn.Dropout()

            def run_fn(input):
                return phase2(input)

-            state = torch.cuda.get_rng_state()
+            state = torch.get_device_module(device_type).get_rng_state()

            out = phase1(inp)
            out = checkpoint(run_fn, out, use_reentrant=True)
            out.sum().backward()
            grad_with_checkpointing = inp.grad

-            torch.cuda.set_rng_state(state)
+            torch.get_device_module(device_type).set_rng_state(state)

            inp.grad = None

@ -330,9 +332,9 @@ class TestCheckpoint(TestCase):

            self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)

-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
    def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self):
-        inp = torch.randn(2, device="cuda").requires_grad_()
+        inp = torch.randn(2, device=device_type).requires_grad_()
        layer = torch.nn.Dropout()

        def run_fn(input):
@ -435,10 +437,10 @@ class TestCheckpoint(TestCase):
            out = checkpoint(run_fn2, input_var, input_var2, use_reentrant=True)
            out.sum().backward()

-    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
    def test_checkpointing_without_reentrant_early_free(self):
        # I don't know how to check if the temporary saved variable buffer
-        # get de-allocated directly. So using cuda memory usage as a proxy
+        # get de-allocated directly. So using GPU memory usage as a proxy

        def _do_test(fn, should_free):
            stats: list[int] = []
@ -449,8 +451,8 @@ class TestCheckpoint(TestCase):
                # emptied at each step)
                def hook(_unused):
                    self.assertEqual(len(stats), idx)
-                    torch.cuda.synchronize()
-                    stats.append(torch.cuda.memory_allocated())
+                    torch.accelerator.synchronize()
+                    stats.append(torch.accelerator.memory_allocated())
                    if idx > 0:
                        if should_free:
                            self.assertLess(stats[idx], stats[idx - 1])
@ -475,7 +477,7 @@ class TestCheckpoint(TestCase):

            return stats

-        x = torch.zeros(10, device="cuda", requires_grad=True)
+        x = torch.zeros(10, device=device_type, requires_grad=True)
        x.grad = torch.zeros_like(x)

        # In a regular backward, buffers get eagerly freed
@ -505,8 +507,8 @@ class TestCheckpoint(TestCase):
    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
    def test_get_device_states_recursive(self):
        inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:1")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:1")],
        }
        device_ids, device_states = get_device_states(inp)
        self.assertEqual(2, len(device_ids))
@ -522,42 +524,42 @@ class TestCheckpoint(TestCase):
        self.assertEqual("meta", device_type)

    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
-    def test_infer_device_state_recursive_multi_cuda(self):
-        # Check that no warning is issued for either cuda:0, cuda:1 or
-        # cuda:0, cuda:0 cases since they are both the same device type
+    def test_infer_device_state_recursive_multi_gpu(self):
+        # Check that no warning is issued for either gpu:0, gpu:1 or
+        # gpu:0, gpu:0 cases since they are both the same device type
        inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:1")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:1")],
        }
        with warnings.catch_warnings():
            warnings.simplefilter("error")
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
        inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:0")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:0")],
        }
        with warnings.catch_warnings():
            warnings.simplefilter("error")
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
-        # Check that a warning is issued for cuda:0, meta and that it includes
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
+        # Check that a warning is issued for gpu:0, meta and that it includes
        # device type information
        inp = {
-            "foo": torch.rand(10, device="cuda:0"),
+            "foo": torch.rand(10, device=f"{device_type}:0"),
            "bar": [torch.rand(10, device="meta")],
        }
        with warnings.catch_warnings(record=True) as w:
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
        self.assertEqual(len(w), 1)
        warning_msg = str(w[-1].message)
        self.assertTrue(
            "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices"
            in warning_msg
        )
-        self.assertTrue("Device types: ['cuda', 'meta']" in warning_msg)
-        self.assertTrue("first device type: cuda" in warning_msg)
+        self.assertTrue(f"Device types: ['{device_type}', 'meta']" in warning_msg)
+        self.assertTrue(f"first device type: {device_type}" in warning_msg)


 class TestDataLoaderUtils(TestCase):
@ -604,7 +606,7 @@ class TestDataLoaderUtils(TestCase):
        self.assertEqual(len(list(dataiter)), 1)

    @unittest.skip(
-        "FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN"
+        "FIXME: Intermittent GPU out-of-memory error on Windows and time-out under ASAN"
    )
    def test_multi_keep(self):
        dataloader: DataLoader = DataLoader(
@ -861,27 +863,33 @@ class TestDeviceUtils(TestCase):
    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
    def test_get_default_device_more(self):
        try:
-            torch.set_default_device("cuda")
+            torch.set_default_device(device_type)
            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
            torch.set_default_device(None)

-            torch.set_default_device("cuda")
-            torch.cuda.set_device("cuda:1")
+            torch.set_default_device(device_type)
+            torch.get_device_module(device_type).set_device(f"{device_type}:1")
+            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
+            torch.accelerator.set_device_index(1)
            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
            torch.set_default_device(None)

-            torch.set_default_device("cuda:1")
+            torch.set_default_device(f"{device_type}:1")
            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
            torch.set_default_device(None)

-            torch.set_default_device("cuda:1")
-            with torch.device("cuda:0"):
-                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+            torch.set_default_device(f"{device_type}:1")
+            with torch.device(f"{device_type}:0"):
+                self.assertEqual(
+                    torch.get_default_device(), torch.device(f"{device_type}", 0)
+                )

            torch.set_default_device("cpu")
            self.assertEqual(torch.get_default_device(), torch.device("cpu"))
-            with torch.device("cuda:0"):
-                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+            with torch.device(f"{device_type}:0"):
+                self.assertEqual(
+                    torch.get_default_device(), torch.device(f"{device_type}", 0)
+                )

            self.assertEqual(torch.get_default_device(), torch.device("cpu"))
        finally:
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@ -239,7 +239,10 @@ def reduction_combine(
    if reduction_type in ("min", "max"):
        return f"{reduction_type}_propagate_nan({var}, {next_value})"
    if reduction_type == "welford_reduce":
-        return f"welford_combine({var}, {next_value})"
+        if helper_val:
+            return f"welford_combine({var}, {next_value}, &{helper_val})"
+        else:
+            return f"welford_combine({var}, {next_value})"
    if reduction_type == "welford_combine":
        if isinstance(next_value, tuple):
            mean, m2, weight = next_value
@ -2194,10 +2197,8 @@ class CppKernel(Kernel):
        # sum and welford
        # Note: using helper has non-negligible impact on performance

-        # keep the original behavior for welford_reduce
-        # acc helper is not used for scalar welford_reduce
        if reduction_type == "welford_reduce":
-            return not use_scalar
+            return True

        # TODO add supports for more data types when needed
        if reduction_type == "sum" and dtype == torch.float:
@ -2323,9 +2324,15 @@ class CppKernel(Kernel):
            reduction_size = functools.reduce(
                operator.mul, self.ranges[self.reduction_depth :]
            )
-            helper_val = self.cascade_helper_cse.generate(
-                self.compute, f"reduction {reduction_key}", write=False
-            )
+            # use welford_helper/cascade_helper for vec kernel
+            if reduction_type == "welford_reduce":
+                helper_val = self.welford_helper_cse.generate(
+                    self.compute, f"reduction {reduction_key}", write=False
+                )
+            else:
+                helper_val = self.cascade_helper_cse.generate(
+                    self.compute, f"reduction {reduction_key}", write=False
+                )
            # rename the helper variable to distinguish it from vectorized version
            scalar_helper_val = f"scalar_{helper_val}"
            self._use_acc_helper(
@ -3092,19 +3099,16 @@ class CppVecKernel(CppKernel):
                if self.ranges[self.tiling_idx] % self.tiling_factor
                else sympy.Integer(0)
            )
-            # scalar helper for scalar sum is also needed when vec kernel is included
-            # Note: is it different from welford reduction as welford reduction of scalar version
-            # does not need helper, and the helper needs the information of reduction size to initialize
-            if reduction_type == "sum":
-                scalar_helper_val = f"scalar_{helper_val}"
-                self._use_acc_helper(
-                    reduction_type,
-                    acc,
-                    scalar_helper_val,
-                    reduction_size,
-                    dtype,
-                    use_scalar=True,
-                )
+            # scalar helper for scalar welford_reduce/sum is also needed when vec kernel is included
+            scalar_helper_val = f"scalar_{helper_val}"
+            self._use_acc_helper(
+                reduction_type,
+                acc,
+                scalar_helper_val,
+                reduction_size,
+                dtype,
+                use_scalar=True,
+            )
            self._use_acc_helper(
                reduction_type, acc, helper_val, helper_vec_range, dtype
            )
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@ -61,7 +61,7 @@ static void setSignalHandler(
      sigaction(signal, &sa, old_sa_ptr) != 0) {
    std::ostringstream oss;
    oss << "An error occurred while setting handler for " << strsignal(signal)
-        << ".";
+        << '.';
    TORCH_CHECK(false, oss.str());
  }
 }
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@ -29,14 +29,14 @@ PyObject* THPDevice_New(const at::Device& device) {

 static PyObject* THPDevice_repr(THPDevice* self) {
  std::ostringstream oss;
-  oss << "device(type=\'" << self->device.type() << "\'";
+  oss << "device(type=\'" << self->device.type() << '\'';
  if (self->device.has_index()) {
    // `self->device.index()` returns uint8_t which is treated as ascii while
    // printing, hence casting it to uint16_t.
    // https://stackoverflow.com/questions/19562103/uint8-t-cant-be-printed-with-cout
    oss << ", index=" << static_cast<uint16_t>(self->device.index());
  }
-  oss << ")";
+  oss << ')';
  return THPUtils_packString(oss.str().c_str());
 }

--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -212,8 +212,8 @@ static PyObject* THPModule_initExtension(
        }
        auto frame_id = s_tb[idx];
        const auto& frame = s_tbs.all_frames.at(frame_id);
-        oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
-            << ":" << frame.lineno << '\n';
+        oss << '#' << idx << ' ' << frame.funcname << " from " << frame.filename
+            << ':' << frame.lineno << '\n';
      }
      return oss.str();
    });
@ -2781,8 +2781,8 @@ Call this whenever a new thread is created in order to propagate values from

  py_module.def("_dump_local_tls_set", []() {
    auto local_keyset = c10::impl::tls_local_dispatch_key_set();
-    std::cout << "Included: " << toString(local_keyset.included_) << "\n";
-    std::cout << "Excluded: " << toString(local_keyset.excluded_) << "\n";
+    std::cout << "Included: " << toString(local_keyset.included_) << '\n';
+    std::cout << "Excluded: " << toString(local_keyset.excluded_) << '\n';
  });

  py_module.def(
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@ -254,7 +254,7 @@ static PyObject* THPFInfo_str(THPFInfo* self) {
      << PyFloat_AsDouble(THPFInfo_smallest_normal(self, nullptr));
  oss << ", tiny=" << PyFloat_AsDouble(THPFInfo_tiny(self, nullptr));
  if (dtypeStr != nullptr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
  }
  return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
 }
@ -266,7 +266,7 @@ static PyObject* THPIInfo_str(THPIInfo* self) {
  oss << "iinfo(min=" << PyLong_AsDouble(THPIInfo_min(self, nullptr));
  oss << ", max=" << PyLong_AsDouble(THPIInfo_max(self, nullptr));
  if (dtypeStr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
  }

  return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
--- a/torch/csrc/api/include/torch/detail/TensorDataContainer.h
+++ b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
@ -271,7 +271,7 @@ struct TensorDataContainer {
          "TensorDataContainer_pretty_print_scalar",
          [&] { stream << scalar_.to<scalar_t>(); });
    } else if (is_init_list()) {
-      stream << "{";
+      stream << '{';
      for (const TensorDataContainer* it = init_list_.begin();
           it != init_list_.end();
           it++) {
@ -279,9 +279,9 @@ struct TensorDataContainer {
        if (std::next(it) != init_list_.end())
          stream << ", ";
      }
-      stream << "}";
+      stream << '}';
    } else if (is_tensor()) {
-      stream << "{";
+      stream << '{';
      for (const auto i : c10::irange(tensor_.sizes()[0])) {
        AT_DISPATCH_ALL_TYPES_AND3(
            at::kBool,
@ -293,7 +293,7 @@ struct TensorDataContainer {
        if (i != tensor_.sizes()[0] - 1)
          stream << ", ";
      }
-      stream << "}";
+      stream << '}';
    } else {
      TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
    }
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@ -145,7 +145,7 @@ class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
    stream << ", "
           << "affine=" << this->options.affine() << ", "
           << "track_running_stats=" << this->options.track_running_stats()
-           << ")";
+           << ')';
  }
 };

--- a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@ -28,13 +28,13 @@ class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
  void pretty_print(std::ostream& stream) const override {
    stream << "torch::nn::ParameterDict(" << '\n';
    for (const auto& pair : parameters_) {
-      stream << "(" << pair.key() << ")"
-             << ": Parameter containing: [" << pair.value().scalar_type()
-             << " of size " << pair.value().sizes() << "]";
+      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
+             << pair.value().scalar_type() << " of size "
+             << pair.value().sizes() << ']';
      ;
      stream << '\n';
    }
-    stream << ")";
+    stream << ')';
  }

  /// Insert the parameter along with the key into ParameterDict
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@ -36,13 +36,13 @@ class ParameterListImpl : public Cloneable<ParameterListImpl> {
  void pretty_print(std::ostream& stream) const override {
    stream << "torch::nn::ParameterList(" << '\n';
    for (const auto& pair : parameters_) {
-      stream << "(" << pair.key() << ")"
-             << ": Parameter containing: [" << pair.value().scalar_type()
-             << " of size " << pair.value().sizes() << "]";
+      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
+             << pair.value().scalar_type() << " of size "
+             << pair.value().sizes() << ']';
      ;
      stream << '\n';
    }
-    stream << ")";
+    stream << ')';
  }

  /// push the a given parameter at the end of the list
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@ -113,8 +113,8 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {

  /// Pretty prints the `Conv{1,2,3}d` module into the given `stream`.
  void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::Conv" << D << "d"
-           << "(" << options.in_channels() << ", " << options.out_channels()
+    stream << "torch::nn::Conv" << D << 'd' << '(' << options.in_channels()
+           << ", " << options.out_channels()
           << ", kernel_size=" << options.kernel_size()
           << ", stride=" << options.stride();
    std::visit(
@ -143,7 +143,7 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
      stream << ", padding_mode="
             << enumtype::get_enum_name(options.padding_mode());
    }
-    stream << ")";
+    stream << ')';
  }

  /// The options with which this `Module` was constructed.
@ -278,8 +278,8 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {

  /// Pretty prints the `ConvTranspose{1,2,3}d` module into the given `stream`.
  void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::ConvTranspose" << D << "d"
-           << "(" << this->options.in_channels() << ", "
+    stream << "torch::nn::ConvTranspose" << D << 'd' << '('
+           << this->options.in_channels() << ", "
           << this->options.out_channels()
           << ", kernel_size=" << this->options.kernel_size()
           << ", stride=" << this->options.stride();
@ -303,7 +303,7 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
      stream << ", padding_mode="
             << enumtype::get_enum_name(this->options.padding_mode());
    }
-    stream << ")";
+    stream << ')';
  }

 protected:
--- a/torch/csrc/api/include/torch/nn/modules/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@ -53,7 +53,7 @@ class InstanceNormImpl
           << "momentum=" << this->options.momentum() << ", "
           << "affine=" << this->options.affine() << ", "
           << "track_running_stats=" << this->options.track_running_stats()
-           << ")";
+           << ')';
  }
 };

--- a/torch/csrc/api/include/torch/nn/modules/pooling.h
+++ b/torch/csrc/api/include/torch/nn/modules/pooling.h
@ -232,8 +232,8 @@ class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
  /// Pretty prints the `AdaptiveMaxPool{1,2,3}d` module into the given
  /// `stream`.
  void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveMaxPool" << D << "d"
-           << "(output_size=" << options.output_size() << ")";
+    stream << "torch::nn::AdaptiveMaxPool" << D << 'd'
+           << "(output_size=" << options.output_size() << ')';
  }

  /// The options with which this `Module` was constructed.
@ -365,8 +365,8 @@ class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
  /// Pretty prints the `AdaptiveAvgPool{1,2,3}d` module into the given
  /// `stream`.
  void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveAvgPool" << D << "d"
-           << "(output_size=" << options.output_size() << ")";
+    stream << "torch::nn::AdaptiveAvgPool" << D << 'd'
+           << "(output_size=" << options.output_size() << ')';
  }

  /// The options with which this `Module` was constructed.
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@ -355,11 +355,11 @@ void Module::pretty_print_recursive(
    stream << "(\n";
    const std::string next_indentation = indentation + "  ";
    for (const auto& child : children_) {
-      stream << next_indentation << "(" << child.key() << "): ";
+      stream << next_indentation << '(' << child.key() << "): ";
      child.value()->pretty_print_recursive(stream, next_indentation);
      stream << '\n';
    }
-    stream << indentation << ")";
+    stream << indentation << ')';
  }
 }

--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@ -21,7 +21,7 @@ void ELUImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << ", inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -39,7 +39,7 @@ void SELUImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << "inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -55,7 +55,7 @@ void HardshrinkImpl::reset() {}

 void HardshrinkImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::Hardshrink(" << options.lambda()
-         << ")";
+         << ')';
 }

 // ============================================================================
@ -86,7 +86,7 @@ void HardtanhImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << ", inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -107,7 +107,7 @@ void LeakyReLUImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << ", inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -129,7 +129,7 @@ SoftmaxImpl::SoftmaxImpl(const SoftmaxOptions& options_) : options(options_) {}
 void SoftmaxImpl::reset() {}

 void SoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmax(dim=" << options.dim() << ")";
+  stream << "torch::nn::Softmax(dim=" << options.dim() << ')';
 }

 Tensor SoftmaxImpl::forward(const Tensor& input) {
@ -143,7 +143,7 @@ SoftminImpl::SoftminImpl(const SoftminOptions& options_) : options(options_) {}
 void SoftminImpl::reset() {}

 void SoftminImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmin(dim=" << options.dim() << ")";
+  stream << "torch::nn::Softmin(dim=" << options.dim() << ')';
 }

 Tensor SoftminImpl::forward(const Tensor& input) {
@ -158,7 +158,7 @@ LogSoftmaxImpl::LogSoftmaxImpl(const LogSoftmaxOptions& options_)
 void LogSoftmaxImpl::reset() {}

 void LogSoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ")";
+  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ')';
 }

 Tensor LogSoftmaxImpl::forward(const Tensor& input) {
@ -197,7 +197,7 @@ void PReLUImpl::reset() {

 void PReLUImpl::pretty_print(std::ostream& stream) const {
  stream << "torch::nn::PReLU(num_parameters=" << options.num_parameters()
-         << ")";
+         << ')';
 }

 // ============================================================================
@ -215,7 +215,7 @@ void ReLUImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << "inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -233,7 +233,7 @@ void ReLU6Impl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << "inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -257,7 +257,7 @@ void RReLUImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << ", inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -275,7 +275,7 @@ void CELUImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << ", inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
@ -289,7 +289,7 @@ Tensor GLUImpl::forward(const Tensor& input) {
 void GLUImpl::reset() {}

 void GLUImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::GLU(dim=" << options.dim() << ")";
+  stream << "torch::nn::GLU(dim=" << options.dim() << ')';
 }

 // ============================================================================
@ -355,7 +355,7 @@ void SoftplusImpl::reset() {}

 void SoftplusImpl::pretty_print(std::ostream& stream) const {
  stream << "torch::nn::Softplus(beta=" << options.beta()
-         << ", threshold=" << options.threshold() << ")";
+         << ", threshold=" << options.threshold() << ')';
 }

 // ============================================================================
@ -370,7 +370,7 @@ Tensor SoftshrinkImpl::forward(const Tensor& input) {
 void SoftshrinkImpl::reset() {}

 void SoftshrinkImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softshrink(" << options.lambda() << ")";
+  stream << "torch::nn::Softshrink(" << options.lambda() << ')';
 }

 // ============================================================================
@ -430,7 +430,7 @@ void ThresholdImpl::pretty_print(std::ostream& stream) const {
  if (options.inplace()) {
    stream << std::boolalpha << ", inplace=" << options.inplace();
  }
-  stream << ")";
+  stream << ')';
 }

 // ============================================================================
--- a/torch/csrc/api/src/nn/modules/distance.cpp
+++ b/torch/csrc/api/src/nn/modules/distance.cpp
@ -12,7 +12,7 @@ void CosineSimilarityImpl::reset() {}

 void CosineSimilarityImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::CosineSimilarity"
-         << "(dim=" << options.dim() << ", eps=" << options.eps() << ")";
+         << "(dim=" << options.dim() << ", eps=" << options.eps() << ')';
 }

 Tensor CosineSimilarityImpl::forward(const Tensor& x1, const Tensor& x2) {
@ -30,7 +30,7 @@ void PairwiseDistanceImpl::reset() {}
 void PairwiseDistanceImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::PairwiseDistance"
         << "(p=" << options.p() << ", eps=" << options.eps()
-         << ", keepdim=" << options.keepdim() << ")";
+         << ", keepdim=" << options.keepdim() << ')';
 }

 Tensor PairwiseDistanceImpl::forward(const Tensor& x1, const Tensor& x2) {
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@ -19,7 +19,7 @@ Tensor DropoutImpl::forward(Tensor input) {

 void DropoutImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::Dropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }

 // ============================================================================
@ -31,7 +31,7 @@ Tensor Dropout2dImpl::forward(Tensor input) {

 void Dropout2dImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::Dropout2d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }

 // ============================================================================
@ -43,7 +43,7 @@ Tensor Dropout3dImpl::forward(Tensor input) {

 void Dropout3dImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::Dropout3d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }

 // ============================================================================
@ -55,7 +55,7 @@ Tensor AlphaDropoutImpl::forward(const Tensor& input) {

 void AlphaDropoutImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::AlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }

 // ============================================================================
@ -67,7 +67,7 @@ Tensor FeatureAlphaDropoutImpl::forward(const Tensor& input) {

 void FeatureAlphaDropoutImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha << "torch::nn::FeatureAlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }

 } // namespace torch::nn
--- a/torch/csrc/api/src/nn/modules/embedding.cpp
+++ b/torch/csrc/api/src/nn/modules/embedding.cpp
@ -76,7 +76,7 @@ void EmbeddingImpl::pretty_print(std::ostream& stream) const {
  if (options.sparse()) {
    stream << ", sparse=" << std::boolalpha << options.sparse();
  }
-  stream << ")";
+  stream << ')';
 }

 torch::Tensor EmbeddingImpl::forward(const Tensor& input) {
@ -181,6 +181,6 @@ void EmbeddingBagImpl::pretty_print(std::ostream& stream) const {
  if (padding_idx_opt.has_value()) {
    stream << ", padding_idx=" << padding_idx_opt.value();
  }
-  stream << ")";
+  stream << ')';
 }
 } // namespace torch::nn
--- a/torch/csrc/api/src/nn/modules/fold.cpp
+++ b/torch/csrc/api/src/nn/modules/fold.cpp
@ -17,7 +17,7 @@ void FoldImpl::pretty_print(std::ostream& stream) const {
         << ", kernel_size=" << options.kernel_size()
         << ", dilation=" << options.dilation()
         << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ")";
+         << ')';
 }

 Tensor FoldImpl::forward(const Tensor& input) {
@ -40,7 +40,7 @@ void UnfoldImpl::pretty_print(std::ostream& stream) const {
  stream << "torch::nn::Unfold(kernel_size=" << options.kernel_size()
         << ", dilation=" << options.dilation()
         << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ")";
+         << ')';
 }

 Tensor UnfoldImpl::forward(const Tensor& input) {
--- a/torch/csrc/api/src/nn/modules/linear.cpp
+++ b/torch/csrc/api/src/nn/modules/linear.cpp
@ -55,7 +55,7 @@ void LinearImpl::pretty_print(std::ostream& stream) const {
  stream << std::boolalpha
         << "torch::nn::Linear(in_features=" << options.in_features()
         << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ")";
+         << ", bias=" << options.bias() << ')';
 }

 Tensor LinearImpl::forward(const Tensor& input) {
@ -70,7 +70,7 @@ void FlattenImpl::reset() {}

 void FlattenImpl::pretty_print(std::ostream& stream) const {
  stream << "torch::nn::Flatten(start_dim=" << options.start_dim()
-         << ", end_dim=" << options.end_dim() << ")";
+         << ", end_dim=" << options.end_dim() << ')';
 }

 Tensor FlattenImpl::forward(const Tensor& input) {
@ -161,7 +161,7 @@ void BilinearImpl::pretty_print(std::ostream& stream) const {
         << "torch::nn::Bilinear(in1_features=" << options.in1_features()
         << ", in2_features=" << options.in2_features()
         << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ")";
+         << ", bias=" << options.bias() << ')';
 }

 Tensor BilinearImpl::forward(const Tensor& input1, const Tensor& input2) {
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@ -74,7 +74,7 @@ HingeEmbeddingLossImpl::HingeEmbeddingLossImpl(
 void HingeEmbeddingLossImpl::reset() {}

 void HingeEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ")";
+  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ')';
 }

 Tensor HingeEmbeddingLossImpl::forward(
@ -104,7 +104,7 @@ void MultiMarginLossImpl::pretty_print(std::ostream& stream) const {
  stream << "torch::nn::MultiMarginLoss(p=" << options.p()
         << ", margin=" << options.margin() << ", weight=" << options.weight()
         << ", reduction=" << enumtype::get_enum_name(options.reduction())
-         << ")";
+         << ')';
 }

 Tensor MultiMarginLossImpl::forward(const Tensor& input, const Tensor& target) {
@ -126,7 +126,7 @@ CosineEmbeddingLossImpl::CosineEmbeddingLossImpl(
 void CosineEmbeddingLossImpl::reset() {}

 void CosineEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ")";
+  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ')';
 }

 Tensor CosineEmbeddingLossImpl::forward(
@ -169,7 +169,7 @@ void TripletMarginLossImpl::reset() {}
 void TripletMarginLossImpl::pretty_print(std::ostream& stream) const {
  stream << "torch::nn::TripletMarginLoss(margin=" << options.margin()
         << ", p=" << options.p() << ", eps=" << options.eps() << std::boolalpha
-         << ", swap=" << options.swap() << ")";
+         << ", swap=" << options.swap() << ')';
 }

 Tensor TripletMarginLossImpl::forward(
@ -199,7 +199,7 @@ void TripletMarginWithDistanceLossImpl::pretty_print(
    std::ostream& stream) const {
  stream << "torch::nn::TripletMarginWithDistanceLoss(margin="
         << options.margin() << std::boolalpha << ", swap=" << options.swap()
-         << ")";
+         << ')';
 }

 Tensor TripletMarginWithDistanceLossImpl::forward(
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Sun, Jiayi	4322354770	[Inductor] optimize scalar welford_reduce (#162709 ) Summary: Optimize scalar welford_reduce implementation, combining Welford algorithm with cascade sum to improve numerical stability. Specifically: 1. Use Welford algorithm to compute mean and variance. 2. Use cascade summation when computing sum over input for both mean and variance. Example: Take https://github.com/pytorch/pytorch/issues/141541 as an example: ``` import torch import torch.nn as nn torch.manual_seed(0) class Model(nn.Module): def __init__(self): super().__init__() self.gn = nn.GroupNorm(num_groups=32, num_channels=32) def forward(self, x): return self.gn(x) model = Model().eval() x = torch.randn(1, 32, 128, 128, 128) with torch.no_grad(): output = model(x) with torch._inductor.config.patch({"cpp.simdlen": 0}): c_model = torch.compile(model) c_output = c_model(x) print(torch.max(torch.abs(output - c_output))) print(torch.allclose(output, c_output, 1.3e-6, 1e-5)) ``` logs - before ``` tensor(0.0005) False ``` - After ``` tensor(1.4305e-06) True ``` Generated code: - before ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float', 'float', 'const float', 'const float', 'const float', 'float'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(float* in_out_ptr0, float* in_out_ptr1, const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr2) { auto out_ptr1 = in_out_ptr0; auto out_ptr0 = in_out_ptr1; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<float> tmp_acc0_arr[4]; for (int i = 0; i < 4; i++) { tmp_acc0_arr[i] = Welford<float>(); } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); Welford<float> tmp_acc0_local = Welford<float>(); #pragma omp for for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152Lx0)]; tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0); } } } tmp_acc0_arr[tid] = tmp_acc0_local; } for (int tid = 0; tid < 4; tid++) { tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]); } in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean; in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2; } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { { auto tmp0 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp6 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp8 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp11 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp1 = static_cast<float>(2097152.0); auto tmp2 = tmp0 / tmp1; auto tmp3 = static_cast<float>(1e-05); auto tmp4 = float(tmp2 + tmp3); auto tmp5 = 1 / std::sqrt(tmp4); auto tmp7 = float(tmp5 tmp6); auto tmp9 = decltype(tmp8)(-tmp8); auto tmp10 = float(tmp9 * tmp7); auto tmp12 = float(tmp10 + tmp11); in_out_ptr0[static_cast<int64_t>(x0)] = tmp7; in_out_ptr1[static_cast<int64_t>(x0)] = tmp12; } } } } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); { #pragma omp for for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { #pragma GCC ivdep for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152Lx0)]; auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)]; auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)]; auto tmp2 = float(tmp0 tmp1); auto tmp4 = float(tmp2 + tmp3); out_ptr2[static_cast<int64_t>(x1 + 2097152Lx0)] = tmp4; } } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, arg1_1, arg2_1 = args args.clear() assert_size_stride(arg0_1, (32, ), (1, )) assert_size_stride(arg1_1, (32, ), (1, )) assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1)) buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1 # reuse buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0 # reuse buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32) # [Provenance debug handles] cpp_fused_native_group_norm_0:1 cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5) del arg0_1 del arg1_1 del arg2_1 return (buf5, ) ``` - After ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float', 'float', 'const float', 'const float', 'const float', 'float'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(float in_out_ptr0, float* in_out_ptr1, const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr2) { auto out_ptr1 = in_out_ptr0; auto out_ptr0 = in_out_ptr1; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<float> tmp_acc0_arr[4]; for (int i = 0; i < 4; i++) { tmp_acc0_arr[i] = Welford<float>(); } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); WelfordHelper<float, float, 4096> scalar_welford_helper0(static_cast<int64_t>(524288L)); Welford<float> tmp_acc0_local = Welford<float>(); #pragma omp for for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152Lx0)]; tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0, &scalar_welford_helper0); } } } tmp_acc0_local = welford_combine(tmp_acc0_local, &scalar_welford_helper0); tmp_acc0_arr[tid] = tmp_acc0_local; } for (int tid = 0; tid < 4; tid++) { tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]); } in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean; in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2; } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { { auto tmp0 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp6 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp8 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp11 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp1 = static_cast<float>(2097152.0); auto tmp2 = tmp0 / tmp1; auto tmp3 = static_cast<float>(1e-05); auto tmp4 = float(tmp2 + tmp3); auto tmp5 = 1 / std::sqrt(tmp4); auto tmp7 = float(tmp5 tmp6); auto tmp9 = decltype(tmp8)(-tmp8); auto tmp10 = float(tmp9 * tmp7); auto tmp12 = float(tmp10 + tmp11); in_out_ptr0[static_cast<int64_t>(x0)] = tmp7; in_out_ptr1[static_cast<int64_t>(x0)] = tmp12; } } } } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); { #pragma omp for for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { #pragma GCC ivdep for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152Lx0)]; auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)]; auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)]; auto tmp2 = float(tmp0 tmp1); auto tmp4 = float(tmp2 + tmp3); out_ptr2[static_cast<int64_t>(x1 + 2097152L*x0)] = tmp4; } } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, arg1_1, arg2_1 = args args.clear() assert_size_stride(arg0_1, (32, ), (1, )) assert_size_stride(arg1_1, (32, ), (1, )) assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1)) buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1 # reuse buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0 # reuse buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32) # [Provenance debug handles] cpp_fused_native_group_norm_0:1 cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5) del arg0_1 del arg1_1 del arg2_1 return (buf5, ) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162709 Approved by: https://github.com/CaoE, https://github.com/jansel	2025-11-17 02:52:33 +00:00
Edward Yang	363385ad3e	s/Stragety/Strategy/ (#167916 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/167916 Approved by: https://github.com/Skylion007	2025-11-16 19:47:23 +00:00
Minjang Kim	e2e10753d7	Allow same triton kernels in export (#167862 ) Summary: This diff would be a follow-up diff for D85883723. Test Plan: See D86719598. We are now able to publish the model. Unit test: ``` buck run fbcode//mode/opt -c remoteexecution.local=enabled fbcode//sigmoid/inference/test:test_passes -m ovr_config//triton:experimental -- -r test_triton_hop_cpu ``` Differential Revision: D87091238 Pull Request resolved: https://github.com/pytorch/pytorch/pull/167862 Approved by: https://github.com/XueningXu	2025-11-16 17:51:23 +00:00
Erxin Shang	5d99a795f5	[xpu][test] Migrated two test files to XPU (#166684 ) # Description Fixes #114850, we will port test utils and schema check to Intel GPU We could enable Intel GPU with following methods and try the best to keep the original code styles: # Changes 1. Get device type with from accelerator and get_devtype helper method 2. Replace the requires cuda statement to device_type. 3. Add HAS_XPU and HAS GPU check to replace some of the HAS_XPU etc. # Notify Pull Request resolved: https://github.com/pytorch/pytorch/pull/166684 Approved by: https://github.com/ezyang, https://github.com/guangyey Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>	2025-11-16 14:15:28 +00:00
Yuanyuan Chen	2245d7d3b9	Improve char printing (#167899 ) This PR outputs chars to stream without building temporary strings. They were modified by (on fish) ``` sed -i -e 's/<< "\([^\\\']\)"/<< \'\1\'/g' (grep '<< "."' -r torch c10 aten -l) ``` and revert some invalid changes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/167899 Approved by: https://github.com/Skylion007	2025-11-16 07:19:16 +00:00