C10_UNUSED to [[maybe_unused]] (#6357) (#138364)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/6357 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138364 Approved by: https://github.com/Skylion007, https://github.com/eqy
2025-10-20 21:14:14 +08:00 · 2024-10-19 13:17:43 +00:00
parent 2f6a70bfea
commit fddabc6e0b
139 changed files with 834 additions and 690 deletions
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -68,7 +68,7 @@ struct strided_tensor_iter_fixed {
  strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default;
  strided_tensor_iter_fixed(
      Tensor& tensor,
-      C10_UNUSED bool sort_strides = false)
+      [[maybe_unused]] bool sort_strides = false)
      : data_(tensor.data_ptr<T>()) {
    std::memset(counter_, 0, sizeof(int64_t) * N);
    if (tensor.dim() > 0) {
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -63,38 +63,38 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    }                                                 \
  } while (0)

-#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)           \
-  case enum_type: {                                                     \
-    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                        \
-    using HINT C10_UNUSED = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
-    return __VA_ARGS__();                                               \
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)                 \
+  case enum_type: {                                                           \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                              \
+    using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
+    return __VA_ARGS__();                                                     \
  }

 #define AT_DISPATCH_CASE(enum_type, ...) \
  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)

-#define AT_DISPATCH_CASE_QINT(enum_type, scalar_type, ...)            \
-  case enum_type: {                                                   \
-    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                      \
-    using scalar_t = scalar_type;                                     \
-    using underlying_t C10_UNUSED = typename scalar_t::underlying;    \
-    C10_UNUSED const auto& SCALAR_TYPE = enum_type;                   \
-    C10_UNUSED const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
-    return __VA_ARGS__();                                             \
+#define AT_DISPATCH_CASE_QINT(enum_type, scalar_type, ...)                  \
+  case enum_type: {                                                         \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                            \
+    using scalar_t = scalar_type;                                           \
+    using underlying_t [[maybe_unused]] = typename scalar_t::underlying;    \
+    [[maybe_unused]] const auto& SCALAR_TYPE = enum_type;                   \
+    [[maybe_unused]] const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
+    return __VA_ARGS__();                                                   \
  }

-#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                           \
-    enum_type, scalar_type, bitwidth, qmin, qmax, ...)                \
-  case enum_type: {                                                   \
-    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                      \
-    using scalar_t = scalar_type;                                     \
-    using underlying_t C10_UNUSED = typename scalar_t::underlying;    \
-    C10_UNUSED const auto& SCALAR_TYPE = enum_type;                   \
-    C10_UNUSED const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
-    C10_UNUSED int bit_width = bitwidth;                              \
-    C10_UNUSED int64_t quant_min = qmin;                              \
-    C10_UNUSED int64_t quant_max = qmax;                              \
-    return __VA_ARGS__();                                             \
+#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                 \
+    enum_type, scalar_type, bitwidth, qmin, qmax, ...)                      \
+  case enum_type: {                                                         \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                            \
+    using scalar_t = scalar_type;                                           \
+    using underlying_t [[maybe_unused]] = typename scalar_t::underlying;    \
+    [[maybe_unused]] const auto& SCALAR_TYPE = enum_type;                   \
+    [[maybe_unused]] const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
+    [[maybe_unused]] int bit_width = bitwidth;                              \
+    [[maybe_unused]] int64_t quant_min = qmin;                              \
+    [[maybe_unused]] int64_t quant_max = qmax;                              \
+    return __VA_ARGS__();                                                   \
  }

 namespace detail {
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -638,7 +638,7 @@ void replace_(const ITensorListRef functional_tensor, ITensorListRef other) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_tensor.size() == other.size());
  auto functional_tensor_it = functional_tensor.begin();
  auto other_it = other.begin();
-  for (C10_UNUSED const auto i : c10::irange(functional_tensor.size())) {
+  for ([[maybe_unused]] const auto i : c10::irange(functional_tensor.size())) {
    replace_(*functional_tensor_it++, *other_it++);
  }
 }
@ -655,7 +655,7 @@ void propagate_xla_data(const ITensorListRef functional_tensor, ITensorListRef o
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_tensor.size() == other.size());
  auto functional_tensor_it = functional_tensor.begin();
  auto other_it = other.begin();
-  for (C10_UNUSED const auto i : c10::irange(functional_tensor.size())) {
+  for ([[maybe_unused]] const auto i : c10::irange(functional_tensor.size())) {
    propagate_xla_data(*functional_tensor_it++, *other_it++);
  }
 }
@ -670,7 +670,7 @@ void propagate_xla_data_direct(const ITensorListRef tensor,
                               ITensorListRef other) {
  auto tensor_it = tensor.begin();
  auto other_it = other.begin();
-  for (C10_UNUSED const auto i : c10::irange(tensor.size())) {
+  for ([[maybe_unused]] const auto i : c10::irange(tensor.size())) {
    propagate_xla_data_direct(*tensor_it++, *other_it++);
  }
 }
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@ -205,7 +205,7 @@ struct CodeTemplate {
  // or trailing newlines. It's the responsibility of the calling function
  // to indent correctly in the context.
  void emitIndent(std::ostream& out, size_t indent) const {
-    for (C10_UNUSED const auto i : c10::irange(indent)) {
+    for ([[maybe_unused]] const auto i : c10::irange(indent)) {
      out << " ";
    }
  }
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@ -153,7 +153,7 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&

 static void __printIndent(std::ostream &stream, int64_t indent)
 {
-  for (C10_UNUSED const auto i : c10::irange(indent)) {
+  for ([[maybe_unused]] const auto i : c10::irange(indent)) {
    stream << " ";
  }
 }
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -390,7 +390,8 @@ struct TORCH_API ClassType : public NamedType {
      std::string doc_string = "",
      std::vector<std::string> unresolved_class_attributes = {});

-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    const auto& n = name().value();
    return n.qualifiedName();
  }
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@ -376,8 +376,8 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
  return nullptr;
 }

-TORCH_API TupleTypePtr
-ivalue::TupleTypeFactory<TupleType>::fallback(C10_UNUSED const Type& type) {
+TORCH_API TupleTypePtr ivalue::TupleTypeFactory<TupleType>::fallback(
+    [[maybe_unused]] const Type& type) {
 #ifdef C10_MOBILE
  return nullptr;
 #else
@ -398,5 +398,4 @@ ivalue::TupleTypeFactory<TupleType>::fallback(C10_UNUSED const Type& type) {
 #endif
 }

-
 } // namespace c10
--- a/aten/src/ATen/core/enum_type.h
+++ b/aten/src/ATen/core/enum_type.h
@ -88,7 +88,7 @@ struct TORCH_API EnumType : public NamedType {
        cu_(std::move(cu)) {}

  std::string annotation_str_impl(
-      C10_UNUSED const TypePrinter& printer = nullptr) const override {
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    const auto& n = name().value();
    return n.qualifiedName();
  }
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@ -56,7 +56,7 @@ struct TORCH_API Function {
  virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
      Stack& /*stack*/,
      // NOLINTNEXTLINE(performance-unnecessary-value-param)
-      C10_UNUSED TaskLauncher taskLauncher = at::launch) {
+      [[maybe_unused]] TaskLauncher taskLauncher = at::launch) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return {};
  }
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -1278,7 +1278,8 @@ struct TORCH_API NumberType : public Type {
 protected:
  NumberType(TypeKind kind = TypeKind::NumberType) : Type(kind) {}

-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    return "number"; // technically not a valid python type, but
                     // we need to use it when parsing back in annotations
                     // for implicit conversions
@ -1305,7 +1306,8 @@ struct TORCH_API FloatType : public NumberType {

 private:
  FloatType() : NumberType(TypeKind::FloatType) {}
-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    return "float";
  }
 };
@ -1330,7 +1332,8 @@ struct TORCH_API ComplexType : public NumberType {

 private:
  ComplexType() : NumberType(TypeKind::ComplexType) {}
-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    return "complex";
  }
 };
@ -1419,7 +1422,8 @@ struct TORCH_API IntType : public NumberType {

 private:
  IntType() : NumberType(TypeKind::IntType) {}
-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    return "int";
  }
 };
@ -1453,7 +1457,8 @@ struct TORCH_API StringType : public Type {
    // we only use "str" (not "string") in both FunctionSchema and script
    return annotation_str();
  }
-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    return "str";
  }
  static const TypeKind Kind = TypeKind::StringType;
@ -1473,7 +1478,8 @@ struct TORCH_API StorageType : public Type {
  std::string str() const override {
    return annotation_str();
  }
-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    return "Storage";
  }
  static const TypeKind Kind = TypeKind::StorageType;
@ -1508,7 +1514,8 @@ struct TORCH_API FunctionType : public NamedType {

 private:
  FunctionType(torch::jit::Function* function);
-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    const auto& n = name().value();
    return n.qualifiedName();
  }
@ -2199,7 +2206,8 @@ struct TORCH_API InterfaceType : public NamedType {
      const InterfaceType& rhs,
      std::ostream* why_not);

-  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
+  std::string annotation_str_impl(
+      [[maybe_unused]] const TypePrinter& printer = nullptr) const override {
    return name()->qualifiedName();
  }

--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -1121,7 +1121,7 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
 #ifndef _MSC_VER
 # pragma unroll
 #endif
-  for (C10_UNUSED const auto i : c10::irange(n)) {
+  for ([[maybe_unused]] const auto i : c10::irange(n)) {
    *dst = c10::convert<dst_T>(c10::load(src));
    src++;
    dst++;
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@ -157,18 +157,19 @@ constexpr const char* _cusolver_backend_suggestion =            \
 // See NOTE [ USE OF NVRTC AND DRIVER API ].
 #if !defined(USE_ROCM)

-#define AT_CUDA_DRIVER_CHECK(EXPR)                                                                               \
-  do {                                                                                                           \
-    CUresult __err = EXPR;                                                                                       \
-    if (__err != CUDA_SUCCESS) {                                                                                 \
-      const char* err_str;                                                                                       \
-      C10_UNUSED CUresult get_error_str_err = at::globalContext().getNVRTC().cuGetErrorString(__err, &err_str);  \
-      if (get_error_str_err != CUDA_SUCCESS) {                                                                   \
-        AT_ERROR("CUDA driver error: unknown error");                                                            \
-      } else {                                                                                                   \
-        AT_ERROR("CUDA driver error: ", err_str);                                                                \
-      }                                                                                                          \
-    }                                                                                                            \
+#define AT_CUDA_DRIVER_CHECK(EXPR)                                          \
+  do {                                                                      \
+    CUresult __err = EXPR;                                                  \
+    if (__err != CUDA_SUCCESS) {                                            \
+      const char* err_str;                                                  \
+      [[maybe_unused]] CUresult get_error_str_err =                         \
+          at::globalContext().getNVRTC().cuGetErrorString(__err, &err_str); \
+      if (get_error_str_err != CUDA_SUCCESS) {                              \
+        AT_ERROR("CUDA driver error: unknown error");                       \
+      } else {                                                              \
+        AT_ERROR("CUDA driver error: ", err_str);                           \
+      }                                                                     \
+    }                                                                       \
  } while (0)

 #else
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -69,8 +69,12 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot initialize CUDA without ATen_cuda library. ", CUDA_HELP);
  }

-  virtual const Generator& getDefaultCUDAGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
-    TORCH_CHECK(false, "Cannot get default CUDA generator without ATen_cuda library. ", CUDA_HELP);
+  virtual const Generator& getDefaultCUDAGenerator(
+      [[maybe_unused]] DeviceIndex device_index = -1) const {
+    TORCH_CHECK(
+        false,
+        "Cannot get default CUDA generator without ATen_cuda library. ",
+        CUDA_HELP);
  }

  Device getDeviceFromPtr(void* /*data*/) const override {
--- a/aten/src/ATen/detail/XPUHooksInterface.h
+++ b/aten/src/ATen/detail/XPUHooksInterface.h
@ -32,12 +32,15 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{
    TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
  }

-  virtual Generator getXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
+  virtual Generator getXPUGenerator(
+      [[maybe_unused]] DeviceIndex device_index = -1) const {
    TORCH_CHECK(false, "Cannot get XPU generator without ATen_xpu library.");
  }

-  virtual const Generator& getDefaultXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
-    TORCH_CHECK(false, "Cannot get default XPU generator without ATen_xpu library.");
+  virtual const Generator& getDefaultXPUGenerator(
+      [[maybe_unused]] DeviceIndex device_index = -1) const {
+    TORCH_CHECK(
+        false, "Cannot get default XPU generator without ATen_xpu library.");
  }

  virtual DeviceIndex getNumGPUs() const {
--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@ -135,7 +135,7 @@ static Tensor make_feature_noise(const Tensor& input) {
  sizes.reserve(input.dim());
  sizes.push_back(input_sizes[0]);
  sizes.push_back(input_sizes[1]);
-  for (C10_UNUSED const auto i : c10::irange(2, input.dim())) {
+  for ([[maybe_unused]] const auto i : c10::irange(2, input.dim())) {
    sizes.push_back(1);
  }
  // NB: THIS WAS CHANGED FROM THE ORIGINAL
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -1109,7 +1109,7 @@ void unpack_pivots_cpu_kernel(TensorIterator& iter, const int64_t dim_size, cons
    auto* perm_ptr = data[0];
    const auto* pivots_ptr = data[1];

-    for (C10_UNUSED const auto elem : c10::irange(nelems)) {
+    for ([[maybe_unused]] const auto elem : c10::irange(nelems)) {
      // WARNING: linalg.lu_factor returns int32 pivots,
      // this behavior could change in the future.
      const auto perm_data = reinterpret_cast<int64_t*>(perm_ptr);
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -133,30 +133,50 @@ float bf16_dot_with_fp32_arith(
 #endif

 template <typename scalar_t>
-bool scal_use_fast_path(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) {
+bool scal_use_fast_path(
+    [[maybe_unused]] int64_t n,
+    [[maybe_unused]] int64_t incx) {
  return false;
 }

 template <typename scalar_t>
-bool gemv_use_fast_path(C10_UNUSED char trans, C10_UNUSED int64_t m,
-                        C10_UNUSED int64_t n, C10_UNUSED scalar_t alpha,
-                        C10_UNUSED int64_t lda,
-                        C10_UNUSED int64_t incx, C10_UNUSED scalar_t beta,
-                        C10_UNUSED int64_t incy) {
+bool gemv_use_fast_path(
+    [[maybe_unused]] char trans,
+    [[maybe_unused]] int64_t m,
+    [[maybe_unused]] int64_t n,
+    [[maybe_unused]] scalar_t alpha,
+    [[maybe_unused]] int64_t lda,
+    [[maybe_unused]] int64_t incx,
+    [[maybe_unused]] scalar_t beta,
+    [[maybe_unused]] int64_t incy) {
  return false;
 }

 template <typename scalar_t>
-void scal_fast_path(C10_UNUSED int *n, C10_UNUSED scalar_t *a, C10_UNUSED scalar_t *x, C10_UNUSED int *incx) {
-  TORCH_INTERNAL_ASSERT(false, "scal_fast_path shouldn't be called for this configuration");
+void scal_fast_path(
+    [[maybe_unused]] int* n,
+    [[maybe_unused]] scalar_t* a,
+    [[maybe_unused]] scalar_t* x,
+    [[maybe_unused]] int* incx) {
+  TORCH_INTERNAL_ASSERT(
+      false, "scal_fast_path shouldn't be called for this configuration");
 }

 template <typename scalar_t>
-void gemv_fast_path(C10_UNUSED const char *trans, C10_UNUSED const int *m, C10_UNUSED const int *n,
-                    C10_UNUSED  const scalar_t *alpha, C10_UNUSED const scalar_t *a, C10_UNUSED const int *lda,
-                    C10_UNUSED  const scalar_t *x, C10_UNUSED const int *incx, C10_UNUSED const scalar_t *beta,
-                    C10_UNUSED  scalar_t *y, C10_UNUSED const int *incy) {
-  TORCH_INTERNAL_ASSERT(false, "gemv_fast_path shouldn't be called for this configuration");
+void gemv_fast_path(
+    [[maybe_unused]] const char* trans,
+    [[maybe_unused]] const int* m,
+    [[maybe_unused]] const int* n,
+    [[maybe_unused]] const scalar_t* alpha,
+    [[maybe_unused]] const scalar_t* a,
+    [[maybe_unused]] const int* lda,
+    [[maybe_unused]] const scalar_t* x,
+    [[maybe_unused]] const int* incx,
+    [[maybe_unused]] const scalar_t* beta,
+    [[maybe_unused]] scalar_t* y,
+    [[maybe_unused]] const int* incy) {
+  TORCH_INTERNAL_ASSERT(
+      false, "gemv_fast_path shouldn't be called for this configuration");
 }

 #define INSTANTIATE(scalar_t)                                                                                                                                                     \
@ -188,15 +208,32 @@ void scal_fast_path<float>(int *n, float *a, float *x, int *incx) {
 }

 template <>
-bool gemv_use_fast_path<float>(C10_UNUSED char trans, int64_t m, int64_t n, C10_UNUSED float alpha, int64_t lda, int64_t incx, C10_UNUSED float beta, int64_t incy) {
+bool gemv_use_fast_path<float>(
+    [[maybe_unused]] char trans,
+    int64_t m,
+    int64_t n,
+    [[maybe_unused]] float alpha,
+    int64_t lda,
+    int64_t incx,
+    [[maybe_unused]] float beta,
+    int64_t incy) {
  auto intmax = std::numeric_limits<int>::max();
  return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
         (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
 }

 template <>
-bool gemv_use_fast_path<double>(C10_UNUSED char trans, int64_t m, int64_t n, C10_UNUSED double alpha, int64_t lda, int64_t incx, C10_UNUSED double beta, int64_t incy) {
-  return gemv_use_fast_path<float>(trans, m, n, (float)alpha, lda, incx, (float)beta, incy);
+bool gemv_use_fast_path<double>(
+    [[maybe_unused]] char trans,
+    int64_t m,
+    int64_t n,
+    [[maybe_unused]] double alpha,
+    int64_t lda,
+    int64_t incx,
+    [[maybe_unused]] double beta,
+    int64_t incy) {
+  return gemv_use_fast_path<float>(
+      trans, m, n, (float)alpha, lda, incx, (float)beta, incy);
 }

 template <>
@ -220,38 +257,40 @@ INSTANTIATE(int);
 INSTANTIATE(int64_t);
 #if defined(__aarch64__) && !defined(C10_MOBILE)
 template <>
-bool scal_use_fast_path<at::Half>(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) {
+bool scal_use_fast_path<at::Half>(
+    [[maybe_unused]] int64_t n,
+    [[maybe_unused]] int64_t incx) {
  return false;
 }

 template <>
 bool gemv_use_fast_path<at::Half>(
-    C10_UNUSED char trans,
-    C10_UNUSED int64_t m,
-    C10_UNUSED int64_t n,
+    [[maybe_unused]] char trans,
+    [[maybe_unused]] int64_t m,
+    [[maybe_unused]] int64_t n,
    at::Half alpha,
-    C10_UNUSED int64_t lda,
-    C10_UNUSED int64_t incx,
+    [[maybe_unused]] int64_t lda,
+    [[maybe_unused]] int64_t incx,
    at::Half beta,
-    C10_UNUSED int64_t incy) {
+    [[maybe_unused]] int64_t incy) {
  return incx == 1 && c10::detail::fp16_from_bits(alpha.x) == 1.0f &&
-    c10::detail::fp16_from_bits(beta.x) == 0.0f;
+      c10::detail::fp16_from_bits(beta.x) == 0.0f;
 }

 template <>
 bool gemv_use_fast_path<at::BFloat16>(
-  C10_UNUSED char trans,
-  C10_UNUSED int64_t m,
-    C10_UNUSED int64_t n,
+    [[maybe_unused]] char trans,
+    [[maybe_unused]] int64_t m,
+    [[maybe_unused]] int64_t n,
    at::BFloat16 alpha,
-    C10_UNUSED int64_t lda,
-    C10_UNUSED int64_t incx,
+    [[maybe_unused]] int64_t lda,
+    [[maybe_unused]] int64_t incx,
    at::BFloat16 beta,
-    C10_UNUSED int64_t incy) {
-  return (trans == 'T' || trans == 't') && incx == 1 && alpha == 1.0 && beta == 0.0;
+    [[maybe_unused]] int64_t incy) {
+  return (trans == 'T' || trans == 't') && incx == 1 && alpha == 1.0 &&
+      beta == 0.0;
 }

-
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
 static inline float16_t reduce(float16x4_t x) {
        auto sum = vpadd_f16(x, x);
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@ -34,7 +34,7 @@ Tensor make_feature_noise(const Tensor& input) {
  sizes.reserve(input.dim());
  sizes.push_back(input_sizes[0]);
  sizes.push_back(input_sizes[1]);
-  for (C10_UNUSED const auto i : c10::irange(2, input.dim())) {
+  for ([[maybe_unused]] const auto i : c10::irange(2, input.dim())) {
    sizes.push_back(1);
  }
  return input.new_empty_symint(sizes);
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@ -13,9 +13,11 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx);
 }

-
-C10_UNUSED static std::vector<Tensor> expandTensors(const Tensor & self, IOptTensorListRef indices) {
-  // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors
+[[maybe_unused]] static std::vector<Tensor> expandTensors(
+    const Tensor& self,
+    IOptTensorListRef indices) {
+  // If indices come in as ByteTensor or BoolTensor (masks), expand them into
+  // the equivalent indexing by LongTensors
  std::vector<Tensor> result;
  for (const auto& index_opt : indices) {
    if (!index_opt.has_value()) {
@ -48,7 +50,9 @@ C10_UNUSED static std::vector<Tensor> expandTensors(const Tensor & self, IOptTen
  return result;
 }

-C10_UNUSED static void checkIndexTensorTypes(IOptTensorListRef indices, bool allow_int=false) {
+[[maybe_unused]] static void checkIndexTensorTypes(
+    IOptTensorListRef indices,
+    bool allow_int = false) {
  for (const auto& tensor : indices) {
    if (tensor.has_value() && tensor->defined()) {
      auto scalarType = tensor->scalar_type();
@ -83,7 +87,7 @@ inline torch::List<std::optional<Tensor>> toListOfOptionalTensors(ArrayRef<IValu
  return result;
 }

-C10_UNUSED static bool hasContiguousSubspace(TensorList tl) {
+[[maybe_unused]] static bool hasContiguousSubspace(TensorList tl) {
  // true if all the non-null tensors are adjacent
  auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
  auto isNull = [](const Tensor & tensor){ return !tensor.defined(); };
@ -93,15 +97,15 @@ C10_UNUSED static bool hasContiguousSubspace(TensorList tl) {
  return it == stop.base();
 }

-
 // Transposes the tensor and indices together so that all the non-null indices
 // index the first k dimensions of the tensor. Returns the transposed tensor
 // and the reordered indices. For example:
 // transposeToFront(tensor, {nullptr, a, nullptr, b})
 // returns
 // tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr}
-C10_UNUSED static std::tuple<Tensor, std::vector<Tensor>>
-transposeToFront(const Tensor& self, TensorList indices) {
+[[maybe_unused]] static std::tuple<Tensor, std::vector<Tensor>> transposeToFront(
+    const Tensor& self,
+    TensorList indices) {
  std::vector<int64_t> dims;
  std::vector<Tensor> transposedIndices;
  dims.reserve(self.dim());
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@ -241,8 +241,9 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
    auto* b_batch_idx_ptr = data[0];
    auto* a_batch_idx_ptr = data[1];

-    for (C10_UNUSED const auto elem : c10::irange(nelems)) {
-      auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
+    for ([[maybe_unused]] const auto elem : c10::irange(nelems)) {
+      auto b_curr_linear_batch_idx =
+          *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
      auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);

      check_if_copy_needed_for_a(a_curr_linear_batch_idx);
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@ -76,7 +76,7 @@ static void multilabel_margin_loss_forward_out_frame(

    accscalar_t sum = 0;

-    for (C10_UNUSED const auto t : c10::irange(nframe)) {
+    for ([[maybe_unused]] const auto t : c10::irange(nframe)) {
      sum += multilabel_margin_loss_forward_inner_sum_cpu(
          input_data, target_data, is_target_data, dim);

@ -180,7 +180,7 @@ static void multilabel_margin_loss_backward_out_frame(
      reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);

  scalar_t* grad_input_row_data = grad_input.mutable_data_ptr<scalar_t>();
-  for (C10_UNUSED const auto t : c10::irange(nframe)) {
+  for ([[maybe_unused]] const auto t : c10::irange(nframe)) {
    for (const auto dt : c10::irange(dim)) {
      int64_t target_idx = target_data[dt];
      if (target_idx < 0) {
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@ -1204,22 +1204,30 @@ scalar_t calc_igamma(scalar_t a, scalar_t x) {
 }

 template <>
-C10_UNUSED inline c10::BFloat16 calc_igamma<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
+[[maybe_unused]] inline c10::BFloat16 calc_igamma<c10::BFloat16>(
+    c10::BFloat16 a,
+    c10::BFloat16 x) {
  return calc_igamma<float>(float(a), float(x));
 }

 template <>
-C10_UNUSED inline c10::Half calc_igamma<c10::Half>(c10::Half a, c10::Half x) {
+[[maybe_unused]] inline c10::Half calc_igamma<c10::Half>(
+    c10::Half a,
+    c10::Half x) {
  return calc_igamma<float>(float(a), float(x));
 }

 template <>
-C10_UNUSED inline c10::BFloat16 calc_igammac<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
+[[maybe_unused]] inline c10::BFloat16 calc_igammac<c10::BFloat16>(
+    c10::BFloat16 a,
+    c10::BFloat16 x) {
  return calc_igammac<float>(float(a), float(x));
 }

 template <>
-C10_UNUSED inline c10::Half calc_igammac<c10::Half>(c10::Half a, c10::Half x) {
+[[maybe_unused]] inline c10::Half calc_igammac<c10::Half>(
+    c10::Half a,
+    c10::Half x) {
  return calc_igammac<float>(float(a), float(x));
 }

@ -1231,7 +1239,7 @@ inline T abs_impl(T v) {
 }

 template <>
-C10_UNUSED inline uint8_t abs_impl(uint8_t v) {
+[[maybe_unused]] inline uint8_t abs_impl(uint8_t v) {
  return v;
 }

--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@ -188,7 +188,7 @@ std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor
    }
    int64_t dec = prev_batch_size - batch_size;
    if (dec > 0) {
-      for (C10_UNUSED const auto j : c10::irange(dec)) {
+      for ([[maybe_unused]] const auto j : c10::irange(dec)) {
        (*lengths--) = i;
      }
    }
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1889,7 +1889,8 @@ static DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple

 namespace {

-C10_UNUSED static auto ensure_linear_params_registered = register_linear_params();
+[[maybe_unused]] static auto ensure_linear_params_registered =
+    register_linear_params();

 static auto cell_params_base_registry =
    torch::selective_class_<CellParamsBase>("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase"))
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -931,7 +931,7 @@ static inline Tensor diff_helper(const Tensor& self, int64_t n, int64_t dim) {
  bool is_kBool = (self.dtype() == at::kBool);
  n = n > self.sym_size(dim) ? self.sym_size(dim).guard_int(__FILE__, __LINE__) : n;

-  for (C10_UNUSED const auto i : c10::irange(n)) {
+  for ([[maybe_unused]] const auto i : c10::irange(n)) {
    if (is_kBool) {
      result = at::logical_xor(
        at::narrow_symint(result, dim, 1, out_len),
@ -2255,7 +2255,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
            return;
        }
        char* self_data = data[0];
-        for (C10_UNUSED const auto i : c10::irange(dim_size)) {
+        for ([[maybe_unused]] const auto i : c10::irange(dim_size)) {
          if (isnan_(c10::load<scalar_t>(self_data))) {
            result = false;
            return;
@ -2282,7 +2282,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
      }
      char* self_data = data[0];
      char* other_data = data[1];
-      for (C10_UNUSED const auto i : c10::irange(dim_size)) {
+      for ([[maybe_unused]] const auto i : c10::irange(dim_size)) {
        if (c10::load<scalar_t>(self_data) != c10::load<scalar_t>(other_data)) {
          result = false;
          return;
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@ -207,9 +207,13 @@ inline TensorIterator make_reduction(
  return TensorIterator::reduce_op(viewed_result, self.to(in_dtype));
 }

-C10_UNUSED inline TensorIterator make_reduction(
-    const char* name, Tensor& result, const Tensor& self,
-    at::OptionalIntArrayRef dim, bool keepdim, ScalarType out_dtype) {
+[[maybe_unused]] inline TensorIterator make_reduction(
+    const char* name,
+    Tensor& result,
+    const Tensor& self,
+    at::OptionalIntArrayRef dim,
+    bool keepdim,
+    ScalarType out_dtype) {
  // special case for type promotion in mixed precision, improves computational
  // efficiency.
  // not generalize this to common mismatched input/output types to avoid cross
@ -259,9 +263,14 @@ inline TensorIterator make_reduction(
  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));
 }

-C10_UNUSED inline TensorIterator make_reduction(
-    const char* name, Tensor& result1, Tensor& result2, const Tensor& self,
-    at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype) {
+[[maybe_unused]] inline TensorIterator make_reduction(
+    const char* name,
+    Tensor& result1,
+    Tensor& result2,
+    const Tensor& self,
+    at::OptionalIntArrayRef dim,
+    bool keepdim,
+    ScalarType dtype) {
  return make_reduction(name, result1, result2, self, dim, keepdim, dtype, dtype);
 }

@ -313,9 +322,13 @@ inline std::vector<int64_t> get_zero_numel_tensor_size(
 // This function should be called when you are reducing a zero-numel tensor and want to
 // resize the output and return it. This function exists for resizing zero-numel
 // tensors when the size of the reduction dimension is non-zero.
-C10_UNUSED inline void zero_numel_tensor_resize(Tensor& result, Tensor& result_indices,
-                                     const Tensor& self, const int64_t dim,
-                                     const bool keepdim, const char *fn_name) {
+[[maybe_unused]] inline void zero_numel_tensor_resize(
+    Tensor& result,
+    Tensor& result_indices,
+    const Tensor& self,
+    const int64_t dim,
+    const bool keepdim,
+    const char* fn_name) {
  auto sizes = get_zero_numel_tensor_size(self, dim, keepdim, fn_name);
  at::native::resize_output(result, sizes);
  at::native::resize_output(result_indices, sizes);
@ -349,11 +362,11 @@ inline ScalarType get_dtype_from_result(Tensor& result, std::optional<ScalarType

 namespace at::meta {

-C10_UNUSED inline DimVector get_reduction_shape(
+[[maybe_unused]] inline DimVector get_reduction_shape(
    const Tensor& self,
    IntArrayRef dims,
    bool keepdim,
-    bool allow_empty_dims=false) {
+    bool allow_empty_dims = false) {
  auto mask = native::make_dim_mask(dims, self.dim(), allow_empty_dims);
  return native::shape_from_dim_mask(self, mask, keepdim);
 }
@ -434,7 +447,7 @@ inline TensorIterator make_reduction(
  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));
 }

-C10_UNUSED inline TensorIterator make_reduction_from_out_ty(
+[[maybe_unused]] inline TensorIterator make_reduction_from_out_ty(
    const Tensor& self,
    const Tensor& result,
    OptionalIntArrayRef opt_dims,
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -2409,7 +2409,7 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) {

        for (const auto i : c10::irange(n2)) {
          const char* ptr = data[0] + i * strides[1];
-          for (C10_UNUSED const auto j : c10::irange(n1)) {
+          for ([[maybe_unused]] const auto j : c10::irange(n1)) {
            const auto& val = c10::load<scalar_t>(ptr);
            // If nonzero, write index
            if (val != scalar_t(0)) {
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@ -50,7 +50,8 @@ const Tensor& value){
      }
    }
  }
-  for (C10_UNUSED const auto i : c10::irange(num_ind, self.ndimension())) {
+  for ([[maybe_unused]] const auto i :
+       c10::irange(num_ind, self.ndimension())) {
    mask = mask.unsqueeze(-1);
  }
  return std::make_tuple(true, mask);
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -1945,7 +1945,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
      at::parallel_for(0, index_len, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) {
          const auto* src = ptr_index + start;
          auto* dst = ptr_nneg_index + start;
-          for (C10_UNUSED const auto _ : c10::irange(start, end)) {
+          for ([[maybe_unused]] const auto _ : c10::irange(start, end)) {
            auto idx = *src++;
            if (idx < -size || idx >= size) {
               // Mark self and dim as used if code is compiled with STRIP_ERROR_MESSAGES
@ -2051,36 +2051,42 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
        const auto* ptr_sorted_start = ptr_sorted;
        const auto* ptr_sorted_end = ptr_sorted + sorted_len;

-        at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
-            const auto start = tid * chunk_size_src;
-            const auto end = std::min(start + chunk_size_src, src_len);
-            auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr<int64_t>();
-            auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr<int64_t>();
-            auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr<int64_t>();
-            const auto* ptr_src = src.const_data_ptr<int64_t>() + start;
+        at::parallel_for(
+            0, n_threads_src, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
+              const auto start = tid * chunk_size_src;
+              const auto end = std::min(start + chunk_size_src, src_len);
+              auto* ptr_tid_src_int_idx =
+                  src_int_idx.select(0, tid).data_ptr<int64_t>();
+              auto* ptr_tid_sorted_int_idx =
+                  sorted_int_idx.select(0, tid).data_ptr<int64_t>();
+              auto* ptr_tid_int_counts =
+                  int_counts.select(0, tid).data_ptr<int64_t>();
+              const auto* ptr_src = src.const_data_ptr<int64_t>() + start;

-            for (const auto i : c10::irange(start, end)) {
-              const auto src_val = *ptr_src++;
-              const auto src_val_lb = std::lower_bound(ptr_sorted_start, ptr_sorted_end, src_val);
-              // We cannot just use *src_val_lb != src_val because when
-              // src_val_lb == ptr_sorted_end, dereferencing past-the-end value
-              // is not well-defined.
-              if (src_val_lb == ptr_sorted_end || *src_val_lb != src_val) {
-                ++ptr_tid_src_int_idx;
-                ++ptr_tid_sorted_int_idx;
-                ++ptr_tid_int_counts;
-                continue;
+              for (const auto i : c10::irange(start, end)) {
+                const auto src_val = *ptr_src++;
+                const auto src_val_lb =
+                    std::lower_bound(ptr_sorted_start, ptr_sorted_end, src_val);
+                // We cannot just use *src_val_lb != src_val because when
+                // src_val_lb == ptr_sorted_end, dereferencing past-the-end
+                // value is not well-defined.
+                if (src_val_lb == ptr_sorted_end || *src_val_lb != src_val) {
+                  ++ptr_tid_src_int_idx;
+                  ++ptr_tid_sorted_int_idx;
+                  ++ptr_tid_int_counts;
+                  continue;
+                }
+                const auto src_val_ub =
+                    std::upper_bound(ptr_sorted_start, ptr_sorted_end, src_val);
+
+                const int64_t count = src_val_ub - src_val_lb;
+                const int64_t j = src_val_lb - ptr_sorted_start;
+
+                *ptr_tid_src_int_idx++ = i;
+                *ptr_tid_sorted_int_idx++ = j;
+                *ptr_tid_int_counts++ = count;
              }
-              const auto src_val_ub = std::upper_bound(ptr_sorted_start, ptr_sorted_end, src_val);
-
-              const int64_t count = src_val_ub - src_val_lb;
-              const int64_t j = src_val_lb - ptr_sorted_start;
-
-              *ptr_tid_src_int_idx++ = i;
-              *ptr_tid_sorted_int_idx++ = j;
-              *ptr_tid_int_counts++ = count;
-            }
-        });
+            });
      }

      const auto compressed_int_counts = int_counts.sum(-1);
@ -2111,29 +2117,35 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in

        const auto thread_offsets = compressed_int_counts.cumsum(0).sub_(compressed_int_counts);
        const auto* ptr_sorted_idx = sorted_idx.const_data_ptr<int64_t>();
-        at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
-            const auto start = tid * chunk_size_src;
-            const auto end = std::min(start + chunk_size_src, src_len);
-            const auto tid_offset = thread_offsets.const_data_ptr<int64_t>()[tid];
-            const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).const_data_ptr<int64_t>();
-            const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).const_data_ptr<int64_t>();
-            const auto* ptr_tid_int_counts = int_counts.select(0, tid).const_data_ptr<int64_t>();
-            auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset;
-            auto* ptr_tid_selected_src = ptr_selected_src + tid_offset;
+        at::parallel_for(
+            0, n_threads_src, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
+              const auto start = tid * chunk_size_src;
+              const auto end = std::min(start + chunk_size_src, src_len);
+              const auto tid_offset =
+                  thread_offsets.const_data_ptr<int64_t>()[tid];
+              const auto* ptr_tid_src_int_idx =
+                  src_int_idx.select(0, tid).const_data_ptr<int64_t>();
+              const auto* ptr_tid_sorted_int_idx =
+                  sorted_int_idx.select(0, tid).const_data_ptr<int64_t>();
+              const auto* ptr_tid_int_counts =
+                  int_counts.select(0, tid).const_data_ptr<int64_t>();
+              auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset;
+              auto* ptr_tid_selected_src = ptr_selected_src + tid_offset;

-            for (C10_UNUSED const auto _ : c10::irange(start, end)) {
-              const auto count = *ptr_tid_int_counts++;
-              const auto i = *ptr_tid_src_int_idx++;
-              const auto j = *ptr_tid_sorted_int_idx++;
-              if (!count) continue;
+              for ([[maybe_unused]] const auto _ : c10::irange(start, end)) {
+                const auto count = *ptr_tid_int_counts++;
+                const auto i = *ptr_tid_src_int_idx++;
+                const auto j = *ptr_tid_sorted_int_idx++;
+                if (!count)
+                  continue;

-              std::fill_n(ptr_tid_selected_src, count, i);
-              std::copy_n(ptr_sorted_idx + j, count, ptr_tid_selected_sorted);
+                std::fill_n(ptr_tid_selected_src, count, i);
+                std::copy_n(ptr_sorted_idx + j, count, ptr_tid_selected_sorted);

-              ptr_tid_selected_sorted += count;
-              ptr_tid_selected_src += count;
-            }
-        });
+                ptr_tid_selected_sorted += count;
+                ptr_tid_selected_src += count;
+              }
+            });
      }

      return search_in_dim_indices
@ -2192,7 +2204,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
        else {
          auto* ptr_counts = counts.data_ptr<int64_t>();
          const auto* ptr_vals = t.const_data_ptr<int64_t>();
-          for (C10_UNUSED const auto _ : c10::irange(t.numel())) {
+          for ([[maybe_unused]] const auto _ : c10::irange(t.numel())) {
            ++ptr_counts[*ptr_vals++];
          }
        }
@ -2212,14 +2224,19 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
        const auto run_in_parallel = (n_threads == 1);

        auto counts_per_thread = at::zeros({n_threads, size}, idx.options());
-        at::parallel_for(0, n_threads, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
-          const auto start = tid * chunk_size;
-          const auto end = std::min(start + chunk_size, idx_len);
-          const auto tid_idx = idx.slice(0, start, end);
-          auto tid_counts = counts_per_thread.select(0, tid);
-          get_counts(tid_counts, tid_idx, /*bins=*/size,
-              /*is_sorted=*/is_sorted, /*run_in_parallel=*/run_in_parallel);
-        });
+        at::parallel_for(
+            0, n_threads, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
+              const auto start = tid * chunk_size;
+              const auto end = std::min(start + chunk_size, idx_len);
+              const auto tid_idx = idx.slice(0, start, end);
+              auto tid_counts = counts_per_thread.select(0, tid);
+              get_counts(
+                  tid_counts,
+                  tid_idx,
+                  /*bins=*/size,
+                  /*is_sorted=*/is_sorted,
+                  /*run_in_parallel=*/run_in_parallel);
+            });

        return counts_per_thread;
      };
@ -2310,32 +2327,38 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
            1, std::min<int64_t>((src_len + grain_size - 1) / grain_size, at::get_num_threads())
        );
        const auto chunk_size = (src_len + n_threads_src - 1) / n_threads_src;
-        at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
-            const auto start = tid * chunk_size;
-            const auto end = std::min(start + chunk_size, src_len);
-            auto* ptr_src_tid = ptr_src + start;
-            const auto* ptr_src_counts_per_thread
-              = src_counts_per_thread.select(0, tid).const_data_ptr<int64_t>();
-            const auto* ptr_src_offset_counts_per_thread
-              = src_offset_counts_per_thread.select(0, tid).const_data_ptr<int64_t>();
-            auto tid_counts = at::zeros({size}, src.options());
-            auto* ptr_tid_counts = tid_counts.data_ptr<int64_t>();
+        at::parallel_for(
+            0, n_threads_src, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
+              const auto start = tid * chunk_size;
+              const auto end = std::min(start + chunk_size, src_len);
+              auto* ptr_src_tid = ptr_src + start;
+              const auto* ptr_src_counts_per_thread =
+                  src_counts_per_thread.select(0, tid)
+                      .const_data_ptr<int64_t>();
+              const auto* ptr_src_offset_counts_per_thread =
+                  src_offset_counts_per_thread.select(0, tid)
+                      .const_data_ptr<int64_t>();
+              auto tid_counts = at::zeros({size}, src.options());
+              auto* ptr_tid_counts = tid_counts.data_ptr<int64_t>();

-            for (const auto i : c10::irange(start, end)) {
-              const auto idx_val = *ptr_src_tid++;
-              // skip idx value if not in the intersection
-              if (!ptr_intersection_counts[idx_val]) continue;
-              const auto idx_val_offset
-                = ptr_src_intersection_offsets[idx_val]
-                - ptr_src_intersection_counts[idx_val];
-              const auto idx_val_tid_offset
-                = ptr_src_offset_counts_per_thread[idx_val]
-                - ptr_src_counts_per_thread[idx_val];
-              auto& idx_val_local_tid_count = ptr_tid_counts[idx_val];
-              ptr_src_idx[idx_val_offset + idx_val_tid_offset + idx_val_local_tid_count] = i;
-              ++idx_val_local_tid_count;
-            }
-        });
+              for (const auto i : c10::irange(start, end)) {
+                const auto idx_val = *ptr_src_tid++;
+                // skip idx value if not in the intersection
+                if (!ptr_intersection_counts[idx_val])
+                  continue;
+                const auto idx_val_offset =
+                    ptr_src_intersection_offsets[idx_val] -
+                    ptr_src_intersection_counts[idx_val];
+                const auto idx_val_tid_offset =
+                    ptr_src_offset_counts_per_thread[idx_val] -
+                    ptr_src_counts_per_thread[idx_val];
+                auto& idx_val_local_tid_count = ptr_tid_counts[idx_val];
+                ptr_src_idx
+                    [idx_val_offset + idx_val_tid_offset +
+                     idx_val_local_tid_count] = i;
+                ++idx_val_local_tid_count;
+              }
+            });

        const auto src_idx_offsets = src_intersection_offsets.sub_(src_intersection_counts);

@ -2369,26 +2392,28 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
            1, std::min<int64_t>((idx_len + grain_size - 1) / grain_size, at::get_num_threads())
        );
        const auto chunk_size = (idx_len + n_threads_idx - 1) / n_threads_idx;
-        at::parallel_for(0, n_threads_idx, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
-            const auto start = tid * chunk_size;
-            const auto end = std::min(start + chunk_size, idx_len);
-            const auto tid_offset = ptr_thread_offset[tid];
-            const auto* ptr_idx_tid = ptr_idx + start;
-            auto* ptr_idx_selected_tid = ptr_idx_selected + tid_offset;
-            auto* ptr_src_selected_tid = ptr_src_selected + tid_offset;
+        at::parallel_for(
+            0, n_threads_idx, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
+              const auto start = tid * chunk_size;
+              const auto end = std::min(start + chunk_size, idx_len);
+              const auto tid_offset = ptr_thread_offset[tid];
+              const auto* ptr_idx_tid = ptr_idx + start;
+              auto* ptr_idx_selected_tid = ptr_idx_selected + tid_offset;
+              auto* ptr_src_selected_tid = ptr_src_selected + tid_offset;

-            for (const auto i : c10::irange(start, end)) {
-              const auto idx_val = *ptr_idx_tid++;
-              // skip if idx_val is not in the intersection
-              if (!ptr_intersection_counts[idx_val]) continue;
-              const auto count = ptr_src_counts[idx_val];
-              const auto j = ptr_src_idx_offsets[idx_val];
-              std::fill_n(ptr_idx_selected_tid, count, i);
-              std::copy_n(ptr_src_idx + j, count, ptr_src_selected_tid);
-              ptr_idx_selected_tid += count;
-              ptr_src_selected_tid += count;
-            }
-        });
+              for (const auto i : c10::irange(start, end)) {
+                const auto idx_val = *ptr_idx_tid++;
+                // skip if idx_val is not in the intersection
+                if (!ptr_intersection_counts[idx_val])
+                  continue;
+                const auto count = ptr_src_counts[idx_val];
+                const auto j = ptr_src_idx_offsets[idx_val];
+                std::fill_n(ptr_idx_selected_tid, count, i);
+                std::copy_n(ptr_src_idx + j, count, ptr_src_selected_tid);
+                ptr_idx_selected_tid += count;
+                ptr_src_selected_tid += count;
+              }
+            });

        return std::make_tuple(idx_selected, src_selected);
      }();
--- a/aten/src/ATen/native/UnfoldBackward.h
+++ b/aten/src/ATen/native/UnfoldBackward.h
@ -29,13 +29,12 @@ namespace {
 // grad_in does not mean that it is a gradient wrt to input,
 // grad_in/grad_out is just an input/output of unfold_backward kernel.

-C10_UNUSED static TensorIterator _make_unfold_backward_iter_over_grad_out(
-  Tensor& grad_out,
-  const Tensor& grad_in,
-  int64_t dim,
-  int64_t size,
-  int64_t step
-) {
+[[maybe_unused]] static TensorIterator _make_unfold_backward_iter_over_grad_out(
+    Tensor& grad_out,
+    const Tensor& grad_in,
+    int64_t dim,
+    int64_t size,
+    int64_t step) {
  dim = maybe_wrap_dim(dim, grad_out.dim());
  // last dim stores the folds

@ -106,7 +105,6 @@ C10_UNUSED static TensorIterator _make_unfold_backward_iter_over_grad_out(

  return iter;
 }
-
 }

 } // namespace at::native
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@ -103,7 +103,9 @@ DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel);
 DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel);
 DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel);

-C10_UNUSED inline std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+[[maybe_unused]] inline std::array<int64_t, 3> upsample_1d_common_check(
+    IntArrayRef input_size,
+    IntArrayRef output_size) {
  TORCH_CHECK(
      output_size.size() == 1,
      "It is expected output_size equals to 1, but got size ",
@ -131,7 +133,9 @@ C10_UNUSED inline std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef in
  return {nbatch, channels, output_width};
 }

-C10_UNUSED inline std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+[[maybe_unused]] inline std::array<int64_t, 4> upsample_2d_common_check(
+    IntArrayRef input_size,
+    IntArrayRef output_size) {
  TORCH_CHECK(
      output_size.size() == 2,
      "It is expected output_size equals to 2, but got size ",
@ -167,8 +171,9 @@ C10_UNUSED inline std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef in
  return {nbatch, channels, output_height, output_width};
 }

-C10_UNUSED inline
-std::array<int64_t, 5> upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+[[maybe_unused]] inline std::array<int64_t, 5> upsample_3d_common_check(
+    IntArrayRef input_size,
+    IntArrayRef output_size) {
  TORCH_CHECK(
      output_size.size() == 3,
      "It is expected output_size equals to 3, but got size ",
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
@ -40,7 +40,6 @@ int register_linear_params() {
 }

 namespace {
-C10_UNUSED static auto linear_params = register_linear_params();
-}  // namespace
-
+[[maybe_unused]] static auto linear_params = register_linear_params();
+} // namespace
 }}  // namespace ao::sparse
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@ -82,7 +82,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
        std::copy_n(base, 2, data.data());
        const int64_t *outer_strides = &strides[2];

-        for (C10_UNUSED const auto it : c10::irange(size1)) {
+        for ([[maybe_unused]] const auto it : c10::irange(size1)) {
          Vecd dst_s;
          if (strides_in[0] == 0) {
            dst_s = Vecd(dest_t(*((scalar_t*)data[1])));
@ -151,7 +151,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
        std::copy_n(base, 2, data.data());
        const int64_t *outer_strides = &strides[2];

-        for (C10_UNUSED const auto it : c10::irange(size1)) {
+        for ([[maybe_unused]] const auto it : c10::irange(size1)) {
          Vecd dst_s;
          if (strides_in[0] == 0) {
            dst_s = Vecd(dest_t(*((source_t*)data[1])));
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -395,7 +395,7 @@ struct Dist {
    const scalar_t * t1_end = t1 + l1_size;
    const scalar_t * t2_end = t2 + l2_size;

-    for (C10_UNUSED const auto l : c10::irange(d)) {
+    for ([[maybe_unused]] const auto l : c10::irange(d)) {
      for (; t1 != t1_end; t1 += m, res += m) {
        const Vec vec_t1 = Vec::loadu(t1, count);
        Vec res_vec = Vec::loadu(res, count);
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@ -473,7 +473,7 @@ void cpu_flash_attention(
        scalar_t* transpose_buffer_ptr = transpose_buffer.get();
        std::unique_ptr<scalar_t[]> v_copy_buffer = std::make_unique<scalar_t[]>(ekvSplitSize * packb_size);
        scalar_t* v_copy_buffer_ptr = v_copy_buffer.get();
-        for (C10_UNUSED auto z : c10::irange(begin, end)) {
+        for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
          n = l * kvSplitSize;
          int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
          int64_t ekvBlockSize = kvBlockSize % 2 == 0 ? kvBlockSize : kvBlockSize + 1;
@ -566,7 +566,7 @@ void cpu_flash_attention(
            ? query_padding_ptr + ompIdx * qSplitSize * eheadSize
            : nullptr;

-    for (C10_UNUSED auto z : c10::irange(begin, end)) {
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
      int64_t m = k * qSplitSize;
      int64_t qBlockSize = std::min(qSplitSize, qSize - m);
      // Initialize max and sum
@ -931,7 +931,7 @@ void cpu_flash_attention_backward(

    at::Tensor dsum = at::empty({qSplitSize}, query.options().dtype(accumulate_dtype));
    accum_t* dsum_data = dsum.data_ptr<accum_t>();
-    for (C10_UNUSED auto z : c10::irange(begin, end)) {
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
      // rowsum of grad_out * out
      for (int64_t m = 0; m < qSize; m += qSplitSize) {
        int64_t qBlockSize = std::min(qSplitSize, qSize - m);
--- a/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp
+++ b/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp
@ -30,7 +30,7 @@ void _compute_linear_combination_cpu_kernel(
        auto* RESTRICT in_ptr = data[1];
        auto* RESTRICT coeff_ptr = data[2];

-        for (C10_UNUSED const auto elem : c10::irange(n)) {
+        for ([[maybe_unused]] const auto elem : c10::irange(n)) {
          auto* RESTRICT out_data = reinterpret_cast<scalar_t*>(out_ptr);
          auto* RESTRICT in_data = reinterpret_cast<scalar_t*>(in_ptr);
          using primitive_t = typename scalar_value_type<scalar_t>::type;
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -78,7 +78,7 @@ void cpu_take_put_kernel(
  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
    auto* iterated_data_bytes = data[0];
    auto* index_data_bytes = data[1];
-    for (C10_UNUSED const auto elem : c10::irange(n)) {
+    for ([[maybe_unused]] const auto elem : c10::irange(n)) {
      auto idx = *reinterpret_cast<int64_t*>(index_data_bytes);
      auto& iterated = *reinterpret_cast<scalar_t*>(iterated_data_bytes);

@ -203,7 +203,7 @@ void index_fill_kernel(
    auto handle_nonzero_idx_stride = [&](char** data, const int64_t* strides, int64_t n) {
      auto* self_data_bytes = data[0];
      auto* index_data_bytes = data[1];
-      for (C10_UNUSED const auto elem : c10::irange(n)) {
+      for ([[maybe_unused]] const auto elem : c10::irange(n)) {
        auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);
        auto idx = *reinterpret_cast<int64_t*>(index_data_bytes);
        TORCH_CHECK_INDEX(idx >= -self_dim_size && idx < self_dim_size,
@ -229,7 +229,7 @@ void index_fill_kernel(
      if (idx < 0) {
        idx += self_dim_size;
      }
-      for (C10_UNUSED const auto elem: c10::irange(n)) {
+      for ([[maybe_unused]] const auto elem : c10::irange(n)) {
        auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);

        self_data[idx * self_dim_stride] = fill_val;
@ -262,7 +262,7 @@ void index_copy_kernel(
      auto* self_data_bytes = data[0];
      auto* index_data_bytes = data[1];
      auto* source_data_bytes = data[2];
-      for (C10_UNUSED const auto elem : c10::irange(n)) {
+      for ([[maybe_unused]] const auto elem : c10::irange(n)) {
        auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);
        auto idx = *reinterpret_cast<int64_t*>(index_data_bytes);
        auto* source_data = reinterpret_cast<scalar_t*>(source_data_bytes);
@ -285,7 +285,7 @@ void index_copy_kernel(
      TORCH_CHECK_INDEX(idx >= 0 && idx < self_dim_size,
            "index_copy_(): index ", idx, " is out of bounds for dimension ",
            dim, " with size ", self_dim_size);
-      for (C10_UNUSED const auto elem : c10::irange(n)) {
+      for ([[maybe_unused]] const auto elem : c10::irange(n)) {
        auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);
        auto* source_data = reinterpret_cast<scalar_t*>(source_data_bytes);

@ -474,8 +474,7 @@ void cpu_hflip_vec(at::TensorIterator& iter) {
    constexpr auto stride = sizeof(scalar_t);
    TORCH_INTERNAL_ASSERT(stride == -strides[0] && stride == strides[1]);

-    for (C10_UNUSED const auto j : c10::irange(size1)) {
-
+    for ([[maybe_unused]] const auto j : c10::irange(size1)) {
      // vectorized loop with negative stride for output
      char** C10_RESTRICT data_ = data_arr.data();
      int64_t n = size0;
@ -543,8 +542,7 @@ void cpu_vflip_memcpy(at::TensorIterator& iter) {
    TORCH_INTERNAL_ASSERT(strides[0] == strides[1]);
    const int64_t stride = strides[0];

-    for (C10_UNUSED const auto j : c10::irange(size1)) {
-
+    for ([[maybe_unused]] const auto j : c10::irange(size1)) {
      char** C10_RESTRICT data_ = data_arr.data();
      int64_t n = size0;

--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@ -271,7 +271,7 @@ struct VectorizedLoop2d {
    const int64_t *outer_strides = &strides[ntensors];

    if (is_contiguous<traits>(strides)) {
-      for (C10_UNUSED const auto i : c10::irange(size1)) {
+      for ([[maybe_unused]] const auto i : c10::irange(size1)) {
        vectorized_loop(data.data(), size0, 0, op, vop);
        advance(data, outer_strides);
      }
@ -279,12 +279,12 @@ struct VectorizedLoop2d {
      using Indices = std::make_index_sequence<traits::arity>;
      unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
        if (idx) {
-          for (C10_UNUSED const auto i : c10::irange(size1)) {
+          for ([[maybe_unused]] const auto i : c10::irange(size1)) {
            vectorized_loop(data.data(), size0, idx, op, vop);
            advance(data, outer_strides);
          }
        } else {
-          for (C10_UNUSED const auto i : c10::irange(size1)) {
+          for ([[maybe_unused]] const auto i : c10::irange(size1)) {
            basic_loop(data.data(), strides, 0, size0, op);
            advance(data, outer_strides);
          }
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@ -70,7 +70,7 @@ inline void vectorized_reduction(char** data, int64_t n, int64_t stride,

 template <typename F>
 inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) {
-  for (C10_UNUSED const auto j : c10::irange(n)) {
+  for ([[maybe_unused]] const auto j : c10::irange(n)) {
    f();
    data[0] += strides[0];
    data[1] += strides[1];
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -62,11 +62,12 @@ static inline void cpu_cum_base_kernel(const Tensor& result,
    auto* result_data_bytes = data[0];
    const auto* self_data_bytes = data[1];

-    for (C10_UNUSED const auto i : c10::irange(n)) {
-      f(
-        (scalar_t*)result_data_bytes, result_dim_stride,
-        (scalar_t*)self_data_bytes, self_dim_stride, init_val
-      );
+    for ([[maybe_unused]] const auto i : c10::irange(n)) {
+      f((scalar_t*)result_data_bytes,
+        result_dim_stride,
+        (scalar_t*)self_data_bytes,
+        self_dim_stride,
+        init_val);
      result_data_bytes += strides[0];
      self_data_bytes += strides[1];
    }
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@ -215,7 +215,7 @@ struct cpu_scatter_gather_base_kernel {
          // vs dim-TensorIterator loop order depending on
          // whether dim is the last dimension
          if (dim== buffer.dim() - 1) {
-            for (C10_UNUSED const auto nelem : c10::irange(n)) {
+            for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
              // dim loop is a separate code block
              // for better performance
              loop_func.template operator()<scalar_t, func_t>(
@ -232,7 +232,7 @@ struct cpu_scatter_gather_base_kernel {
            for (const auto i : c10::irange(index_dim_size)) {
              auto* self_data = self_data_bytes;
              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
-              for (C10_UNUSED const auto nelem : c10::irange(n)) {
+              for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
                int64_t idx_dim = *(int64_t*)index_data;
                // we are not putting idx_dim in the error message because it disables
                // loop optimization in clang-7
@ -306,7 +306,7 @@ struct cpu_scatter_gather_base_kernel {
          // vs dim-TensorIterator loop order depending on
          // whether dim is the last dimension
          if (dim== buffer.dim() - 1) {
-            for (C10_UNUSED const auto nelem : c10::irange(n)) {
+            for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
              // dim loop is a separate code block
              // for better performance
              loop_func.template operator()<scalar_t, func_t>(
@ -327,7 +327,7 @@ struct cpu_scatter_gather_base_kernel {
              auto* self_data = self_data_bytes;
              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
              auto* src_data = src_data_bytes;
-              for (C10_UNUSED const auto nelem : c10::irange(n)) {
+              for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
                int64_t idx_dim = *(int64_t*)index_data;
                // we are not putting idx_dim in the error message because it disables
                // loop optimization in clang-7
@ -402,7 +402,7 @@ struct cpu_scatter_gather_base_kernel {
          // vs dim-TensorIterator loop order depending on
          // whether dim is the last dimension
          if (dim== buffer.dim() - 1) {
-            for (C10_UNUSED const auto nelem : c10::irange(n)) {
+            for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
              // dim loop is a separate code block
              // for better performance
              loop_func.template operator()<scalar_t, ReduceMean>(
@ -423,7 +423,7 @@ struct cpu_scatter_gather_base_kernel {
              auto* self_data = self_data_bytes;
              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
              auto* src_data = src_data_bytes;
-              for (C10_UNUSED const auto nelem : c10::irange(n)) {
+              for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
                int64_t idx_dim = *(int64_t*)index_data;
                // we are not putting idx_dim in the error message because it disables
                // loop optimization in clang-7
@ -497,7 +497,7 @@ struct cpu_scatter_gather_base_kernel {
          // vs dim-TensorIterator loop order depending on
          // whether dim is the last dimension
          if (dim== buffer.dim() - 1) {
-            for (C10_UNUSED const auto nelem : c10::irange(n)) {
+            for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
              // dim loop is a separate code block
              // for better performance
              loop_func.template operator()<scalar_t, ReduceMaximum>(
@ -518,7 +518,7 @@ struct cpu_scatter_gather_base_kernel {
              auto* self_data = self_data_bytes;
              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
              auto* src_data = src_data_bytes;
-              for (C10_UNUSED const auto nelem : c10::irange(n)) {
+              for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
                int64_t idx_dim = *(int64_t*)index_data;
                // we are not putting idx_dim in the error message because it disables
                // loop optimization in clang-7
@ -593,7 +593,7 @@ struct cpu_scatter_gather_base_kernel {
          // vs dim-TensorIterator loop order depending on
          // whether dim is the last dimension
          if (dim== buffer.dim() - 1) {
-            for (C10_UNUSED const auto nelem : c10::irange(n)) {
+            for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
              // dim loop is a separate code block
              // for better performance
              loop_func.template operator()<scalar_t, ReduceMinimum>(
@ -614,7 +614,7 @@ struct cpu_scatter_gather_base_kernel {
              auto* self_data = self_data_bytes;
              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
              auto* src_data = src_data_bytes;
-              for (C10_UNUSED const auto nelem : c10::irange(n)) {
+              for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
                int64_t idx_dim = *(int64_t*)index_data;
                // we are not putting idx_dim in the error message because it disables
                // loop optimization in clang-7
--- a/aten/src/ATen/native/cpu/SortingKernel.cpp
+++ b/aten/src/ATen/native/cpu/SortingKernel.cpp
@ -53,14 +53,12 @@ void _dim_apply(
          return;
        }

-        for (C10_UNUSED const auto i : c10::irange(n)) {
-          f(
-            reinterpret_cast<scalar_t*>(values_data_bytes),
+        for ([[maybe_unused]] const auto i : c10::irange(n)) {
+          f(reinterpret_cast<scalar_t*>(values_data_bytes),
            values_dim_stride,
            reinterpret_cast<int64_t*>(indices_data_bytes),
            indices_dim_stride,
-            dim_size
-          );
+            dim_size);

          values_data_bytes += strides[0];
          indices_data_bytes += strides[1];
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@ -83,7 +83,7 @@ static inline void compare_base_kernel(const Tensor& result1, const Tensor& resu
    auto* result1_data_bytes = data[0];
    auto* result2_data_bytes = data[1];
    const auto* self_data_bytes = data[2];
-    for (C10_UNUSED const auto i : c10::irange(n)) {
+    for ([[maybe_unused]] const auto i : c10::irange(n)) {
      f((scalar_t*)result1_data_bytes,
        (scalar_t_2*)result2_data_bytes,
        (scalar_t*)self_data_bytes,
@ -253,7 +253,7 @@ static void mode_kernel_impl(

          std::vector<std::pair<scalar_t, int64_t>> elements(self_dim_size);

-          for (C10_UNUSED const auto k : c10::irange(n)) {
+          for ([[maybe_unused]] const auto k : c10::irange(n)) {
            scalar_t* values_data = (scalar_t*)values_data_bytes;
            int64_t* indices_data = (int64_t*)indices_data_bytes;
            const scalar_t* self_data = (scalar_t*)self_data_bytes;
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@ -353,8 +353,9 @@ static void unfolded2d_copy_channels_last(
    int64_t x = 0;
    data_index_init(start, y, output_height, x, output_width);

-    for (const auto k C10_UNUSED: c10::irange(start, end)) {
-      scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane;
+    for (const auto k [[maybe_unused]] : c10::irange(start, end)) {
+      scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane +
+          x * kH * kW * n_input_plane;
      const scalar_t* src = input_data;

      if (padW > 0 || padH > 0) {
--- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
@ -76,7 +76,7 @@ void _unfold_backward_internal_kernel(
    auto* RESTRICT grad_in_ptr = data[1];
    auto* RESTRICT idx_dim_ptr = data[2];

-    for (C10_UNUSED const auto elem : c10::irange(nelems)) {
+    for ([[maybe_unused]] const auto elem : c10::irange(nelems)) {
      auto* RESTRICT grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr);
      auto* RESTRICT grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr);

--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@ -733,8 +733,9 @@ struct HelperInterpBase {
    auto new_shape = std::vector<int64_t>(ndims, 1);
    new_shape[reshape_dim] = output_size;

-    for (C10_UNUSED const auto j : c10::irange(interp_size)) {
-      output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
+    for ([[maybe_unused]] const auto j : c10::irange(interp_size)) {
+      output.emplace_back(
+          empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
      output.emplace_back(empty(new_shape, CPU(output_type)));
    }
  }
@ -1047,8 +1048,9 @@ struct HelperInterpNearest : public HelperInterpBase {
    auto new_shape = std::vector<int64_t>(ndims, 1);
    new_shape[reshape_dim] = output_size;

-    for (C10_UNUSED const auto j : c10::irange(interp_size)) {
-      output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
+    for ([[maybe_unused]] const auto j : c10::irange(interp_size)) {
+      output.emplace_back(
+          empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
      // Defines weights for consistency, but not used
      output.emplace_back(at::ones(new_shape, CPU(output_type)));
    }
--- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@ -102,7 +102,7 @@ void pack_rgb(

  TORCH_INTERNAL_ASSERT(unpacked_increment == 3 || unpacked_increment == 4);

-  for (C10_UNUSED const auto i : c10::irange(num_pixels)) {
+  for ([[maybe_unused]] const auto i : c10::irange(num_pixels)) {
    for (const auto j : c10::irange(num_channels)) {
      packed[j * packed_stride] = unpacked[j];
    }
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@ -723,7 +723,7 @@ void int4pack_mm_kernel_(
    int mb{0}, nb{0};
    data_index_init(begin, mb, MB, nb, NB);

-    for (C10_UNUSED const auto i : c10::irange(begin, end)) {
+    for ([[maybe_unused]] const auto i : c10::irange(begin, end)) {
      int mb_start = mb * BLOCK_M;
      int mb_size = std::min(BLOCK_M, M - mb_start);
      int nb_start = nb * BLOCK_N;
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@ -177,7 +177,7 @@ struct KthValueLauncher {
      cuda::detail::TensorInfo<scalar_t, index_t> values_info,
      int collapse_values_dim,
      cuda::detail::TensorInfo<int64_t, index_t> indices_info,
-      C10_UNUSED int collapse_indices_dim,
+      [[maybe_unused]] int collapse_indices_dim,
      cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
      int collapse_self_dim,
      int64_t num_slices,
@ -212,9 +212,9 @@ struct MedianLauncher {
  template <typename scalar_t, typename index_t, int all_dims>
  inline void launch(
      cuda::detail::TensorInfo<scalar_t, index_t> values_info,
-      C10_UNUSED int collapse_values_dim,
+      [[maybe_unused]] int collapse_values_dim,
      cuda::detail::TensorInfo<int64_t, index_t> indices_info,
-      C10_UNUSED int collapse_indices_dim,
+      [[maybe_unused]] int collapse_indices_dim,
      cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
      int collapse_self_dim,
      int64_t num_slices,
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@ -1374,7 +1374,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cuda(
  for (const auto idx: c10::irange(axis)) {
    stat_shape.push_back(input_shape[idx]);
  }
-  for (C10_UNUSED const auto idx: c10::irange(axis, input.dim())) {
+  for ([[maybe_unused]] const auto idx : c10::irange(axis, input.dim())) {
    stat_shape.push_back(1);
  }

--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@ -74,7 +74,7 @@ cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(
  // Ubuntu-22+ if `libnvrtc.so` is not found on the system, which strictly
  // speaking is not necessary for usecases below See
  // https://github.com/pytorch/pytorch/issues/97041
-  C10_UNUSED static auto cudnn_cnn_infer_handler = [] {
+  [[maybe_unused]] static auto cudnn_cnn_infer_handler = [] {
    void* handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY);
    char* err = dlerror();
    if (!handle) {
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@ -51,7 +51,7 @@ static void layer_norm_with_mean_rstd_out(
  for (const auto idx : c10::irange(axis)) {
    stat_shape.emplace_back(input_shape[idx]);
  }
-  for (C10_UNUSED const auto idx : c10::irange(axis, input.dim())) {
+  for ([[maybe_unused]] const auto idx : c10::irange(axis, input.dim())) {
    stat_shape.emplace_back(1);
  }

@ -256,7 +256,7 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
  for (const auto idx : c10::irange(axis)) {
    stat_shape.push_back(input_shape[idx]);
  }
-  for (C10_UNUSED const auto idx : c10::irange(axis, input.dim())) {
+  for ([[maybe_unused]] const auto idx : c10::irange(axis, input.dim())) {
    stat_shape.push_back(1);
  }
  mean = mean.view(stat_shape);
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -163,7 +163,7 @@ static void linalg_lu_factor_out_mps_impl(const Tensor& A, bool pivot, Tensor& L

  status_tensors.reserve(batchSize);
  pivots_list.reserve(batchSize);
-  for (C10_UNUSED const auto i : c10::irange(batchSize)) {
+  for ([[maybe_unused]] const auto i : c10::irange(batchSize)) {
    status_tensors.push_back(at::zeros(1, kInt, std::nullopt, kMPS, std::nullopt));
    pivots_list.push_back(at::zeros(numPivots, kInt, std::nullopt, kMPS, std::nullopt));
  }
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -922,7 +922,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
  for (const auto idx : c10::irange(axis)) {
    stat_shape.push_back(input_shape[idx]);
  }
-  for (C10_UNUSED auto idx : c10::irange(axis, input.dim())) {
+  for ([[maybe_unused]] auto idx : c10::irange(axis, input.dim())) {
    stat_shape.push_back(1);
  }
  mean = mean.view(stat_shape);
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -706,7 +706,7 @@ static ViewCachedGraph* createViewGraph(const Tensor& self,
      // Self is the input tensor we are creating view of
      newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(base_shape));
      newCachedGraph->storageOffsetTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @1 ]);
-      for (C10_UNUSED const auto i : c10::irange(size.size())) {
+      for ([[maybe_unused]] const auto i : c10::irange(size.size())) {
        newCachedGraph->strideTensors.push_back(mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @1 ]));
      }
      if (needsScatter) {
--- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
@ -444,7 +444,7 @@ Tensor qnnpack_avg_pool2d(
 } // namespace at

 namespace {
-C10_UNUSED std::vector<float> generate_requantization_scales(
+[[maybe_unused]] std::vector<float> generate_requantization_scales(
    const at::Tensor& weight_scales,
    const float input_scale,
    const float output_scale,
@ -468,11 +468,11 @@ C10_UNUSED std::vector<float> generate_requantization_scales(
  return requant_scales;
 }

-C10_UNUSED std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
+[[maybe_unused]] std::pair<std::vector<uint8_t>, at::Tensor>
+make_zero_points_and_scales_tensor(
    const at::Tensor& weight_contig,
    bool transpose = false,
-    uint32_t groups = 1
-  ) {
+    uint32_t groups = 1) {
  const int out_ch_idx = transpose ? 1 : 0;
  const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
  // Add 8 to account for bufferring needed by QNNPACK.
--- a/aten/src/ATen/native/quantized/cpu/QuantUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
@ -186,8 +186,9 @@ inline TensorQuantizationParams ChooseQuantizationParams(

 // This function helps to convert the Conv1D dimensions usable by the Conv2d op.
 constexpr int64_t kConv1dSqueezeDim = 0;
-C10_UNUSED static torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_t>& arg,
-                                             int64_t base_value) {
+[[maybe_unused]] static torch::List<int64_t> MakeArgForConv1d(
+    const torch::List<int64_t>& arg,
+    int64_t base_value) {
  TORCH_CHECK(!arg.empty(), "Argument must have elements.");
  torch::List<int64_t> result({arg.get(0), base_value});
  if (arg.size() == 1) {
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
@ -71,7 +71,7 @@ static void upsample_nearest3d_out_frame(
        const auto* pos1 = &i_p[d1 * input_height * input_width + h1 * input_width + w1];
        auto* pos2 = &o_p[d2 * output_height * output_width + h2 * output_width + w2];

-        for (C10_UNUSED const auto c : c10::irange(channels)) {
+        for ([[maybe_unused]] const auto c : c10::irange(channels)) {
          pos2[0] = pos1[0];
          pos1 += input_depth * input_height * input_width;
          pos2 += output_depth * output_height * output_width;
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@ -143,7 +143,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
      config_vals.push_back(dilation[0].item<int16_t>());
    }
    // output_padding does not exist in v1, so we fill in a default value
-    for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
      config_vals.push_back(0);
    }
    config_vals.push_back(groups[0].item<int16_t>());
@ -294,21 +294,24 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
  torch::List<int64_t> stride, padding, output_padding, dilation;
  // skip kSpatialDim
  int idx = 1;
-  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+  for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
    stride.emplace_back(config_vals.at(idx));
    idx++;
  }
-  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+  for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
    padding.emplace_back(config_vals.at(idx));
    idx++;
  }
-  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+  for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
    dilation.emplace_back(config_vals.at(idx));
    idx++;
  }
-  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
-    TORCH_INTERNAL_ASSERT(idx < static_cast<int64_t>(config_vals.size()),
-        "Unexpected index = ", idx, " for config_vals of size ",
+  for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
+    TORCH_INTERNAL_ASSERT(
+        idx < static_cast<int64_t>(config_vals.size()),
+        "Unexpected index = ",
+        idx,
+        " for config_vals of size ",
        config_vals.size());
    output_padding.emplace_back(config_vals.at(idx));
    idx++;
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@ -554,9 +554,9 @@ int register_embedding_params() {

 namespace {

-C10_UNUSED static auto conv2d_params = register_conv_params<2>();
-C10_UNUSED static auto conv3d_params = register_conv_params<3>();
-C10_UNUSED static auto linear_params = register_linear_params();
-C10_UNUSED static auto embedding_params = register_embedding_params();
+[[maybe_unused]] static auto conv2d_params = register_conv_params<2>();
+[[maybe_unused]] static auto conv3d_params = register_conv_params<3>();
+[[maybe_unused]] static auto linear_params = register_linear_params();
+[[maybe_unused]] static auto embedding_params = register_embedding_params();

 } // namespace
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@ -2293,7 +2293,7 @@ void qupsample_bilinear2d_nhwc_kernel(
      int64_t b{0}, h2{0}, w2{0};
      data_index_init(begin, b, nbatch, h2, output_height, w2, output_width);

-      for (C10_UNUSED const auto i : c10::irange(begin, end)) {
+      for ([[maybe_unused]] const auto i : c10::irange(begin, end)) {
        auto* i_p = reinterpret_cast<typename scalar_t::underlying*>(
            idata + b * input_height * input_width * channels);
        auto* o_p = reinterpret_cast<typename scalar_t::underlying*>(
@ -3818,8 +3818,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
    // channels_last contig.
    // If axis = 0 and channels_last contig, implementation for channels
    // first (NCHW) works.
-    for (C10_UNUSED const auto b : c10::irange(batches)) {
-      for (C10_UNUSED const auto e : c10::irange(elements_per_channel)) {
+    for ([[maybe_unused]] const auto b : c10::irange(batches)) {
+      for ([[maybe_unused]] const auto e : c10::irange(elements_per_channel)) {
        uint32_t c = 0;
        while (c + 8 < channels) {
          const int32x4_t voffset0123 = vld1q_s32(&zero_points_int32t[c]);
@ -3853,7 +3853,7 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
      }
    }
  } else {
-    for (C10_UNUSED const auto b : c10::irange(batches)) {
+    for ([[maybe_unused]] const auto b : c10::irange(batches)) {
      for (const auto c : c10::irange(channels)) {
        uint32_t e = 0;
        const int32x4_t voffset = vdupq_n_s32(zero_points_int32t[c]);
@ -3900,8 +3900,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
    // channels_last contig.
    // If axis = 0 and channels_last contig, implementation for channels
    // first (NCHW) works.
-    for (C10_UNUSED const auto b : c10::irange(batches)) {
-      for (C10_UNUSED const auto e : c10::irange(elements_per_channel)) {
+    for ([[maybe_unused]] const auto b : c10::irange(batches)) {
+      for ([[maybe_unused]] const auto e : c10::irange(elements_per_channel)) {
        uint32_t c = 0;
        while (c + 8 < channels) {
          const int16x8_t vzero_point = vld1q_s16(&zero_points_int16t[c]);
@ -3931,8 +3931,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
      }
    }
  } else {
-    for (C10_UNUSED const auto b : c10::irange(batches)) {
-      for (C10_UNUSED const auto c : c10::irange(channels)) {
+    for ([[maybe_unused]] const auto b : c10::irange(batches)) {
+      for ([[maybe_unused]] const auto c : c10::irange(channels)) {
        uint32_t e = 0;
        const int16x8_t vzero_point = vdupq_n_s16(zero_points_int16t[c]);
        const float32x4_t vinv_scale = vdupq_n_f32(inv_scales[c]);
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@ -634,7 +634,7 @@ class QConvPackWeightInt8 final {
      int64_t groups) {
    torch::List<int64_t> output_padding;
    output_padding.reserve(kSpatialDim);
-    for (C10_UNUSED const auto idx : c10::irange(kSpatialDim)) {
+    for ([[maybe_unused]] const auto idx : c10::irange(kSpatialDim)) {
      output_padding.push_back((int64_t)0);
    }
    return _run(weight, bias, stride, padding, output_padding, dilation, groups,
--- a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
@ -139,7 +139,7 @@ class QConvPackWeightInt8Cudnn final {
      int64_t groups) {
    torch::List<int64_t> output_padding;
    output_padding.reserve(kSpatialDim);
-    for (C10_UNUSED const auto idx : c10::irange(kSpatialDim)) {
+    for ([[maybe_unused]] const auto idx : c10::irange(kSpatialDim)) {
      output_padding.push_back((int64_t)0);
    }
    return _run(weight, bias, stride, padding, output_padding, dilation, groups,
--- a/aten/src/ATen/native/sparse/SparseMatMul.cpp
+++ b/aten/src/ATen/native/sparse/SparseMatMul.cpp
@ -159,7 +159,7 @@ void _csr_matmult(
      }
    }

-    for (C10_UNUSED const auto jj : c10::irange(length)) {
+    for ([[maybe_unused]] const auto jj : c10::irange(length)) {
      // NOTE: the linked list that encodes col indices
      // is not guaranteed to be sorted.
      Cj[nnz] = head;
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@ -11,7 +11,7 @@

 // Compiler Macros

-// Suppress an unused variable. Copied from C10_UNUSED
+// Suppress an unused variable. Copied from [[maybe_unused]]
 #if defined(_MSC_VER) && !defined(__clang__)
 #define VK_UNUSED __pragma(warning(suppress : 4100 4101))
 #else
--- a/aten/src/ATen/native/xnnpack/Init.cpp
+++ b/aten/src/ATen/native/xnnpack/Init.cpp
@ -31,7 +31,7 @@ bool initialize() {
  return is_initialized_;
 }

-C10_UNUSED bool deinitialize() {
+[[maybe_unused]] bool deinitialize() {
  using namespace internal;

  // This implementation allows for retries.
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@ -89,7 +89,7 @@ void TestAdd(DeprecatedTypeProperties& type) {
 void TestZeros(DeprecatedTypeProperties& type) {
  auto begin = std::chrono::high_resolution_clock::now();
  Tensor a = zeros({1024, 1024}, type);
-  for (C10_UNUSED const auto i : c10::irange(1, 1000)) {
+  for ([[maybe_unused]] const auto i : c10::irange(1, 1000)) {
    a = zeros({128, 128}, type);
  }
  auto end = std::chrono::high_resolution_clock::now();
@ -107,7 +107,7 @@ void TestLoadsOfAdds(DeprecatedTypeProperties& type) {
  auto begin = std::chrono::high_resolution_clock::now();
  Tensor d = ones({3, 4}, type);
  Tensor r = zeros({3, 4}, type);
-  for (C10_UNUSED const auto i : c10::irange(1000)) {
+  for ([[maybe_unused]] const auto i : c10::irange(1000)) {
    add_out(r, r, d);
  }
  auto end = std::chrono::high_resolution_clock::now();
@ -124,7 +124,7 @@ void TestLoadOfAddsWithCopy(DeprecatedTypeProperties& type) {
  auto begin = std::chrono::high_resolution_clock::now();
  Tensor d = ones({3, 4}, type);
  Tensor r = zeros({3, 4}, type);
-  for (C10_UNUSED const auto i : c10::irange(1000)) {
+  for ([[maybe_unused]] const auto i : c10::irange(1000)) {
    r = add(r, d);
  }
  auto end = std::chrono::high_resolution_clock::now();
--- a/aten/src/ATen/test/cpu_generator_test.cpp
+++ b/aten/src/ATen/test/cpu_generator_test.cpp
@ -161,7 +161,7 @@ TEST(CPUGeneratorImpl, TestPhiloxEngineOffset1) {
  // So if you want to skip 8 values, offset would
  // be 2, since 2*4=8.
  at::Philox4_32 engine2(123, 1, 2);
-  for (C10_UNUSED const auto i : c10::irange(8)) {
+  for ([[maybe_unused]] const auto i : c10::irange(8)) {
    // Note: instead of using the engine() call 8 times
    // we could have achieved the same functionality by
    // calling the incr() function twice.
@ -222,14 +222,14 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
  // test with zero seed
  at::mt19937 engine1(0);
  std::mt19937 engine2(0);
-  for (C10_UNUSED const auto i : c10::irange(10000)) {
+  for ([[maybe_unused]] const auto i : c10::irange(10000)) {
    ASSERT_EQ(engine1(), engine2());
  }

  // test with large seed
  engine1 = at::mt19937(2147483647);
  engine2 = std::mt19937(2147483647);
-  for (C10_UNUSED const auto i : c10::irange(10000)) {
+  for ([[maybe_unused]] const auto i : c10::irange(10000)) {
    ASSERT_EQ(engine1(), engine2());
  }

@ -238,10 +238,9 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
  auto seed = rd();
  engine1 = at::mt19937(seed);
  engine2 = std::mt19937(seed);
-  for (C10_UNUSED const auto i : c10::irange(10000)) {
+  for ([[maybe_unused]] const auto i : c10::irange(10000)) {
    ASSERT_EQ(engine1(), engine2());
  }
-
 }

 TEST(CPUGeneratorImpl, TestPhiloxEngineReproducibilityRandN) {
--- a/aten/src/ATen/test/legacy_vmap_test.cpp
+++ b/aten/src/ATen/test/legacy_vmap_test.cpp
@ -170,7 +170,7 @@ TEST(VmapTest, TestBatchedTensorActualDim) {
  {
    // ActualDim on kVmapMaxTensorDims sized underlying tensor
    auto tensor = ones({});
-    for (C10_UNUSED const auto i : c10::irange(kVmapMaxTensorDims)) {
+    for ([[maybe_unused]] const auto i : c10::irange(kVmapMaxTensorDims)) {
      tensor = tensor.unsqueeze(0);
    }
    ASSERT_EQ(tensor.dim(), kVmapMaxTensorDims);
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@ -14,7 +14,7 @@ void test(int given_num_threads) {
  ASSERT_TRUE(given_num_threads >= 0);
  ASSERT_EQ(at::get_num_threads(), given_num_threads);
  auto t_sum = t.sum();
-  for (C10_UNUSED const auto i : c10::irange(1000)) {
+  for ([[maybe_unused]] const auto i : c10::irange(1000)) {
    t_sum = t_sum + t.sum();
  }
 }
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -1122,24 +1122,28 @@ namespace {
        float minv = static_cast<float>(static_cast<double>(min_val) * 2.0);
        float maxv = static_cast<float>(static_cast<double>(max_val) * 2.0);
        ValueGen<float> gen(minv, maxv, seed.add(2));
-        for (C10_UNUSED const auto i : c10::irange(trials)) {
-            float scale = generator_sc.get();
-            float inv_scale = 1.0f / static_cast<float>(scale);
-            auto zero_point_val = generator_zp.get();
-            int index = 0;
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
-                //generate vals
-                for (auto& v : unit_float_vec) {
-                    v = gen.get();
-                    expected_qint_vals[index] = quantize_val<underlying>(scale, zero_point_val, v);
-                    index++;
-                }
-                float_ret[j] = vfloat::loadu(unit_float_vec);
+        for ([[maybe_unused]] const auto i : c10::irange(trials)) {
+          float scale = generator_sc.get();
+          float inv_scale = 1.0f / static_cast<float>(scale);
+          auto zero_point_val = generator_zp.get();
+          int index = 0;
+          for (int j = 0; j < vec::float_num_vecs(); j++) {
+            // generate vals
+            for (auto& v : unit_float_vec) {
+              v = gen.get();
+              expected_qint_vals[index] =
+                  quantize_val<underlying>(scale, zero_point_val, v);
+              index++;
            }
-            auto expected = vec::loadu(expected_qint_vals);
-            auto actual = vec::quantize(float_ret, scale, zero_point_val, inv_scale);
-            if (AssertVectorized<vec>(NAME_INFO(Quantize), expected, actual).check()) return;
-        } //trials;
+            float_ret[j] = vfloat::loadu(unit_float_vec);
+          }
+          auto expected = vec::loadu(expected_qint_vals);
+          auto actual =
+              vec::quantize(float_ret, scale, zero_point_val, inv_scale);
+          if (AssertVectorized<vec>(NAME_INFO(Quantize), expected, actual)
+                  .check())
+            return;
+        } // trials;
    }
 #if (defined(CPU_CAPABILITY_AVX2) ||  defined(CPU_CAPABILITY_AVX512))  && !defined(_MSC_VER)
    // This test case aims to test at::vec::QuantizeAvx512 and
@ -1168,7 +1172,7 @@ namespace {
      float minv = static_cast<float>(static_cast<double>(min_val) * 2.0);
      float maxv = static_cast<float>(static_cast<double>(max_val) * 2.0);
      ValueGen<float> gen(minv, maxv, seed.add(2));
-      for (C10_UNUSED const auto i : c10::irange(trials)) {
+      for ([[maybe_unused]] const auto i : c10::irange(trials)) {
        float scale = generator_sc.get();
        float inv_scale = 1.0f / static_cast<float>(scale);
        auto zero_point_val = generator_zp.get();
@ -1227,35 +1231,36 @@ namespace {
        ValueGen<int> generator(min_val, max_val, seed.add(1));
        //scale
        ValueGen<float> generator_sc(1.f, 15.f, seed.add(2));
-        for (C10_UNUSED const auto i : c10::irange(trials)) {
-            float scale = generator_sc.get();
-            int32_t zero_point_val = generator.get();
-            float scale_zp_premul = -(scale * zero_point_val);
-            vfloat vf_scale = vfloat{ scale };
-            vfloat vf_zp = vfloat{ static_cast<float>(zero_point_val) };
-            vfloat vf_scale_zp = vfloat{ scale_zp_premul };
-            //generate vals
-            for (auto& x : qint_vals) {
-                x = generator.get();
+        for ([[maybe_unused]] const auto i : c10::irange(trials)) {
+          float scale = generator_sc.get();
+          int32_t zero_point_val = generator.get();
+          float scale_zp_premul = -(scale * zero_point_val);
+          vfloat vf_scale = vfloat{scale};
+          vfloat vf_zp = vfloat{static_cast<float>(zero_point_val)};
+          vfloat vf_scale_zp = vfloat{scale_zp_premul};
+          // generate vals
+          for (auto& x : qint_vals) {
+            x = generator.get();
+          }
+          // get expected
+          int index = 0;
+          auto qint_vec = vec::loadu(qint_vals);
+          auto actual_float_ret =
+              qint_vec.dequantize(vf_scale, vf_zp, vf_scale_zp);
+          for (int j = 0; j < vec::float_num_vecs(); j++) {
+            for (auto& v : unit_exp_vals) {
+              v = dequantize_val(scale, zero_point_val, qint_vals[index]);
+              index++;
            }
-            //get expected
-            int index = 0;
-            auto qint_vec = vec::loadu(qint_vals);
-            auto actual_float_ret = qint_vec.dequantize(vf_scale, vf_zp, vf_scale_zp);
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
-                for (auto& v : unit_exp_vals) {
-                    v = dequantize_val(scale, zero_point_val, qint_vals[index]);
-                    index++;
-                }
-                vfloat expected = vfloat::loadu(unit_exp_vals);
-                const auto& actual = actual_float_ret[j];
+            vfloat expected = vfloat::loadu(unit_exp_vals);
+            const auto& actual = actual_float_ret[j];
 #if  defined(CHECK_DEQUANT_WITH_LOW_PRECISION)
                if (AssertVectorized<vfloat>(NAME_INFO(DeQuantize), seed, expected, actual).check(false, true, 1.e-3f)) return;
 #else
                if (AssertVectorized<vfloat>(NAME_INFO(DeQuantize), seed, expected, actual).check()) return;
 #endif
            }
-        } //trials;
+        } // trials;
    }
    TYPED_TEST(QuantizationTests, ReQuantizeFromInt) {
        using vec = TypeParam;
@ -1274,25 +1279,29 @@ namespace {
        ValueGen<int32_t> generator(min_val, max_val, seed);
        //scale
        ValueGen<float> generator_sc(1.f, 15.f, seed.add(1));
-        for (C10_UNUSED const auto i : c10::irange(trials)) {
-            float multiplier = 1.f / (generator_sc.get());
-            auto zero_point_val = generator.get();
-            int index = 0;
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
-                //generate vals
-                for (auto& v : unit_int_vec) {
-                    v = c10::qint32(generator.get());
-                    expected_qint_vals[index] = requantize_from_int<underlying>(multiplier, zero_point_val, v.val_);
-                    index++;
-                }
-                int_ret[j] = vqint::loadu(unit_int_vec);
+        for ([[maybe_unused]] const auto i : c10::irange(trials)) {
+          float multiplier = 1.f / (generator_sc.get());
+          auto zero_point_val = generator.get();
+          int index = 0;
+          for (int j = 0; j < vec::float_num_vecs(); j++) {
+            // generate vals
+            for (auto& v : unit_int_vec) {
+              v = c10::qint32(generator.get());
+              expected_qint_vals[index] = requantize_from_int<underlying>(
+                  multiplier, zero_point_val, v.val_);
+              index++;
            }
-            auto expected = vec::loadu(expected_qint_vals);
-            auto actual = vec::requantize_from_int(int_ret, multiplier, zero_point_val);
-            if (AssertVectorized<vec>(NAME_INFO(ReQuantizeFromInt), seed, expected, actual).check()) {
-                return;
-            }
-        } //trials;
+            int_ret[j] = vqint::loadu(unit_int_vec);
+          }
+          auto expected = vec::loadu(expected_qint_vals);
+          auto actual =
+              vec::requantize_from_int(int_ret, multiplier, zero_point_val);
+          if (AssertVectorized<vec>(
+                  NAME_INFO(ReQuantizeFromInt), seed, expected, actual)
+                  .check()) {
+            return;
+          }
+        } // trials;
    }
    TYPED_TEST(QuantizationTests, WideningSubtract) {
        using vec = TypeParam;
@ -1311,30 +1320,33 @@ namespace {
        typename vec::int_vec_return_type  expected_int_ret;
        auto seed = TestSeed();
        ValueGen<underlying> generator(min_val, max_val, seed);
-        for (C10_UNUSED const auto i : c10::irange(trials)) {
-            //generate vals
-            for (int j = 0; j < vec::size(); j++) {
-                qint_vals[j] = generator.get();
-                qint_b[j] = generator.get();
-                if constexpr (std::is_same_v<underlying, int>) {
-                    //filter overflow cases
-                    filter_sub_overflow(qint_vals[j], qint_b[j]);
-                }
+        for ([[maybe_unused]] const auto i : c10::irange(trials)) {
+          // generate vals
+          for (int j = 0; j < vec::size(); j++) {
+            qint_vals[j] = generator.get();
+            qint_b[j] = generator.get();
+            if constexpr (std::is_same_v<underlying, int>) {
+              // filter overflow cases
+              filter_sub_overflow(qint_vals[j], qint_b[j]);
            }
-            int index = 0;
-            auto qint_vec = vec::loadu(qint_vals);
-            auto qint_vec_b = vec::loadu(qint_b);
-            auto actual_int_ret = qint_vec.widening_subtract(qint_vec_b);
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
-                for (auto& v : unit_exp_vals) {
-                    v = widening_subtract(qint_vals[index], qint_b[index]);
-                    index++;
-                }
-                auto expected = vqint::loadu(unit_exp_vals);
-                const auto& actual = actual_int_ret[j];
-                if (AssertVectorized<vqint>(NAME_INFO(WideningSubtract), seed, expected, actual).check()) return;
+          }
+          int index = 0;
+          auto qint_vec = vec::loadu(qint_vals);
+          auto qint_vec_b = vec::loadu(qint_b);
+          auto actual_int_ret = qint_vec.widening_subtract(qint_vec_b);
+          for (int j = 0; j < vec::float_num_vecs(); j++) {
+            for (auto& v : unit_exp_vals) {
+              v = widening_subtract(qint_vals[index], qint_b[index]);
+              index++;
            }
-        } //trials;
+            auto expected = vqint::loadu(unit_exp_vals);
+            const auto& actual = actual_int_ret[j];
+            if (AssertVectorized<vqint>(
+                    NAME_INFO(WideningSubtract), seed, expected, actual)
+                    .check())
+              return;
+          }
+        } // trials;
    }
    TYPED_TEST(QuantizationTests, Relu) {
        using vec = TypeParam;
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@ -943,22 +943,25 @@ void test_unary(
        UVT start = dmn_argc > 0 ? dmn.ArgsDomain[0].start : default_start;
        UVT end = dmn_argc > 0 ? dmn.ArgsDomain[0].end : default_end;
        ValueGen<VT> generator(start, end, seed.add(changeSeedBy));
-        for (C10_UNUSED const auto trial : c10::irange(trialCount)) {
-            for (const auto k : c10::irange(el_count)) {
-                vals[k] = generator.get();
-                call_filter(filter, vals[k]);
-                //map operator
-                expected[k] = expectedFunction(vals[k]);
-            }
-            // test
-            auto input = vec_type::loadu(vals);
-            auto actual = actualFunction(input);
-            auto vec_expected = vec_type::loadu(expected);
-            AssertVectorized<vec_type> vecAssert(testNameInfo, seed, vec_expected, actual, input);
-            if (vecAssert.check(bitwise, dmn.CheckWithTolerance, dmn.ToleranceError)) return;
+        for ([[maybe_unused]] const auto trial : c10::irange(trialCount)) {
+          for (const auto k : c10::irange(el_count)) {
+            vals[k] = generator.get();
+            call_filter(filter, vals[k]);
+            // map operator
+            expected[k] = expectedFunction(vals[k]);
+          }
+          // test
+          auto input = vec_type::loadu(vals);
+          auto actual = actualFunction(input);
+          auto vec_expected = vec_type::loadu(expected);
+          AssertVectorized<vec_type> vecAssert(
+              testNameInfo, seed, vec_expected, actual, input);
+          if (vecAssert.check(
+                  bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
+            return;

-        }// trial
-        //inrease Seed
+        } // trial
+        // inrease Seed
        changeSeedBy += 1;
    }
    for (auto& custom : testCase.getCustomChecks()) {
@ -1002,22 +1005,25 @@ void test_binary(
        UVT end1 = dmn_argc > 1 ? dmn.ArgsDomain[1].end : default_end;
        ValueGen<VT> generator0(start0, end0, seed.add(changeSeedBy));
        ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
-        for (C10_UNUSED const auto trial : c10::irange(trialCount)) {
-            for (const auto k : c10::irange(el_count)) {
-                vals0[k] = generator0.get();
-                vals1[k] = generator1.get();
-                call_filter(filter, vals0[k], vals1[k]);
-                //map operator
-                expected[k] = expectedFunction(vals0[k], vals1[k]);
-            }
-            // test
-            auto input0 = vec_type::loadu(vals0);
-            auto input1 = vec_type::loadu(vals1);
-            auto actual = actualFunction(input0, input1);
-            auto vec_expected = vec_type::loadu(expected);
-            AssertVectorized<vec_type> vecAssert(testNameInfo, seed, vec_expected, actual, input0, input1);
-            if (vecAssert.check(bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))return;
-        }// trial
+        for ([[maybe_unused]] const auto trial : c10::irange(trialCount)) {
+          for (const auto k : c10::irange(el_count)) {
+            vals0[k] = generator0.get();
+            vals1[k] = generator1.get();
+            call_filter(filter, vals0[k], vals1[k]);
+            // map operator
+            expected[k] = expectedFunction(vals0[k], vals1[k]);
+          }
+          // test
+          auto input0 = vec_type::loadu(vals0);
+          auto input1 = vec_type::loadu(vals1);
+          auto actual = actualFunction(input0, input1);
+          auto vec_expected = vec_type::loadu(expected);
+          AssertVectorized<vec_type> vecAssert(
+              testNameInfo, seed, vec_expected, actual, input0, input1);
+          if (vecAssert.check(
+                  bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
+            return;
+        } // trial
        changeSeedBy += 1;
    }
    for (auto& custom : testCase.getCustomChecks()) {
@ -1067,24 +1073,27 @@ void test_ternary(
        ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
        ValueGen<VT> generator2(start2, end2, seed.add(changeSeedBy + 2));

-        for (C10_UNUSED const auto trial : c10::irange(trialCount)) {
-            for (const auto k : c10::irange(el_count)) {
-                vals0[k] = generator0.get();
-                vals1[k] = generator1.get();
-                vals2[k] = generator2.get();
-                call_filter(filter, vals0[k], vals1[k], vals2[k]);
-                //map operator
-                expected[k] = expectedFunction(vals0[k], vals1[k], vals2[k]);
-            }
-            // test
-            auto input0 = vec_type::loadu(vals0);
-            auto input1 = vec_type::loadu(vals1);
-            auto input2 = vec_type::loadu(vals2);
-            auto actual = actualFunction(input0, input1, input2);
-            auto vec_expected = vec_type::loadu(expected);
-            AssertVectorized<vec_type> vecAssert(testNameInfo, seed, vec_expected, actual, input0, input1, input2);
-            if (vecAssert.check(bitwise, dmn.CheckWithTolerance, dmn.ToleranceError)) return;
-        }// trial
+        for ([[maybe_unused]] const auto trial : c10::irange(trialCount)) {
+          for (const auto k : c10::irange(el_count)) {
+            vals0[k] = generator0.get();
+            vals1[k] = generator1.get();
+            vals2[k] = generator2.get();
+            call_filter(filter, vals0[k], vals1[k], vals2[k]);
+            // map operator
+            expected[k] = expectedFunction(vals0[k], vals1[k], vals2[k]);
+          }
+          // test
+          auto input0 = vec_type::loadu(vals0);
+          auto input1 = vec_type::loadu(vals1);
+          auto input2 = vec_type::loadu(vals2);
+          auto actual = actualFunction(input0, input1, input2);
+          auto vec_expected = vec_type::loadu(expected);
+          AssertVectorized<vec_type> vecAssert(
+              testNameInfo, seed, vec_expected, actual, input0, input1, input2);
+          if (vecAssert.check(
+                  bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
+            return;
+        } // trial
        changeSeedBy += 1;
    }
 }
--- a/c10/core/impl/alloc_cpu.cpp
+++ b/c10/core/impl/alloc_cpu.cpp
@ -72,11 +72,11 @@ inline bool is_thp_alloc(size_t nbytes) {
  return (is_thp_alloc_enabled() && (nbytes >= gAlloc_threshold_thp));
 }
 #elif !defined(__ANDROID__) && !defined(_MSC_VER)
-constexpr size_t c10_compute_alignment(C10_UNUSED size_t nbytes) {
+constexpr size_t c10_compute_alignment([[maybe_unused]] size_t nbytes) {
  return gAlignment;
 }

-constexpr bool is_thp_alloc(C10_UNUSED size_t nbytes) {
+constexpr bool is_thp_alloc([[maybe_unused]] size_t nbytes) {
  return false;
 }
 #endif
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@ -196,7 +196,7 @@ CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
          dsa_check_if_all_devices_support_managed_memory()),
      gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
      enabled_at_runtime(check_env_for_dsa_enabled()) {
-  for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
+  for ([[maybe_unused]] const auto _ : c10::irange(dsa_get_device_count())) {
    uvm_assertions.emplace_back(nullptr, uvm_deleter);
  }

--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@ -23,7 +23,7 @@ void c10_cuda_check_implementation(
    return;
  }

-  C10_UNUSED auto error_unused = cudaGetLastError();
+  [[maybe_unused]] auto error_unused = cudaGetLastError();
  (void)error_unused;

  std::string check_message;
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@ -40,7 +40,7 @@ class C10_CUDA_API CUDAError : public c10::Error {
  do {                                                         \
    const cudaError_t __err = EXPR;                            \
    if (C10_UNLIKELY(__err != cudaSuccess)) {                  \
-      C10_UNUSED auto error_unused = cudaGetLastError();       \
+      [[maybe_unused]] auto error_unused = cudaGetLastError(); \
      TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
    }                                                          \
  } while (0)
@ -49,18 +49,18 @@ class C10_CUDA_API CUDAError : public c10::Error {
 #define C10_CUDA_ERROR_HANDLED(EXPR) EXPR

 // Intentionally ignore a CUDA error
-#define C10_CUDA_IGNORE_ERROR(EXPR)                             \
-  do {                                                          \
-    const cudaError_t __err = EXPR;                             \
-    if (C10_UNLIKELY(__err != cudaSuccess)) {                   \
-      C10_UNUSED cudaError_t error_unused = cudaGetLastError(); \
-    }                                                           \
+#define C10_CUDA_IGNORE_ERROR(EXPR)                                   \
+  do {                                                                \
+    const cudaError_t __err = EXPR;                                   \
+    if (C10_UNLIKELY(__err != cudaSuccess)) {                         \
+      [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
+    }                                                                 \
  } while (0)

 // Clear the last CUDA error
-#define C10_CUDA_CLEAR_ERROR()                                \
-  do {                                                        \
-    C10_UNUSED cudaError_t error_unused = cudaGetLastError(); \
+#define C10_CUDA_CLEAR_ERROR()                                      \
+  do {                                                              \
+    [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
  } while (0)

 // This should be used directly after every kernel launch to ensure
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@ -22,7 +22,7 @@ int device_count_impl(bool fail_if_no_driver) {
  // Clear out the error state, so we don't spuriously trigger someone else.
  // (This shouldn't really matter, since we won't be running very much CUDA
  // code in this regime.)
-  C10_UNUSED cudaError_t last_err = cudaGetLastError();
+  [[maybe_unused]] cudaError_t last_err = cudaGetLastError();
  switch (err) {
    case cudaErrorNoDevice:
      // Zero devices is ok here
@ -170,7 +170,7 @@ std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
 }

 namespace _internal {
-bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) {
+bool dummyHasPrimaryContext([[maybe_unused]] DeviceIndex device_index) {
  TORCH_CHECK(false, "Should never been called");
 }
 bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -8,7 +8,7 @@
    CUresult __err = EXPR;                                                 \
    if (__err != CUDA_SUCCESS) {                                           \
      const char* err_str;                                                 \
-      CUresult get_error_str_err C10_UNUSED =                              \
+      CUresult get_error_str_err [[maybe_unused]] =                        \
          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
      if (get_error_str_err != CUDA_SUCCESS) {                             \
        AT_ERROR("CUDA driver error: unknown error");                      \
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@ -118,9 +118,6 @@
 #define C10_HAS_CPP_ATTRIBUTE(x) (0)
 #endif

-// suppress an unused variable.
-#define C10_UNUSED [[maybe_unused]]
-
 #if !defined(__has_attribute)
 #define __has_attribute(x) 0
 #endif
--- a/c10/test/util/ordered_preserving_dict_test.cpp
+++ b/c10/test/util/ordered_preserving_dict_test.cpp
@ -35,12 +35,12 @@ dict_int_int test_dict(dict_int_int& dict) {

  // erase via iterators
  auto begin = dict.begin();
-  for (C10_UNUSED const auto i : c10::irange(20)) {
+  for ([[maybe_unused]] const auto i : c10::irange(20)) {
    begin++;
  }

  auto end = begin;
-  for (C10_UNUSED const auto i : c10::irange(20)) {
+  for ([[maybe_unused]] const auto i : c10::irange(20)) {
    erase_set.insert(end->first);
    end++;
  }
@ -134,11 +134,11 @@ TEST(OrderedPreservingDictTest, DictCollisions) {

    // erase a few entries via iterator
    auto begin = dict.begin();
-    for (C10_UNUSED const auto j : c10::irange(10)) {
+    for ([[maybe_unused]] const auto j : c10::irange(10)) {
      begin++;
    }
    auto end = begin;
-    for (C10_UNUSED const auto j : c10::irange(7)) {
+    for ([[maybe_unused]] const auto j : c10::irange(7)) {
      erase_set.insert(end->first);
      end++;
    }
--- a/c10/util/ApproximateClock.cpp
+++ b/c10/util/ApproximateClock.cpp
@ -26,7 +26,7 @@ ApproximateClockToUnixTimeConverter::measurePair() {
 ApproximateClockToUnixTimeConverter::time_pairs
 ApproximateClockToUnixTimeConverter::measurePairs() {
  static constexpr auto n_warmup = 5;
-  for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
+  for ([[maybe_unused]] const auto _ : c10::irange(n_warmup)) {
    getApproximateTime();
    static_cast<void>(steady_clock_t::now());
  }
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -658,12 +658,12 @@ namespace c10::detail {
 // Report a warning to the user only once.  Accepts an arbitrary number of extra
 // arguments which are concatenated into the warning message using operator<<
 //
-#define _TORCH_WARN_ONCE(...)                                             \
-  C10_UNUSED static const auto C10_ANONYMOUS_VARIABLE(torch_warn_once_) = \
-      [&] {                                                               \
-        TORCH_WARN(__VA_ARGS__);                                          \
-        return true;                                                      \
-      }()
+#define _TORCH_WARN_ONCE(...)                                \
+  [[maybe_unused]] static const auto C10_ANONYMOUS_VARIABLE( \
+      torch_warn_once_) = [&] {                              \
+    TORCH_WARN(__VA_ARGS__);                                 \
+    return true;                                             \
+  }()

 #ifdef DISABLE_WARN
 #define TORCH_WARN_ONCE(...) ((void)0);
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@ -322,8 +322,8 @@ C10_API const std::unique_ptr<EventSampledHandler>& GetEventSampledHandler(
 *   // Logs caller info with an arbitrary text event, if there is a usage.
 *   C10_LOG_API_USAGE_ONCE("my_api");
 */
-#define C10_LOG_API_USAGE_ONCE(...)                        \
-  C10_UNUSED static bool C10_ANONYMOUS_VARIABLE(logFlag) = \
+#define C10_LOG_API_USAGE_ONCE(...)                              \
+  [[maybe_unused]] static bool C10_ANONYMOUS_VARIABLE(logFlag) = \
      ::c10::detail::LogAPIUsageFakeReturn(__VA_ARGS__);

 // API usage logging capabilities
--- a/c10/xpu/test/impl/XPUStreamTest.cpp
+++ b/c10/xpu/test/impl/XPUStreamTest.cpp
@ -115,7 +115,7 @@ TEST(XPUStreamTest, StreamPoolRoundRobinTest) {
  }

  std::vector<c10::xpu::XPUStream> streams{};
-  for (C10_UNUSED const auto _ : c10::irange(200)) {
+  for ([[maybe_unused]] const auto _ : c10::irange(200)) {
    streams.emplace_back(c10::xpu::getStreamFromPool());
  }

--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@ -2220,7 +2220,7 @@ TEST(DataLoaderTest, ChunkDatasetCrossChunkShuffle) {
        for (const auto i : c10::irange(
                 (chunk_count + cross_chunk_shuffle_count - 1) /
                 cross_chunk_shuffle_count)) {
-          for (C10_UNUSED const auto j : c10::irange(chunk_size)) {
+          for ([[maybe_unused]] const auto j : c10::irange(chunk_size)) {
            for (const auto k : c10::irange(cross_chunk_shuffle_count)) {
              if (i * cross_chunk_shuffle_count + k < chunk_count) {
                expected_result.push_back(i * cross_chunk_shuffle_count + k);
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@ -1343,7 +1343,7 @@ TEST_F(FunctionalTest, GumbelSoftmax) {

    auto counts = torch::zeros_like(logits);
    torch::Tensor y_draw;
-    for (C10_UNUSED const auto i : c10::irange(num_draws)) {
+    for ([[maybe_unused]] const auto i : c10::irange(num_draws)) {
      y_draw =
          F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true));
      counts += y_draw;
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@ -123,7 +123,7 @@ bool test_mnist(
  torch::Device device(with_cuda ? torch::kCUDA : torch::kCPU);
  model->to(device);

-  for (C10_UNUSED const auto epoch : c10::irange(number_of_epochs)) {
+  for ([[maybe_unused]] const auto epoch : c10::irange(number_of_epochs)) {
    // NOLINTNEXTLINE(performance-for-range-copy)
    for (torch::data::Example<> batch : *data_loader) {
      auto data = batch.data.to(device);
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@ -3511,7 +3511,7 @@ void _multihead_attn_test_helper(
  std::uniform_int_distribution<int> d_2_10(2, 10);
  std::uniform_int_distribution<int> d_3_10(3, 10);
  bool registration_checked = false;
-  for (C10_UNUSED const auto i : c10::irange(100)) {
+  for ([[maybe_unused]] const auto i : c10::irange(100)) {
    const auto batch_sz = d_2_10(generator);
    const auto seq_len = d_2_10(generator);
    const auto d_head = d_3_10(generator);
--- a/test/cpp/api/nn_utils.cpp
+++ b/test/cpp/api/nn_utils.cpp
@ -398,7 +398,8 @@ std::vector<torch::Tensor> PackedSequenceTest_ordered_sequence(
    torch::ScalarType tensor_type) {
  std::vector<torch::Tensor> seqs;
  seqs.reserve(PackedSequenceTest_batch_size);
-  for (C10_UNUSED const auto i : c10::irange(PackedSequenceTest_batch_size)) {
+  for ([[maybe_unused]] const auto i :
+       c10::irange(PackedSequenceTest_batch_size)) {
    seqs.emplace_back(torch::empty(
        {torch::randint(1, PackedSequenceTest_max_length, {1}).item<int64_t>()},
        tensor_type));
--- a/test/cpp/api/operations.cpp
+++ b/test/cpp/api/operations.cpp
@ -12,7 +12,7 @@ struct OperationTest : torch::test::SeedingFixture {
 };

 TEST_F(OperationTest, Lerp) {
-  for (C10_UNUSED const auto i : c10::irange(TEST_AMOUNT)) {
+  for ([[maybe_unused]] const auto i : c10::irange(TEST_AMOUNT)) {
    // test lerp_kernel_scalar
    auto start = torch::rand({3, 5});
    auto end = torch::rand({3, 5});
@ -36,7 +36,7 @@ TEST_F(OperationTest, Lerp) {
 }

 TEST_F(OperationTest, Cross) {
-  for (C10_UNUSED const auto i : c10::irange(TEST_AMOUNT)) {
+  for ([[maybe_unused]] const auto i : c10::irange(TEST_AMOUNT)) {
    // input
    auto a = torch::rand({10, 3});
    auto b = torch::rand({10, 3});
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@ -157,7 +157,7 @@ void check_exact_values(
 TEST(OptimTest, OptimizerAccessors) {
  auto options = AdagradOptions(1.0);
  std::vector<torch::Tensor> params;
-  for (C10_UNUSED const auto i : c10::irange(3)) {
+  for ([[maybe_unused]] const auto i : c10::irange(3)) {
    params.push_back(torch::randn(10));
  }
  auto optimizer = Adagrad(params, options);
--- a/test/cpp/c10d/FileStoreTest.cpp
+++ b/test/cpp/c10d/FileStoreTest.cpp
@ -99,14 +99,14 @@ void stressTestStore(std::string path, std::string prefix = "") {
  std::vector<std::thread> threads;
  c10d::test::Semaphore sem1, sem2;

-  for (C10_UNUSED const auto i : c10::irange(numThreads)) {
+  for ([[maybe_unused]] const auto i : c10::irange(numThreads)) {
    threads.emplace_back([&] {
      auto fileStore =
          c10::make_intrusive<c10d::FileStore>(path, numThreads + 1);
      c10d::PrefixStore store(prefix, fileStore);
      sem1.post();
      sem2.wait();
-      for (C10_UNUSED const auto j : c10::irange(numIterations)) {
+      for ([[maybe_unused]] const auto j : c10::irange(numIterations)) {
        store.add("counter", 1);
      }
    });
--- a/test/cpp/c10d/HashStoreTest.cpp
+++ b/test/cpp/c10d/HashStoreTest.cpp
@ -62,11 +62,11 @@ void stressTestStore(std::string prefix = "") {
  auto hashStore = c10::make_intrusive<c10d::HashStore>();
  c10d::PrefixStore store(std::move(prefix), hashStore);

-  for (C10_UNUSED const auto i : c10::irange(numThreads)) {
+  for ([[maybe_unused]] const auto i : c10::irange(numThreads)) {
    threads.emplace_back([&] {
      sem1.post();
      sem2.wait();
-      for (C10_UNUSED const auto j : c10::irange(numIterations)) {
+      for ([[maybe_unused]] const auto j : c10::irange(numIterations)) {
        store.add("counter", 1);
      }
    });
--- a/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
+++ b/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
@ -15,12 +15,12 @@ using at::cuda::CUDAStream;
 template <typename T, typename... Args>
 std::vector<T> initialize(const std::string& path, size_t N, Args&&... args) {
  std::vector<T> tests;
-  for (C10_UNUSED const auto i : c10::irange(N)) {
+  for ([[maybe_unused]] const auto i : c10::irange(N)) {
    tests.push_back(std::move(T(path, std::forward<Args>(args)...)));
  }

  std::vector<std::thread> threads;
-  for (C10_UNUSED const auto i : c10::irange(N)) {
+  for ([[maybe_unused]] const auto i : c10::irange(N)) {
    threads.push_back(std::thread([i, N, &tests] { tests[i].start(i, N); }));
  }

--- a/test/cpp/c10d/ProcessGroupGlooTest.cpp
+++ b/test/cpp/c10d/ProcessGroupGlooTest.cpp
@ -123,7 +123,7 @@ class CollectiveTest {
      int num,
      bool delayed = false) {
    std::vector<CollectiveTest> tests;
-    for (C10_UNUSED const auto i : c10::irange(num)) {
+    for ([[maybe_unused]] const auto i : c10::irange(num)) {
      tests.emplace_back(path);
    }

--- a/test/cpp/c10d/TCPStoreTest.cpp
+++ b/test/cpp/c10d/TCPStoreTest.cpp
@ -102,7 +102,7 @@ void testHelper(bool useLibUV, const std::string& prefix = "") {

  for (const auto i : c10::irange(numThreads)) {
    threads.emplace_back([=, &sem1, &sem2, &clientStores, &expectedCounterRes] {
-      for (C10_UNUSED const auto j : c10::irange(numIterations)) {
+      for ([[maybe_unused]] const auto j : c10::irange(numIterations)) {
        clientStores[i]->add("counter", 1);
      }
      // Let each thread set and get key on its client store
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@ -1043,7 +1043,7 @@ TEST(Reductions, ReduceSplitRfactor) {
  SimpleIREvaluator cg(s, {b, c});

  cg.call({in, out});
-  for (C10_UNUSED const auto i : c10::irange(M)) {
+  for ([[maybe_unused]] const auto i : c10::irange(M)) {
    ASSERT_EQ(out[0], 4950);
  }
 }
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@ -3884,7 +3884,7 @@ TEST(Simplify, SimplifyEliminateEmptyFor) {
  {
    // Flatten many layers around an empty block to an empty block.
    StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
-    for (C10_UNUSED const auto i : c10::irange(11)) {
+    for ([[maybe_unused]] const auto i : c10::irange(11)) {
      VarHandle loopVar("loopVar", kInt);
      last = For::make(loopVar, 0, 10, last);
    }
@ -3968,7 +3968,7 @@ TEST(Simplify, SimplifyFlattenBlock) {
  {
    // Flatten many layers around an empty block to an empty block.
    StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
-    for (C10_UNUSED const auto i : c10::irange(11)) {
+    for ([[maybe_unused]] const auto i : c10::irange(11)) {
      last = alloc<Block>(std::vector<StmtPtr>({last}));
    }

--- a/test/custom_operator/op.cpp
+++ b/test/custom_operator/op.cpp
@ -12,7 +12,7 @@ torch::List<torch::Tensor> custom_op(
    int64_t repeat) {
  torch::List<torch::Tensor> output;
  output.reserve(repeat);
-  for (C10_UNUSED const auto i : c10::irange(repeat)) {
+  for ([[maybe_unused]] const auto i : c10::irange(repeat)) {
    output.push_back(tensor * scalar);
  }
  return output;
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@ -41,13 +41,13 @@ namespace torch::autograd {

 namespace VariableType {
 namespace{
-  C10_UNUSED void reset_grad_accumulator(Variable & self) {
-    AutogradMeta* meta = torch::autograd::impl::get_autograd_meta(self);
-    if (meta != nullptr) {
-      meta->grad_accumulator_.reset();
-    }
+[[maybe_unused]] void reset_grad_accumulator(Variable& self) {
+  AutogradMeta* meta = torch::autograd::impl::get_autograd_meta(self);
+  if (meta != nullptr) {
+    meta->grad_accumulator_.reset();
  }
 }
+}

 namespace {

--- a/Show More
+++ b/Show More