Re-enable some C++ warnings (#142332)

It enables some C++ warnings since the code base is fairly clean. Meanwhile, Wextra-semi is disabled on CUDA generated code since there is no way to fix them without the cooperation of CUDA team. Pull Request resolved: https://github.com/pytorch/pytorch/pull/142332 Approved by: https://github.com/albanD, https://github.com/eqy
2025-10-20 12:54:11 +08:00 · 2024-12-12 04:02:12 +00:00
parent f892f9862a
commit 2903cf0ad8
11 changed files with 62 additions and 65 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -997,8 +997,6 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-missing-field-initializers"
                               CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
@ -1076,7 +1074,6 @@ if(NOT MSVC)
      set(WERROR FALSE)
    endif()
  endif()
-  append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG)
  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
@ -1093,6 +1090,7 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
  if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+    append_cxx_flag_if_supported("-Wno-dangling-reference" CMAKE_CXX_FLAGS)
    append_cxx_flag_if_supported("-Wno-error=dangling-reference" CMAKE_CXX_FLAGS)
    append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS)
  endif()
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -466,6 +466,6 @@ void CUDAHooks::deviceSynchronize(DeviceIndex device_index) const {
 using at::CUDAHooksRegistry;
 using at::RegistererCUDAHooksRegistry;

-REGISTER_CUDA_HOOKS(CUDAHooks);
+REGISTER_CUDA_HOOKS(CUDAHooks)

 } // namespace at::cuda::detail
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -127,8 +127,8 @@ RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) {
 #define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2)
 #define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3)

-NVRTC_STUB2(nvrtcVersion, int*, int*);
-NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const);
+NVRTC_STUB2(nvrtcVersion, int*, int*)
+NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const)

 nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
                               const char *src,
@ -143,32 +143,32 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
  return fn(prog, src, name, numHeaders, headers, includeNames);
 }

-NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *);
-NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *);
-NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
+NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
+NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
+NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
-NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *);
-NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *);
+NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
+NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
 #endif
-NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
-_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
-NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
-NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
-NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **);
+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
+_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
+NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
+NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
+NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)

-CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *);
-CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *);
-CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t);
-CUDA_STUB2(cuGetErrorString, CUresult, const char **);
-CUDA_STUB1(cuCtxGetCurrent, CUcontext *);
-CUDA_STUB1(cuCtxSetCurrent, CUcontext);
-CUDA_STUB1(cuModuleUnload, CUmodule);
-CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
-CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice);
-CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
-CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
-CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int);
-CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction);
+CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
+CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
+CUDA_STUB2(cuGetErrorString, CUresult, const char **)
+CUDA_STUB1(cuCtxGetCurrent, CUcontext *)
+CUDA_STUB1(cuCtxSetCurrent, CUcontext)
+CUDA_STUB1(cuModuleUnload, CUmodule)
+CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *)
+CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice)
+CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *)
+CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
+CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)

 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 CUresult CUDAAPI
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -561,8 +561,8 @@ namespace {
        bool expected = std::isnan(val);
        CACHE_ALIGN c10::Half actual_vals[vHalf::size()];
        vHalf(val).isnan().store(actual_vals);
-        for (int jj = 0; jj < vHalf::size(); ++jj) {
-          EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_vals[jj]) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec;
+        for (auto actual_val : actual_vals) {
+          EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_val) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec;
        }
      }
    }
@ -1046,7 +1046,7 @@ namespace {
          mask[idx] = (VT)0;
        }
        else {
-          int64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
+          uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
          std::memcpy(&mask[idx], &hex_mask, sizeof(VT));
        }
        if (!test_blendv<vec, VT, idx+1, N>(expected_val, a, b, mask)) return false;
@ -1315,8 +1315,8 @@ namespace {
        ValueGen<float> generator_sc(1.f, 15.f, seed.add(2));
        for ([[maybe_unused]] const auto i : c10::irange(trials)) {
          float scale = generator_sc.get();
-          int32_t zero_point_val = generator.get();
-          float scale_zp_premul = -(scale * zero_point_val);
+          auto zero_point_val = generator.get();
+          float scale_zp_premul = -(scale * static_cast<float>(zero_point_val));
          vfloat vf_scale = vfloat{scale};
          vfloat vf_zp = vfloat{static_cast<float>(zero_point_val)};
          vfloat vf_scale_zp = vfloat{scale_zp_premul};
@ -1657,18 +1657,16 @@ namespace {
    TEST(HalfConversionTest, HalfFloat) {
      float f32s[100];
      for (const auto i : c10::irange(100)) {
-        f32s[i] = i + 0.3;
+        f32s[i] = static_cast<float>(i + 0.3);
      }
-      uint16_t u16;
-      float x;
      for (const auto i : c10::irange(100)) {
      #if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
          !defined(__APPLE__)
-        u16 = at::vec::float2half_scalar(f32s[i]);
-        x = at::vec::half2float_scalar(u16);
+        uint16_t u16 = at::vec::float2half_scalar(f32s[i]);
+        float x = at::vec::half2float_scalar(u16);
      #else
-        u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]);
-        x = c10::detail::fp16_ieee_to_fp32_value(u16);
+        uint16_t u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]);
+        float x = c10::detail::fp16_ieee_to_fp32_value(u16);
      #endif

        EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
@ -1697,7 +1695,7 @@ namespace {
      VT v_pinf = static_cast<VT>(*(float *)&infBits);
      values[index] = v_pinf;
      auto vec_pinf = vec::loadu(values);
-      int negInfBits = 0xFF800000;
+      unsigned int negInfBits = 0xFF800000;
      VT v_ninf  = static_cast<VT>(*(float *)&negInfBits);
      values[index] = v_ninf;
      auto vec_ninf = vec::loadu(values);
@ -1779,8 +1777,8 @@ namespace {
        const auto expected = static_cast<float>(val);
        CACHE_ALIGN float actual_vals[vfloat::size()];
        at::vec::convert<float>(vBFloat16(val)).store(actual_vals);
-        for (int jj = 0; jj < vfloat::size(); ++jj) {
-          EXPECT_EQ(c10::bit_cast<uint32_t>(expected), c10::bit_cast<uint32_t>(actual_vals[jj]))
+        for (auto actual_val : actual_vals) {
+          EXPECT_EQ(c10::bit_cast<uint32_t>(expected), c10::bit_cast<uint32_t>(actual_val))
            << "convert-to-float failure for bf16 bit pattern "
            << std::hex << ii << std::dec;
        }
@ -1794,20 +1792,20 @@ namespace {

    #define TEST_MASK_LOAD(dst_t, mask_t, mask_n)                           \
      do {                                                                  \
-        CACHE_ALIGN dst_t x[mask_n * size];                                 \
-        CACHE_ALIGN dst_t y[mask_n * size];                                 \
-        CACHE_ALIGN dst_t ref[mask_n * size];                               \
-        auto seed = TestSeed();                                             \
-        dst_t generator_min = std::numeric_limits<dst_t>::is_signed ? dst_t(-100) : dst_t(0); \
-        ValueGen<dst_t> generator(generator_min, dst_t(100), seed);     \
-        for (const auto i : c10::irange(mask_n * size)) {                   \
-          x[i] = generator.get();                                           \
-        }                                                                   \
-        auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed);            \
        constexpr int dst_size = at::vec::Vectorized<dst_t>::size();        \
        constexpr int dst_n = mask_n * size / dst_size;                     \
-        constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;    \
        if constexpr(dst_n * dst_size >= mask_n * size) {                   \
+            CACHE_ALIGN dst_t x[mask_n * size];                             \
+            CACHE_ALIGN dst_t y[mask_n * size];                             \
+            CACHE_ALIGN dst_t ref[mask_n * size];                           \
+            auto seed = TestSeed();                                         \
+            dst_t generator_min = std::numeric_limits<dst_t>::is_signed ? dst_t(-100) : dst_t(0); \
+            ValueGen<dst_t> generator(generator_min, dst_t(100), seed);     \
+            for (const auto i : c10::irange(mask_n * size)) {               \
+              x[i] = generator.get();                                       \
+            }                                                               \
+            auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed);        \
+            constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\
            auto x_vec = vec_mask.template loadu<dst_t, rnd_n>(x);          \
            x_vec.store(y);                                                 \
            for (const auto i : c10::irange(mask_n * size)) {               \
--- a/benchmarks/static_runtime/test_utils.cc
+++ b/benchmarks/static_runtime/test_utils.cc
@ -353,7 +353,7 @@ void testStaticRuntime(

          size_t new_managed_bytes =
              memory_planner ? memory_planner->total_managed() : 0;
-          if (check_resize && new_managed_bytes >= 0) {
+          if (check_resize) {
            EXPECT_GE(new_managed_bytes, managed_bytes);
          }

--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@ -188,7 +188,6 @@ class BlockingCounter {
  // returns false.
  bool DecrementCount() {
    const auto count_value = count_.fetch_sub(1, std::memory_order_relaxed) - 1;
-    TORCH_DCHECK_GE(count_value, 0);
    if (count_value == 0) {
      std::lock_guard<std::mutex> g(mutex_);
      cond_.notify_one();
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -414,6 +414,9 @@ function(torch_compile_options libname)
      $<$<COMPILE_LANGUAGE:CXX>:${private_compile_options}>)
  if(USE_CUDA)
    foreach(option IN LISTS private_compile_options)
+      if("${option}" STREQUAL "-Wextra-semi")
+        continue()
+      endif()
      target_compile_options(${libname} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler ${option}>)
    endforeach()
  endif()
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@ -1888,7 +1888,6 @@ static PyObject* order(PyObject *_,
        }
    }

-    int ndim = 0;
    int insert_point = -1;
    Slice<DimEntry> new_levels;
    for (auto l : levels) {
@ -1896,7 +1895,6 @@ static PyObject* order(PyObject *_,
            continue;
        }
        if (l.is_positional()) {
-            ndim++;
            if (insert_point == -1) {
                insert_point = new_levels.size();
                new_levels.extend(A, flat_positional_dims);
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@ -69,7 +69,7 @@ std::string jsonStrEscape(const std::string& str) {
      ostream << "\\r";
    } else if (ch == '\t') {
      ostream << "\\t";
-    } else if ('\x00' <= ch && ch <= '\x1f') {
+    } else if (ch <= '\x1f') {
      ostream << "\\u" << std::hex << std::setw(4) << std::setfill('0')
              << static_cast<int>(ch);
    } else {
--- a/torch/csrc/lazy/core/ir_dump_util.cpp
+++ b/torch/csrc/lazy/core/ir_dump_util.cpp
@ -43,13 +43,13 @@ std::optional<AttrTag> ParseAttrTag(
  }

  std::string::size_type vpos = match[1].second - node_string.begin() + 1;
-  char nested_open = -1;
-  char nested_close = -1;
+  std::optional<char> nested_open;
+  std::optional<char> nested_close;
  size_t nest_count = 1;
  AttrTag tag;
  tag.name = match[1].str();
  for (pos = vpos; pos < node_string.size(); ++pos) {
-    if (nested_open < 0) {
+    if (!nested_open.has_value()) {
      if (SkipTagSeparator(node_string, pos) != pos) {
        break;
      }
@ -72,7 +72,8 @@ std::optional<AttrTag> ParseAttrTag(
      --nest_count;
      if (nest_count == 0) {
        nest_count = 1;
-        nested_open = nested_close = -1;
+        nested_open.reset();
+        nested_close.reset();
      }
    } else if (node_string[pos] == nested_open) {
      ++nest_count;
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@ -669,7 +669,7 @@ static std::string json_str_escape(const std::string& str) {
      ostream << "\\r";
    } else if (ch == '\t') {
      ostream << "\\t";
-    } else if ('\x00' <= ch && ch <= '\x1f') {
+    } else if (ch <= '\x1f') {
      ostream << "\\u" << std::hex << std::setw(4) << std::setfill('0')
              << static_cast<int>(ch);
    } else {