From 2903cf0ad82bbd05583291d950ef76e5c08afb76 Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 12 Dec 2024 04:02:12 +0000 Subject: [PATCH] Re-enable some C++ warnings (#142332) It enables some C++ warnings since the code base is fairly clean. Meanwhile, Wextra-semi is disabled on CUDA generated code since there is no way to fix them without the cooperation of CUDA team. Pull Request resolved: https://github.com/pytorch/pytorch/pull/142332 Approved by: https://github.com/albanD, https://github.com/eqy --- CMakeLists.txt | 4 +- aten/src/ATen/cuda/detail/CUDAHooks.cpp | 2 +- aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 50 +++++++++---------- aten/src/ATen/test/vec_test_all_types.cpp | 50 +++++++++---------- benchmarks/static_runtime/test_utils.cc | 2 +- caffe2/utils/threadpool/WorkersPool.h | 1 - cmake/public/utils.cmake | 3 ++ functorch/csrc/dim/dim.cpp | 2 - .../c10d/control_plane/WorkerServer.cpp | 2 +- torch/csrc/lazy/core/ir_dump_util.cpp | 9 ++-- .../standalone/execution_trace_observer.cpp | 2 +- 11 files changed, 62 insertions(+), 65 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c8af5f00b5c1..c6c2d3366432 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -997,8 +997,6 @@ if(NOT MSVC) append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-missing-field-initializers" CMAKE_CXX_FLAGS) - append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS) - append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS) @@ -1076,7 +1074,6 @@ if(NOT MSVC) set(WERROR FALSE) endif() endif() - append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG) if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU") @@ -1093,6 +1090,7 @@ if(NOT MSVC) append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS) if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13) + append_cxx_flag_if_supported("-Wno-dangling-reference" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-error=dangling-reference" CMAKE_CXX_FLAGS) append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS) endif() diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index a6439f7c3e57..d14ea9d505a2 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -466,6 +466,6 @@ void CUDAHooks::deviceSynchronize(DeviceIndex device_index) const { using at::CUDAHooksRegistry; using at::RegistererCUDAHooksRegistry; -REGISTER_CUDA_HOOKS(CUDAHooks); +REGISTER_CUDA_HOOKS(CUDAHooks) } // namespace at::cuda::detail diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp index 75c503d48d51..50833167a8f1 100644 --- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp @@ -127,8 +127,8 @@ RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) { #define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2) #define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3) -NVRTC_STUB2(nvrtcVersion, int*, int*); -NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const); +NVRTC_STUB2(nvrtcVersion, int*, int*) +NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const) nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, @@ -143,32 +143,32 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, return fn(prog, src, name, numHeaders, headers, includeNames); } -NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *); -NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *); -NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); +NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *) +NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *) +NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *) #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 -NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); -NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); +NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *) +NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *) #endif -NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); -_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); -NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); -NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); -NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **); +NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *) +_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult) +NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*) +NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *) +NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **) -CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *); -CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *); -CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t); -CUDA_STUB2(cuGetErrorString, CUresult, const char **); -CUDA_STUB1(cuCtxGetCurrent, CUcontext *); -CUDA_STUB1(cuCtxSetCurrent, CUcontext); -CUDA_STUB1(cuModuleUnload, CUmodule); -CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *); -CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice); -CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *); -CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *); -CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int); -CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction); +CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *) +CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *) +CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t) +CUDA_STUB2(cuGetErrorString, CUresult, const char **) +CUDA_STUB1(cuCtxGetCurrent, CUcontext *) +CUDA_STUB1(cuCtxSetCurrent, CUcontext) +CUDA_STUB1(cuModuleUnload, CUmodule) +CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *) +CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice) +CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *) +CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *) +CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int) +CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction) #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000 CUresult CUDAAPI diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp index b08bbded001c..9ef0b1293025 100644 --- a/aten/src/ATen/test/vec_test_all_types.cpp +++ b/aten/src/ATen/test/vec_test_all_types.cpp @@ -561,8 +561,8 @@ namespace { bool expected = std::isnan(val); CACHE_ALIGN c10::Half actual_vals[vHalf::size()]; vHalf(val).isnan().store(actual_vals); - for (int jj = 0; jj < vHalf::size(); ++jj) { - EXPECT_EQ(expected, c10::bit_cast(actual_vals[jj]) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec; + for (auto actual_val : actual_vals) { + EXPECT_EQ(expected, c10::bit_cast(actual_val) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec; } } } @@ -1046,7 +1046,7 @@ namespace { mask[idx] = (VT)0; } else { - int64_t hex_mask = 0xFFFFFFFFFFFFFFFF; + uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF; std::memcpy(&mask[idx], &hex_mask, sizeof(VT)); } if (!test_blendv(expected_val, a, b, mask)) return false; @@ -1315,8 +1315,8 @@ namespace { ValueGen generator_sc(1.f, 15.f, seed.add(2)); for ([[maybe_unused]] const auto i : c10::irange(trials)) { float scale = generator_sc.get(); - int32_t zero_point_val = generator.get(); - float scale_zp_premul = -(scale * zero_point_val); + auto zero_point_val = generator.get(); + float scale_zp_premul = -(scale * static_cast(zero_point_val)); vfloat vf_scale = vfloat{scale}; vfloat vf_zp = vfloat{static_cast(zero_point_val)}; vfloat vf_scale_zp = vfloat{scale_zp_premul}; @@ -1657,18 +1657,16 @@ namespace { TEST(HalfConversionTest, HalfFloat) { float f32s[100]; for (const auto i : c10::irange(100)) { - f32s[i] = i + 0.3; + f32s[i] = static_cast(i + 0.3); } - uint16_t u16; - float x; for (const auto i : c10::irange(100)) { #if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \ !defined(__APPLE__) - u16 = at::vec::float2half_scalar(f32s[i]); - x = at::vec::half2float_scalar(u16); + uint16_t u16 = at::vec::float2half_scalar(f32s[i]); + float x = at::vec::half2float_scalar(u16); #else - u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]); - x = c10::detail::fp16_ieee_to_fp32_value(u16); + uint16_t u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]); + float x = c10::detail::fp16_ieee_to_fp32_value(u16); #endif EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i])) @@ -1697,7 +1695,7 @@ namespace { VT v_pinf = static_cast(*(float *)&infBits); values[index] = v_pinf; auto vec_pinf = vec::loadu(values); - int negInfBits = 0xFF800000; + unsigned int negInfBits = 0xFF800000; VT v_ninf = static_cast(*(float *)&negInfBits); values[index] = v_ninf; auto vec_ninf = vec::loadu(values); @@ -1779,8 +1777,8 @@ namespace { const auto expected = static_cast(val); CACHE_ALIGN float actual_vals[vfloat::size()]; at::vec::convert(vBFloat16(val)).store(actual_vals); - for (int jj = 0; jj < vfloat::size(); ++jj) { - EXPECT_EQ(c10::bit_cast(expected), c10::bit_cast(actual_vals[jj])) + for (auto actual_val : actual_vals) { + EXPECT_EQ(c10::bit_cast(expected), c10::bit_cast(actual_val)) << "convert-to-float failure for bf16 bit pattern " << std::hex << ii << std::dec; } @@ -1794,20 +1792,20 @@ namespace { #define TEST_MASK_LOAD(dst_t, mask_t, mask_n) \ do { \ - CACHE_ALIGN dst_t x[mask_n * size]; \ - CACHE_ALIGN dst_t y[mask_n * size]; \ - CACHE_ALIGN dst_t ref[mask_n * size]; \ - auto seed = TestSeed(); \ - dst_t generator_min = std::numeric_limits::is_signed ? dst_t(-100) : dst_t(0); \ - ValueGen generator(generator_min, dst_t(100), seed); \ - for (const auto i : c10::irange(mask_n * size)) { \ - x[i] = generator.get(); \ - } \ - auto vec_mask = generate_vec_mask(seed); \ constexpr int dst_size = at::vec::Vectorized::size(); \ constexpr int dst_n = mask_n * size / dst_size; \ - constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size; \ if constexpr(dst_n * dst_size >= mask_n * size) { \ + CACHE_ALIGN dst_t x[mask_n * size]; \ + CACHE_ALIGN dst_t y[mask_n * size]; \ + CACHE_ALIGN dst_t ref[mask_n * size]; \ + auto seed = TestSeed(); \ + dst_t generator_min = std::numeric_limits::is_signed ? dst_t(-100) : dst_t(0); \ + ValueGen generator(generator_min, dst_t(100), seed); \ + for (const auto i : c10::irange(mask_n * size)) { \ + x[i] = generator.get(); \ + } \ + auto vec_mask = generate_vec_mask(seed); \ + constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\ auto x_vec = vec_mask.template loadu(x); \ x_vec.store(y); \ for (const auto i : c10::irange(mask_n * size)) { \ diff --git a/benchmarks/static_runtime/test_utils.cc b/benchmarks/static_runtime/test_utils.cc index d7f49c7171cb..cf6f199125d7 100644 --- a/benchmarks/static_runtime/test_utils.cc +++ b/benchmarks/static_runtime/test_utils.cc @@ -353,7 +353,7 @@ void testStaticRuntime( size_t new_managed_bytes = memory_planner ? memory_planner->total_managed() : 0; - if (check_resize && new_managed_bytes >= 0) { + if (check_resize) { EXPECT_GE(new_managed_bytes, managed_bytes); } diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h index 2f964712b158..5de6b1213e84 100644 --- a/caffe2/utils/threadpool/WorkersPool.h +++ b/caffe2/utils/threadpool/WorkersPool.h @@ -188,7 +188,6 @@ class BlockingCounter { // returns false. bool DecrementCount() { const auto count_value = count_.fetch_sub(1, std::memory_order_relaxed) - 1; - TORCH_DCHECK_GE(count_value, 0); if (count_value == 0) { std::lock_guard g(mutex_); cond_.notify_one(); diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake index 0c8e91ab7cfb..49349fb5fa3d 100644 --- a/cmake/public/utils.cmake +++ b/cmake/public/utils.cmake @@ -414,6 +414,9 @@ function(torch_compile_options libname) $<$:${private_compile_options}>) if(USE_CUDA) foreach(option IN LISTS private_compile_options) + if("${option}" STREQUAL "-Wextra-semi") + continue() + endif() target_compile_options(${libname} PRIVATE $<$:-Xcompiler ${option}>) endforeach() endif() diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp index e41ef5f8d68c..33e1c080dabd 100644 --- a/functorch/csrc/dim/dim.cpp +++ b/functorch/csrc/dim/dim.cpp @@ -1888,7 +1888,6 @@ static PyObject* order(PyObject *_, } } - int ndim = 0; int insert_point = -1; Slice new_levels; for (auto l : levels) { @@ -1896,7 +1895,6 @@ static PyObject* order(PyObject *_, continue; } if (l.is_positional()) { - ndim++; if (insert_point == -1) { insert_point = new_levels.size(); new_levels.extend(A, flat_positional_dims); diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp index e4a2d301a566..8f1abcdeab44 100644 --- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp +++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp @@ -69,7 +69,7 @@ std::string jsonStrEscape(const std::string& str) { ostream << "\\r"; } else if (ch == '\t') { ostream << "\\t"; - } else if ('\x00' <= ch && ch <= '\x1f') { + } else if (ch <= '\x1f') { ostream << "\\u" << std::hex << std::setw(4) << std::setfill('0') << static_cast(ch); } else { diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp index 706bc2fd05ff..3f33c4fce224 100644 --- a/torch/csrc/lazy/core/ir_dump_util.cpp +++ b/torch/csrc/lazy/core/ir_dump_util.cpp @@ -43,13 +43,13 @@ std::optional ParseAttrTag( } std::string::size_type vpos = match[1].second - node_string.begin() + 1; - char nested_open = -1; - char nested_close = -1; + std::optional nested_open; + std::optional nested_close; size_t nest_count = 1; AttrTag tag; tag.name = match[1].str(); for (pos = vpos; pos < node_string.size(); ++pos) { - if (nested_open < 0) { + if (!nested_open.has_value()) { if (SkipTagSeparator(node_string, pos) != pos) { break; } @@ -72,7 +72,8 @@ std::optional ParseAttrTag( --nest_count; if (nest_count == 0) { nest_count = 1; - nested_open = nested_close = -1; + nested_open.reset(); + nested_close.reset(); } } else if (node_string[pos] == nested_open) { ++nest_count; diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp index 8602d9f513b9..4d1fc6285706 100644 --- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp +++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp @@ -669,7 +669,7 @@ static std::string json_str_escape(const std::string& str) { ostream << "\\r"; } else if (ch == '\t') { ostream << "\\t"; - } else if ('\x00' <= ch && ch <= '\x1f') { + } else if (ch <= '\x1f') { ostream << "\\u" << std::hex << std::setw(4) << std::setfill('0') << static_cast(ch); } else {