[Clang-tidy header][24/N] Fix clang-tidy warnings on c10/cuda/*.{cpp,h} (#120781)

This PR begins to clean clang-tidy warnings of code in c10/cuda. Pull Request resolved: https://github.com/pytorch/pytorch/pull/120781 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2024-03-15 05:03:22 +00:00
parent e4fda049c2
commit fb10e13000
6 changed files with 54 additions and 39 deletions
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -67,7 +67,7 @@ void CUDAAllocatorConfig::consumeToken(
    size_t i,
    const char c) {
  TORCH_CHECK(
-      i < config.size() && config[i].compare(std::string(1, c)) == 0,
+      i < config.size() && config[i] == std::string(1, c),
      "Error parsing CachingAllocator settings, expected ",
      c,
      "");
@ -77,15 +77,16 @@ size_t CUDAAllocatorConfig::parseMaxSplitSize(
    const std::vector<std::string>& config,
    size_t i) {
  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
  if (++i < config.size()) {
    size_t val1 = stoi(config[i]);
    TORCH_CHECK(
-        val1 > kLargeBuffer / (1024 * 1024),
+        val1 > kLargeBuffer / mb,
        "CachingAllocator option max_split_size_mb too small, must be > ",
-        kLargeBuffer / (1024 * 1024),
+        kLargeBuffer / mb,
        "");
-    val1 = std::max(val1, kLargeBuffer / (1024 * 1024));
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
    m_max_split_size = val1 * 1024 * 1024;
  } else {
    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
@ -118,9 +119,9 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
  bool first_value = true;

  if (++i < config.size()) {
-    if (config[i].compare("[") == 0) {
+    if (std::string_view(config[i]) == "[") {
      size_t last_index = 0;
-      while (++i < config.size() && config[i].compare("]") != 0) {
+      while (++i < config.size() && std::string_view(config[i]) != "]") {
        const std::string& val1 = config[i];
        size_t val2 = 0;

@ -136,7 +137,7 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
            "For roundups, the divisons has to be power of 2 ",
            "");

-        if (val1.compare(">") == 0) {
+        if (std::string_view(val1) == ">") {
          std::fill(
              std::next(
                  m_roundup_power2_divisions.begin(),
@ -171,7 +172,7 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
          last_index = index;
        }

-        if (config[i + 1].compare("]") != 0) {
+        if (std::string_view(config[i + 1]) != "]") {
          consumeToken(config, ++i, ',');
        }
      }
@ -253,51 +254,61 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
  lexArgs(env, config);

  for (size_t i = 0; i < config.size(); i++) {
-    if (config[i].compare("max_split_size_mb") == 0) {
+    std::string_view config_item_view(config[i]);
+    if (config_item_view == "max_split_size_mb") {
      i = parseMaxSplitSize(config, i);
      used_native_specific_option = true;
-    } else if (config[i].compare("garbage_collection_threshold") == 0) {
+    } else if (config_item_view == "garbage_collection_threshold") {
      i = parseGarbageCollectionThreshold(config, i);
      used_native_specific_option = true;
-    } else if (config[i].compare("roundup_power2_divisions") == 0) {
+    } else if (config_item_view == "roundup_power2_divisions") {
      i = parseRoundUpPower2Divisions(config, i);
      used_native_specific_option = true;
-    } else if (config[i].compare("backend") == 0) {
+    } else if (config_item_view == "backend") {
      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
-    } else if (config[i] == "expandable_segments") {
+    } else if (config_item_view == "expandable_segments") {
      used_native_specific_option = true;
      consumeToken(config, ++i, ':');
      ++i;
      TORCH_CHECK(
-          i < config.size() && (config[i] == "True" || config[i] == "False"),
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
          "Expected a single True/False argument for expandable_segments");
-      m_expandable_segments = (config[i] == "True");
+      config_item_view = config[i];
+      m_expandable_segments = (config_item_view == "True");
    } else if (
        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
        // use, accept both. We must break up the string to prevent hipify here.
-        config[i].compare("release_lock_on_hipmalloc") == 0 ||
-        config[i].compare("release_lock_on_c"
-                          "udamalloc") == 0) {
+        config_item_view == "release_lock_on_hipmalloc" ||
+        config_item_view ==
+            "release_lock_on_c"
+            "udamalloc") {
      used_native_specific_option = true;
      consumeToken(config, ++i, ':');
      ++i;
      TORCH_CHECK(
-          i < config.size() && (config[i] == "True" || config[i] == "False"),
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
          "Expected a single True/False argument for release_lock_on_cudamalloc");
-      m_release_lock_on_cudamalloc = (config[i] == "True");
+      config_item_view = config[i];
+      m_release_lock_on_cudamalloc = (config_item_view == "True");
    } else if (
        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
        // use, accept both. We must break up the string to prevent hipify here.
-        config[i].compare("pinned_use_hip_host_register") == 0 ||
-        config[i].compare("pinned_use_c"
-                          "uda_host_register") == 0) {
+        config_item_view == "pinned_use_hip_host_register" ||
+        config_item_view ==
+            "pinned_use_c"
+            "uda_host_register") {
      i = parsePinnedUseCudaHostRegister(config, i);
      used_native_specific_option = true;
-    } else if (config[i].compare("pinned_num_register_threads") == 0) {
+    } else if (config_item_view == "pinned_num_register_threads") {
      i = parsePinnedNumRegisterThreads(config, i);
      used_native_specific_option = true;
    } else {
-      TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i]);
+      TORCH_CHECK(
+          false, "Unrecognized CachingAllocator option: ", config_item_view);
    }

    if (i + 1 < config.size()) {
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -2,13 +2,13 @@

 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>
-#include <cuda_runtime_api.h>

 #include <atomic>
 #include <cstddef>
 #include <cstdlib>
 #include <mutex>
 #include <string>
+#include <vector>

 namespace c10::cuda::CUDACachingAllocator {

--- a/c10/cuda/CUDAGuard.h
+++ b/c10/cuda/CUDAGuard.h
@ -6,8 +6,6 @@
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/impl/CUDAGuardImpl.h>

-#include <cstddef>
-
 namespace c10::cuda {

 // This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
--- a/c10/cuda/CUDAMiscFunctions.cpp
+++ b/c10/cuda/CUDAMiscFunctions.cpp
@ -1,5 +1,5 @@
 #include <c10/cuda/CUDAMiscFunctions.h>
-#include <stdlib.h>
+#include <cstdlib>

 namespace c10::cuda {

--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@ -6,6 +6,7 @@
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>

+#include <array>
 #include <atomic>
 #include <cstdint>

@ -38,14 +39,19 @@ static int max_stream_priorities;
 // the destruction.
 #if !defined(USE_ROCM)
 // CUDA-only: used to initializes the stream pools (once)
-static c10::once_flag device_flags[C10_COMPILE_TIME_MAX_GPUS];
+static std::array<c10::once_flag, C10_COMPILE_TIME_MAX_GPUS> device_flags;
 #endif
-static std::atomic<uint32_t>
-    priority_counters[c10::cuda::max_compile_time_stream_priorities]
-                     [C10_COMPILE_TIME_MAX_GPUS];
+static std::array<
+    std::array<std::atomic<uint32_t>, C10_COMPILE_TIME_MAX_GPUS>,
+    c10::cuda::max_compile_time_stream_priorities>
+    priority_counters;

-static cudaStream_t streams[c10::cuda::max_compile_time_stream_priorities]
-                           [C10_COMPILE_TIME_MAX_GPUS][kStreamsPerPool];
+static std::array<
+    std::array<
+        std::array<cudaStream_t, kStreamsPerPool>,
+        C10_COMPILE_TIME_MAX_GPUS>,
+    c10::cuda::max_compile_time_stream_priorities>
+    streams;
 #ifdef USE_ROCM
 static c10::once_flag
    stream_flags[c10::cuda::max_compile_time_stream_priorities]
@ -159,6 +165,7 @@ StreamId makeStreamId(StreamIdType st, size_t si) {
 }

 // Thread-local current streams
+// NOLINTNEXTLINE(*-arrays)
 static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;

 // Populates global values.
@ -227,6 +234,7 @@ static void initCUDAStreamsOnce() {
  }

  // Inits current streams (thread local) to default streams
+  // NOLINTNEXTLINE(*-arrays)
  current_streams = std::make_unique<StreamId[]>(num_gpus);
  for (const auto i : c10::irange(num_gpus)) {
    current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
@ -274,6 +282,7 @@ cudaStream_t CUDAStream::stream() const {
        " official API like c10::cuda::getStreamFromPool() to get a new stream.");
    return nullptr;
  } else if (st.isExt()) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
    return reinterpret_cast<cudaStream_t>(stream_id);
  } else {
    auto streamType = st.getStreamType();
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@ -1,8 +1,5 @@
 #pragma once

-#include <cstdint>
-#include <utility>
-
 #include <cuda_runtime_api.h>

 #include <c10/core/DeviceGuard.h>