From b72ddbab60d3de21c1c83dd444ff540b7220044e Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 18 Jan 2024 08:15:50 +0000
Subject: [PATCH] [Clang-tidy header][15/N] Enable clang-tidy on headers in
 c10/cuda and c10/mobile (#116602)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/116602
Approved by: https://github.com/ezyang
---
 .lintrunner.toml                   |  6 +++---
 c10/cuda/CUDAAllocatorConfig.cpp   |  2 ++
 c10/cuda/CUDAAllocatorConfig.h     | 11 +++++------
 c10/cuda/CUDACachingAllocator.cpp  |  3 ---
 c10/cuda/CUDACachingAllocator.h    | 29 +++++++++++++++++------------
 c10/cuda/CUDADeviceAssertionHost.h | 26 +++++++++++++++++---------
 c10/cuda/CUDAGraphsC10Utils.h      |  6 +++---
 c10/cuda/impl/CUDAGuardImpl.h      | 19 ++++++++++++-------
 c10/mobile/CPUCachingAllocator.h   |  2 ++
 c10/mobile/CPUProfilingAllocator.h |  3 +++
 10 files changed, 64 insertions(+), 43 deletions(-)
diff --git a/.lintrunner.toml b/.lintrunner.toml
index fa3711916c86..9d558a0140ee 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -246,8 +246,7 @@ code = 'CLANGTIDY'
 include_patterns = [
     'aten/src/ATen/core/*.cpp',
     'c10/**/*.cpp',
-    'c10/core/**/*.h',
-    'c10/util/**/*.h',
+    'c10/**/*.h',
     # Enable coverage of headers in torch/csrc and excluding sub-directories for now.
     'torch/csrc/*.h',
     'torch/csrc/**/*.cpp',
@@ -258,10 +257,10 @@ exclude_patterns = [
     # CUDA files are also excluded.
     '**/fb/**',
     '**/*pb.h',
-    '**/*CUDA*',
     '**/cuda/*pp',
     '**/*XPU*',
     '**/xpu/*pp',
+    'c10/cuda/CUDAAlgorithm.h',
     'c10/util/complex_math.h',
     'c10/util/complex_utils.h',
     'c10/util/flat_hash_map.h',
@@ -272,6 +271,7 @@ exclude_patterns = [
     'c10/util/SmallVector.h',
     'c10/util/win32-headers.h',
     'c10/util/*inl.h',
+    'c10/test/**/*.h',
     'aten/src/ATen/core/TensorImpl_test.cpp',
     'third_party/**/*',
     'torch/csrc/api/**',
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 45a1233beb3e..5a709f981b0e 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -1,4 +1,6 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/llvmMathExtras.h>
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 68fe160a8a96..3a10926c555f 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -1,14 +1,13 @@
 #pragma once
 
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>
-#include <c10/util/llvmMathExtras.h>
 #include <cuda_runtime_api.h>
 
 #include <atomic>
-#include <vector>
+#include <cstddef>
+#include <cstdlib>
+#include <string>
 
 namespace c10::cuda::CUDACachingAllocator {
 
@@ -74,8 +73,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
  private:
   CUDAAllocatorConfig();
 
-  void lexArgs(const char* env, std::vector<std::string>& config);
-  void consumeToken(
+  static void lexArgs(const char* env, std::vector<std::string>& config);
+  static void consumeToken(
       const std::vector<std::string>& config,
       size_t i,
       const char c);
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 0b62f61f5e3f..d3e0ebbaf593 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -22,13 +22,10 @@
 #include <c10/util/Exception.h>
 #include <cuda_runtime_api.h>
 #include <algorithm>
-#include <bitset>
 #include <cstddef>
 #include <cstdint>
 #include <deque>
 #include <iostream>
-#include <iterator>
-#include <map>
 #include <memory>
 #include <mutex>
 #include <regex>
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 0706cbaf64ff..c4a84bef304b 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -1,17 +1,22 @@
 #pragma once
 
 #include <c10/core/Allocator.h>
-#include <c10/core/StorageImpl.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/ApproximateClock.h>
+#include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
 
 #include <array>
-#include <mutex>
-#include <set>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
 #include <unordered_set>
+#include <utility>
 
 namespace c10 {
 
@@ -101,7 +106,7 @@ struct DeviceStats {
   int64_t max_split_size = 0;
 };
 
-typedef std::shared_ptr<GatheredContext> (*CreateContextFn)(void);
+typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
 
 // Struct containing info of an allocation block (i.e. a fractional part of a
 // cudaMalloc)..
@@ -123,7 +128,7 @@ struct SegmentInfo {
   int64_t requested_size = 0; // unrounded, actually requested size
   int64_t allocated_size = 0;
   int64_t active_size = 0;
-  cudaStream_t stream = 0;
+  cudaStream_t stream = nullptr;
   bool is_large = false;
   bool is_expandable = false;
   MempoolId_t owner_private_pool_id = {0, 0};
@@ -170,16 +175,16 @@ struct TraceEntry {
         addr_(addr),
         context_(std::move(context)),
         stream_(stream),
-        size_(size) {
+        size_(static_cast<int64_t>(size)) {
     time_.approx_t_ = time;
   }
   Action action_;
   int device_;
   int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
   std::shared_ptr<GatheredContext> context_;
-  cudaStream_t stream_;
+  cudaStream_t stream_{};
   int64_t size_;
-  trace_time_ time_;
+  trace_time_ time_{};
 };
 
 struct SnapshotInfo {
@@ -372,7 +377,7 @@ inline std::shared_ptr<AllocatorState> getCheckpointState(
 inline CheckpointDelta setCheckpointPoolState(
     int device,
     std::shared_ptr<AllocatorState> pps) {
-  return get()->setCheckpointPoolState(device, pps);
+  return get()->setCheckpointPoolState(device, std::move(pps));
 }
 
 // CUDAGraph interactions
@@ -409,11 +414,11 @@ inline bool checkPoolLiveAllocations(
 }
 
 inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
-  return get()->attachOutOfMemoryObserver(observer);
+  return get()->attachOutOfMemoryObserver(std::move(observer));
 }
 
 inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
-  return get()->attachAllocatorTraceTracker(tracker);
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
 }
 
 inline void releasePool(int device, MempoolId_t mempool_id) {
@@ -421,7 +426,7 @@ inline void releasePool(int device, MempoolId_t mempool_id) {
 }
 // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
 inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
-  return get()->getIpcDevPtr(handle);
+  return get()->getIpcDevPtr(std::move(handle));
 }
 
 inline std::string name() {
diff --git a/c10/cuda/CUDADeviceAssertionHost.h b/c10/cuda/CUDADeviceAssertionHost.h
index 7a9b070c85e0..a945915c2878 100644
--- a/c10/cuda/CUDADeviceAssertionHost.h
+++ b/c10/cuda/CUDADeviceAssertionHost.h
@@ -2,9 +2,11 @@
 
 #include <c10/cuda/CUDAMacros.h>
 
+#include <cstdint>
 #include <memory>
 #include <mutex>
 #include <string>
+#include <utility>
 #include <vector>
 
 #ifdef USE_CUDA
@@ -22,19 +24,24 @@ namespace c10::cuda {
 /// Held in managed memory and access by both the CPU and the GPU.
 struct DeviceAssertionData {
   /// Stringification of the assertion
-  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
+  // NOLINTNEXTLINE(*-c-arrays)
+  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
   /// File the assertion was in
-  char filename[C10_CUDA_DSA_MAX_STR_LEN];
+  // NOLINTNEXTLINE(*-c-arrays)
+  char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
   /// Name of the function the assertion was in
-  char function_name[C10_CUDA_DSA_MAX_STR_LEN];
+  // NOLINTNEXTLINE(*-c-arrays)
+  char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
   /// Line number the assertion was at
-  int line_number;
+  int line_number{};
   /// Number uniquely identifying the kernel launch that triggered the assertion
-  uint32_t caller;
+  uint32_t caller{};
   /// block_id of the thread that failed the assertion
-  int32_t block_id[3];
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t block_id[3]{};
   /// third_id of the thread that failed the assertion
-  int32_t thread_id[3];
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t thread_id[3]{};
 };
 
 /// Used to hold assertions generated by the device
@@ -42,9 +49,10 @@ struct DeviceAssertionData {
 struct DeviceAssertionsData {
   /// Total number of assertions found; a subset of thse will be recorded
   /// in `assertions`
-  int32_t assertion_count;
+  int32_t assertion_count{};
   /// An array of assertions that will be written to in a race-free manner
-  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
+  // NOLINTNEXTLINE(*-c-arrays)
+  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
 };
 
 /// Use to hold info about kernel launches so that we can run kernels
diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h
index 7aca8852fe89..79a36d80cead 100644
--- a/c10/cuda/CUDAGraphsC10Utils.h
+++ b/c10/cuda/CUDAGraphsC10Utils.h
@@ -19,8 +19,8 @@ using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
 // that controls the error-checking strictness of a capture.
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
 struct C10_CUDA_API CUDAStreamCaptureModeGuard {
-  CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) {
-    strictness_ = desired;
+  CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired)
+      : strictness_(desired) {
     C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_));
   }
   ~CUDAStreamCaptureModeGuard() {
@@ -79,7 +79,7 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
 // Use this version where you're sure a CUDA context exists already.
 inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
-  cudaStreamCaptureStatus is_capturing;
+  cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone};
   C10_CUDA_CHECK(
       cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
   return CaptureStatus(is_capturing);
diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h
index 1d580ef10a88..f98f6a98a3fa 100644
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <c10/core/DeviceGuard.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/core/impl/GPUTrace.h>
 #include <c10/macros/Macros.h>
@@ -11,7 +10,13 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAStream.h>
 
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/util/Optional.h>
 #include <cuda_runtime_api.h>
+#include <cstdint>
 
 namespace c10 {
 namespace cuda {
@@ -30,21 +35,21 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   Device exchangeDevice(Device d) const override {
     TORCH_INTERNAL_ASSERT(d.is_cuda());
     int old_device_index = c10::cuda::ExchangeDevice(d.index());
-    return Device(DeviceType::CUDA, old_device_index);
+    return Device(DeviceType::CUDA, static_cast<DeviceIndex>(old_device_index));
   }
   Device getDevice() const override {
-    int device;
+    int device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-    return Device(DeviceType::CUDA, device);
+    return Device(DeviceType::CUDA, static_cast<DeviceIndex>(device));
   }
   c10::optional<Device> uncheckedGetDevice() const noexcept {
-    int device;
+    int device = 0;
     const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
     C10_CUDA_CHECK_WARN(err);
     if (err != cudaSuccess) {
       return c10::nullopt;
     }
-    return Device(DeviceType::CUDA, device);
+    return Device(DeviceType::CUDA, static_cast<DeviceIndex>(device));
   }
   void setDevice(Device d) const override {
     TORCH_INTERNAL_ASSERT(d.is_cuda());
@@ -104,7 +109,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     if (!event)
       return;
     auto cuda_event = static_cast<cudaEvent_t>(event);
-    int orig_device;
+    int orig_device = 0;
     C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device));
     C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index));
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h
index 94e1305d80c7..6e695af121fe 100644
--- a/c10/mobile/CPUCachingAllocator.h
+++ b/c10/mobile/CPUCachingAllocator.h
@@ -1,7 +1,9 @@
 #pragma once
 
+#include <cstddef>
 #include <mutex>
 
+#include <c10/macros/Export.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/flat_hash_map.h>
 
diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h
index f26c5d25e609..02e8ea4414a2 100644
--- a/c10/mobile/CPUProfilingAllocator.h
+++ b/c10/mobile/CPUProfilingAllocator.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#include <c10/macros/Export.h>
 #include <c10/util/flat_hash_map.h>
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <vector>