From b72ddbab60d3de21c1c83dd444ff540b7220044e Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 18 Jan 2024 08:15:50 +0000 Subject: [PATCH] [Clang-tidy header][15/N] Enable clang-tidy on headers in c10/cuda and c10/mobile (#116602) Pull Request resolved: https://github.com/pytorch/pytorch/pull/116602 Approved by: https://github.com/ezyang --- .lintrunner.toml | 6 +++--- c10/cuda/CUDAAllocatorConfig.cpp | 2 ++ c10/cuda/CUDAAllocatorConfig.h | 11 +++++------ c10/cuda/CUDACachingAllocator.cpp | 3 --- c10/cuda/CUDACachingAllocator.h | 29 +++++++++++++++++------------ c10/cuda/CUDADeviceAssertionHost.h | 26 +++++++++++++++++--------- c10/cuda/CUDAGraphsC10Utils.h | 6 +++--- c10/cuda/impl/CUDAGuardImpl.h | 19 ++++++++++++------- c10/mobile/CPUCachingAllocator.h | 2 ++ c10/mobile/CPUProfilingAllocator.h | 3 +++ 10 files changed, 64 insertions(+), 43 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index fa3711916c86..9d558a0140ee 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -246,8 +246,7 @@ code = 'CLANGTIDY' include_patterns = [ 'aten/src/ATen/core/*.cpp', 'c10/**/*.cpp', - 'c10/core/**/*.h', - 'c10/util/**/*.h', + 'c10/**/*.h', # Enable coverage of headers in torch/csrc and excluding sub-directories for now. 'torch/csrc/*.h', 'torch/csrc/**/*.cpp', @@ -258,10 +257,10 @@ exclude_patterns = [ # CUDA files are also excluded. '**/fb/**', '**/*pb.h', - '**/*CUDA*', '**/cuda/*pp', '**/*XPU*', '**/xpu/*pp', + 'c10/cuda/CUDAAlgorithm.h', 'c10/util/complex_math.h', 'c10/util/complex_utils.h', 'c10/util/flat_hash_map.h', @@ -272,6 +271,7 @@ exclude_patterns = [ 'c10/util/SmallVector.h', 'c10/util/win32-headers.h', 'c10/util/*inl.h', + 'c10/test/**/*.h', 'aten/src/ATen/core/TensorImpl_test.cpp', 'third_party/**/*', 'torch/csrc/api/**', diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp index 45a1233beb3e..5a709f981b0e 100644 --- a/c10/cuda/CUDAAllocatorConfig.cpp +++ b/c10/cuda/CUDAAllocatorConfig.cpp @@ -1,4 +1,6 @@ #include +#include +#include #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) #include diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index 68fe160a8a96..3a10926c555f 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -1,14 +1,13 @@ #pragma once -#include -#include #include #include -#include #include #include -#include +#include +#include +#include namespace c10::cuda::CUDACachingAllocator { @@ -74,8 +73,8 @@ class C10_CUDA_API CUDAAllocatorConfig { private: CUDAAllocatorConfig(); - void lexArgs(const char* env, std::vector& config); - void consumeToken( + static void lexArgs(const char* env, std::vector& config); + static void consumeToken( const std::vector& config, size_t i, const char c); diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 0b62f61f5e3f..d3e0ebbaf593 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -22,13 +22,10 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index 0706cbaf64ff..c4a84bef304b 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -1,17 +1,22 @@ #pragma once #include -#include #include #include #include #include +#include #include #include -#include -#include +#include +#include +#include +#include +#include +#include #include +#include namespace c10 { @@ -101,7 +106,7 @@ struct DeviceStats { int64_t max_split_size = 0; }; -typedef std::shared_ptr (*CreateContextFn)(void); +typedef std::shared_ptr (*CreateContextFn)(); // Struct containing info of an allocation block (i.e. a fractional part of a // cudaMalloc).. @@ -123,7 +128,7 @@ struct SegmentInfo { int64_t requested_size = 0; // unrounded, actually requested size int64_t allocated_size = 0; int64_t active_size = 0; - cudaStream_t stream = 0; + cudaStream_t stream = nullptr; bool is_large = false; bool is_expandable = false; MempoolId_t owner_private_pool_id = {0, 0}; @@ -170,16 +175,16 @@ struct TraceEntry { addr_(addr), context_(std::move(context)), stream_(stream), - size_(size) { + size_(static_cast(size)) { time_.approx_t_ = time; } Action action_; int device_; int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda std::shared_ptr context_; - cudaStream_t stream_; + cudaStream_t stream_{}; int64_t size_; - trace_time_ time_; + trace_time_ time_{}; }; struct SnapshotInfo { @@ -372,7 +377,7 @@ inline std::shared_ptr getCheckpointState( inline CheckpointDelta setCheckpointPoolState( int device, std::shared_ptr pps) { - return get()->setCheckpointPoolState(device, pps); + return get()->setCheckpointPoolState(device, std::move(pps)); } // CUDAGraph interactions @@ -409,11 +414,11 @@ inline bool checkPoolLiveAllocations( } inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) { - return get()->attachOutOfMemoryObserver(observer); + return get()->attachOutOfMemoryObserver(std::move(observer)); } inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) { - return get()->attachAllocatorTraceTracker(tracker); + return get()->attachAllocatorTraceTracker(std::move(tracker)); } inline void releasePool(int device, MempoolId_t mempool_id) { @@ -421,7 +426,7 @@ inline void releasePool(int device, MempoolId_t mempool_id) { } // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE inline std::shared_ptr getIpcDevPtr(std::string handle) { - return get()->getIpcDevPtr(handle); + return get()->getIpcDevPtr(std::move(handle)); } inline std::string name() { diff --git a/c10/cuda/CUDADeviceAssertionHost.h b/c10/cuda/CUDADeviceAssertionHost.h index 7a9b070c85e0..a945915c2878 100644 --- a/c10/cuda/CUDADeviceAssertionHost.h +++ b/c10/cuda/CUDADeviceAssertionHost.h @@ -2,9 +2,11 @@ #include +#include #include #include #include +#include #include #ifdef USE_CUDA @@ -22,19 +24,24 @@ namespace c10::cuda { /// Held in managed memory and access by both the CPU and the GPU. struct DeviceAssertionData { /// Stringification of the assertion - char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]; + // NOLINTNEXTLINE(*-c-arrays) + char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{}; /// File the assertion was in - char filename[C10_CUDA_DSA_MAX_STR_LEN]; + // NOLINTNEXTLINE(*-c-arrays) + char filename[C10_CUDA_DSA_MAX_STR_LEN]{}; /// Name of the function the assertion was in - char function_name[C10_CUDA_DSA_MAX_STR_LEN]; + // NOLINTNEXTLINE(*-c-arrays) + char function_name[C10_CUDA_DSA_MAX_STR_LEN]{}; /// Line number the assertion was at - int line_number; + int line_number{}; /// Number uniquely identifying the kernel launch that triggered the assertion - uint32_t caller; + uint32_t caller{}; /// block_id of the thread that failed the assertion - int32_t block_id[3]; + // NOLINTNEXTLINE(*-c-arrays) + int32_t block_id[3]{}; /// third_id of the thread that failed the assertion - int32_t thread_id[3]; + // NOLINTNEXTLINE(*-c-arrays) + int32_t thread_id[3]{}; }; /// Used to hold assertions generated by the device @@ -42,9 +49,10 @@ struct DeviceAssertionData { struct DeviceAssertionsData { /// Total number of assertions found; a subset of thse will be recorded /// in `assertions` - int32_t assertion_count; + int32_t assertion_count{}; /// An array of assertions that will be written to in a race-free manner - DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]; + // NOLINTNEXTLINE(*-c-arrays) + DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{}; }; /// Use to hold info about kernel launches so that we can run kernels diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h index 7aca8852fe89..79a36d80cead 100644 --- a/c10/cuda/CUDAGraphsC10Utils.h +++ b/c10/cuda/CUDAGraphsC10Utils.h @@ -19,8 +19,8 @@ using MempoolId_t = std::pair; // that controls the error-checking strictness of a capture. #if !defined(USE_ROCM) || ROCM_VERSION >= 50300 struct C10_CUDA_API CUDAStreamCaptureModeGuard { - CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) { - strictness_ = desired; + CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) + : strictness_(desired) { C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_)); } ~CUDAStreamCaptureModeGuard() { @@ -79,7 +79,7 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) { // Use this version where you're sure a CUDA context exists already. inline CaptureStatus currentStreamCaptureStatusMayInitCtx() { #if !defined(USE_ROCM) || ROCM_VERSION >= 50300 - cudaStreamCaptureStatus is_capturing; + cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone}; C10_CUDA_CHECK( cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing)); return CaptureStatus(is_capturing); diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h index 1d580ef10a88..f98f6a98a3fa 100644 --- a/c10/cuda/impl/CUDAGuardImpl.h +++ b/c10/cuda/impl/CUDAGuardImpl.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -11,7 +10,13 @@ #include #include +#include +#include +#include +#include +#include #include +#include namespace c10 { namespace cuda { @@ -30,21 +35,21 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface { Device exchangeDevice(Device d) const override { TORCH_INTERNAL_ASSERT(d.is_cuda()); int old_device_index = c10::cuda::ExchangeDevice(d.index()); - return Device(DeviceType::CUDA, old_device_index); + return Device(DeviceType::CUDA, static_cast(old_device_index)); } Device getDevice() const override { - int device; + int device = 0; C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); - return Device(DeviceType::CUDA, device); + return Device(DeviceType::CUDA, static_cast(device)); } c10::optional uncheckedGetDevice() const noexcept { - int device; + int device = 0; const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device)); C10_CUDA_CHECK_WARN(err); if (err != cudaSuccess) { return c10::nullopt; } - return Device(DeviceType::CUDA, device); + return Device(DeviceType::CUDA, static_cast(device)); } void setDevice(Device d) const override { TORCH_INTERNAL_ASSERT(d.is_cuda()); @@ -104,7 +109,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface { if (!event) return; auto cuda_event = static_cast(event); - int orig_device; + int orig_device = 0; C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device)); C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index)); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h index 94e1305d80c7..6e695af121fe 100644 --- a/c10/mobile/CPUCachingAllocator.h +++ b/c10/mobile/CPUCachingAllocator.h @@ -1,7 +1,9 @@ #pragma once +#include #include +#include #include #include diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h index f26c5d25e609..02e8ea4414a2 100644 --- a/c10/mobile/CPUProfilingAllocator.h +++ b/c10/mobile/CPUProfilingAllocator.h @@ -1,6 +1,9 @@ #pragma once +#include #include +#include +#include #include #include