[Clang-tidy header][15/N] Enable clang-tidy on headers in c10/cuda and c10/mobile (#116602)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/116602
Approved by: https://github.com/ezyang
This commit is contained in:
cyy
2024-01-18 08:15:50 +00:00
committed by PyTorch MergeBot
parent 57ca455471
commit b72ddbab60
10 changed files with 64 additions and 43 deletions

View File

@ -246,8 +246,7 @@ code = 'CLANGTIDY'
include_patterns = [ include_patterns = [
'aten/src/ATen/core/*.cpp', 'aten/src/ATen/core/*.cpp',
'c10/**/*.cpp', 'c10/**/*.cpp',
'c10/core/**/*.h', 'c10/**/*.h',
'c10/util/**/*.h',
# Enable coverage of headers in torch/csrc and excluding sub-directories for now. # Enable coverage of headers in torch/csrc and excluding sub-directories for now.
'torch/csrc/*.h', 'torch/csrc/*.h',
'torch/csrc/**/*.cpp', 'torch/csrc/**/*.cpp',
@ -258,10 +257,10 @@ exclude_patterns = [
# CUDA files are also excluded. # CUDA files are also excluded.
'**/fb/**', '**/fb/**',
'**/*pb.h', '**/*pb.h',
'**/*CUDA*',
'**/cuda/*pp', '**/cuda/*pp',
'**/*XPU*', '**/*XPU*',
'**/xpu/*pp', '**/xpu/*pp',
'c10/cuda/CUDAAlgorithm.h',
'c10/util/complex_math.h', 'c10/util/complex_math.h',
'c10/util/complex_utils.h', 'c10/util/complex_utils.h',
'c10/util/flat_hash_map.h', 'c10/util/flat_hash_map.h',
@ -272,6 +271,7 @@ exclude_patterns = [
'c10/util/SmallVector.h', 'c10/util/SmallVector.h',
'c10/util/win32-headers.h', 'c10/util/win32-headers.h',
'c10/util/*inl.h', 'c10/util/*inl.h',
'c10/test/**/*.h',
'aten/src/ATen/core/TensorImpl_test.cpp', 'aten/src/ATen/core/TensorImpl_test.cpp',
'third_party/**/*', 'third_party/**/*',
'torch/csrc/api/**', 'torch/csrc/api/**',

View File

@ -1,4 +1,6 @@
#include <c10/cuda/CUDAAllocatorConfig.h> #include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/llvmMathExtras.h>
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h> #include <c10/cuda/driver_api.h>

View File

@ -1,14 +1,13 @@
#pragma once #pragma once
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h> #include <c10/cuda/CUDAMacros.h>
#include <c10/util/Exception.h> #include <c10/util/Exception.h>
#include <c10/util/llvmMathExtras.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <atomic> #include <atomic>
#include <vector> #include <cstddef>
#include <cstdlib>
#include <string>
namespace c10::cuda::CUDACachingAllocator { namespace c10::cuda::CUDACachingAllocator {
@ -74,8 +73,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
private: private:
CUDAAllocatorConfig(); CUDAAllocatorConfig();
void lexArgs(const char* env, std::vector<std::string>& config); static void lexArgs(const char* env, std::vector<std::string>& config);
void consumeToken( static void consumeToken(
const std::vector<std::string>& config, const std::vector<std::string>& config,
size_t i, size_t i,
const char c); const char c);

View File

@ -22,13 +22,10 @@
#include <c10/util/Exception.h> #include <c10/util/Exception.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <algorithm> #include <algorithm>
#include <bitset>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <deque> #include <deque>
#include <iostream> #include <iostream>
#include <iterator>
#include <map>
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <regex> #include <regex>

View File

@ -1,17 +1,22 @@
#pragma once #pragma once
#include <c10/core/Allocator.h> #include <c10/core/Allocator.h>
#include <c10/core/StorageImpl.h>
#include <c10/cuda/CUDAGraphsC10Utils.h> #include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAMacros.h> #include <c10/cuda/CUDAMacros.h>
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <c10/util/ApproximateClock.h> #include <c10/util/ApproximateClock.h>
#include <c10/util/Exception.h>
#include <c10/util/Registry.h> #include <c10/util/Registry.h>
#include <array> #include <array>
#include <mutex> #include <atomic>
#include <set> #include <cstddef>
#include <cstdint>
#include <functional>
#include <memory>
#include <string>
#include <unordered_set> #include <unordered_set>
#include <utility>
namespace c10 { namespace c10 {
@ -101,7 +106,7 @@ struct DeviceStats {
int64_t max_split_size = 0; int64_t max_split_size = 0;
}; };
typedef std::shared_ptr<GatheredContext> (*CreateContextFn)(void); typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
// Struct containing info of an allocation block (i.e. a fractional part of a // Struct containing info of an allocation block (i.e. a fractional part of a
// cudaMalloc).. // cudaMalloc)..
@ -123,7 +128,7 @@ struct SegmentInfo {
int64_t requested_size = 0; // unrounded, actually requested size int64_t requested_size = 0; // unrounded, actually requested size
int64_t allocated_size = 0; int64_t allocated_size = 0;
int64_t active_size = 0; int64_t active_size = 0;
cudaStream_t stream = 0; cudaStream_t stream = nullptr;
bool is_large = false; bool is_large = false;
bool is_expandable = false; bool is_expandable = false;
MempoolId_t owner_private_pool_id = {0, 0}; MempoolId_t owner_private_pool_id = {0, 0};
@ -170,16 +175,16 @@ struct TraceEntry {
addr_(addr), addr_(addr),
context_(std::move(context)), context_(std::move(context)),
stream_(stream), stream_(stream),
size_(size) { size_(static_cast<int64_t>(size)) {
time_.approx_t_ = time; time_.approx_t_ = time;
} }
Action action_; Action action_;
int device_; int device_;
int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
std::shared_ptr<GatheredContext> context_; std::shared_ptr<GatheredContext> context_;
cudaStream_t stream_; cudaStream_t stream_{};
int64_t size_; int64_t size_;
trace_time_ time_; trace_time_ time_{};
}; };
struct SnapshotInfo { struct SnapshotInfo {
@ -372,7 +377,7 @@ inline std::shared_ptr<AllocatorState> getCheckpointState(
inline CheckpointDelta setCheckpointPoolState( inline CheckpointDelta setCheckpointPoolState(
int device, int device,
std::shared_ptr<AllocatorState> pps) { std::shared_ptr<AllocatorState> pps) {
return get()->setCheckpointPoolState(device, pps); return get()->setCheckpointPoolState(device, std::move(pps));
} }
// CUDAGraph interactions // CUDAGraph interactions
@ -409,11 +414,11 @@ inline bool checkPoolLiveAllocations(
} }
inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) { inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
return get()->attachOutOfMemoryObserver(observer); return get()->attachOutOfMemoryObserver(std::move(observer));
} }
inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) { inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
return get()->attachAllocatorTraceTracker(tracker); return get()->attachAllocatorTraceTracker(std::move(tracker));
} }
inline void releasePool(int device, MempoolId_t mempool_id) { inline void releasePool(int device, MempoolId_t mempool_id) {
@ -421,7 +426,7 @@ inline void releasePool(int device, MempoolId_t mempool_id) {
} }
// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
inline std::shared_ptr<void> getIpcDevPtr(std::string handle) { inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
return get()->getIpcDevPtr(handle); return get()->getIpcDevPtr(std::move(handle));
} }
inline std::string name() { inline std::string name() {

View File

@ -2,9 +2,11 @@
#include <c10/cuda/CUDAMacros.h> #include <c10/cuda/CUDAMacros.h>
#include <cstdint>
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#ifdef USE_CUDA #ifdef USE_CUDA
@ -22,19 +24,24 @@ namespace c10::cuda {
/// Held in managed memory and access by both the CPU and the GPU. /// Held in managed memory and access by both the CPU and the GPU.
struct DeviceAssertionData { struct DeviceAssertionData {
/// Stringification of the assertion /// Stringification of the assertion
char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]; // NOLINTNEXTLINE(*-c-arrays)
char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
/// File the assertion was in /// File the assertion was in
char filename[C10_CUDA_DSA_MAX_STR_LEN]; // NOLINTNEXTLINE(*-c-arrays)
char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
/// Name of the function the assertion was in /// Name of the function the assertion was in
char function_name[C10_CUDA_DSA_MAX_STR_LEN]; // NOLINTNEXTLINE(*-c-arrays)
char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
/// Line number the assertion was at /// Line number the assertion was at
int line_number; int line_number{};
/// Number uniquely identifying the kernel launch that triggered the assertion /// Number uniquely identifying the kernel launch that triggered the assertion
uint32_t caller; uint32_t caller{};
/// block_id of the thread that failed the assertion /// block_id of the thread that failed the assertion
int32_t block_id[3]; // NOLINTNEXTLINE(*-c-arrays)
int32_t block_id[3]{};
/// third_id of the thread that failed the assertion /// third_id of the thread that failed the assertion
int32_t thread_id[3]; // NOLINTNEXTLINE(*-c-arrays)
int32_t thread_id[3]{};
}; };
/// Used to hold assertions generated by the device /// Used to hold assertions generated by the device
@ -42,9 +49,10 @@ struct DeviceAssertionData {
struct DeviceAssertionsData { struct DeviceAssertionsData {
/// Total number of assertions found; a subset of thse will be recorded /// Total number of assertions found; a subset of thse will be recorded
/// in `assertions` /// in `assertions`
int32_t assertion_count; int32_t assertion_count{};
/// An array of assertions that will be written to in a race-free manner /// An array of assertions that will be written to in a race-free manner
DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]; // NOLINTNEXTLINE(*-c-arrays)
DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
}; };
/// Use to hold info about kernel launches so that we can run kernels /// Use to hold info about kernel launches so that we can run kernels

View File

@ -19,8 +19,8 @@ using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
// that controls the error-checking strictness of a capture. // that controls the error-checking strictness of a capture.
#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
struct C10_CUDA_API CUDAStreamCaptureModeGuard { struct C10_CUDA_API CUDAStreamCaptureModeGuard {
CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) { CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired)
strictness_ = desired; : strictness_(desired) {
C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_)); C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_));
} }
~CUDAStreamCaptureModeGuard() { ~CUDAStreamCaptureModeGuard() {
@ -79,7 +79,7 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
// Use this version where you're sure a CUDA context exists already. // Use this version where you're sure a CUDA context exists already.
inline CaptureStatus currentStreamCaptureStatusMayInitCtx() { inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
cudaStreamCaptureStatus is_capturing; cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone};
C10_CUDA_CHECK( C10_CUDA_CHECK(
cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing)); cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
return CaptureStatus(is_capturing); return CaptureStatus(is_capturing);

View File

@ -1,6 +1,5 @@
#pragma once #pragma once
#include <c10/core/DeviceGuard.h>
#include <c10/core/impl/DeviceGuardImplInterface.h> #include <c10/core/impl/DeviceGuardImplInterface.h>
#include <c10/core/impl/GPUTrace.h> #include <c10/core/impl/GPUTrace.h>
#include <c10/macros/Macros.h> #include <c10/macros/Macros.h>
@ -11,7 +10,13 @@
#include <c10/cuda/CUDAFunctions.h> #include <c10/cuda/CUDAFunctions.h>
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <c10/core/Device.h>
#include <c10/core/DeviceType.h>
#include <c10/core/Stream.h>
#include <c10/core/impl/PyInterpreter.h>
#include <c10/util/Optional.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <cstdint>
namespace c10 { namespace c10 {
namespace cuda { namespace cuda {
@ -30,21 +35,21 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
Device exchangeDevice(Device d) const override { Device exchangeDevice(Device d) const override {
TORCH_INTERNAL_ASSERT(d.is_cuda()); TORCH_INTERNAL_ASSERT(d.is_cuda());
int old_device_index = c10::cuda::ExchangeDevice(d.index()); int old_device_index = c10::cuda::ExchangeDevice(d.index());
return Device(DeviceType::CUDA, old_device_index); return Device(DeviceType::CUDA, static_cast<DeviceIndex>(old_device_index));
} }
Device getDevice() const override { Device getDevice() const override {
int device; int device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
return Device(DeviceType::CUDA, device); return Device(DeviceType::CUDA, static_cast<DeviceIndex>(device));
} }
c10::optional<Device> uncheckedGetDevice() const noexcept { c10::optional<Device> uncheckedGetDevice() const noexcept {
int device; int device = 0;
const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device)); const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
C10_CUDA_CHECK_WARN(err); C10_CUDA_CHECK_WARN(err);
if (err != cudaSuccess) { if (err != cudaSuccess) {
return c10::nullopt; return c10::nullopt;
} }
return Device(DeviceType::CUDA, device); return Device(DeviceType::CUDA, static_cast<DeviceIndex>(device));
} }
void setDevice(Device d) const override { void setDevice(Device d) const override {
TORCH_INTERNAL_ASSERT(d.is_cuda()); TORCH_INTERNAL_ASSERT(d.is_cuda());
@ -104,7 +109,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
if (!event) if (!event)
return; return;
auto cuda_event = static_cast<cudaEvent_t>(event); auto cuda_event = static_cast<cudaEvent_t>(event);
int orig_device; int orig_device = 0;
C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device)); C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device));
C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index)); C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index));
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();

View File

@ -1,7 +1,9 @@
#pragma once #pragma once
#include <cstddef>
#include <mutex> #include <mutex>
#include <c10/macros/Export.h>
#include <c10/util/SmallVector.h> #include <c10/util/SmallVector.h>
#include <c10/util/flat_hash_map.h> #include <c10/util/flat_hash_map.h>

View File

@ -1,6 +1,9 @@
#pragma once #pragma once
#include <c10/macros/Export.h>
#include <c10/util/flat_hash_map.h> #include <c10/util/flat_hash_map.h>
#include <cstddef>
#include <cstdint>
#include <memory> #include <memory>
#include <vector> #include <vector>