mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Check all CUDA API calls for errors in torch/ (#81560)
Summary:
Original commit changeset: 0bb770d2cdb2
Original Phabricator Diff: D35194935 (79e5b053b6)
Differential Revision: D35291874
Pull Request resolved: https://github.com/pytorch/pytorch/pull/81560
Approved by: https://github.com/ezyang
This commit is contained in:
committed by
PyTorch MergeBot
parent
4e3a0ff92e
commit
3ece9fb45d
@ -195,7 +195,7 @@ CudaIPCSentData::~CudaIPCSentData() {
|
||||
try {
|
||||
if (event_sync_required_) {
|
||||
at::cuda::CUDAGuard device_guard(device_.index());
|
||||
cudaEventDestroy(event_);
|
||||
C10_CUDA_CHECK(cudaEventDestroy(event_));
|
||||
if (!CudaIPCGlobalEntities::alive) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
#include <torch/csrc/cuda/nccl.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/hash.h>
|
||||
@ -142,7 +143,7 @@ struct NcclCommList {
|
||||
if (comms) {
|
||||
for (const auto i : c10::irange(ndevices)) {
|
||||
int dummy_var;
|
||||
if (cudaGetDevice(&dummy_var) != cudaSuccess) {
|
||||
if (C10_CUDA_ERROR_HANDLED(cudaGetDevice(&dummy_var)) != cudaSuccess) {
|
||||
/* there are cases when this destructor is called after the
|
||||
CUDA driver is already unloaded from the process.
|
||||
In these cases, skip ncclCommDestroy */
|
||||
|
||||
@ -71,25 +71,26 @@ void initCudartBindings(PyObject* module) {
|
||||
"cuda"
|
||||
"HostRegister",
|
||||
[](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t {
|
||||
return cudaHostRegister((void*)ptr, size, flags);
|
||||
return C10_CUDA_ERROR_HANDLED(
|
||||
cudaHostRegister((void*)ptr, size, flags));
|
||||
});
|
||||
cudart.def(
|
||||
"cuda"
|
||||
"HostUnregister",
|
||||
[](uintptr_t ptr) -> cudaError_t {
|
||||
return cudaHostUnregister((void*)ptr);
|
||||
return C10_CUDA_ERROR_HANDLED(cudaHostUnregister((void*)ptr));
|
||||
});
|
||||
cudart.def(
|
||||
"cuda"
|
||||
"StreamCreate",
|
||||
[](uintptr_t ptr) -> cudaError_t {
|
||||
return cudaStreamCreate((cudaStream_t*)ptr);
|
||||
return C10_CUDA_ERROR_HANDLED(cudaStreamCreate((cudaStream_t*)ptr));
|
||||
});
|
||||
cudart.def(
|
||||
"cuda"
|
||||
"StreamDestroy",
|
||||
[](uintptr_t ptr) -> cudaError_t {
|
||||
return cudaStreamDestroy((cudaStream_t)ptr);
|
||||
return C10_CUDA_ERROR_HANDLED(cudaStreamDestroy((cudaStream_t)ptr));
|
||||
});
|
||||
#if !defined(USE_ROCM)
|
||||
cudart.def(
|
||||
@ -104,7 +105,7 @@ void initCudartBindings(PyObject* module) {
|
||||
c10::cuda::CUDAGuard guard(device);
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
cudaMemGetInfo(&device_free, &device_total);
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
|
||||
return {device_free, device_total};
|
||||
});
|
||||
}
|
||||
|
||||
@ -1175,9 +1175,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
|
||||
if (measure_kernel_time_ ||
|
||||
isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
|
||||
isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
|
||||
cudaEventCreate(&start_event);
|
||||
cudaEventCreate(&finish_event);
|
||||
cudaEventRecord(start_event);
|
||||
C10_CUDA_CHECK(cudaEventCreate(&start_event));
|
||||
C10_CUDA_CHECK(cudaEventCreate(&finish_event));
|
||||
C10_CUDA_CHECK(cudaEventRecord(start_event));
|
||||
}
|
||||
|
||||
if (execute_kernel_) {
|
||||
@ -1233,12 +1233,13 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
|
||||
if (measure_kernel_time_ ||
|
||||
isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
|
||||
isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
|
||||
cudaEventRecord(finish_event);
|
||||
cudaEventSynchronize(start_event);
|
||||
cudaEventSynchronize(finish_event);
|
||||
cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event);
|
||||
cudaEventDestroy(start_event);
|
||||
cudaEventDestroy(finish_event);
|
||||
C10_CUDA_CHECK(cudaEventRecord(finish_event));
|
||||
C10_CUDA_CHECK(cudaEventSynchronize(start_event));
|
||||
C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
|
||||
C10_CUDA_CHECK(
|
||||
cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
|
||||
C10_CUDA_CHECK(cudaEventDestroy(start_event));
|
||||
C10_CUDA_CHECK(cudaEventDestroy(finish_event));
|
||||
|
||||
bytes_processed_ = 0;
|
||||
// Figure how many bytes are inputs, outputs, and temporary buffers
|
||||
|
||||
@ -941,7 +941,7 @@ void initializeCudaContext() {
|
||||
if (!pctx) {
|
||||
std::unique_lock<std::mutex> cudaFreeMutexLock(
|
||||
*(c10::cuda::getFreeMutex()));
|
||||
cudaFree(nullptr);
|
||||
C10_CUDA_CHECK(cudaFree(nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -81,16 +81,14 @@ struct CUDAMethods : public ProfilerStubs {
|
||||
|
||||
void onEachDevice(std::function<void(int)> op) const override {
|
||||
at::cuda::OptionalCUDAGuard device_guard;
|
||||
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
||||
int count = at::cuda::device_count();
|
||||
for (const auto i : c10::irange(count)) {
|
||||
for (const auto i : c10::irange(at::cuda::device_count())) {
|
||||
device_guard.set_index(i);
|
||||
op(i);
|
||||
}
|
||||
}
|
||||
|
||||
void synchronize() const override {
|
||||
cudaDeviceSynchronize();
|
||||
TORCH_CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
bool enabled() const override {
|
||||
|
||||
Reference in New Issue
Block a user