mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Add support for backend to register reducer timer
Currently by default, reduce timer registration is expected for all backend. if timer is not registered throws assert in set_runtime_stats_and_log() To allow registration of reducer timer for other backends, moved the timer registration to another file decoupling the internal interface. Signed-off-by: Jeeja <jeejakp@habana.ai> Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/71700 Approved by: https://github.com/rohan-varma
This commit is contained in:
@ -14,6 +14,7 @@
|
||||
#include <c10d/Utils.hpp>
|
||||
#include <c10d/comm.hpp>
|
||||
#include <c10d/debug.h>
|
||||
#include <c10d/reducer_timer.hpp>
|
||||
#include <c10d/default_comm_hooks.hpp>
|
||||
#include <torch/csrc/autograd/function.h>
|
||||
#include <torch/csrc/autograd/profiler.h>
|
||||
@ -28,77 +29,10 @@ constexpr int kDefaultFirstBucketBytes = int(1024 * 1024);
|
||||
constexpr int kDefaultBucketBytesCap = int(25 * 1024 * 1024);
|
||||
// Collect runtime stats once for every kDDPRuntimeLoggingSampleRate iterations.
|
||||
constexpr int kDDPRuntimeLoggingSampleRate = 100;
|
||||
constexpr int kUnsetTime = -1;
|
||||
|
||||
inline int64_t current_time_in_nanos() {
|
||||
return torch::profiler::impl::getTime();
|
||||
}
|
||||
|
||||
// Forward declaration
|
||||
class Logger;
|
||||
|
||||
class TORCH_API Timer {
|
||||
private:
|
||||
// The timestamp of forward call start time in each iteration.
|
||||
int64_t forward_start_time = kUnsetTime;
|
||||
// The timestamp of backward computation start and end time in each
|
||||
// iteration.
|
||||
int64_t backward_compute_start_time = kUnsetTime;
|
||||
int64_t backward_compute_end_time = kUnsetTime;
|
||||
// The timestamp of first communication call start time in each iteration.
|
||||
int64_t backward_comm_start_time = kUnsetTime;
|
||||
// The timestamp of last communication call end time in each iteration.
|
||||
int64_t backward_comm_end_time = kUnsetTime;
|
||||
public:
|
||||
enum class Event {
|
||||
kForwardStart,
|
||||
kBackwardComputeStart,
|
||||
kBackwardComputeEnd,
|
||||
kBackwardCommStart,
|
||||
kBackwardCommEnd,
|
||||
};
|
||||
|
||||
// Record the current event, i.e., mark it as having occurred now. Default
|
||||
// CPU implementation.
|
||||
virtual void record(Event event) {
|
||||
getTimeRef(event) = current_time_in_nanos();
|
||||
}
|
||||
|
||||
// Return the difference between when two events occurred, in nanoseconds.
|
||||
// Or nullopt if one of them hasn't been recorded.
|
||||
virtual c10::optional<int64_t> measureDifference(Event start, Event end) = 0;
|
||||
|
||||
virtual ~Timer() = default;
|
||||
|
||||
// Return host-side timestamp, or nullopt if it has not yet been recorded.
|
||||
c10::optional<int64_t> getTimestamp(Event event) {
|
||||
auto time = getTimeRef(event);
|
||||
if (time == kUnsetTime) {
|
||||
return c10::nullopt;
|
||||
} else {
|
||||
return time;
|
||||
}
|
||||
}
|
||||
|
||||
// Return host-side time member variable corresponding to the given event.
|
||||
int64_t& getTimeRef(Event event) {
|
||||
switch (event) {
|
||||
case Event::kForwardStart:
|
||||
return forward_start_time;
|
||||
case Event::kBackwardComputeStart:
|
||||
return backward_compute_start_time;
|
||||
case Event::kBackwardComputeEnd:
|
||||
return backward_compute_end_time;
|
||||
case Event::kBackwardCommStart:
|
||||
return backward_comm_start_time;
|
||||
case Event::kBackwardCommEnd:
|
||||
return backward_comm_end_time;
|
||||
default:
|
||||
TORCH_INTERNAL_ASSERT(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Local accumulator type for a single bucket.
|
||||
struct BucketAccumulator {
|
||||
std::vector<size_t> indices;
|
||||
@ -106,8 +40,6 @@ struct BucketAccumulator {
|
||||
size_t size_limit = 0;
|
||||
};
|
||||
|
||||
C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
|
||||
|
||||
class TORCH_API Reducer {
|
||||
public:
|
||||
// The constructor takes a list of variables (i.e. parameters) for this
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include <c10d/reducer.hpp>
|
||||
#include <c10d/reducer_timer.hpp>
|
||||
|
||||
#include <c10/core/DeviceGuard.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
|
75
torch/csrc/distributed/c10d/reducer_timer.hpp
Normal file
75
torch/csrc/distributed/c10d/reducer_timer.hpp
Normal file
@ -0,0 +1,75 @@
|
||||
#pragma once
|
||||
#include <torch/csrc/autograd/profiler.h>
|
||||
|
||||
namespace c10d {
|
||||
constexpr int kUnsetTime = -1;
|
||||
|
||||
inline int64_t current_time_in_nanos() {
|
||||
return torch::profiler::impl::getTime();
|
||||
}
|
||||
|
||||
class TORCH_API Timer {
|
||||
private:
|
||||
// The timestamp of forward call start time in each iteration.
|
||||
int64_t forward_start_time = kUnsetTime;
|
||||
// The timestamp of backward computation start and end time in each
|
||||
// iteration.
|
||||
int64_t backward_compute_start_time = kUnsetTime;
|
||||
int64_t backward_compute_end_time = kUnsetTime;
|
||||
// The timestamp of first communication call start time in each iteration.
|
||||
int64_t backward_comm_start_time = kUnsetTime;
|
||||
// The timestamp of last communication call end time in each iteration.
|
||||
int64_t backward_comm_end_time = kUnsetTime;
|
||||
|
||||
public:
|
||||
enum class Event {
|
||||
kForwardStart,
|
||||
kBackwardComputeStart,
|
||||
kBackwardComputeEnd,
|
||||
kBackwardCommStart,
|
||||
kBackwardCommEnd,
|
||||
};
|
||||
|
||||
// Record the current event, i.e., mark it as having occurred now. Default
|
||||
// CPU implementation.
|
||||
virtual void record(Event event) {
|
||||
getTimeRef(event) = current_time_in_nanos();
|
||||
}
|
||||
|
||||
// Return the difference between when two events occurred, in nanoseconds.
|
||||
// Or nullopt if one of them hasn't been recorded.
|
||||
virtual c10::optional<int64_t> measureDifference(Event start, Event end) = 0;
|
||||
|
||||
virtual ~Timer() = default;
|
||||
|
||||
// Return host-side timestamp, or nullopt if it has not yet been recorded.
|
||||
c10::optional<int64_t> getTimestamp(Event event) {
|
||||
auto time = getTimeRef(event);
|
||||
if (time == kUnsetTime) {
|
||||
return c10::nullopt;
|
||||
} else {
|
||||
return time;
|
||||
}
|
||||
}
|
||||
|
||||
// Return host-side time member variable corresponding to the given event.
|
||||
int64_t& getTimeRef(Event event) {
|
||||
switch (event) {
|
||||
case Event::kForwardStart:
|
||||
return forward_start_time;
|
||||
case Event::kBackwardComputeStart:
|
||||
return backward_compute_start_time;
|
||||
case Event::kBackwardComputeEnd:
|
||||
return backward_compute_end_time;
|
||||
case Event::kBackwardCommStart:
|
||||
return backward_comm_start_time;
|
||||
case Event::kBackwardCommEnd:
|
||||
return backward_comm_end_time;
|
||||
default:
|
||||
TORCH_INTERNAL_ASSERT(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
|
||||
} // namespace c10d
|
Reference in New Issue
Block a user