mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Add low priority XPU Stream (#141119)
# Motivation Due to the potential for the external SYCL queue to have a low priority, we need to support the low-priority SYCL queue for native XPU Streams to maintain consistency. Pull Request resolved: https://github.com/pytorch/pytorch/pull/141119 Approved by: https://github.com/gujinghui, https://github.com/albanD ghstack dependencies: #142347
This commit is contained in:
committed by
PyTorch MergeBot
parent
39450ae655
commit
a68c0ca497
@ -33,32 +33,51 @@ std::deque<
|
|||||||
|
|
||||||
thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
|
thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
|
||||||
|
|
||||||
// Note [StreamId assignment]
|
/*
|
||||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~
|
* Note [StreamId assignment]
|
||||||
// How do we assign stream IDs?
|
* ~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
//
|
* How do we assign stream IDs?
|
||||||
// -- 55 bits -- -- 5 bits -- -- 3 bits -- -- 1 bit --
|
*
|
||||||
// zeros StreamIdIndex StreamIdType Ext/native stream
|
* -- 55 bits -- -- 5 bits -- -- 3 bits -- -- 1 bit --
|
||||||
// ignored for ext ignored for ext
|
* zeros StreamIdIndex StreamIdType Ext/native stream
|
||||||
//
|
* ignored for ext ignored for ext
|
||||||
// Where StreamIdType:
|
*
|
||||||
// 000 = normal priority queue
|
* Where StreamIdType:
|
||||||
// 001 = high priority queue
|
* 000 = low priority queue
|
||||||
// 111 = external queue
|
* 001 = normal priority queue
|
||||||
//
|
* 010 = high priority queue
|
||||||
// For external stream, StreamID is a sycl::queue* pointer. This means that last
|
* 111 = external queue
|
||||||
// bit will always be 0. So when constructing StreamId for a native stream we
|
*
|
||||||
// set last bit to 1 to distinguish between native and external streams. For
|
* For external stream, StreamID is a sycl::queue* pointer. This means that last
|
||||||
// more details, see Note [External XPU Stream].
|
* bit will always be 0. So when constructing StreamId for a native stream we
|
||||||
//
|
* set last bit to 1 to distinguish between native and external streams. For
|
||||||
// StreamId is 64-bit, so we can just rely on regular promotion rules.
|
* more details, see Note [External XPU Stream].
|
||||||
// We rely on StreamIdIndex and StreamIdType being non-negative;
|
*
|
||||||
|
* StreamId is 64-bit, so we can just rely on regular promotion rules.
|
||||||
|
* We rely on StreamIdIndex and StreamIdType being non-negative;
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note [XPU Stream priorities]
|
||||||
|
* XPU stream priority levels are defined based on the following design
|
||||||
|
* principles:
|
||||||
|
* 1. Higher priority number indicates lower priority.
|
||||||
|
* 2. The default priority, `normal`, corresponds to a priority number of 0.
|
||||||
|
* 3. StreamIdType and priority number are inversely related.
|
||||||
|
*
|
||||||
|
* This relationship can be summarized as follows:
|
||||||
|
* -- priority type -- -- priority number -- -- type number --
|
||||||
|
* low 1 0
|
||||||
|
* normal 0 1
|
||||||
|
* high -1 2
|
||||||
|
*/
|
||||||
|
|
||||||
using StreamIdIndex = uint8_t;
|
using StreamIdIndex = uint8_t;
|
||||||
enum class StreamIdType : uint8_t {
|
enum class StreamIdType : uint8_t {
|
||||||
// The higher the type number, the higher the priority for the native stream.
|
// The higher the type number, the higher the priority for the native stream.
|
||||||
NORMAL = 0x0,
|
LOW = 0x0,
|
||||||
HIGH = 0X1,
|
NORMAL = 0x1,
|
||||||
|
HIGH = 0x2,
|
||||||
// For an external stream, the last bit of StreamId is 0, whose priority is
|
// For an external stream, the last bit of StreamId is 0, whose priority is
|
||||||
// queried at runtime.
|
// queried at runtime.
|
||||||
EXT = 0x7,
|
EXT = 0x7,
|
||||||
@ -66,6 +85,8 @@ enum class StreamIdType : uint8_t {
|
|||||||
|
|
||||||
inline std::ostream& operator<<(std::ostream& stream, StreamIdType q) {
|
inline std::ostream& operator<<(std::ostream& stream, StreamIdType q) {
|
||||||
switch (q) {
|
switch (q) {
|
||||||
|
case StreamIdType::LOW:
|
||||||
|
return stream << "LOW";
|
||||||
case StreamIdType::NORMAL:
|
case StreamIdType::NORMAL:
|
||||||
return stream << "NORMAL";
|
return stream << "NORMAL";
|
||||||
case StreamIdType::HIGH:
|
case StreamIdType::HIGH:
|
||||||
@ -87,7 +108,8 @@ inline StreamIdType streamIdType(StreamId s) {
|
|||||||
int mask_for_type = (1 << kStreamTypeBits) - 1;
|
int mask_for_type = (1 << kStreamTypeBits) - 1;
|
||||||
auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
|
auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
st == StreamIdType::NORMAL || st == StreamIdType::HIGH,
|
st == StreamIdType::NORMAL || st == StreamIdType::HIGH ||
|
||||||
|
st == StreamIdType::LOW,
|
||||||
"invalid StreamId: ",
|
"invalid StreamId: ",
|
||||||
s);
|
s);
|
||||||
return st;
|
return st;
|
||||||
@ -116,8 +138,12 @@ void initDeviceStreamState(DeviceIndex device) {
|
|||||||
using namespace sycl::ext::oneapi::property;
|
using namespace sycl::ext::oneapi::property;
|
||||||
// Need to align with StreamIdType.
|
// Need to align with StreamIdType.
|
||||||
const std::vector<sycl::property_list> properties = {
|
const std::vector<sycl::property_list> properties = {
|
||||||
|
{sycl::property::queue::in_order(), queue::priority_low()},
|
||||||
{sycl::property::queue::in_order(), queue::priority_normal()},
|
{sycl::property::queue::in_order(), queue::priority_normal()},
|
||||||
{sycl::property::queue::in_order(), queue::priority_high()}};
|
{sycl::property::queue::in_order(), queue::priority_high()}};
|
||||||
|
TORCH_CHECK(
|
||||||
|
properties.size() == max_compile_time_stream_priorities,
|
||||||
|
"The number of stream priorities should be equal to max_compile_time_stream_priorities");
|
||||||
for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
|
for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
|
||||||
for (const auto i : c10::irange(kStreamsPerPool)) {
|
for (const auto i : c10::irange(kStreamsPerPool)) {
|
||||||
auto& stream = streams[device][p][i];
|
auto& stream = streams[device][p][i];
|
||||||
@ -186,16 +212,19 @@ int XPUStream::priority() const {
|
|||||||
if (C10_UNLIKELY(st == StreamIdType::EXT)) {
|
if (C10_UNLIKELY(st == StreamIdType::EXT)) {
|
||||||
// Query external stream priority
|
// Query external stream priority
|
||||||
using namespace sycl::ext::oneapi::property;
|
using namespace sycl::ext::oneapi::property;
|
||||||
// Default priority for SYCL queue is normal.
|
|
||||||
st = StreamIdType::NORMAL;
|
|
||||||
if (queue().has_property<queue::priority_normal>()) {
|
if (queue().has_property<queue::priority_normal>()) {
|
||||||
st = StreamIdType::NORMAL;
|
st = StreamIdType::NORMAL;
|
||||||
} else if (queue().has_property<queue::priority_high>()) {
|
} else if (queue().has_property<queue::priority_high>()) {
|
||||||
st = StreamIdType::HIGH;
|
st = StreamIdType::HIGH;
|
||||||
|
} else if (queue().has_property<queue::priority_low>()) {
|
||||||
|
st = StreamIdType::LOW;
|
||||||
|
} else {
|
||||||
|
// Default priority for SYCL queue is normal.
|
||||||
|
st = StreamIdType::NORMAL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// StreamIdType and priority number are inversely related.
|
// See Note [XPU Stream priorities]
|
||||||
return -static_cast<int>(st);
|
return -static_cast<int>(st) + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// See Note [StreamId assignment]
|
// See Note [StreamId assignment]
|
||||||
@ -232,14 +261,11 @@ XPUStream getStreamFromPool(const int priority, DeviceIndex device) {
|
|||||||
device = c10::xpu::current_device();
|
device = c10::xpu::current_device();
|
||||||
}
|
}
|
||||||
check_device_index(device);
|
check_device_index(device);
|
||||||
TORCH_CHECK(
|
|
||||||
priority <= 0,
|
|
||||||
"Expected XPU stream priority to be less than or equal to 0, got ",
|
|
||||||
priority);
|
|
||||||
// Initializes the stream pools (once)
|
// Initializes the stream pools (once)
|
||||||
initDeviceStreamOnce(device);
|
initDeviceStreamOnce(device);
|
||||||
|
// See Note [XPU Stream priorities]
|
||||||
auto priority_idx =
|
auto priority_idx =
|
||||||
std::min(-priority, max_compile_time_stream_priorities - 1);
|
std::clamp(-priority + 1, 0, max_compile_time_stream_priorities - 1);
|
||||||
const auto idx = get_idx(priority_counters[device][priority_idx]);
|
const auto idx = get_idx(priority_counters[device][priority_idx]);
|
||||||
auto id_type = static_cast<StreamIdType>(priority_idx);
|
auto id_type = static_cast<StreamIdType>(priority_idx);
|
||||||
return XPUStreamForId(device, makeStreamId(id_type, idx));
|
return XPUStreamForId(device, makeStreamId(id_type, idx));
|
||||||
@ -248,7 +274,8 @@ XPUStream getStreamFromPool(const int priority, DeviceIndex device) {
|
|||||||
XPUStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
|
XPUStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
|
||||||
initXPUStreamsOnce();
|
initXPUStreamsOnce();
|
||||||
// If isHighPriority is true, return the stream with the highest priority.
|
// If isHighPriority is true, return the stream with the highest priority.
|
||||||
int priority = isHighPriority ? -max_compile_time_stream_priorities + 1 : 0;
|
// See Note [XPU Stream priorities]
|
||||||
|
int priority = isHighPriority ? -max_compile_time_stream_priorities + 2 : 0;
|
||||||
return getStreamFromPool(priority, device);
|
return getStreamFromPool(priority, device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ namespace c10::xpu {
|
|||||||
* threads as the SYCL specification described.
|
* threads as the SYCL specification described.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static constexpr int max_compile_time_stream_priorities = 2;
|
static constexpr int max_compile_time_stream_priorities = 3;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This serves as a wrapper around c10::Stream and acts as a representation for
|
* This serves as a wrapper around c10::Stream and acts as a representation for
|
||||||
@ -132,7 +132,8 @@ class C10_XPU_API XPUStream {
|
|||||||
|
|
||||||
/// Return the range of priority **supported by PyTorch**.
|
/// Return the range of priority **supported by PyTorch**.
|
||||||
static std::tuple<int, int> priority_range() {
|
static std::tuple<int, int> priority_range() {
|
||||||
return std::make_tuple(0, -max_compile_time_stream_priorities + 1);
|
// See Note [XPU Stream priorities]
|
||||||
|
return std::make_tuple(1, -max_compile_time_stream_priorities + 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -69,11 +69,24 @@ TEST(XPUStreamTest, StreamBehavior) {
|
|||||||
|
|
||||||
auto [least_priority, greatest_priority] =
|
auto [least_priority, greatest_priority] =
|
||||||
c10::xpu::XPUStream::priority_range();
|
c10::xpu::XPUStream::priority_range();
|
||||||
EXPECT_EQ(least_priority, 0);
|
EXPECT_EQ(least_priority, 1);
|
||||||
EXPECT_TRUE(greatest_priority < 0);
|
EXPECT_EQ(greatest_priority, -1);
|
||||||
|
|
||||||
stream = c10::xpu::getStreamFromPool(/* isHighPriority */ true);
|
stream = c10::xpu::getStreamFromPool(/* isHighPriority */ true);
|
||||||
EXPECT_TRUE(stream.priority() < 0);
|
EXPECT_EQ(stream.priority(), -1);
|
||||||
|
stream = c10::xpu::getStreamFromPool(/* isHighPriority */ false);
|
||||||
|
EXPECT_EQ(stream.priority(), 0);
|
||||||
|
|
||||||
|
stream = c10::xpu::getStreamFromPool(-1);
|
||||||
|
EXPECT_EQ(stream.priority(), -1);
|
||||||
|
stream = c10::xpu::getStreamFromPool(-10);
|
||||||
|
EXPECT_EQ(stream.priority(), -1);
|
||||||
|
stream = c10::xpu::getStreamFromPool(0);
|
||||||
|
EXPECT_EQ(stream.priority(), 0);
|
||||||
|
stream = c10::xpu::getStreamFromPool(1);
|
||||||
|
EXPECT_EQ(stream.priority(), 1);
|
||||||
|
stream = c10::xpu::getStreamFromPool(10);
|
||||||
|
EXPECT_EQ(stream.priority(), 1);
|
||||||
|
|
||||||
if (c10::xpu::device_count() <= 1) {
|
if (c10::xpu::device_count() <= 1) {
|
||||||
return;
|
return;
|
||||||
|
@ -21,9 +21,11 @@ class Stream(torch._C._XpuStreamBase):
|
|||||||
device(torch.device or int, optional): a device on which to allocate
|
device(torch.device or int, optional): a device on which to allocate
|
||||||
the stream. If :attr:`device` is ``None`` (default) or a negative
|
the stream. If :attr:`device` is ``None`` (default) or a negative
|
||||||
integer, this will use the current device.
|
integer, this will use the current device.
|
||||||
priority(int, optional): priority of the stream, should be 0 or
|
priority(int, optional): priority of the stream, which can be positive, 0, or negative.
|
||||||
negative, where negative numbers indicate higher priority. By default,
|
A lower number indicates a higher priority. By default, the priority is set to 0.
|
||||||
streams have priority 0.
|
If the value falls outside of the allowed priority range, it will automatically be
|
||||||
|
mapped to the nearest valid priority (lowest for large positive numbers or
|
||||||
|
highest for large negative numbers).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __new__(cls, device=None, priority=0, **kwargs):
|
def __new__(cls, device=None, priority=0, **kwargs):
|
||||||
|
Reference in New Issue
Block a user