mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
# Motivation According to [[1/2] Intel GPU Runtime Upstreaming for Stream](https://github.com/pytorch/pytorch/pull/117611), as mentioned in [[RFC] Intel GPU Runtime Upstreaming](https://github.com/pytorch/pytorch/issues/114842), the second PR covers the changes under `python frontend`. # Design Currently, it primarily offers stream-related APIs, including - `torch.xpu.StreamContext` - `torch.xpu.current_stream` - `torch.xpu.set_stream` - `torch.xpu.synchronize` - `torch._C._xpu_getCurrentRawStream` # Additional Context We will implement functions like `torch.xpu.Stream.wait_event`, `torch.xpu.Stream.wait_stream`, and `torch.xpu.Stream.record_event` in the next PR related with `Event`. The differences with CUDA: no default and external stream in XPU and lack of below APIs: - `torch.cuda.ExternalStream` - `torch.cuda.default_stream` - `toch.cuda.is_current_stream_capturing` Pull Request resolved: https://github.com/pytorch/pytorch/pull/117619 Approved by: https://github.com/EikanWang, https://github.com/jgong5, https://github.com/gujinghui, https://github.com/albanD ghstack dependencies: #117611
317 lines
10 KiB
C++
317 lines
10 KiB
C++
#include <ATen/ATen.h>
|
|
#include <ATen/xpu/XPUContext.h>
|
|
#include <c10/util/CallOnce.h>
|
|
#include <c10/xpu/XPUFunctions.h>
|
|
#include <torch/csrc/Module.h>
|
|
#include <torch/csrc/THP.h>
|
|
#include <torch/csrc/utils/device_lazy_init.h>
|
|
#include <torch/csrc/utils/pycfunction_helpers.h>
|
|
#include <torch/csrc/utils/python_numbers.h>
|
|
#include <torch/csrc/utils/python_strings.h>
|
|
|
|
#include <pthread.h>
|
|
|
|
using namespace torch;
|
|
|
|
static bool in_bad_fork = false; // True for children forked after xpu init
|
|
|
|
// Called in the forked child if xpu has already been initialized
|
|
static void forked_child() {
|
|
in_bad_fork = true;
|
|
torch::utils::set_requires_device_init(at::kXPU, true);
|
|
}
|
|
|
|
// Should be called before the first xpu call. It is mainly called in lazy_init.
|
|
// Note: This is distinct from initExtension because a stub xpu implementation
|
|
// has some working functions (e.g. device_count) but cannot fully initialize.
|
|
static void poison_fork() {
|
|
static c10::once_flag flag;
|
|
c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
|
|
}
|
|
|
|
// XPU management methods
|
|
|
|
static PyObject* THXPModule_isInBadFork_wrap(PyObject* self, PyObject* noargs) {
|
|
HANDLE_TH_ERRORS
|
|
return PyBool_FromLong(in_bad_fork);
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
|
|
HANDLE_TH_ERRORS
|
|
TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to set_device");
|
|
|
|
int device = THPUtils_unpackInt(arg);
|
|
c10::xpu::set_device(static_cast<c10::DeviceIndex>(device));
|
|
|
|
Py_RETURN_NONE;
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_exchangeDevice_wrap(PyObject* self, PyObject* arg) {
|
|
HANDLE_TH_ERRORS
|
|
TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchange_device");
|
|
|
|
int device = THPUtils_unpackInt(arg);
|
|
if (device < 0) {
|
|
return THPUtils_packInt32(-1);
|
|
}
|
|
|
|
torch::utils::device_lazy_init(at::kXPU);
|
|
int current_device = c10::xpu::exchange_device(device);
|
|
|
|
return THPUtils_packInt32(current_device);
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg) {
|
|
HANDLE_TH_ERRORS
|
|
TORCH_CHECK(
|
|
THPUtils_checkLong(arg), "invalid argument to maybe_exchange_device");
|
|
|
|
int device = THPUtils_unpackInt(arg);
|
|
if (device < 0) {
|
|
return THPUtils_packInt32(-1);
|
|
}
|
|
|
|
torch::utils::device_lazy_init(at::kXPU);
|
|
int current_device = c10::xpu::maybe_exchange_device(device);
|
|
|
|
return THPUtils_packInt32(current_device);
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
|
|
HANDLE_TH_ERRORS
|
|
|
|
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
|
auto device = static_cast<int32_t>(c10::xpu::current_device());
|
|
|
|
return THPUtils_packInt32(device);
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
|
|
HANDLE_TH_ERRORS
|
|
poison_fork();
|
|
return THPUtils_packUInt64(at::xpu::device_count());
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_getCurrentStream_wrap(
|
|
PyObject* self,
|
|
PyObject* device_index) {
|
|
HANDLE_TH_ERRORS
|
|
TORCH_CHECK(
|
|
THPUtils_checkLong(device_index), "invalid argument to current_stream");
|
|
int64_t device = THPUtils_unpackLong(device_index);
|
|
auto stream = at::xpu::getCurrentXPUStream(device);
|
|
PyObject* output_tuple = PyTuple_New(3);
|
|
PyTuple_SetItem(
|
|
output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
|
|
PyTuple_SetItem(
|
|
output_tuple,
|
|
1,
|
|
THPUtils_packInt64(static_cast<int64_t>(stream.device_index())));
|
|
PyTuple_SetItem(
|
|
output_tuple,
|
|
2,
|
|
THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
|
|
return output_tuple;
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_getCurrentStream_raw(
|
|
PyObject* self,
|
|
PyObject* device_index) {
|
|
HANDLE_TH_ERRORS
|
|
TORCH_CHECK(
|
|
THPUtils_checkLong(device_index),
|
|
"invalid argument to getCurrentRawStream");
|
|
int64_t device = THPUtils_unpackLong(device_index);
|
|
return PyLong_FromVoidPtr(&at::xpu::getCurrentXPUStream(device).queue());
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_setStream_wrap(
|
|
PyObject* self,
|
|
PyObject* args,
|
|
PyObject* kwargs) {
|
|
HANDLE_TH_ERRORS
|
|
int64_t stream_id = 0;
|
|
int64_t device_index = 0;
|
|
int64_t device_type = 0;
|
|
|
|
// NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
|
|
constexpr const char* kwlist[] = {
|
|
"stream_id", "device_index", "device_type", nullptr};
|
|
if (!PyArg_ParseTupleAndKeywords(
|
|
args,
|
|
kwargs,
|
|
"|LLL",
|
|
const_cast<char**>(kwlist),
|
|
&stream_id,
|
|
&device_index,
|
|
&device_type)) {
|
|
}
|
|
|
|
auto stream = at::xpu::XPUStream::unpack3(
|
|
stream_id, device_index, static_cast<c10::DeviceType>(device_type));
|
|
|
|
auto device = c10::xpu::current_device();
|
|
if (device != stream.device_index()) {
|
|
c10::xpu::set_device(stream.device_index());
|
|
}
|
|
at::xpu::setCurrentXPUStream(stream);
|
|
Py_RETURN_NONE;
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
PyObject* THXPModule_xpuSynchronize(PyObject* self, PyObject* arg) {
|
|
HANDLE_TH_ERRORS
|
|
TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to synchronize");
|
|
int device = THPUtils_unpackInt(arg);
|
|
{
|
|
pybind11::gil_scoped_release no_gil;
|
|
// Only the SYCL queues we have reserved will be synchronized, see Note
|
|
// [Synchronize Streams on Device].
|
|
c10::xpu::syncStreamsOnDevice(static_cast<c10::DeviceIndex>(device));
|
|
}
|
|
Py_RETURN_NONE;
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
// XPU module initialization
|
|
|
|
static void registerXpuDeviceProperties(PyObject* module) {
|
|
// Add _xpuDevicePropertires class to torch._C
|
|
using namespace c10::xpu;
|
|
auto get_device_type = [](const DeviceProp& prop) {
|
|
std::ostringstream stream;
|
|
using namespace sycl::info;
|
|
switch (prop.device_type) {
|
|
case device_type::cpu:
|
|
stream << "cpu";
|
|
break;
|
|
case device_type::gpu:
|
|
stream << "gpu";
|
|
break;
|
|
case device_type::accelerator:
|
|
stream << "accelerator";
|
|
break;
|
|
case device_type::host:
|
|
stream << "host";
|
|
break;
|
|
default:
|
|
stream << "unknown device type:"
|
|
<< static_cast<typename std::underlying_type<device_type>::type>(
|
|
prop.device_type);
|
|
break;
|
|
}
|
|
return stream.str();
|
|
};
|
|
auto gpu_subslice_count = [](const DeviceProp& prop) {
|
|
return (prop.gpu_eu_count / prop.gpu_eu_count_per_subslice);
|
|
};
|
|
auto m = py::handle(module).cast<py::module>();
|
|
py::class_<DeviceProp>(m, "_XpuDeviceProperties")
|
|
.def_readonly("name", &DeviceProp::name)
|
|
.def_readonly("platform_name", &DeviceProp::platform_name)
|
|
.def_readonly("total_memory", &DeviceProp::global_mem_size)
|
|
.def_readonly("max_compute_units", &DeviceProp::max_compute_units)
|
|
.def_readonly("gpu_eu_count", &DeviceProp::gpu_eu_count)
|
|
.def_property_readonly("gpu_subslice_count", gpu_subslice_count)
|
|
.def_readonly("max_work_group_size", &DeviceProp::max_work_group_size)
|
|
.def_readonly("max_num_sub_groups", &DeviceProp::max_num_sub_groups)
|
|
.def_readonly("sub_group_sizes", &DeviceProp::sub_group_sizes)
|
|
.def_property_readonly("type", get_device_type)
|
|
.def(
|
|
"__repr__",
|
|
[&get_device_type, &gpu_subslice_count](const DeviceProp& prop) {
|
|
std::ostringstream stream;
|
|
stream << "_XpuDeviceProperties(name='" << prop.name
|
|
<< "', platform_name='" << prop.platform_name << "', type='"
|
|
<< get_device_type(prop)
|
|
<< ", total_memory=" << prop.global_mem_size / (1024 * 1024)
|
|
<< "MB, max_compute_units=" << prop.max_compute_units
|
|
<< ", gpu_eu_count=" << prop.gpu_eu_count
|
|
<< ", gpu_subslice_count=" << gpu_subslice_count(prop)
|
|
<< ", max_work_group_size=" << prop.max_work_group_size
|
|
<< ", max_num_sub_groups=" << prop.max_num_sub_groups
|
|
<< ", sub_group_sizes=[" << prop.sub_group_sizes << "])";
|
|
return stream.str();
|
|
});
|
|
}
|
|
|
|
static void bindGetDeviceProperties(PyObject* module) {
|
|
// Add method to torch.xpu
|
|
auto m = py::handle(module).cast<py::module>();
|
|
m.def(
|
|
"_get_device_properties",
|
|
[](int device) -> c10::xpu::DeviceProp* {
|
|
return at::xpu::getDeviceProperties(device);
|
|
},
|
|
py::return_value_policy::reference);
|
|
}
|
|
|
|
// Callback for python part. Used for additional initialization of python
|
|
// classes
|
|
static PyObject* THXPModule_initExtension(PyObject* self, PyObject* noargs) {
|
|
HANDLE_TH_ERRORS
|
|
TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
|
|
poison_fork();
|
|
|
|
auto m = THPObjectPtr(PyImport_ImportModule("torch.xpu"));
|
|
if (!m)
|
|
throw python_error();
|
|
|
|
bindGetDeviceProperties(m);
|
|
|
|
Py_RETURN_NONE;
|
|
END_HANDLE_TH_ERRORS
|
|
}
|
|
|
|
// NOLINTNEXTLINE(modernize-avoid-c-arrays,
|
|
// cppcoreguidelines-avoid-non-const-global-variables,
|
|
// cppcoreguidelines-avoid-c-arrays)
|
|
static struct PyMethodDef _THXPModule_methods[] = {
|
|
{"_xpu_init", THXPModule_initExtension, METH_NOARGS, nullptr},
|
|
{"_xpu_setDevice", THXPModule_setDevice_wrap, METH_O, nullptr},
|
|
{"_xpu_exchangeDevice", THXPModule_exchangeDevice_wrap, METH_O, nullptr},
|
|
{"_xpu_maybeExchangeDevice",
|
|
THXPModule_maybeExchangeDevice_wrap,
|
|
METH_O,
|
|
nullptr},
|
|
{"_xpu_getDevice", THXPModule_getDevice_wrap, METH_NOARGS, nullptr},
|
|
{"_xpu_getDeviceCount",
|
|
THXPModule_getDeviceCount_wrap,
|
|
METH_NOARGS,
|
|
nullptr},
|
|
{"_xpu_isInBadFork", THXPModule_isInBadFork_wrap, METH_NOARGS, nullptr},
|
|
{"_xpu_getCurrentStream",
|
|
THXPModule_getCurrentStream_wrap,
|
|
METH_O,
|
|
nullptr},
|
|
{"_xpu_getCurrentRawStream",
|
|
THXPModule_getCurrentStream_raw,
|
|
METH_O,
|
|
nullptr},
|
|
{"_xpu_setStream",
|
|
castPyCFunctionWithKeywords(THXPModule_setStream_wrap),
|
|
METH_VARARGS | METH_KEYWORDS,
|
|
nullptr},
|
|
{"_xpu_synchronize", THXPModule_xpuSynchronize, METH_O, nullptr},
|
|
{nullptr}};
|
|
|
|
PyMethodDef* THXPModule_methods() {
|
|
return _THXPModule_methods;
|
|
}
|
|
|
|
namespace torch::xpu {
|
|
|
|
void initModule(PyObject* module) {
|
|
registerXpuDeviceProperties(module);
|
|
}
|
|
|
|
} // namespace torch::xpu
|