mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[RELAND] refactor lazy init to device-agnostic (#119248)
# Motivation This PR intends to extend `cuda_lazy_init` to `device_lazy_init` which is a device-agnostic API that can support any backend. And change `maybe_initialize_cuda` to `maybe_initialize_device` to support lazy initialization for CUDA while maintaining scalability. # Design We maintain a flag for each backend to manage the lazy initialization state separately. # Additional Context No need more UTs. This is a reland PR, the original PR is [refactor lazy init to device-agnostic](https://github.com/pytorch/pytorch/pull/118846). This is a common PR, and does not trigger xpu ciflow. Differential Revision: [D53478332](https://our.internmc.facebook.com/intern/diff/D53478332) Pull Request resolved: https://github.com/pytorch/pytorch/pull/119248 Approved by: https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/jgong5, https://github.com/atalman
This commit is contained in:
committed by
PyTorch MergeBot
parent
3625ccfbea
commit
5c46600f84
@ -869,7 +869,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/utils/init.cpp",
|
||||
"torch/csrc/utils/throughput_benchmark.cpp",
|
||||
"torch/csrc/utils.cpp",
|
||||
"torch/csrc/utils/cuda_lazy_init.cpp",
|
||||
"torch/csrc/utils/device_lazy_init.cpp",
|
||||
"torch/csrc/utils/invalid_arguments.cpp",
|
||||
"torch/csrc/utils/nested.cpp",
|
||||
"torch/csrc/utils/object_ptr.cpp",
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include "torch/csrc/utils/pycfunction_helpers.h"
|
||||
#include "torch/csrc/utils/python_arg_parser.h"
|
||||
#include "torch/csrc/utils/structseq.h"
|
||||
#include "torch/csrc/utils/cuda_lazy_init.h"
|
||||
#include "torch/csrc/utils/device_lazy_init.h"
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include "torch/csrc/utils/pycfunction_helpers.h"
|
||||
#include "torch/csrc/utils/python_arg_parser.h"
|
||||
#include "torch/csrc/utils/structseq.h"
|
||||
#include "torch/csrc/utils/cuda_lazy_init.h"
|
||||
#include "torch/csrc/utils/device_lazy_init.h"
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include "torch/csrc/utils/pycfunction_helpers.h"
|
||||
#include "torch/csrc/utils/python_arg_parser.h"
|
||||
#include "torch/csrc/utils/structseq.h"
|
||||
#include "torch/csrc/utils/cuda_lazy_init.h"
|
||||
#include "torch/csrc/utils/device_lazy_init.h"
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
|
@ -31,7 +31,7 @@
|
||||
#include "torch/csrc/jit/frontend/tracer.h"
|
||||
#include "torch/csrc/autograd/generated/variable_factories.h"
|
||||
#include "torch/csrc/utils/structseq.h"
|
||||
#include "torch/csrc/utils/cuda_lazy_init.h"
|
||||
#include "torch/csrc/utils/device_lazy_init.h"
|
||||
#include "torch/csrc/autograd/generated/python_return_types.h"
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
@ -21,7 +21,7 @@
|
||||
#ifdef USE_CUDA
|
||||
#include "torch/csrc/cuda/Event.h"
|
||||
#endif
|
||||
#include "torch/csrc/utils/cuda_lazy_init.h"
|
||||
#include "torch/csrc/utils/device_lazy_init.h"
|
||||
#include "torch/csrc/utils/object_ptr.h"
|
||||
#include "torch/csrc/utils/pycfunction_helpers.h"
|
||||
#include "torch/csrc/utils/python_arg_parser.h"
|
||||
@ -495,7 +495,7 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
|
||||
auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
|
||||
auto opt_memory_format = r.memoryformatOptional(2);
|
||||
TORCH_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
|
||||
torch::utils::cuda_lazy_init();
|
||||
torch::utils::device_lazy_init(at::kCUDA);
|
||||
return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
|
||||
END_HANDLE_TH_ERRORS
|
||||
}
|
||||
@ -973,9 +973,7 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
|
||||
auto copy = std::get<3>(parsed);
|
||||
auto opt_memory_format = std::get<4>(parsed);
|
||||
auto& self_ = THPVariable_Unpack(self);
|
||||
if (device && device->is_cuda()) {
|
||||
torch::utils::cuda_lazy_init();
|
||||
}
|
||||
torch::utils::maybe_initialize_device(device);
|
||||
if (device && device->is_privateuseone()) {
|
||||
at::globalContext().lazyInitPrivateUse1();
|
||||
}
|
||||
@ -1057,9 +1055,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa
|
||||
if (device_type != device.type()) {
|
||||
device = at::Device(device_type);
|
||||
}
|
||||
if (device.is_cuda()) {
|
||||
torch::utils::cuda_lazy_init();
|
||||
}
|
||||
torch::utils::maybe_initialize_device(device);
|
||||
if (device.is_privateuseone()) {
|
||||
at::globalContext().lazyInitPrivateUse1();
|
||||
}
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <torch/csrc/Storage.h>
|
||||
#include <torch/csrc/autograd/generated/VariableType.h>
|
||||
#include <torch/csrc/utils/cuda_enabled.h>
|
||||
#include <torch/csrc/utils/cuda_lazy_init.h>
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
#include <torch/csrc/utils/object_ptr.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
|
@ -9,7 +9,7 @@
|
||||
#include <torch/csrc/autograd/python_variable.h>
|
||||
#include <torch/csrc/autograd/utils/wrap_outputs.h>
|
||||
#include <torch/csrc/jit/frontend/tracer.h>
|
||||
#include <torch/csrc/utils/cuda_lazy_init.h>
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
#include <torch/csrc/utils/out_types.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
#include <torch/csrc/utils/pycfunction_helpers.h>
|
||||
@ -61,7 +61,7 @@ inline Tensor dispatch_range(
|
||||
const Scalar& end,
|
||||
const Scalar& step,
|
||||
const TensorOptions& options) {
|
||||
torch::utils::maybe_initialize_cuda(options);
|
||||
torch::utils::maybe_initialize_device(options);
|
||||
pybind11::gil_scoped_release no_gil;
|
||||
DeviceGuard device_guard(options.device());
|
||||
return torch::range(start, end, step, options);
|
||||
|
@ -39,7 +39,7 @@
|
||||
#include <torch/csrc/cuda/python_comm.h>
|
||||
#include <torch/csrc/profiler/python/combined_traceback.h>
|
||||
#include <torch/csrc/python_headers.h>
|
||||
#include <torch/csrc/utils/cuda_lazy_init.h>
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
#include <torch/csrc/utils/pycfunction_helpers.h>
|
||||
#include <torch/csrc/utils/python_numbers.h>
|
||||
@ -62,7 +62,7 @@ static bool in_bad_fork = false; // True for children forked after cuda init
|
||||
// Called in the forked child if cuda has already been initialized
|
||||
static void forked_child() {
|
||||
in_bad_fork = true;
|
||||
torch::utils::set_requires_cuda_init(true);
|
||||
torch::utils::set_requires_device_init(at::kCUDA, true);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -85,7 +85,7 @@ PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
|
||||
TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
|
||||
auto device = THPUtils_unpackLong(arg);
|
||||
|
||||
torch::utils::cuda_lazy_init();
|
||||
torch::utils::device_lazy_init(at::kCUDA);
|
||||
c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
|
||||
|
||||
Py_RETURN_NONE;
|
||||
@ -100,7 +100,7 @@ PyObject* THCPModule_exchangeDevice(PyObject* self, PyObject* arg) {
|
||||
return THPUtils_packInt32(-1);
|
||||
}
|
||||
|
||||
torch::utils::cuda_lazy_init();
|
||||
torch::utils::device_lazy_init(at::kCUDA);
|
||||
int current_device = c10::cuda::ExchangeDevice(device);
|
||||
|
||||
return THPUtils_packInt32(current_device);
|
||||
@ -115,7 +115,7 @@ PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
|
||||
return THPUtils_packInt32(-1);
|
||||
}
|
||||
|
||||
torch::utils::cuda_lazy_init();
|
||||
torch::utils::device_lazy_init(at::kCUDA);
|
||||
int current_device = c10::cuda::MaybeExchangeDevice(device);
|
||||
|
||||
return THPUtils_packInt32(current_device);
|
||||
@ -124,7 +124,7 @@ PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
|
||||
|
||||
PyObject* THCPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
|
||||
HANDLE_TH_ERRORS
|
||||
torch::utils::cuda_lazy_init();
|
||||
torch::utils::device_lazy_init(at::kCUDA);
|
||||
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
||||
auto device = static_cast<int32_t>(c10::cuda::current_device());
|
||||
return THPUtils_packInt32(device);
|
||||
@ -151,7 +151,7 @@ PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
|
||||
int64_t device = THPUtils_unpackLong(arg1);
|
||||
int64_t peer_device = THPUtils_unpackLong(arg2);
|
||||
|
||||
torch::utils::cuda_lazy_init();
|
||||
torch::utils::device_lazy_init(at::kCUDA);
|
||||
auto can_access = at::cuda::canDeviceAccessPeer(device, peer_device);
|
||||
return PyBool_FromLong(can_access);
|
||||
END_HANDLE_TH_ERRORS
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include <torch/csrc/autograd/utils/wrap_outputs.h>
|
||||
#include <torch/csrc/autograd/variable.h>
|
||||
#include <torch/csrc/utils/cuda_enabled.h>
|
||||
#include <torch/csrc/utils/cuda_lazy_init.h>
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
#include <torch/csrc/utils/python_strings.h>
|
||||
#include <torch/csrc/utils/tensor_new.h>
|
||||
#include <torch/csrc/utils/tensor_types.h>
|
||||
|
@ -1,43 +0,0 @@
|
||||
#include <torch/csrc/utils/cuda_lazy_init.h>
|
||||
|
||||
#include <torch/csrc/Exceptions.h>
|
||||
#include <torch/csrc/python_headers.h>
|
||||
#include <torch/csrc/utils/object_ptr.h>
|
||||
|
||||
namespace torch {
|
||||
namespace utils {
|
||||
namespace {
|
||||
|
||||
bool is_initialized = false;
|
||||
|
||||
}
|
||||
|
||||
void cuda_lazy_init() {
|
||||
pybind11::gil_scoped_acquire g;
|
||||
// Protected by the GIL. We don't use call_once because under ASAN it
|
||||
// has a buggy implementation that deadlocks if an instance throws an
|
||||
// exception. In any case, call_once isn't necessary, because we
|
||||
// have taken a lock.
|
||||
if (is_initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
|
||||
if (!module) {
|
||||
throw python_error();
|
||||
}
|
||||
|
||||
auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
|
||||
if (!res) {
|
||||
throw python_error();
|
||||
}
|
||||
|
||||
is_initialized = true;
|
||||
}
|
||||
|
||||
void set_requires_cuda_init(bool value) {
|
||||
is_initialized = !value;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace torch
|
@ -1,33 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/TensorOptions.h>
|
||||
|
||||
// cuda_lazy_init() is always compiled, even for CPU-only builds.
|
||||
// Thus, it does not live in the cuda/ folder.
|
||||
|
||||
namespace torch {
|
||||
namespace utils {
|
||||
|
||||
// The INVARIANT is that this function MUST be called before you attempt
|
||||
// to get a CUDA Type object from ATen, in any way. Here are some common
|
||||
// ways that a Type object may be retrieved:
|
||||
//
|
||||
// - You call getNonVariableType or getNonVariableTypeOpt
|
||||
// - You call toBackend() on a Type
|
||||
//
|
||||
// It's important to do this correctly, because if you forget to add it
|
||||
// you'll get an oblique error message about "Cannot initialize CUDA without
|
||||
// ATen_cuda library" if you try to use CUDA functionality from a CPU-only
|
||||
// build, which is not good UX.
|
||||
//
|
||||
void cuda_lazy_init();
|
||||
void set_requires_cuda_init(bool value);
|
||||
|
||||
static void maybe_initialize_cuda(const at::TensorOptions& options) {
|
||||
if (options.device().is_cuda()) {
|
||||
torch::utils::cuda_lazy_init();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace torch
|
42
torch/csrc/utils/device_lazy_init.cpp
Normal file
42
torch/csrc/utils/device_lazy_init.cpp
Normal file
@ -0,0 +1,42 @@
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
|
||||
#include <torch/csrc/Exceptions.h>
|
||||
#include <torch/csrc/python_headers.h>
|
||||
#include <torch/csrc/utils/object_ptr.h>
|
||||
#include <iostream>
|
||||
namespace torch::utils {
|
||||
namespace {
|
||||
|
||||
std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_initialized{};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
void device_lazy_init(at::DeviceType device_type) {
|
||||
pybind11::gil_scoped_acquire g;
|
||||
// Protected by the GIL. We don't use call_once because under ASAN it
|
||||
// has a buggy implementation that deadlocks if an instance throws an
|
||||
// exception. In any case, call_once isn't necessary, because we
|
||||
// have taken a lock.
|
||||
if (is_initialized[static_cast<int>(device_type)]) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string module_name = "torch." + at::DeviceTypeName(device_type, true);
|
||||
auto module = THPObjectPtr(PyImport_ImportModule(module_name.c_str()));
|
||||
if (!module) {
|
||||
throw python_error();
|
||||
}
|
||||
|
||||
auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
|
||||
if (!res) {
|
||||
throw python_error();
|
||||
}
|
||||
|
||||
is_initialized[static_cast<int>(device_type)] = true;
|
||||
}
|
||||
|
||||
void set_requires_device_init(at::DeviceType device_type, bool value) {
|
||||
is_initialized[static_cast<int>(device_type)] = !value;
|
||||
}
|
||||
|
||||
} // namespace torch::utils
|
48
torch/csrc/utils/device_lazy_init.h
Normal file
48
torch/csrc/utils/device_lazy_init.h
Normal file
@ -0,0 +1,48 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/TensorOptions.h>
|
||||
|
||||
// device_lazy_init() is always compiled, even for CPU-only builds.
|
||||
|
||||
namespace torch::utils {
|
||||
|
||||
/**
|
||||
* This mechanism of lazy initialization is designed for each device backend.
|
||||
* Currently, CUDA and XPU follow this design. This function `device_lazy_init`
|
||||
* MUST be called before you attempt to access any Type(CUDA or XPU) object
|
||||
* from ATen, in any way. It guarantees that the device runtime status is lazily
|
||||
* initialized when the first runtime API is requested.
|
||||
*
|
||||
* Here are some common ways that a device object may be retrieved:
|
||||
* - You call getNonVariableType or getNonVariableTypeOpt
|
||||
* - You call toBackend() on a Type
|
||||
*
|
||||
* It's important to do this correctly, because if you forget to add it you'll
|
||||
* get an oblique error message seems like "Cannot initialize CUDA without
|
||||
* ATen_cuda library" or "Cannot initialize XPU without ATen_xpu library" if you
|
||||
* try to use CUDA or XPU functionality from a CPU-only build, which is not good
|
||||
* UX.
|
||||
*/
|
||||
void device_lazy_init(at::DeviceType device_type);
|
||||
void set_requires_device_init(at::DeviceType device_type, bool value);
|
||||
|
||||
static inline void maybe_initialize_device(at::Device& device) {
|
||||
// Add more devices here to enable lazy initialization.
|
||||
if (device.is_cuda()) {
|
||||
device_lazy_init(device.type());
|
||||
}
|
||||
}
|
||||
|
||||
static inline void maybe_initialize_device(c10::optional<at::Device>& device) {
|
||||
if (!device.has_value()) {
|
||||
return;
|
||||
}
|
||||
maybe_initialize_device(device.value());
|
||||
}
|
||||
|
||||
static inline void maybe_initialize_device(const at::TensorOptions& options) {
|
||||
auto device = options.device();
|
||||
maybe_initialize_device(device);
|
||||
}
|
||||
|
||||
} // namespace torch::utils
|
@ -7,7 +7,7 @@
|
||||
#include <torch/csrc/Size.h>
|
||||
#include <torch/csrc/autograd/generated/variable_factories.h>
|
||||
#include <torch/csrc/autograd/variable.h>
|
||||
#include <torch/csrc/utils/cuda_lazy_init.h>
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
#include <torch/csrc/utils/numpy_stub.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
#include <torch/csrc/utils/python_arg_parser.h>
|
||||
@ -60,12 +60,6 @@ TensorOptions build_options(
|
||||
return options;
|
||||
}
|
||||
|
||||
void maybe_initialize_cuda(const Device& device) {
|
||||
if (device.is_cuda()) {
|
||||
torch::utils::cuda_lazy_init();
|
||||
}
|
||||
}
|
||||
|
||||
// NB: It appears there is some consistency invariant between options and
|
||||
// device, where if device is non-empty, its type must be consistent with the
|
||||
// device type in options.
|
||||
@ -76,7 +70,7 @@ Tensor new_with_sizes(
|
||||
at::ScalarType scalar_type,
|
||||
const optional<Device>& device,
|
||||
c10::SymIntArrayRef sizes) {
|
||||
maybe_initialize_cuda(options.device());
|
||||
maybe_initialize_device(options.device());
|
||||
pybind11::gil_scoped_release no_gil;
|
||||
return at::empty_symint(sizes, build_options(options, scalar_type, device));
|
||||
}
|
||||
@ -319,7 +313,7 @@ Tensor internal_new_from_data(
|
||||
type_inference ? var.scalar_type() : scalar_type;
|
||||
auto device = device_opt.has_value() ? *device_opt : var.device();
|
||||
pybind11::gil_scoped_release no_gil;
|
||||
maybe_initialize_cuda(device);
|
||||
maybe_initialize_device(device);
|
||||
return var.to(
|
||||
device,
|
||||
inferred_scalar_type,
|
||||
@ -337,7 +331,7 @@ Tensor internal_new_from_data(
|
||||
type_inference ? tensor.scalar_type() : scalar_type;
|
||||
auto device = device_opt.has_value() ? *device_opt : options.device();
|
||||
pybind11::gil_scoped_release no_gil;
|
||||
maybe_initialize_cuda(device);
|
||||
maybe_initialize_device(device);
|
||||
return tensor.to(
|
||||
device,
|
||||
inferred_scalar_type,
|
||||
@ -353,7 +347,7 @@ Tensor internal_new_from_data(
|
||||
type_inference ? tensor.scalar_type() : scalar_type;
|
||||
auto device = device_opt.has_value() ? *device_opt : options.device();
|
||||
pybind11::gil_scoped_release no_gil;
|
||||
maybe_initialize_cuda(device);
|
||||
maybe_initialize_device(device);
|
||||
return tensor.to(
|
||||
device,
|
||||
inferred_scalar_type,
|
||||
@ -449,7 +443,7 @@ Tensor internal_new_from_data(
|
||||
}
|
||||
}
|
||||
pybind11::gil_scoped_release no_gil;
|
||||
maybe_initialize_cuda(device);
|
||||
maybe_initialize_device(device);
|
||||
// However, it is VERY important that we trace the to() call here (even
|
||||
// though the reason this is important is a hack). Without *some* factory
|
||||
// function call that is traced at construction time, we will consider
|
||||
@ -1640,9 +1634,7 @@ Tensor tensor_fromDLPack(PyObject* data) {
|
||||
// because cuda ATen types have not been registered in Python yet.
|
||||
// so if we have a cuda tensor, then we need to make sure
|
||||
// we have called _lazy_init here
|
||||
if (atensor.is_cuda()) {
|
||||
py::module::import("torch.cuda").attr("init")();
|
||||
}
|
||||
maybe_initialize_device(atensor.device());
|
||||
return atensor;
|
||||
}
|
||||
|
||||
|
@ -1443,7 +1443,7 @@ const auto options = TensorOptions()
|
||||
.layout({arg_parser_outputs['layout'].expr})
|
||||
.requires_grad({arg_parser_outputs['requires_grad'].expr})
|
||||
.pinned_memory({arg_parser_outputs['pin_memory'].expr});
|
||||
torch::utils::maybe_initialize_cuda(options);
|
||||
torch::utils::maybe_initialize_device(options);
|
||||
"""
|
||||
)
|
||||
lambda_args_exprs["options"] = "options"
|
||||
|
Reference in New Issue
Block a user