[RELAND] refactor lazy init to device-agnostic (#119248)

# Motivation
This PR intends to extend `cuda_lazy_init` to `device_lazy_init` which is a device-agnostic API that can support any backend. And change `maybe_initialize_cuda` to `maybe_initialize_device` to support lazy initialization for CUDA while maintaining scalability.

# Design
We maintain a flag for each backend to manage the lazy initialization state separately.

# Additional Context
No need more UTs.
This is a reland PR, the original PR is [refactor lazy init to device-agnostic](https://github.com/pytorch/pytorch/pull/118846).
This is a common PR, and does not trigger xpu ciflow.

Differential Revision: [D53478332](https://our.internmc.facebook.com/intern/diff/D53478332)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/119248
Approved by: https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/jgong5, https://github.com/atalman
This commit is contained in:
Yu, Guangye
2024-02-06 09:41:28 +00:00
committed by PyTorch MergeBot
parent 3625ccfbea
commit 5c46600f84
16 changed files with 118 additions and 116 deletions

View File

@ -869,7 +869,7 @@ libtorch_python_core_sources = [
"torch/csrc/utils/init.cpp",
"torch/csrc/utils/throughput_benchmark.cpp",
"torch/csrc/utils.cpp",
"torch/csrc/utils/cuda_lazy_init.cpp",
"torch/csrc/utils/device_lazy_init.cpp",
"torch/csrc/utils/invalid_arguments.cpp",
"torch/csrc/utils/nested.cpp",
"torch/csrc/utils/object_ptr.cpp",

View File

@ -14,7 +14,7 @@
#include "torch/csrc/utils/pycfunction_helpers.h"
#include "torch/csrc/utils/python_arg_parser.h"
#include "torch/csrc/utils/structseq.h"
#include "torch/csrc/utils/cuda_lazy_init.h"
#include "torch/csrc/utils/device_lazy_init.h"
#include <ATen/core/Tensor.h>

View File

@ -14,7 +14,7 @@
#include "torch/csrc/utils/pycfunction_helpers.h"
#include "torch/csrc/utils/python_arg_parser.h"
#include "torch/csrc/utils/structseq.h"
#include "torch/csrc/utils/cuda_lazy_init.h"
#include "torch/csrc/utils/device_lazy_init.h"
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>

View File

@ -14,7 +14,7 @@
#include "torch/csrc/utils/pycfunction_helpers.h"
#include "torch/csrc/utils/python_arg_parser.h"
#include "torch/csrc/utils/structseq.h"
#include "torch/csrc/utils/cuda_lazy_init.h"
#include "torch/csrc/utils/device_lazy_init.h"
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>

View File

@ -31,7 +31,7 @@
#include "torch/csrc/jit/frontend/tracer.h"
#include "torch/csrc/autograd/generated/variable_factories.h"
#include "torch/csrc/utils/structseq.h"
#include "torch/csrc/utils/cuda_lazy_init.h"
#include "torch/csrc/utils/device_lazy_init.h"
#include "torch/csrc/autograd/generated/python_return_types.h"
#include <ATen/core/Tensor.h>

View File

@ -21,7 +21,7 @@
#ifdef USE_CUDA
#include "torch/csrc/cuda/Event.h"
#endif
#include "torch/csrc/utils/cuda_lazy_init.h"
#include "torch/csrc/utils/device_lazy_init.h"
#include "torch/csrc/utils/object_ptr.h"
#include "torch/csrc/utils/pycfunction_helpers.h"
#include "torch/csrc/utils/python_arg_parser.h"
@ -495,7 +495,7 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
auto opt_memory_format = r.memoryformatOptional(2);
TORCH_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
torch::utils::cuda_lazy_init();
torch::utils::device_lazy_init(at::kCUDA);
return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
END_HANDLE_TH_ERRORS
}
@ -973,9 +973,7 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
auto copy = std::get<3>(parsed);
auto opt_memory_format = std::get<4>(parsed);
auto& self_ = THPVariable_Unpack(self);
if (device && device->is_cuda()) {
torch::utils::cuda_lazy_init();
}
torch::utils::maybe_initialize_device(device);
if (device && device->is_privateuseone()) {
at::globalContext().lazyInitPrivateUse1();
}
@ -1057,9 +1055,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa
if (device_type != device.type()) {
device = at::Device(device_type);
}
if (device.is_cuda()) {
torch::utils::cuda_lazy_init();
}
torch::utils::maybe_initialize_device(device);
if (device.is_privateuseone()) {
at::globalContext().lazyInitPrivateUse1();
}

View File

@ -8,7 +8,7 @@
#include <torch/csrc/Storage.h>
#include <torch/csrc/autograd/generated/VariableType.h>
#include <torch/csrc/utils/cuda_enabled.h>
#include <torch/csrc/utils/cuda_lazy_init.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/object_ptr.h>
#include <ATen/ATen.h>

View File

@ -9,7 +9,7 @@
#include <torch/csrc/autograd/python_variable.h>
#include <torch/csrc/autograd/utils/wrap_outputs.h>
#include <torch/csrc/jit/frontend/tracer.h>
#include <torch/csrc/utils/cuda_lazy_init.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/out_types.h>
#include <torch/csrc/utils/pybind.h>
#include <torch/csrc/utils/pycfunction_helpers.h>
@ -61,7 +61,7 @@ inline Tensor dispatch_range(
const Scalar& end,
const Scalar& step,
const TensorOptions& options) {
torch::utils::maybe_initialize_cuda(options);
torch::utils::maybe_initialize_device(options);
pybind11::gil_scoped_release no_gil;
DeviceGuard device_guard(options.device());
return torch::range(start, end, step, options);

View File

@ -39,7 +39,7 @@
#include <torch/csrc/cuda/python_comm.h>
#include <torch/csrc/profiler/python/combined_traceback.h>
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/cuda_lazy_init.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/pybind.h>
#include <torch/csrc/utils/pycfunction_helpers.h>
#include <torch/csrc/utils/python_numbers.h>
@ -62,7 +62,7 @@ static bool in_bad_fork = false; // True for children forked after cuda init
// Called in the forked child if cuda has already been initialized
static void forked_child() {
in_bad_fork = true;
torch::utils::set_requires_cuda_init(true);
torch::utils::set_requires_device_init(at::kCUDA, true);
}
#endif
@ -85,7 +85,7 @@ PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
auto device = THPUtils_unpackLong(arg);
torch::utils::cuda_lazy_init();
torch::utils::device_lazy_init(at::kCUDA);
c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
Py_RETURN_NONE;
@ -100,7 +100,7 @@ PyObject* THCPModule_exchangeDevice(PyObject* self, PyObject* arg) {
return THPUtils_packInt32(-1);
}
torch::utils::cuda_lazy_init();
torch::utils::device_lazy_init(at::kCUDA);
int current_device = c10::cuda::ExchangeDevice(device);
return THPUtils_packInt32(current_device);
@ -115,7 +115,7 @@ PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
return THPUtils_packInt32(-1);
}
torch::utils::cuda_lazy_init();
torch::utils::device_lazy_init(at::kCUDA);
int current_device = c10::cuda::MaybeExchangeDevice(device);
return THPUtils_packInt32(current_device);
@ -124,7 +124,7 @@ PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
PyObject* THCPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
torch::utils::cuda_lazy_init();
torch::utils::device_lazy_init(at::kCUDA);
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
auto device = static_cast<int32_t>(c10::cuda::current_device());
return THPUtils_packInt32(device);
@ -151,7 +151,7 @@ PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
int64_t device = THPUtils_unpackLong(arg1);
int64_t peer_device = THPUtils_unpackLong(arg2);
torch::utils::cuda_lazy_init();
torch::utils::device_lazy_init(at::kCUDA);
auto can_access = at::cuda::canDeviceAccessPeer(device, peer_device);
return PyBool_FromLong(can_access);
END_HANDLE_TH_ERRORS

View File

@ -13,7 +13,7 @@
#include <torch/csrc/autograd/utils/wrap_outputs.h>
#include <torch/csrc/autograd/variable.h>
#include <torch/csrc/utils/cuda_enabled.h>
#include <torch/csrc/utils/cuda_lazy_init.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/python_strings.h>
#include <torch/csrc/utils/tensor_new.h>
#include <torch/csrc/utils/tensor_types.h>

View File

@ -1,43 +0,0 @@
#include <torch/csrc/utils/cuda_lazy_init.h>
#include <torch/csrc/Exceptions.h>
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/object_ptr.h>
namespace torch {
namespace utils {
namespace {
bool is_initialized = false;
}
void cuda_lazy_init() {
pybind11::gil_scoped_acquire g;
// Protected by the GIL. We don't use call_once because under ASAN it
// has a buggy implementation that deadlocks if an instance throws an
// exception. In any case, call_once isn't necessary, because we
// have taken a lock.
if (is_initialized) {
return;
}
auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
if (!module) {
throw python_error();
}
auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
if (!res) {
throw python_error();
}
is_initialized = true;
}
void set_requires_cuda_init(bool value) {
is_initialized = !value;
}
} // namespace utils
} // namespace torch

View File

@ -1,33 +0,0 @@
#pragma once
#include <c10/core/TensorOptions.h>
// cuda_lazy_init() is always compiled, even for CPU-only builds.
// Thus, it does not live in the cuda/ folder.
namespace torch {
namespace utils {
// The INVARIANT is that this function MUST be called before you attempt
// to get a CUDA Type object from ATen, in any way. Here are some common
// ways that a Type object may be retrieved:
//
// - You call getNonVariableType or getNonVariableTypeOpt
// - You call toBackend() on a Type
//
// It's important to do this correctly, because if you forget to add it
// you'll get an oblique error message about "Cannot initialize CUDA without
// ATen_cuda library" if you try to use CUDA functionality from a CPU-only
// build, which is not good UX.
//
void cuda_lazy_init();
void set_requires_cuda_init(bool value);
static void maybe_initialize_cuda(const at::TensorOptions& options) {
if (options.device().is_cuda()) {
torch::utils::cuda_lazy_init();
}
}
} // namespace utils
} // namespace torch

View File

@ -0,0 +1,42 @@
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/Exceptions.h>
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/object_ptr.h>
#include <iostream>
namespace torch::utils {
namespace {
std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_initialized{};
} // anonymous namespace
void device_lazy_init(at::DeviceType device_type) {
pybind11::gil_scoped_acquire g;
// Protected by the GIL. We don't use call_once because under ASAN it
// has a buggy implementation that deadlocks if an instance throws an
// exception. In any case, call_once isn't necessary, because we
// have taken a lock.
if (is_initialized[static_cast<int>(device_type)]) {
return;
}
std::string module_name = "torch." + at::DeviceTypeName(device_type, true);
auto module = THPObjectPtr(PyImport_ImportModule(module_name.c_str()));
if (!module) {
throw python_error();
}
auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
if (!res) {
throw python_error();
}
is_initialized[static_cast<int>(device_type)] = true;
}
void set_requires_device_init(at::DeviceType device_type, bool value) {
is_initialized[static_cast<int>(device_type)] = !value;
}
} // namespace torch::utils

View File

@ -0,0 +1,48 @@
#pragma once
#include <c10/core/TensorOptions.h>
// device_lazy_init() is always compiled, even for CPU-only builds.
namespace torch::utils {
/**
* This mechanism of lazy initialization is designed for each device backend.
* Currently, CUDA and XPU follow this design. This function `device_lazy_init`
* MUST be called before you attempt to access any Type(CUDA or XPU) object
* from ATen, in any way. It guarantees that the device runtime status is lazily
* initialized when the first runtime API is requested.
*
* Here are some common ways that a device object may be retrieved:
* - You call getNonVariableType or getNonVariableTypeOpt
* - You call toBackend() on a Type
*
* It's important to do this correctly, because if you forget to add it you'll
* get an oblique error message seems like "Cannot initialize CUDA without
* ATen_cuda library" or "Cannot initialize XPU without ATen_xpu library" if you
* try to use CUDA or XPU functionality from a CPU-only build, which is not good
* UX.
*/
void device_lazy_init(at::DeviceType device_type);
void set_requires_device_init(at::DeviceType device_type, bool value);
static inline void maybe_initialize_device(at::Device& device) {
// Add more devices here to enable lazy initialization.
if (device.is_cuda()) {
device_lazy_init(device.type());
}
}
static inline void maybe_initialize_device(c10::optional<at::Device>& device) {
if (!device.has_value()) {
return;
}
maybe_initialize_device(device.value());
}
static inline void maybe_initialize_device(const at::TensorOptions& options) {
auto device = options.device();
maybe_initialize_device(device);
}
} // namespace torch::utils

View File

@ -7,7 +7,7 @@
#include <torch/csrc/Size.h>
#include <torch/csrc/autograd/generated/variable_factories.h>
#include <torch/csrc/autograd/variable.h>
#include <torch/csrc/utils/cuda_lazy_init.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/numpy_stub.h>
#include <torch/csrc/utils/pybind.h>
#include <torch/csrc/utils/python_arg_parser.h>
@ -60,12 +60,6 @@ TensorOptions build_options(
return options;
}
void maybe_initialize_cuda(const Device& device) {
if (device.is_cuda()) {
torch::utils::cuda_lazy_init();
}
}
// NB: It appears there is some consistency invariant between options and
// device, where if device is non-empty, its type must be consistent with the
// device type in options.
@ -76,7 +70,7 @@ Tensor new_with_sizes(
at::ScalarType scalar_type,
const optional<Device>& device,
c10::SymIntArrayRef sizes) {
maybe_initialize_cuda(options.device());
maybe_initialize_device(options.device());
pybind11::gil_scoped_release no_gil;
return at::empty_symint(sizes, build_options(options, scalar_type, device));
}
@ -319,7 +313,7 @@ Tensor internal_new_from_data(
type_inference ? var.scalar_type() : scalar_type;
auto device = device_opt.has_value() ? *device_opt : var.device();
pybind11::gil_scoped_release no_gil;
maybe_initialize_cuda(device);
maybe_initialize_device(device);
return var.to(
device,
inferred_scalar_type,
@ -337,7 +331,7 @@ Tensor internal_new_from_data(
type_inference ? tensor.scalar_type() : scalar_type;
auto device = device_opt.has_value() ? *device_opt : options.device();
pybind11::gil_scoped_release no_gil;
maybe_initialize_cuda(device);
maybe_initialize_device(device);
return tensor.to(
device,
inferred_scalar_type,
@ -353,7 +347,7 @@ Tensor internal_new_from_data(
type_inference ? tensor.scalar_type() : scalar_type;
auto device = device_opt.has_value() ? *device_opt : options.device();
pybind11::gil_scoped_release no_gil;
maybe_initialize_cuda(device);
maybe_initialize_device(device);
return tensor.to(
device,
inferred_scalar_type,
@ -449,7 +443,7 @@ Tensor internal_new_from_data(
}
}
pybind11::gil_scoped_release no_gil;
maybe_initialize_cuda(device);
maybe_initialize_device(device);
// However, it is VERY important that we trace the to() call here (even
// though the reason this is important is a hack). Without *some* factory
// function call that is traced at construction time, we will consider
@ -1640,9 +1634,7 @@ Tensor tensor_fromDLPack(PyObject* data) {
// because cuda ATen types have not been registered in Python yet.
// so if we have a cuda tensor, then we need to make sure
// we have called _lazy_init here
if (atensor.is_cuda()) {
py::module::import("torch.cuda").attr("init")();
}
maybe_initialize_device(atensor.device());
return atensor;
}

View File

@ -1443,7 +1443,7 @@ const auto options = TensorOptions()
.layout({arg_parser_outputs['layout'].expr})
.requires_grad({arg_parser_outputs['requires_grad'].expr})
.pinned_memory({arg_parser_outputs['pin_memory'].expr});
torch::utils::maybe_initialize_cuda(options);
torch::utils::maybe_initialize_device(options);
"""
)
lambda_args_exprs["options"] = "options"