[RELAND] refactor lazy init to device-agnostic (#119248)

# Motivation This PR intends to extend `cuda_lazy_init` to `device_lazy_init` which is a device-agnostic API that can support any backend. And change `maybe_initialize_cuda` to `maybe_initialize_device` to support lazy initialization for CUDA while maintaining scalability. # Design We maintain a flag for each backend to manage the lazy initialization state separately. # Additional Context No need more UTs. This is a reland PR, the original PR is [refactor lazy init to device-agnostic](https://github.com/pytorch/pytorch/pull/118846). This is a common PR, and does not trigger xpu ciflow. Differential Revision: [D53478332](https://our.internmc.facebook.com/intern/diff/D53478332) Pull Request resolved: https://github.com/pytorch/pytorch/pull/119248 Approved by: https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/jgong5, https://github.com/atalman
2025-10-20 21:14:14 +08:00 · 2024-02-06 09:41:28 +00:00
parent 3625ccfbea
commit 5c46600f84
16 changed files with 118 additions and 116 deletions
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -869,7 +869,7 @@ libtorch_python_core_sources = [
    "torch/csrc/utils/init.cpp",
    "torch/csrc/utils/throughput_benchmark.cpp",
    "torch/csrc/utils.cpp",
-    "torch/csrc/utils/cuda_lazy_init.cpp",
+    "torch/csrc/utils/device_lazy_init.cpp",
    "torch/csrc/utils/invalid_arguments.cpp",
    "torch/csrc/utils/nested.cpp",
    "torch/csrc/utils/object_ptr.cpp",
--- a/tools/autograd/templates/python_fft_functions.cpp
+++ b/tools/autograd/templates/python_fft_functions.cpp
@ -14,7 +14,7 @@
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"

 #include <ATen/core/Tensor.h>

--- a/tools/autograd/templates/python_nested_functions.cpp
+++ b/tools/autograd/templates/python_nested_functions.cpp
@ -14,7 +14,7 @@
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
--- a/tools/autograd/templates/python_special_functions.cpp
+++ b/tools/autograd/templates/python_special_functions.cpp
@ -14,7 +14,7 @@
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@ -31,7 +31,7 @@
 #include "torch/csrc/jit/frontend/tracer.h"
 #include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"
 #include "torch/csrc/autograd/generated/python_return_types.h"

 #include <ATen/core/Tensor.h>
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@ -21,7 +21,7 @@
 #ifdef USE_CUDA
 #include "torch/csrc/cuda/Event.h"
 #endif
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"
 #include "torch/csrc/utils/object_ptr.h"
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
@ -495,7 +495,7 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
  auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
  auto opt_memory_format = r.memoryformatOptional(2);
  TORCH_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
  return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
  END_HANDLE_TH_ERRORS
 }
@ -973,9 +973,7 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
  auto copy = std::get<3>(parsed);
  auto opt_memory_format = std::get<4>(parsed);
  auto& self_ = THPVariable_Unpack(self);
-  if (device && device->is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
+  torch::utils::maybe_initialize_device(device);
  if (device && device->is_privateuseone()) {
    at::globalContext().lazyInitPrivateUse1();
  }
@ -1057,9 +1055,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa
  if (device_type != device.type()) {
    device = at::Device(device_type);
  }
-  if (device.is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
+  torch::utils::maybe_initialize_device(device);
  if (device.is_privateuseone()) {
    at::globalContext().lazyInitPrivateUse1();
  }
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@ -8,7 +8,7 @@
 #include <torch/csrc/Storage.h>
 #include <torch/csrc/autograd/generated/VariableType.h>
 #include <torch/csrc/utils/cuda_enabled.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/object_ptr.h>

 #include <ATen/ATen.h>
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@ -9,7 +9,7 @@
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/jit/frontend/tracer.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/out_types.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
@ -61,7 +61,7 @@ inline Tensor dispatch_range(
    const Scalar& end,
    const Scalar& step,
    const TensorOptions& options) {
-  torch::utils::maybe_initialize_cuda(options);
+  torch::utils::maybe_initialize_device(options);
  pybind11::gil_scoped_release no_gil;
  DeviceGuard device_guard(options.device());
  return torch::range(start, end, step, options);
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -39,7 +39,7 @@
 #include <torch/csrc/cuda/python_comm.h>
 #include <torch/csrc/profiler/python/combined_traceback.h>
 #include <torch/csrc/python_headers.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_numbers.h>
@ -62,7 +62,7 @@ static bool in_bad_fork = false; // True for children forked after cuda init
 // Called in the forked child if cuda has already been initialized
 static void forked_child() {
  in_bad_fork = true;
-  torch::utils::set_requires_cuda_init(true);
+  torch::utils::set_requires_device_init(at::kCUDA, true);
 }
 #endif

@ -85,7 +85,7 @@ PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
  auto device = THPUtils_unpackLong(arg);

-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
  c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));

  Py_RETURN_NONE;
@ -100,7 +100,7 @@ PyObject* THCPModule_exchangeDevice(PyObject* self, PyObject* arg) {
    return THPUtils_packInt32(-1);
  }

-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
  int current_device = c10::cuda::ExchangeDevice(device);

  return THPUtils_packInt32(current_device);
@ -115,7 +115,7 @@ PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
    return THPUtils_packInt32(-1);
  }

-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
  int current_device = c10::cuda::MaybeExchangeDevice(device);

  return THPUtils_packInt32(current_device);
@ -124,7 +124,7 @@ PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {

 PyObject* THCPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
  HANDLE_TH_ERRORS
-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
  auto device = static_cast<int32_t>(c10::cuda::current_device());
  return THPUtils_packInt32(device);
@ -151,7 +151,7 @@ PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
  int64_t device = THPUtils_unpackLong(arg1);
  int64_t peer_device = THPUtils_unpackLong(arg2);

-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
  auto can_access = at::cuda::canDeviceAccessPeer(device, peer_device);
  return PyBool_FromLong(can_access);
  END_HANDLE_TH_ERRORS
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@ -13,7 +13,7 @@
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/utils/cuda_enabled.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/tensor_new.h>
 #include <torch/csrc/utils/tensor_types.h>
--- a/torch/csrc/utils/cuda_lazy_init.cpp
+++ b/torch/csrc/utils/cuda_lazy_init.cpp
@ -1,43 +0,0 @@
-#include <torch/csrc/utils/cuda_lazy_init.h>
-
-#include <torch/csrc/Exceptions.h>
-#include <torch/csrc/python_headers.h>
-#include <torch/csrc/utils/object_ptr.h>
-
-namespace torch {
-namespace utils {
-namespace {
-
-bool is_initialized = false;
-
-}
-
-void cuda_lazy_init() {
-  pybind11::gil_scoped_acquire g;
-  // Protected by the GIL.  We don't use call_once because under ASAN it
-  // has a buggy implementation that deadlocks if an instance throws an
-  // exception.  In any case, call_once isn't necessary, because we
-  // have taken a lock.
-  if (is_initialized) {
-    return;
-  }
-
-  auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
-  if (!module) {
-    throw python_error();
-  }
-
-  auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
-  if (!res) {
-    throw python_error();
-  }
-
-  is_initialized = true;
-}
-
-void set_requires_cuda_init(bool value) {
-  is_initialized = !value;
-}
-
-} // namespace utils
-} // namespace torch
--- a/torch/csrc/utils/cuda_lazy_init.h
+++ b/torch/csrc/utils/cuda_lazy_init.h
@ -1,33 +0,0 @@
-#pragma once
-
-#include <c10/core/TensorOptions.h>
-
-// cuda_lazy_init() is always compiled, even for CPU-only builds.
-// Thus, it does not live in the cuda/ folder.
-
-namespace torch {
-namespace utils {
-
-// The INVARIANT is that this function MUST be called before you attempt
-// to get a CUDA Type object from ATen, in any way.  Here are some common
-// ways that a Type object may be retrieved:
-//
-//    - You call getNonVariableType or getNonVariableTypeOpt
-//    - You call toBackend() on a Type
-//
-// It's important to do this correctly, because if you forget to add it
-// you'll get an oblique error message about "Cannot initialize CUDA without
-// ATen_cuda library" if you try to use CUDA functionality from a CPU-only
-// build, which is not good UX.
-//
-void cuda_lazy_init();
-void set_requires_cuda_init(bool value);
-
-static void maybe_initialize_cuda(const at::TensorOptions& options) {
-  if (options.device().is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
-}
-
-} // namespace utils
-} // namespace torch
--- a/torch/csrc/utils/device_lazy_init.cpp
+++ b/torch/csrc/utils/device_lazy_init.cpp
@ -0,0 +1,42 @@
+#include <torch/csrc/utils/device_lazy_init.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <iostream>
+namespace torch::utils {
+namespace {
+
+std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_initialized{};
+
+} // anonymous namespace
+
+void device_lazy_init(at::DeviceType device_type) {
+  pybind11::gil_scoped_acquire g;
+  // Protected by the GIL.  We don't use call_once because under ASAN it
+  // has a buggy implementation that deadlocks if an instance throws an
+  // exception.  In any case, call_once isn't necessary, because we
+  // have taken a lock.
+  if (is_initialized[static_cast<int>(device_type)]) {
+    return;
+  }
+
+  std::string module_name = "torch." + at::DeviceTypeName(device_type, true);
+  auto module = THPObjectPtr(PyImport_ImportModule(module_name.c_str()));
+  if (!module) {
+    throw python_error();
+  }
+
+  auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
+  if (!res) {
+    throw python_error();
+  }
+
+  is_initialized[static_cast<int>(device_type)] = true;
+}
+
+void set_requires_device_init(at::DeviceType device_type, bool value) {
+  is_initialized[static_cast<int>(device_type)] = !value;
+}
+
+} // namespace torch::utils
--- a/torch/csrc/utils/device_lazy_init.h
+++ b/torch/csrc/utils/device_lazy_init.h
@ -0,0 +1,48 @@
+#pragma once
+
+#include <c10/core/TensorOptions.h>
+
+// device_lazy_init() is always compiled, even for CPU-only builds.
+
+namespace torch::utils {
+
+/**
+ * This mechanism of lazy initialization is designed for each device backend.
+ * Currently, CUDA and XPU follow this design. This function `device_lazy_init`
+ * MUST be called before you attempt to access any Type(CUDA or XPU) object
+ * from ATen, in any way. It guarantees that the device runtime status is lazily
+ * initialized when the first runtime API is requested.
+ *
+ * Here are some common ways that a device object may be retrieved:
+ *   - You call getNonVariableType or getNonVariableTypeOpt
+ *   - You call toBackend() on a Type
+ *
+ * It's important to do this correctly, because if you forget to add it you'll
+ * get an oblique error message seems like "Cannot initialize CUDA without
+ * ATen_cuda library" or "Cannot initialize XPU without ATen_xpu library" if you
+ * try to use CUDA or XPU functionality from a CPU-only build, which is not good
+ * UX.
+ */
+void device_lazy_init(at::DeviceType device_type);
+void set_requires_device_init(at::DeviceType device_type, bool value);
+
+static inline void maybe_initialize_device(at::Device& device) {
+  // Add more devices here to enable lazy initialization.
+  if (device.is_cuda()) {
+    device_lazy_init(device.type());
+  }
+}
+
+static inline void maybe_initialize_device(c10::optional<at::Device>& device) {
+  if (!device.has_value()) {
+    return;
+  }
+  maybe_initialize_device(device.value());
+}
+
+static inline void maybe_initialize_device(const at::TensorOptions& options) {
+  auto device = options.device();
+  maybe_initialize_device(device);
+}
+
+} // namespace torch::utils
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@ -7,7 +7,7 @@
 #include <torch/csrc/Size.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/autograd/variable.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/numpy_stub.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_arg_parser.h>
@ -60,12 +60,6 @@ TensorOptions build_options(
  return options;
 }

-void maybe_initialize_cuda(const Device& device) {
-  if (device.is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
-}
-
 // NB: It appears there is some consistency invariant between options and
 // device, where if device is non-empty, its type must be consistent with the
 // device type in options.
@ -76,7 +70,7 @@ Tensor new_with_sizes(
    at::ScalarType scalar_type,
    const optional<Device>& device,
    c10::SymIntArrayRef sizes) {
-  maybe_initialize_cuda(options.device());
+  maybe_initialize_device(options.device());
  pybind11::gil_scoped_release no_gil;
  return at::empty_symint(sizes, build_options(options, scalar_type, device));
 }
@ -319,7 +313,7 @@ Tensor internal_new_from_data(
        type_inference ? var.scalar_type() : scalar_type;
    auto device = device_opt.has_value() ? *device_opt : var.device();
    pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
    return var.to(
        device,
        inferred_scalar_type,
@ -337,7 +331,7 @@ Tensor internal_new_from_data(
        type_inference ? tensor.scalar_type() : scalar_type;
    auto device = device_opt.has_value() ? *device_opt : options.device();
    pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
    return tensor.to(
        device,
        inferred_scalar_type,
@ -353,7 +347,7 @@ Tensor internal_new_from_data(
        type_inference ? tensor.scalar_type() : scalar_type;
    auto device = device_opt.has_value() ? *device_opt : options.device();
    pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
    return tensor.to(
        device,
        inferred_scalar_type,
@ -449,7 +443,7 @@ Tensor internal_new_from_data(
      }
    }
    pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
    // However, it is VERY important that we trace the to() call here (even
    // though the reason this is important is a hack).  Without *some* factory
    // function call that is traced at construction time, we will consider
@ -1640,9 +1634,7 @@ Tensor tensor_fromDLPack(PyObject* data) {
  // because cuda ATen types have not been registered in Python yet.
  // so if we have a cuda tensor, then we need to make sure
  // we have called _lazy_init here
-  if (atensor.is_cuda()) {
-    py::module::import("torch.cuda").attr("init")();
-  }
+  maybe_initialize_device(atensor.device());
  return atensor;
 }

--- a/torchgen/api/python.py
+++ b/torchgen/api/python.py
@ -1443,7 +1443,7 @@ const auto options = TensorOptions()
    .layout({arg_parser_outputs['layout'].expr})
    .requires_grad({arg_parser_outputs['requires_grad'].expr})
    .pinned_memory({arg_parser_outputs['pin_memory'].expr});
-torch::utils::maybe_initialize_cuda(options);
+torch::utils::maybe_initialize_device(options);
 """
        )
        lambda_args_exprs["options"] = "options"