Revert "Generalize poison fork logic for each device backend (#144664)"

This reverts commit 83bd0b63b55f224fada6d5f6dd7eb5b4cb3072fb. Reverted https://github.com/pytorch/pytorch/pull/144664 on behalf of https://github.com/atalman due to failing internal tests ([comment](https://github.com/pytorch/pytorch/pull/144664#issuecomment-2795157082))
2025-10-20 12:54:11 +08:00 · 2025-04-10 21:02:14 +00:00
parent 8efcf21fff
commit a0ab243c3a
7 changed files with 115 additions and 85 deletions
--- a/test/test_cpp_extensions_mtia_backend.py
+++ b/test/test_cpp_extensions_mtia_backend.py
@ -11,18 +11,20 @@ from torch.testing._internal.common_utils import (
    IS_ARM64,
    IS_LINUX,
    skipIfTorchDynamo,
+    TEST_CUDA,
    TEST_PRIVATEUSE1,
+    TEST_XPU,
 )
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME


-# This TestCase should be mutually exclusive with other backends.
-HAS_CUDA = torch.backends.cuda.is_built()
-HAS_XPU = torch.xpu._is_compiled()
-HAS_MPS = torch.backends.mps.is_built()
+# define TEST_ROCM before changing TEST_CUDA
+TEST_ROCM = TEST_CUDA and torch.version.hip is not None and ROCM_HOME is not None
+TEST_CUDA = TEST_CUDA and CUDA_HOME is not None


@unittest.skipIf(
-    IS_ARM64 or not IS_LINUX or HAS_CUDA or HAS_XPU or HAS_MPS or TEST_PRIVATEUSE1,
+    IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_PRIVATEUSE1 or TEST_ROCM or TEST_XPU,
    "Only on linux platform and mutual exclusive to other backends",
 )
@torch.testing._internal.common_utils.markDynamoStrictTest
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -51,9 +51,32 @@
 #include <sstream>
 #include <thread>
 #include <unordered_map>
+#ifndef WIN32
+#include <pthread.h>
+#endif

 using namespace torch;

+static bool in_bad_fork = false; // True for children forked after cuda init
+
+#ifndef WIN32
+// Called in the forked child if cuda has already been initialized
+static void forked_child() {
+  in_bad_fork = true;
+  torch::utils::set_requires_device_init(at::kCUDA, true);
+}
+#endif
+
+// Should be called before the first cuda call.
+// Note: This is distinct from initExtension because a stub cuda implementation
+// has some working functions (e.g. device_count) but cannot fully initialize.
+static void poison_fork() {
+#ifndef WIN32
+  static auto result [[maybe_unused]] =
+      pthread_atfork(nullptr, nullptr, forked_child);
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // CUDA management methods
 ////////////////////////////////////////////////////////////////////////////////
@ -137,17 +160,14 @@ PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {

 PyObject* THCPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
  HANDLE_TH_ERRORS
-  // Note: This is distinct from initExtension because a stub cuda
-  // implementation has some working functions (e.g. device_count) but cannot
-  // fully initialize.
-  torch::utils::register_fork_handler_for_device_init(at::kCUDA);
+  poison_fork();
  return THPUtils_packUInt64(at::cuda::device_count());
  END_HANDLE_TH_ERRORS
 }

 PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
  HANDLE_TH_ERRORS
-  torch::utils::register_fork_handler_for_device_init(at::kCUDA);
+  poison_fork();
 #ifdef CUDA_ARCH_FLAGS
  static const char* flags = C10_STRINGIZE(CUDA_ARCH_FLAGS);
  return THPUtils_packString(flags);
@ -159,7 +179,7 @@ PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {

 static PyObject* THCPModule_isInBadFork(PyObject* self, PyObject* noargs) {
  HANDLE_TH_ERRORS
-  return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kCUDA));
+  return PyBool_FromLong(in_bad_fork);
  END_HANDLE_TH_ERRORS
 }

@ -1493,8 +1513,8 @@ static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
      "please rebuild pytorch without asan if you need to use this module");
 #endif
  HANDLE_TH_ERRORS
-  TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kCUDA));
-  torch::utils::register_fork_handler_for_device_init(at::kCUDA);
+  TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
+  poison_fork();
  at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);

  auto m = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
--- a/torch/csrc/mps/Module.cpp
+++ b/torch/csrc/mps/Module.cpp
@ -6,12 +6,16 @@
 #include <torch/csrc/THP.h>
 #include <torch/csrc/mps/Module.h>
 #include <torch/csrc/python_headers.h>
-#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <memory>

+// pthread.h is included for tracking bad forks
+#ifndef WIN32
+#include <pthread.h>
+#endif
+
 #ifdef USE_MPS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/MetalShaderLibrary.h>
@ -19,9 +23,27 @@

 namespace torch::mps {

+namespace {
+// True for children forked after mps init
+static bool in_bad_fork = false;
+
+// Called in the forked child if mps has already been initialized
+static void forked_mps_child() {
+  in_bad_fork = true;
+}
+
+// Should be called before the first mps call.
+static void track_bad_mps_fork() {
+#ifndef WIN32
+  static auto result [[maybe_unused]] =
+      pthread_atfork(nullptr, nullptr, forked_mps_child);
+#endif
+}
+} // namespace
+
 static PyObject* MPSModule_isInBadFork(PyObject* self, PyObject* noargs) {
  HANDLE_TH_ERRORS
-  return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kMPS));
+  return PyBool_FromLong(in_bad_fork);
  END_HANDLE_TH_ERRORS
 }

@ -29,7 +51,7 @@ static PyObject* MPSModule_getDefaultMPSGenerator(
    PyObject* _unused,
    PyObject* noargs) {
  HANDLE_TH_ERRORS
-  torch::utils::register_fork_handler_for_device_init(at::kMPS);
+  track_bad_mps_fork();
  return THPGenerator_initDefaultGenerator(
      at::detail::getMPSHooks().getDefaultGenerator());
  END_HANDLE_TH_ERRORS
@ -37,8 +59,8 @@ static PyObject* MPSModule_getDefaultMPSGenerator(

 static PyObject* MPSModule_isAvailable(PyObject* _unused, PyObject* noargs) {
  HANDLE_TH_ERRORS
+  track_bad_mps_fork();
  if (at::detail::getMPSHooks().hasMPS()) {
-    torch::utils::register_fork_handler_for_device_init(at::kMPS);
    Py_RETURN_TRUE;
  } else {
    Py_RETURN_FALSE;
--- a/torch/csrc/mtia/Module.cpp
+++ b/torch/csrc/mtia/Module.cpp
@ -7,15 +7,38 @@
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
+#ifndef WIN32
+#include <pthread.h>
+#endif

 namespace torch::mtia {

+static bool in_bad_fork = false; // True for children forked after mtia init
+
+#ifndef WIN32
+// Called in the forked child if mtia has already been initialized
+static void forked_child() {
+  in_bad_fork = true;
+  torch::utils::set_requires_device_init(at::kMTIA, true);
+}
+#endif
+
+// Should be called before the first mtia call.
+// Note: This is distinct from initExtension because a stub mtia implementation
+// has some working functions (e.g. device_count) but cannot fully initialize.
+static void poison_fork() {
+#ifndef WIN32
+  static auto result [[maybe_unused]] =
+      pthread_atfork(nullptr, nullptr, forked_child);
+#endif
+}
+
 void initModule(PyObject* module) {
  auto m = py::handle(module).cast<py::module>();

  m.def("_mtia_init", []() {
-    TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kMTIA));
-    torch::utils::register_fork_handler_for_device_init(at::kMTIA);
+    TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
+    poison_fork();
    at::globalContext().lazyInitDevice(c10::DeviceType::MTIA);
  });

@ -24,9 +47,7 @@ void initModule(PyObject* module) {
    return at::detail::isMTIAHooksBuilt();
  });

-  m.def("_mtia_isInBadFork", []() {
-    return torch::utils::is_device_in_bad_fork(at::kMTIA);
-  });
+  m.def("_mtia_isInBadFork", []() { return in_bad_fork; });

  m.def("_mtia_getCurrentStream", [](c10::DeviceIndex device_index) {
    torch::utils::device_lazy_init(at::kMTIA);
--- a/torch/csrc/utils/device_lazy_init.cpp
+++ b/torch/csrc/utils/device_lazy_init.cpp
@ -1,23 +1,13 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
-#include <c10/util/CallOnce.h>
 #include <torch/csrc/utils/device_lazy_init.h>

 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/object_ptr.h>
-
-#ifndef WIN32
-#include <pthread.h>
-#endif
-
 namespace torch::utils {
 namespace {

 std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_initialized{};
-std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_in_bad_fork{};
-std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES>
-    at_fork_once_flags{};
-std::optional<c10::DeviceType> at_fork_device_type{};

 } // anonymous namespace

@ -68,34 +58,4 @@ void set_requires_device_init(at::DeviceType device_type, bool value) {
  is_initialized[static_cast<int>(device_type)] = !value;
 }

-bool is_device_in_bad_fork(at::DeviceType device_type) {
-  return is_in_bad_fork[static_cast<int>(device_type)];
-}
-
-void set_device_in_bad_fork(at::DeviceType device_type, bool value) {
-  is_in_bad_fork[static_cast<int>(device_type)] = value;
-}
-
-// Should be called before the first device runtime call.
-void register_fork_handler_for_device_init(at::DeviceType device_type) {
-#ifndef WIN32
-  auto& flag = at_fork_once_flags[static_cast<int>(device_type)];
-  c10::call_once(flag, [device_type]() {
-    TORCH_CHECK(
-        !at_fork_device_type,
-        "Only one device type can be registered. But now, we have two device types: ",
-        at_fork_device_type.value(),
-        " and ",
-        device_type);
-    at_fork_device_type = device_type;
-    pthread_atfork(nullptr, nullptr, []() {
-      set_device_in_bad_fork(at_fork_device_type.value(), true);
-      if (is_device_lazy_init_supported(at_fork_device_type.value())) {
-        set_requires_device_init(at_fork_device_type.value(), true);
-      }
-    });
-  });
-#endif
-}
-
 } // namespace torch::utils
--- a/torch/csrc/utils/device_lazy_init.h
+++ b/torch/csrc/utils/device_lazy_init.h
@ -67,21 +67,4 @@ inline void maybe_initialize_device(

 bool is_device_initialized(at::DeviceType device_type);

-TORCH_PYTHON_API bool is_device_in_bad_fork(at::DeviceType device_type);
-
-TORCH_PYTHON_API void set_device_in_bad_fork(
-    at::DeviceType device_type,
-    bool value);
-
-TORCH_PYTHON_API void register_fork_handler_for_device_init(
-    at::DeviceType device_type);
-
-inline void maybe_register_fork_handler_for_device_init(
-    std::optional<at::DeviceType>& device_type) {
-  if (!device_type.has_value()) {
-    return;
-  }
-  register_fork_handler_for_device_init(device_type.value());
-}
-
 } // namespace torch::utils
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@ -11,8 +11,32 @@
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/xpu/Module.h>

+#ifndef WIN32
+#include <pthread.h>
+#endif
+
 using namespace torch;

+static bool in_bad_fork = false; // True for children forked after xpu init
+
+#ifndef WIN32
+// Called in the forked child if xpu has already been initialized
+static void forked_child() {
+  in_bad_fork = true;
+  torch::utils::set_requires_device_init(at::kXPU, true);
+}
+#endif
+
+// Should be called before the first xpu call. It is mainly called in lazy_init.
+// Note: This is distinct from initExtension because a stub xpu implementation
+// has some working functions (e.g. device_count) but cannot fully initialize.
+static void poison_fork() {
+#ifndef WIN32
+  static auto result [[maybe_unused]] =
+      pthread_atfork(nullptr, nullptr, forked_child);
+#endif
+}
+
 // XPU management methods

 static PyObject* THXPModule_getArchFlags(PyObject* self, PyObject* noargs) {
@ -28,7 +52,7 @@ static PyObject* THXPModule_getArchFlags(PyObject* self, PyObject* noargs) {

 static PyObject* THXPModule_isInBadFork_wrap(PyObject* self, PyObject* noargs) {
  HANDLE_TH_ERRORS
-  return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kXPU));
+  return PyBool_FromLong(in_bad_fork);
  END_HANDLE_TH_ERRORS
 }

@ -91,9 +115,7 @@ static PyObject* THXPModule_getDeviceCount_wrap(
    PyObject* self,
    PyObject* noargs) {
  HANDLE_TH_ERRORS
-  // Note: This is distinct from initExtension because a stub xpu implementation
-  // has some working functions (e.g. device_count) but cannot fully initialize.
-  torch::utils::register_fork_handler_for_device_init(at::kXPU);
+  poison_fork();
  return THPUtils_packUInt64(at::xpu::device_count());
  END_HANDLE_TH_ERRORS
 }
@ -398,8 +420,8 @@ static void initXpuMethodBindings(PyObject* module) {
 // classes
 static PyObject* THXPModule_initExtension(PyObject* self, PyObject* noargs) {
  HANDLE_TH_ERRORS
-  TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kXPU));
-  torch::utils::register_fork_handler_for_device_init(at::kXPU);
+  TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
+  poison_fork();
  at::globalContext().lazyInitDevice(c10::DeviceType::XPU);

  auto m = THPObjectPtr(PyImport_ImportModule("torch.xpu"));