Revert "Generalize poison fork logic for each device backend (#144664)"

This reverts commit 83bd0b63b55f224fada6d5f6dd7eb5b4cb3072fb.

Reverted https://github.com/pytorch/pytorch/pull/144664 on behalf of https://github.com/atalman due to failing internal tests ([comment](https://github.com/pytorch/pytorch/pull/144664#issuecomment-2795157082))
This commit is contained in:
PyTorch MergeBot
2025-04-10 21:02:14 +00:00
parent 8efcf21fff
commit a0ab243c3a
7 changed files with 115 additions and 85 deletions

View File

@ -11,18 +11,20 @@ from torch.testing._internal.common_utils import (
IS_ARM64,
IS_LINUX,
skipIfTorchDynamo,
TEST_CUDA,
TEST_PRIVATEUSE1,
TEST_XPU,
)
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
# This TestCase should be mutually exclusive with other backends.
HAS_CUDA = torch.backends.cuda.is_built()
HAS_XPU = torch.xpu._is_compiled()
HAS_MPS = torch.backends.mps.is_built()
# define TEST_ROCM before changing TEST_CUDA
TEST_ROCM = TEST_CUDA and torch.version.hip is not None and ROCM_HOME is not None
TEST_CUDA = TEST_CUDA and CUDA_HOME is not None
@unittest.skipIf(
IS_ARM64 or not IS_LINUX or HAS_CUDA or HAS_XPU or HAS_MPS or TEST_PRIVATEUSE1,
IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_PRIVATEUSE1 or TEST_ROCM or TEST_XPU,
"Only on linux platform and mutual exclusive to other backends",
)
@torch.testing._internal.common_utils.markDynamoStrictTest

View File

@ -51,9 +51,32 @@
#include <sstream>
#include <thread>
#include <unordered_map>
#ifndef WIN32
#include <pthread.h>
#endif
using namespace torch;
static bool in_bad_fork = false; // True for children forked after cuda init
#ifndef WIN32
// Called in the forked child if cuda has already been initialized
static void forked_child() {
in_bad_fork = true;
torch::utils::set_requires_device_init(at::kCUDA, true);
}
#endif
// Should be called before the first cuda call.
// Note: This is distinct from initExtension because a stub cuda implementation
// has some working functions (e.g. device_count) but cannot fully initialize.
static void poison_fork() {
#ifndef WIN32
static auto result [[maybe_unused]] =
pthread_atfork(nullptr, nullptr, forked_child);
#endif
}
////////////////////////////////////////////////////////////////////////////////
// CUDA management methods
////////////////////////////////////////////////////////////////////////////////
@ -137,17 +160,14 @@ PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
PyObject* THCPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
// Note: This is distinct from initExtension because a stub cuda
// implementation has some working functions (e.g. device_count) but cannot
// fully initialize.
torch::utils::register_fork_handler_for_device_init(at::kCUDA);
poison_fork();
return THPUtils_packUInt64(at::cuda::device_count());
END_HANDLE_TH_ERRORS
}
PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
torch::utils::register_fork_handler_for_device_init(at::kCUDA);
poison_fork();
#ifdef CUDA_ARCH_FLAGS
static const char* flags = C10_STRINGIZE(CUDA_ARCH_FLAGS);
return THPUtils_packString(flags);
@ -159,7 +179,7 @@ PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
static PyObject* THCPModule_isInBadFork(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kCUDA));
return PyBool_FromLong(in_bad_fork);
END_HANDLE_TH_ERRORS
}
@ -1493,8 +1513,8 @@ static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
"please rebuild pytorch without asan if you need to use this module");
#endif
HANDLE_TH_ERRORS
TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kCUDA));
torch::utils::register_fork_handler_for_device_init(at::kCUDA);
TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
poison_fork();
at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
auto m = THPObjectPtr(PyImport_ImportModule("torch.cuda"));

View File

@ -6,12 +6,16 @@
#include <torch/csrc/THP.h>
#include <torch/csrc/mps/Module.h>
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/pybind.h>
#include <torch/csrc/utils/python_numbers.h>
#include <torch/csrc/utils/python_strings.h>
#include <memory>
// pthread.h is included for tracking bad forks
#ifndef WIN32
#include <pthread.h>
#endif
#ifdef USE_MPS
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/mps/MetalShaderLibrary.h>
@ -19,9 +23,27 @@
namespace torch::mps {
namespace {
// True for children forked after mps init
static bool in_bad_fork = false;
// Called in the forked child if mps has already been initialized
static void forked_mps_child() {
in_bad_fork = true;
}
// Should be called before the first mps call.
static void track_bad_mps_fork() {
#ifndef WIN32
static auto result [[maybe_unused]] =
pthread_atfork(nullptr, nullptr, forked_mps_child);
#endif
}
} // namespace
static PyObject* MPSModule_isInBadFork(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kMPS));
return PyBool_FromLong(in_bad_fork);
END_HANDLE_TH_ERRORS
}
@ -29,7 +51,7 @@ static PyObject* MPSModule_getDefaultMPSGenerator(
PyObject* _unused,
PyObject* noargs) {
HANDLE_TH_ERRORS
torch::utils::register_fork_handler_for_device_init(at::kMPS);
track_bad_mps_fork();
return THPGenerator_initDefaultGenerator(
at::detail::getMPSHooks().getDefaultGenerator());
END_HANDLE_TH_ERRORS
@ -37,8 +59,8 @@ static PyObject* MPSModule_getDefaultMPSGenerator(
static PyObject* MPSModule_isAvailable(PyObject* _unused, PyObject* noargs) {
HANDLE_TH_ERRORS
track_bad_mps_fork();
if (at::detail::getMPSHooks().hasMPS()) {
torch::utils::register_fork_handler_for_device_init(at::kMPS);
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;

View File

@ -7,15 +7,38 @@
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/pybind.h>
#ifndef WIN32
#include <pthread.h>
#endif
namespace torch::mtia {
static bool in_bad_fork = false; // True for children forked after mtia init
#ifndef WIN32
// Called in the forked child if mtia has already been initialized
static void forked_child() {
in_bad_fork = true;
torch::utils::set_requires_device_init(at::kMTIA, true);
}
#endif
// Should be called before the first mtia call.
// Note: This is distinct from initExtension because a stub mtia implementation
// has some working functions (e.g. device_count) but cannot fully initialize.
static void poison_fork() {
#ifndef WIN32
static auto result [[maybe_unused]] =
pthread_atfork(nullptr, nullptr, forked_child);
#endif
}
void initModule(PyObject* module) {
auto m = py::handle(module).cast<py::module>();
m.def("_mtia_init", []() {
TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kMTIA));
torch::utils::register_fork_handler_for_device_init(at::kMTIA);
TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
poison_fork();
at::globalContext().lazyInitDevice(c10::DeviceType::MTIA);
});
@ -24,9 +47,7 @@ void initModule(PyObject* module) {
return at::detail::isMTIAHooksBuilt();
});
m.def("_mtia_isInBadFork", []() {
return torch::utils::is_device_in_bad_fork(at::kMTIA);
});
m.def("_mtia_isInBadFork", []() { return in_bad_fork; });
m.def("_mtia_getCurrentStream", [](c10::DeviceIndex device_index) {
torch::utils::device_lazy_init(at::kMTIA);

View File

@ -1,23 +1,13 @@
#include <c10/core/impl/TorchDispatchModeTLS.h>
#include <c10/util/CallOnce.h>
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/Exceptions.h>
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/object_ptr.h>
#ifndef WIN32
#include <pthread.h>
#endif
namespace torch::utils {
namespace {
std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_initialized{};
std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_in_bad_fork{};
std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES>
at_fork_once_flags{};
std::optional<c10::DeviceType> at_fork_device_type{};
} // anonymous namespace
@ -68,34 +58,4 @@ void set_requires_device_init(at::DeviceType device_type, bool value) {
is_initialized[static_cast<int>(device_type)] = !value;
}
bool is_device_in_bad_fork(at::DeviceType device_type) {
return is_in_bad_fork[static_cast<int>(device_type)];
}
void set_device_in_bad_fork(at::DeviceType device_type, bool value) {
is_in_bad_fork[static_cast<int>(device_type)] = value;
}
// Should be called before the first device runtime call.
void register_fork_handler_for_device_init(at::DeviceType device_type) {
#ifndef WIN32
auto& flag = at_fork_once_flags[static_cast<int>(device_type)];
c10::call_once(flag, [device_type]() {
TORCH_CHECK(
!at_fork_device_type,
"Only one device type can be registered. But now, we have two device types: ",
at_fork_device_type.value(),
" and ",
device_type);
at_fork_device_type = device_type;
pthread_atfork(nullptr, nullptr, []() {
set_device_in_bad_fork(at_fork_device_type.value(), true);
if (is_device_lazy_init_supported(at_fork_device_type.value())) {
set_requires_device_init(at_fork_device_type.value(), true);
}
});
});
#endif
}
} // namespace torch::utils

View File

@ -67,21 +67,4 @@ inline void maybe_initialize_device(
bool is_device_initialized(at::DeviceType device_type);
TORCH_PYTHON_API bool is_device_in_bad_fork(at::DeviceType device_type);
TORCH_PYTHON_API void set_device_in_bad_fork(
at::DeviceType device_type,
bool value);
TORCH_PYTHON_API void register_fork_handler_for_device_init(
at::DeviceType device_type);
inline void maybe_register_fork_handler_for_device_init(
std::optional<at::DeviceType>& device_type) {
if (!device_type.has_value()) {
return;
}
register_fork_handler_for_device_init(device_type.value());
}
} // namespace torch::utils

View File

@ -11,8 +11,32 @@
#include <torch/csrc/utils/python_strings.h>
#include <torch/csrc/xpu/Module.h>
#ifndef WIN32
#include <pthread.h>
#endif
using namespace torch;
static bool in_bad_fork = false; // True for children forked after xpu init
#ifndef WIN32
// Called in the forked child if xpu has already been initialized
static void forked_child() {
in_bad_fork = true;
torch::utils::set_requires_device_init(at::kXPU, true);
}
#endif
// Should be called before the first xpu call. It is mainly called in lazy_init.
// Note: This is distinct from initExtension because a stub xpu implementation
// has some working functions (e.g. device_count) but cannot fully initialize.
static void poison_fork() {
#ifndef WIN32
static auto result [[maybe_unused]] =
pthread_atfork(nullptr, nullptr, forked_child);
#endif
}
// XPU management methods
static PyObject* THXPModule_getArchFlags(PyObject* self, PyObject* noargs) {
@ -28,7 +52,7 @@ static PyObject* THXPModule_getArchFlags(PyObject* self, PyObject* noargs) {
static PyObject* THXPModule_isInBadFork_wrap(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kXPU));
return PyBool_FromLong(in_bad_fork);
END_HANDLE_TH_ERRORS
}
@ -91,9 +115,7 @@ static PyObject* THXPModule_getDeviceCount_wrap(
PyObject* self,
PyObject* noargs) {
HANDLE_TH_ERRORS
// Note: This is distinct from initExtension because a stub xpu implementation
// has some working functions (e.g. device_count) but cannot fully initialize.
torch::utils::register_fork_handler_for_device_init(at::kXPU);
poison_fork();
return THPUtils_packUInt64(at::xpu::device_count());
END_HANDLE_TH_ERRORS
}
@ -398,8 +420,8 @@ static void initXpuMethodBindings(PyObject* module) {
// classes
static PyObject* THXPModule_initExtension(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kXPU));
torch::utils::register_fork_handler_for_device_init(at::kXPU);
TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
poison_fork();
at::globalContext().lazyInitDevice(c10::DeviceType::XPU);
auto m = THPObjectPtr(PyImport_ImportModule("torch.xpu"));