[nativert] Move Placement to pytorch core (#152953)

Summary:
Move Placement to pytorch core.

Using `torch::nativert::isSameDevice` explicitly in code to avoid confusion with the `isSameDevice` in torch namespace.

Test Plan:
```
buck run fbcode//mode/dev-nosan  //caffe2/test/cpp/nativert:placement_test

./bin/test_nativert
```

OSS and internal CI

Differential Revision: D74190745

Pull Request resolved: https://github.com/pytorch/pytorch/pull/152953
Approved by: https://github.com/Skylion007, https://github.com/swolchok, https://github.com/zhxchen17, https://github.com/cyyever
This commit is contained in:
Shangdi Yu
2025-05-14 15:26:50 +00:00
committed by PyTorch MergeBot
parent ced90d23d3
commit 2e440e39a6
6 changed files with 263 additions and 0 deletions

View File

@ -590,6 +590,8 @@ libtorch_core_jit_sources = sorted(jit_sources_full)
libtorch_nativert_sources = [
"torch/nativert/graph/TensorMeta.cpp",
"torch/nativert/executor/Placement.cpp",
"torch/nativert/executor/PlacementUtils.cpp",
]
torch_mobile_tracer_sources = [

View File

@ -6,6 +6,7 @@ file(GLOB_RECURSE NATIVERT_ALL_TEST_FILES "${NATIVERT_TEST_ROOT}/test_*.cpp")
set(NATIVERT_TEST_SRCS
${NATIVERT_ALL_TEST_FILES}
${TORCH_ROOT}/torch/nativert/graph/TensorMeta.cpp
${TORCH_ROOT}/torch/nativert/executor/PlacementUtils.cpp
)
add_executable(test_nativert
@ -19,6 +20,7 @@ target_compile_definitions(test_nativert PRIVATE USE_GTEST)
set(NATIVERT_TEST_DEPENDENCIES torch gtest)
target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES})
target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only)
target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE})
if(USE_CUDA)

View File

@ -0,0 +1,104 @@
#include <c10/core/Device.h>
#include <gtest/gtest.h>
#include <unordered_map>
#include <torch/nativert/executor/Placement.h>
using namespace ::testing;
namespace torch::nativert {
TEST(PlacementTest, NormalizeDevice) {
c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
c10::Device cpuDevice1 = c10::Device(c10::DeviceType::CPU);
cpuDevice1.set_index(1);
EXPECT_EQ(normalizeDevice(cpuDevice), cpuDevice);
EXPECT_NE(normalizeDevice(cpuDevice1), cpuDevice1);
c10::Device cudaDevice = c10::Device(c10::DeviceType::CUDA);
c10::Device cudaDevice1 = c10::Device(c10::DeviceType::CUDA, 1);
EXPECT_EQ(normalizeDevice(cudaDevice), c10::Device(c10::DeviceType::CUDA, 0));
EXPECT_EQ(
normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 1));
EXPECT_NE(
normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 0));
}
TEST(PlacementTest, IsSameDevice) {
c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
c10::Device cpuDevice1 = c10::Device(c10::DeviceType::CPU);
cpuDevice1.set_index(1);
EXPECT_TRUE(isSameDevice(cpuDevice, cpuDevice));
EXPECT_TRUE(isSameDevice(cpuDevice, cpuDevice1));
c10::Device cudaDevice = c10::Device(c10::DeviceType::CUDA);
c10::Device cudaDevice0 = c10::Device(c10::DeviceType::CUDA, 0);
c10::Device cudaDevice1 = c10::Device(c10::DeviceType::CUDA, 1);
EXPECT_TRUE(isSameDevice(cudaDevice, cudaDevice0));
EXPECT_FALSE(isSameDevice(cudaDevice0, cudaDevice1));
EXPECT_FALSE(isSameDevice(cudaDevice0, cpuDevice));
}
TEST(PlacementTest, PlacementDefaultOnly) {
Placement placement(c10::Device(c10::DeviceType::CUDA, 0));
std::ostringstream os;
os << placement;
EXPECT_EQ(os.str(), "|cuda:0");
c10::Device cuda0 = c10::Device(c10::DeviceType::CUDA, 0);
c10::Device cuda1 = c10::Device(c10::DeviceType::CUDA, 1);
c10::Device cuda2 = c10::Device(c10::DeviceType::CUDA, 2);
EXPECT_EQ(placement.getMappedDevice(cuda0), cuda0);
EXPECT_EQ(placement.getMappedDevice(cuda1), cuda0);
EXPECT_EQ(placement.getMappedDevice(cuda2), cuda0);
}
TEST(PlacementTest, PlacementBasic) {
Placement placement(
{{c10::Device(c10::DeviceType::CPU), c10::Device(c10::DeviceType::CPU)},
{c10::Device(c10::DeviceType::CUDA, 0),
c10::Device(c10::DeviceType::CUDA, 1)},
{c10::Device(c10::DeviceType::CUDA, 1),
c10::Device(c10::DeviceType::CUDA, 2)}},
c10::Device(c10::DeviceType::CUDA, 0));
std::ostringstream os;
os << placement;
EXPECT_EQ(os.str(), "cpu|cpu,cuda:0|cuda:1,cuda:1|cuda:2,|cuda:0");
c10::Device cpu = c10::Device(c10::DeviceType::CPU);
c10::Device cuda0 = c10::Device(c10::DeviceType::CUDA, 0);
c10::Device cuda1 = c10::Device(c10::DeviceType::CUDA, 1);
c10::Device cuda2 = c10::Device(c10::DeviceType::CUDA, 2);
c10::Device cuda3 = c10::Device(c10::DeviceType::CUDA, 3);
EXPECT_EQ(placement.getMappedDevice(cpu), cpu);
EXPECT_EQ(placement.getMappedDevice(cuda0), cuda1);
EXPECT_EQ(placement.getMappedDevice(cuda1), cuda2);
EXPECT_EQ(placement.getMappedDevice(cuda2), cuda0);
EXPECT_EQ(placement.getMappedDevice(cuda3), cuda0);
}
TEST(PlacementTest, Placement) {
std::unordered_map<c10::Device, c10::Device> deviceMap1 = {
{c10::Device("cuda:0"), c10::Device("cuda:1")}};
Placement p1(deviceMap1);
EXPECT_EQ(p1.getMappedDevice(c10::Device("cpu")), c10::Device("cpu"));
EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda")), c10::Device("cuda:1"));
EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:1"));
std::unordered_map<c10::Device, c10::Device> deviceMap2 = {
{c10::Device("cpu"), c10::Device("cuda")}};
Placement p2(deviceMap2);
EXPECT_EQ(p2.getMappedDevice(c10::Device("cpu")), c10::Device("cuda:0"));
EXPECT_EQ(p2.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:0"));
EXPECT_EQ(p2.getMappedDevice(c10::Device("cuda:1")), c10::Device("cuda:1"));
}
} // namespace torch::nativert

View File

@ -0,0 +1,61 @@
#include <torch/nativert/executor/Placement.h>
#include <fmt/ostream.h>
#include <ostream>
namespace torch::nativert {
std::ostream& operator<<(std::ostream& os, const Placement& placement) {
std::vector<std::pair<std::string, c10::Device>> sorted_keys;
sorted_keys.reserve(placement.deviceMap_.size());
for (const auto& pair : placement.deviceMap_) {
sorted_keys.emplace_back(pair.first.str(), pair.first);
}
std::sort(
sorted_keys.begin(), sorted_keys.end(), [](const auto& a, const auto& b) {
return a.first < b.first;
});
bool first = true;
for (const auto& pair : sorted_keys) {
if (!first) {
fmt::print(os, ",");
}
first = false;
const auto& key = pair.second;
const auto& value = placement.deviceMap_.at(key);
fmt::print(os, "{}|{}", pair.first, value.str());
}
if (placement.defaultDevice_.has_value()) {
fmt::print(os, "{}|{}", first ? "" : ",", placement.defaultDevice_->str());
}
return os;
}
Placement::Placement(std::optional<c10::Device> defaultDevice)
: Placement({}, defaultDevice) {}
Placement::Placement(
const std::unordered_map<c10::Device, c10::Device>& deviceMap,
std::optional<c10::Device> defaultDevice) {
for (const auto& [srcDevice, dstDevice] : deviceMap) {
deviceMap_.try_emplace(
normalizeDevice(srcDevice), normalizeDevice(dstDevice));
}
if (defaultDevice.has_value()) {
defaultDevice_ = normalizeDevice(defaultDevice.value());
}
}
c10::Device Placement::getMappedDevice(const c10::Device& srcDevice) const {
auto it = deviceMap_.find(normalizeDevice(srcDevice));
if (it != deviceMap_.end()) {
return it->second;
}
if (defaultDevice_.has_value()) {
return defaultDevice_.value();
}
return srcDevice;
}
} // namespace torch::nativert

View File

@ -0,0 +1,57 @@
#pragma once
#include <c10/core/Device.h>
#include <c10/util/Logging.h>
#include <optional>
#include <unordered_map>
namespace torch::nativert {
/**
* This function returns a normalized version of the input device:
* - For CPU devices, the returned device will have no index (i.e., the default
* CPU device).
* - For CUDA devices, if no index is specified, index 0 is assumed.
* - For other device types, the function will raise an error.
*
* @param device The input c10::Device to normalize.
* @return A normalized c10::Device with standardized indexing.
*
* @throws c10::Error If the device type is not CPU or CUDA.
*/
c10::Device normalizeDevice(const c10::Device& device);
/**
* Returns true if the two devices are the same and has the same device index
* (if cuda).
*/
bool isSameDevice(const c10::Device& device1, const c10::Device& device2);
/**
* @brief A utility class for managing device placement mappings.
*
* The Placement class provides a way to map source devices to target devices.
* It supports both explicit per-device mappings and a default device fallback.
* This is the argument taken in NativeRT to map from model artifact device to
* the device it should run on.
*/
struct TORCH_API Placement {
Placement() = default;
explicit Placement(std::optional<c10::Device> defaultDevice);
explicit Placement(
const std::unordered_map<c10::Device, c10::Device>& deviceMap,
std::optional<c10::Device> defaultDevice = std::nullopt);
c10::Device getMappedDevice(const c10::Device& srcDevice) const;
TORCH_API friend std::ostream& operator<<(
std::ostream& os,
const Placement& obj);
protected:
std::unordered_map<c10::Device, c10::Device> deviceMap_;
std::optional<c10::Device> defaultDevice_;
};
} // namespace torch::nativert

View File

@ -0,0 +1,37 @@
#include <torch/nativert/executor/Placement.h>
#include <fmt/ostream.h>
namespace torch::nativert {
c10::Device normalizeDevice(const c10::Device& device) {
// cpu device doesn't have index
// cuda device index must have a index
if (device.is_cpu()) {
return c10::Device(c10::DeviceType::CPU);
} else if (device.is_cuda()) {
return c10::Device(
c10::DeviceType::CUDA,
device.has_index() ? device.index() : static_cast<c10::DeviceIndex>(0));
} else {
TORCH_CHECK(false, "Unsupported device type", device);
}
}
bool isSameDevice(const c10::Device& a, const c10::Device& b) {
if (a.is_cpu()) {
return b.is_cpu();
}
if (a.is_cuda()) {
if (b.is_cuda()) {
auto aIndex = a.has_index() ? a.index() : 0;
auto bIndex = b.has_index() ? b.index() : 0;
return aIndex == bIndex;
} else {
return false;
}
}
TORCH_CHECK(false, "Unsupported device type", a, " and ", b);
return false;
}
} // namespace torch::nativert