Add device and key for lazy tensors (#61621)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61621

Test Plan: CI

Reviewed By: mruberry

Differential Revision: D29912934

Pulled By: asuhan

fbshipit-source-id: 493c32063a3e756d93cbf1d876563a35eaafb537
This commit is contained in:
Alex Suhan
2021-07-26 22:59:10 -07:00
committed by Facebook GitHub Bot
parent 2945a73d90
commit b176feec1e
24 changed files with 187 additions and 70 deletions

View File

@ -73,6 +73,9 @@ class TORCH_API Context {
static bool hasXLA() {
return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA);
}
static bool hasLazy() {
return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
}
static bool hasMLC() {
return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC);
}

View File

@ -1343,9 +1343,9 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
if (is_meta_) return;
// XLA tensors don't have storage, so they don't have an underlying data pointer.
// XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
// Nothing beyond this point is important for meta functions, so it's fine to exit early here.
if (common_device_.type() == DeviceType::XLA) return;
if (common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::Lazy) return;
for (auto& op : operands_) {
TORCH_INTERNAL_ASSERT(op.tensor->defined());

View File

@ -52,6 +52,10 @@ TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}
TORCH_LIBRARY_IMPL(_, AutogradLazy, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}
TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}

View File

@ -257,6 +257,11 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
" backend.");
}
std::string expectedMessageForBackend(DispatchKey key) {
std::string key_str(c10::toString(key));
return "Could not run '_test::dummy' with arguments from the '" + key_str + "' backend";
}
TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCalling_thenCallsCorrectKernel) {
bool called_kernel1 = false;
bool called_kernel2 = false;
@ -277,18 +282,20 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall
EXPECT_FALSE(called_kernel1);
EXPECT_TRUE(called_kernel2);
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
}, "Could not run '_test::dummy' with arguments from the 'XLA'"
" backend.");
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
std::string expectMessage = expectedMessageForBackend(key);
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(key));
}, expectMessage.c_str());
// also assert that the error message contains the available tensor type ids, but don't assert their order
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
}, "CPU");
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
}, "CUDA");
// also assert that the error message contains the available tensor type ids, but don't assert their order
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(key));
}, "CPU");
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(key));
}, "CUDA");
}
}
bool called_stackbased_kernel = false;
@ -302,7 +309,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndNoneCanInf
auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
.kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
}, "Cannot infer operator schema for this kind of kernel in registration of operator _test::dummy");
}
@ -311,7 +319,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
.kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
ASSERT_TRUE(op.has_value()); // assert schema is registered
@ -326,10 +335,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
EXPECT_TRUE(called_stackbased_kernel);
EXPECT_FALSE(called_kernel);
called_kernel = called_stackbased_kernel = false;
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
EXPECT_TRUE(called_stackbased_kernel);
EXPECT_FALSE(called_kernel);
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
called_kernel = called_stackbased_kernel = false;
callOp(*op, dummyTensor(key));
EXPECT_TRUE(called_stackbased_kernel);
EXPECT_FALSE(called_kernel);
}
}
TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCanInferSchema_thenSucceeds) {
@ -337,7 +348,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
.kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
ASSERT_TRUE(op.has_value()); // assert schema is registered
@ -352,10 +364,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
EXPECT_FALSE(called_stackbased_kernel);
EXPECT_TRUE(called_kernel);
called_kernel = called_stackbased_kernel = false;
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
EXPECT_TRUE(called_stackbased_kernel);
EXPECT_FALSE(called_kernel);
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
called_kernel = called_stackbased_kernel = false;
callOp(*op, dummyTensor(key));
EXPECT_TRUE(called_stackbased_kernel);
EXPECT_FALSE(called_kernel);
}
}
TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneCanInferSchema_thenSucceeds) {
@ -363,7 +377,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
.kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
ASSERT_TRUE(op.has_value()); // assert schema is registered
@ -378,10 +393,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
EXPECT_FALSE(called_stackbased_kernel);
EXPECT_TRUE(called_kernel);
called_kernel = called_stackbased_kernel = false;
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
EXPECT_TRUE(called_stackbased_kernel);
EXPECT_FALSE(called_kernel);
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
called_kernel = called_stackbased_kernel = false;
callOp(*op, dummyTensor(key));
EXPECT_TRUE(called_stackbased_kernel);
EXPECT_FALSE(called_kernel);
}
}
struct DummyKernelWithIntParam final : OperatorKernel {
@ -570,21 +587,21 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) {
EXPECT_FALSE(called_nonautograd);
}
TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
.kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(DispatchKey::AutogradXLA)
.kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(key))
.kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::Autograd));
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
ASSERT_TRUE(op.has_value());
std::string expectedMessage = expectedMessageForBackend(key);
expectThrows<c10::Error>([&] {
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
}, "Could not run '_test::dummy' with arguments from the 'XLA'"
" backend.");
callOp(*op, dummyTensor(key));
}, expectedMessage.c_str());
called_nonautograd = called_autograd = false;
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
EXPECT_TRUE(called_nonautograd);
EXPECT_FALSE(called_autograd);
@ -594,7 +611,15 @@ TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
EXPECT_FALSE(called_nonautograd);
}
TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA);
}
TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) {
LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy);
}
void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) {
{
auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
.catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());
@ -603,38 +628,46 @@ TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAI
ASSERT_TRUE(op.has_value());
called_nonautograd = called_autograd = false;
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
EXPECT_TRUE(called_nonautograd);
EXPECT_FALSE(called_autograd);
called_nonautograd = called_autograd = false;
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
op->typed<void (Tensor)>().call(dummyTensor(key));
EXPECT_FALSE(called_autograd);
EXPECT_TRUE(called_nonautograd);
}
{
auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
.kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::XLA)
.kernel<decltype(autograd_kernel), &autograd_kernel>(key)
.catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
ASSERT_TRUE(op.has_value());
// When there's direct registration to XLA backend, AutogradXLA doesn't pick up catchAll
// When there's direct registration to XLA / Lazy backend, Autograd{XLA, Lazy} doesn't pick up catchAll
// kernel in precompute but just keep fallthrough kernel from backend fallback.
// Thus it falls through AutogradXLA and reaches the kernel at XLA key.
// Thus it falls through Autograd{XLA, Lazy} and reaches the kernel at XLA / Lazy key.
called_nonautograd = called_autograd = false;
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
EXPECT_FALSE(called_nonautograd);
EXPECT_TRUE(called_autograd);
called_nonautograd = called_autograd = false;
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
op->typed<void (Tensor)>().call(dummyTensor(key));
EXPECT_TRUE(called_autograd);
EXPECT_FALSE(called_nonautograd);
}
}
TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::XLA);
}
TEST(OperatorRegistrationTest, whenRegisterWithLazyKernelAndCatchAll_AutogradLazyIsNotFilled) {
whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::Lazy);
}
TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingCppSignatures_thenFails) {
expectThrows<c10::Error>([] {
auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
@ -1217,9 +1250,11 @@ TEST(NewOperatorRegistrationTest, testBasics) {
m.def("dummy4", [](const Tensor& self, const Tensor& other) { return other; });
m.impl("dummy", c10::DeviceType::CPU, [](const Tensor& self) { return self; });
m.impl("dummy", c10::DeviceType::XLA, [](const Tensor& self) { return self; });
m.impl("dummy", c10::DeviceType::Lazy, [](const Tensor& self) { return self; });
// Internal API
m.impl("dummy2", c10::DispatchKey::CPU, [](const Tensor& self) { return self; });
m.impl("dummy2", c10::DispatchKey::XLA, [](const Tensor& self) { return self; });
m.impl("dummy2", c10::DispatchKey::Lazy, [](const Tensor& self) { return self; });
ASSERT_TRUE(Dispatcher::singleton().findSchema({"_test::dummy", ""}).has_value());
// Should have a schema even if there are no impls
@ -1345,15 +1380,15 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradKernel) {
ASSERT_TRUE(math_called);
}
{
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
math_called = false;
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
callOp(*op, dummyTensor(key));
ASSERT_TRUE(math_called);
}
{
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
math_called = false;
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
ASSERT_TRUE(math_called);
}
@ -1523,16 +1558,16 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradKernel) {
ASSERT_TRUE(called);
}
{
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
called = false;
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
callOp(*op, dummyTensor(key));
ASSERT_TRUE(called);
}
{
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
called = false;
// AutogradXLA is fallthrough, calls XLA kernel
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
// Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
ASSERT_TRUE(called);
}
@ -1575,17 +1610,17 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradAndCompos
ASSERT_TRUE(backend_called);
}
{
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
backend_called = math_called = false;
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
callOp(*op, dummyTensor(key));
ASSERT_TRUE(backend_called);
ASSERT_FALSE(math_called);
}
{
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
backend_called = math_called = false;
// AutogradXLA is fallthrough, calls XLA kernel
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
// Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
ASSERT_FALSE(math_called);
ASSERT_TRUE(backend_called);
}
@ -1681,11 +1716,11 @@ TEST(NewOperatorRegistrationTest, dispatch) {
ASSERT_TRUE(autograd_called);
}
{
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
autograd_called = false;
auto op = Dispatcher::singleton().findSchema({"test::fn_autograd", ""});
ASSERT_TRUE(op.has_value());
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
ASSERT_TRUE(autograd_called);
}
}

View File

@ -26,7 +26,7 @@ list(APPEND ATen_CPU_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/quantized_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/operators_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/lazy_tensor_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp

View File

@ -0,0 +1,23 @@
#include <gtest/gtest.h>
#include <ATen/ATen.h>
void LazyTensorTest(c10::DispatchKey dispatch_key, at::DeviceType device_type) {
auto tensor_impl =
c10::make_intrusive<c10::TensorImpl, c10::UndefinedTensorImpl>(
dispatch_key,
caffe2::TypeMeta::Make<float>(),
at::Device(device_type, 0));
at::Tensor t(std::move(tensor_impl));
ASSERT_TRUE(t.device() == at::Device(device_type, 0));
}
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
TEST(XlaTensorTest, TestNoStorage) {
LazyTensorTest(at::DispatchKey::XLA, at::DeviceType::XLA);
}
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
TEST(LazyTensorTest, TestNoStorage) {
LazyTensorTest(at::DispatchKey::Lazy, at::DeviceType::Lazy);
}

View File

@ -18,7 +18,7 @@ VALGRIND=${VALGRIND:=ON}
./tensor_interop_test
./undefined_tensor_test
./extension_backend_test
./xla_tensor_test
./lazy_tensor_test
./tensor_iterator_test
./Dimname_test
./Dict_test

View File

@ -51,6 +51,7 @@ enum class Backend {
MkldnnCPU,
MLC,
HPU,
Lazy,
NumOptions
};
@ -69,6 +70,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
return Backend::MSNPU;
} else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
return Backend::XLA;
} else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
return Backend::Lazy;
} else if (t == DispatchKey::MLC || t == DispatchKey::AutogradMLC) {
return Backend::MLC;
} else if (t == DispatchKey::Vulkan) {
@ -124,6 +127,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
return DispatchKey::MSNPU;
case Backend::XLA:
return DispatchKey::XLA;
case Backend::Lazy:
return DispatchKey::Lazy;
case Backend::XPU:
return DispatchKey::XPU;
case Backend::SparseXPU:
@ -177,6 +182,8 @@ static inline DeviceType backendToDeviceType(Backend b) {
return DeviceType::MSNPU;
case Backend::XLA:
return DeviceType::XLA;
case Backend::Lazy:
return DeviceType::Lazy;
case Backend::SparseCPU:
return DeviceType::CPU;
case Backend::SparseCUDA:
@ -232,6 +239,8 @@ static inline const char* toString(Backend b) {
return "MSNPU";
case Backend::XLA:
return "XLA";
case Backend::Lazy:
return "Lazy";
case Backend::MLC:
return "MLC";
case Backend::SparseCPU:

View File

@ -45,6 +45,7 @@ DeviceType parse_type(const std::string& device_string) {
{"fpga", DeviceType::FPGA},
{"msnpu", DeviceType::MSNPU},
{"xla", DeviceType::XLA},
{"lazy", DeviceType::Lazy},
{"vulkan", DeviceType::Vulkan},
{"mlc", DeviceType::MLC},
{"meta", DeviceType::Meta},
@ -61,7 +62,7 @@ DeviceType parse_type(const std::string& device_string) {
}
TORCH_CHECK(
false,
"Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, vulkan, meta, hpu device type at start of device string: ",
"Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
device_string);
}
} // namespace

View File

@ -103,7 +103,7 @@ struct C10_API Device final {
/// Return true if the device supports arbirtary strides.
bool supports_as_strided() const noexcept {
return type_ != DeviceType::XLA;
return type_ != DeviceType::XLA && type_ != DeviceType::Lazy;
}
/// Same string as returned from operator<<.

View File

@ -29,6 +29,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
return lower_case ? "msnpu" : "MSNPU";
case DeviceType::XLA:
return lower_case ? "xla" : "XLA";
case DeviceType::Lazy:
return lower_case ? "lazy" : "LAZY";
case DeviceType::MLC:
return lower_case ? "mlc" : "MLC";
case DeviceType::Vulkan:
@ -75,6 +77,7 @@ bool isValidDeviceType(DeviceType d) {
case DeviceType::FPGA:
case DeviceType::MSNPU:
case DeviceType::XLA:
case DeviceType::Lazy:
case DeviceType::MLC:
case DeviceType::Vulkan:
case DeviceType::Metal:

View File

@ -30,11 +30,12 @@ enum class DeviceType : int8_t {
Meta = 14, // Meta (tensors with no data)
HPU = 15, // HPU / HABANA
VE = 16, // SX-Aurora / NEC
Lazy = 17, // Lazy Tensors
// NB: If you add more devices:
// - Change the implementations of DeviceTypeName and isValidDeviceType
// in DeviceType.cpp
// - Change the number below
COMPILE_TIME_MAX_DEVICE_TYPES = 17,
COMPILE_TIME_MAX_DEVICE_TYPES = 18,
};
constexpr DeviceType kCPU = DeviceType::CPU;
@ -50,18 +51,19 @@ constexpr DeviceType kMetal = DeviceType::Metal;
constexpr DeviceType kXPU = DeviceType::XPU;
constexpr DeviceType kHPU = DeviceType::HPU;
constexpr DeviceType kVE = DeviceType::VE;
constexpr DeviceType kLazy = DeviceType::Lazy;
// define explicit int constant
constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
static_assert(
COMPILE_TIME_MAX_DEVICE_TYPES <= 17,
COMPILE_TIME_MAX_DEVICE_TYPES <= 18,
"Hey! You seem to be adding a lot of new DeviceTypes. The intent was "
"for this constant to reflect the actual number of DeviceTypes we support "
"in PyTorch; it's important that this number is not too large as we "
"use this to allocate stack arrays in some places in our code. If you "
"are indeed just adding the 17th device type, feel free to change "
"are indeed just adding the 18th device type, feel free to change "
"the check to 32; but if you are adding some sort of extensible device "
"types registration, please be aware that you are affecting code that "
"this number is small. Try auditing uses of this constant.");

View File

@ -23,6 +23,8 @@ const char* toString(DispatchKey t) {
return "MSNPU";
case DispatchKey::XLA:
return "XLA";
case DispatchKey::Lazy:
return "Lazy";
case DispatchKey::MLC:
return "MLC";
case DispatchKey::HPU:
@ -91,6 +93,8 @@ const char* toString(DispatchKey t) {
return "AutogradCUDA";
case DispatchKey::AutogradXLA:
return "AutogradXLA";
case DispatchKey::AutogradLazy:
return "AutogradLazy";
case DispatchKey::AutogradMLC:
return "AutogradMLC";
case DispatchKey::AutogradHPU:
@ -179,6 +183,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
return DispatchKey::AutogradCUDA;
case DispatchKey::XLA:
return DispatchKey::AutogradXLA;
case DispatchKey::Lazy:
return DispatchKey::AutogradLazy;
case DispatchKey::MLC:
return DispatchKey::AutogradMLC;
case DispatchKey::HPU:

View File

@ -68,6 +68,7 @@ enum class DispatchKey : uint8_t {
XPU, // For out of tree Intel's heterogeneous computing plug-in
HPU, // For out of tree & closed source integration of HPU / Habana
VE, // For out of tree & closed source integration of SX-Aurora / NEC
Lazy, // For lazy tensor backends
// A meta tensor is a tensor without any data associated with it. (They
// have also colloquially been referred to as tensors on the "null" device).
@ -229,6 +230,7 @@ enum class DispatchKey : uint8_t {
AutogradCPU,
AutogradCUDA,
AutogradXLA,
AutogradLazy,
AutogradXPU,
AutogradMLC,
AutogradHPU,

View File

@ -12,6 +12,7 @@ constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
DispatchKey::CPU,
DispatchKey::CUDA,
DispatchKey::XLA,
DispatchKey::Lazy,
DispatchKey::XPU,
DispatchKey::PrivateUse1,
DispatchKey::PrivateUse2,
@ -57,6 +58,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
return DispatchKeySet(DispatchKey::CUDA);
case DispatchKey::AutogradXLA:
return DispatchKeySet(DispatchKey::XLA);
case DispatchKey::AutogradLazy:
return DispatchKeySet(DispatchKey::Lazy);
case DispatchKey::AutogradMLC:
return DispatchKeySet(DispatchKey::MLC);
case DispatchKey::AutogradHPU:

View File

@ -212,6 +212,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
DispatchKey::AutogradCPU,
DispatchKey::AutogradCUDA,
DispatchKey::AutogradXLA,
DispatchKey::AutogradLazy,
DispatchKey::AutogradNestedTensor,
DispatchKey::AutogradMLC,
DispatchKey::AutogradHPU,

View File

@ -667,6 +667,8 @@ inline DispatchKey computeDispatchKey(
return DispatchKey::MSNPU;
case DeviceType::XLA:
return DispatchKey::XLA;
case DeviceType::Lazy:
return DispatchKey::Lazy;
case DeviceType::MLC:
return DispatchKey::MLC;
case DeviceType::Vulkan:
@ -768,6 +770,9 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
case DispatchKey::XLA:
case DispatchKey::AutogradXLA:
return DeviceType::XLA;
case DispatchKey::Lazy:
case DispatchKey::AutogradLazy:
return DeviceType::Lazy;
case DispatchKey::Vulkan:
return DeviceType::Vulkan;
case DispatchKey::Meta:

View File

@ -789,7 +789,7 @@ CPU: registered at {}:5 :: () -> () [ boxed unboxed ]
class TestPythonDispatcher(TestCase):
def test_basic(self):
dispatcher = PythonDispatcher()
dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"])
dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd"])
self.assertExpectedInline(
dispatcher.dispatchTable(),
'''\
@ -799,16 +799,18 @@ key kernel
---------------------------
CPU fn_CPU [kernel]
XLA fn_XLA [kernel]
Lazy fn_Lazy [kernel]
QuantizedCPU fn_CompositeImplicitAutograd [math kernel]
AutogradOther fn_CompositeImplicitAutograd [math kernel]
AutogradCPU fallthrough [backend fallback]
AutogradXLA fallthrough [backend fallback]
AutogradLazy fallthrough [backend fallback]
'''
)
def test_math_autogradcpu(self):
dispatcher = PythonDispatcher()
dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd", "AutogradCPU"])
dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd", "AutogradCPU"])
self.assertExpectedInline(
dispatcher.dispatchTable(),
'''\
@ -818,10 +820,12 @@ key kernel
---------------------------
CPU fn_CPU [kernel]
XLA fn_XLA [kernel]
Lazy fn_Lazy [kernel]
QuantizedCPU fn_CompositeImplicitAutograd [math kernel]
AutogradOther fn_CompositeImplicitAutograd [math kernel]
AutogradCPU fn_AutogradCPU [kernel]
AutogradXLA fallthrough [backend fallback]
AutogradLazy fallthrough [backend fallback]
'''
)
self.assertExpectedInline(
@ -833,6 +837,7 @@ key kernel
---------------------------
CPU fn_CPU
XLA fn_XLA
Lazy fn_Lazy
AutogradCPU fn_AutogradCPU
CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
'''
@ -840,7 +845,7 @@ CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
def test_defaultbackend_autogradcpu(self):
dispatcher = PythonDispatcher()
dispatcher.register(["CPU", "XLA", "CompositeExplicitAutograd", "AutogradCPU"])
dispatcher.register(["CPU", "XLA", "Lazy", "CompositeExplicitAutograd", "AutogradCPU"])
self.assertExpectedInline(
dispatcher.dispatchTable(),
'''\
@ -850,10 +855,12 @@ key kernel
---------------------------
CPU fn_CPU [kernel]
XLA fn_XLA [kernel]
Lazy fn_Lazy [kernel]
QuantizedCPU fn_CompositeExplicitAutograd [default backend kernel]
AutogradOther fallthrough [backend fallback]
AutogradCPU fn_AutogradCPU [kernel]
AutogradXLA fallthrough [backend fallback]
AutogradLazy fallthrough [backend fallback]
'''
)
@ -866,6 +873,7 @@ key kernel
---------------------------
CPU fn_CPU
XLA fn_XLA
Lazy fn_Lazy
AutogradCPU fn_AutogradCPU
CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd
'''
@ -883,10 +891,12 @@ key kernel
---------------------------
CPU fn_CPU [kernel]
XLA fn_CompositeImplicitAutograd [math kernel]
Lazy fn_CompositeImplicitAutograd [math kernel]
QuantizedCPU fn_QuantizedCPU [kernel]
AutogradOther ambiguous_autogradother [ambiguous autogradother]
AutogradCPU fallthrough [backend fallback]
AutogradXLA fn_CompositeImplicitAutograd [math kernel]
AutogradLazy fn_CompositeImplicitAutograd [math kernel]
'''
)

View File

@ -58,6 +58,7 @@ class DispatchKey(Enum):
FPGA = auto()
MSNPU = auto()
XLA = auto()
Lazy = auto()
Vulkan = auto()
Metal = auto()
XPU = auto()
@ -89,6 +90,7 @@ class DispatchKey(Enum):
AutogradCPU = auto()
AutogradCUDA = auto()
AutogradXLA = auto()
AutogradLazy = auto()
AutogradNestedTensor = auto()
AutogradXPU = auto()
AutogradPrivateUse1 = auto()

View File

@ -55,6 +55,7 @@ class PythonDispatcher:
"CPU", "AutogradCPU",
"QuantizedCPU", "AutogradOther",
"XLA", "AutogradXLA",
"Lazy", "AutogradLazy",
]
alias_keys = [
"CompositeExplicitAutograd",

View File

@ -109,6 +109,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
.value("FPGA", c10::DeviceType::FPGA)
.value("MSNPU", c10::DeviceType::MSNPU)
.value("XLA", c10::DeviceType::XLA)
.value("Lazy", c10::DeviceType::Lazy)
.value("MLC", c10::DeviceType::MLC)
.value("HPU", c10::DeviceType::HPU)
.value("Meta", c10::DeviceType::Meta)

View File

@ -32,6 +32,7 @@ c10::optional<c10::DispatchKey> parseDispatchKey(const std::string& k) {
{"CPU", c10::DispatchKey::CPU},
{"CUDA", c10::DispatchKey::CUDA},
{"XLA", c10::DispatchKey::XLA},
{"Lazy", c10::DispatchKey::Lazy},
{"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
{"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd},
{"Autograd", c10::DispatchKey::Autograd},

View File

@ -319,6 +319,7 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
dispatch_key == c10::DispatchKey::CUDA ||
dispatch_key == c10::DispatchKey::HIP ||
dispatch_key == c10::DispatchKey::XLA ||
dispatch_key == c10::DispatchKey::Lazy ||
dispatch_key == c10::DispatchKey::XPU,
"new(): expected DispatchKey: ",
c10::DispatchKey::CPU,
@ -329,11 +330,13 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
" or ",
c10::DispatchKey::XLA,
" or ",
c10::DispatchKey::Lazy,
" or ",
c10::DispatchKey::XPU,
" but got: ",
dispatch_key);
} else if(expected_layout == c10::kSparse) {
// NOTE: no sparse XLA
// NOTE: no sparse XLA or Lazy
TORCH_CHECK(
dispatch_key == c10::DispatchKey::SparseCPU ||
dispatch_key == c10::DispatchKey::SparseCUDA ||

View File

@ -292,6 +292,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
return c10::DispatchKey::CUDA;
case c10::DeviceType::XLA:
return c10::DispatchKey::XLA;
case c10::DeviceType::Lazy:
return c10::DispatchKey::Lazy;
case c10::DeviceType::MLC:
return c10::DispatchKey::MLC;
case c10::DeviceType::Meta: