mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Add device and key for lazy tensors (#61621)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61621 Test Plan: CI Reviewed By: mruberry Differential Revision: D29912934 Pulled By: asuhan fbshipit-source-id: 493c32063a3e756d93cbf1d876563a35eaafb537
This commit is contained in:
committed by
Facebook GitHub Bot
parent
2945a73d90
commit
b176feec1e
@ -73,6 +73,9 @@ class TORCH_API Context {
|
||||
static bool hasXLA() {
|
||||
return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA);
|
||||
}
|
||||
static bool hasLazy() {
|
||||
return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
|
||||
}
|
||||
static bool hasMLC() {
|
||||
return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC);
|
||||
}
|
||||
|
@ -1343,9 +1343,9 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
|
||||
|
||||
if (is_meta_) return;
|
||||
|
||||
// XLA tensors don't have storage, so they don't have an underlying data pointer.
|
||||
// XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
|
||||
// Nothing beyond this point is important for meta functions, so it's fine to exit early here.
|
||||
if (common_device_.type() == DeviceType::XLA) return;
|
||||
if (common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::Lazy) return;
|
||||
|
||||
for (auto& op : operands_) {
|
||||
TORCH_INTERNAL_ASSERT(op.tensor->defined());
|
||||
|
@ -52,6 +52,10 @@ TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutogradLazy, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
@ -257,6 +257,11 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
|
||||
" backend.");
|
||||
}
|
||||
|
||||
std::string expectedMessageForBackend(DispatchKey key) {
|
||||
std::string key_str(c10::toString(key));
|
||||
return "Could not run '_test::dummy' with arguments from the '" + key_str + "' backend";
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCalling_thenCallsCorrectKernel) {
|
||||
bool called_kernel1 = false;
|
||||
bool called_kernel2 = false;
|
||||
@ -277,18 +282,20 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall
|
||||
EXPECT_FALSE(called_kernel1);
|
||||
EXPECT_TRUE(called_kernel2);
|
||||
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
}, "Could not run '_test::dummy' with arguments from the 'XLA'"
|
||||
" backend.");
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
std::string expectMessage = expectedMessageForBackend(key);
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOp(*op, dummyTensor(key));
|
||||
}, expectMessage.c_str());
|
||||
|
||||
// also assert that the error message contains the available tensor type ids, but don't assert their order
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
}, "CPU");
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
}, "CUDA");
|
||||
// also assert that the error message contains the available tensor type ids, but don't assert their order
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOp(*op, dummyTensor(key));
|
||||
}, "CPU");
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOp(*op, dummyTensor(key));
|
||||
}, "CUDA");
|
||||
}
|
||||
}
|
||||
|
||||
bool called_stackbased_kernel = false;
|
||||
@ -302,7 +309,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndNoneCanInf
|
||||
auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
|
||||
}, "Cannot infer operator schema for this kind of kernel in registration of operator _test::dummy");
|
||||
}
|
||||
|
||||
@ -311,7 +319,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
|
||||
auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
|
||||
|
||||
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
|
||||
ASSERT_TRUE(op.has_value()); // assert schema is registered
|
||||
@ -326,10 +335,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
|
||||
EXPECT_TRUE(called_stackbased_kernel);
|
||||
EXPECT_FALSE(called_kernel);
|
||||
|
||||
called_kernel = called_stackbased_kernel = false;
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
EXPECT_TRUE(called_stackbased_kernel);
|
||||
EXPECT_FALSE(called_kernel);
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
called_kernel = called_stackbased_kernel = false;
|
||||
callOp(*op, dummyTensor(key));
|
||||
EXPECT_TRUE(called_stackbased_kernel);
|
||||
EXPECT_FALSE(called_kernel);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCanInferSchema_thenSucceeds) {
|
||||
@ -337,7 +348,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
|
||||
auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
|
||||
.kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
|
||||
|
||||
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
|
||||
ASSERT_TRUE(op.has_value()); // assert schema is registered
|
||||
@ -352,10 +364,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
|
||||
EXPECT_FALSE(called_stackbased_kernel);
|
||||
EXPECT_TRUE(called_kernel);
|
||||
|
||||
called_kernel = called_stackbased_kernel = false;
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
EXPECT_TRUE(called_stackbased_kernel);
|
||||
EXPECT_FALSE(called_kernel);
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
called_kernel = called_stackbased_kernel = false;
|
||||
callOp(*op, dummyTensor(key));
|
||||
EXPECT_TRUE(called_stackbased_kernel);
|
||||
EXPECT_FALSE(called_kernel);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneCanInferSchema_thenSucceeds) {
|
||||
@ -363,7 +377,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
|
||||
auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
|
||||
.kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
|
||||
.kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
|
||||
|
||||
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
|
||||
ASSERT_TRUE(op.has_value()); // assert schema is registered
|
||||
@ -378,10 +393,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
|
||||
EXPECT_FALSE(called_stackbased_kernel);
|
||||
EXPECT_TRUE(called_kernel);
|
||||
|
||||
called_kernel = called_stackbased_kernel = false;
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
EXPECT_TRUE(called_stackbased_kernel);
|
||||
EXPECT_FALSE(called_kernel);
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
called_kernel = called_stackbased_kernel = false;
|
||||
callOp(*op, dummyTensor(key));
|
||||
EXPECT_TRUE(called_stackbased_kernel);
|
||||
EXPECT_FALSE(called_kernel);
|
||||
}
|
||||
}
|
||||
|
||||
struct DummyKernelWithIntParam final : OperatorKernel {
|
||||
@ -570,21 +587,21 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) {
|
||||
EXPECT_FALSE(called_nonautograd);
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
|
||||
void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
|
||||
auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
|
||||
.kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(DispatchKey::AutogradXLA)
|
||||
.kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(key))
|
||||
.kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::Autograd));
|
||||
|
||||
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
|
||||
ASSERT_TRUE(op.has_value());
|
||||
|
||||
std::string expectedMessage = expectedMessageForBackend(key);
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
}, "Could not run '_test::dummy' with arguments from the 'XLA'"
|
||||
" backend.");
|
||||
callOp(*op, dummyTensor(key));
|
||||
}, expectedMessage.c_str());
|
||||
|
||||
called_nonautograd = called_autograd = false;
|
||||
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
|
||||
op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
|
||||
EXPECT_TRUE(called_nonautograd);
|
||||
EXPECT_FALSE(called_autograd);
|
||||
|
||||
@ -594,7 +611,15 @@ TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
|
||||
EXPECT_FALSE(called_nonautograd);
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
|
||||
TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
|
||||
LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA);
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) {
|
||||
LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy);
|
||||
}
|
||||
|
||||
void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) {
|
||||
{
|
||||
auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
|
||||
.catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());
|
||||
@ -603,38 +628,46 @@ TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAI
|
||||
ASSERT_TRUE(op.has_value());
|
||||
|
||||
called_nonautograd = called_autograd = false;
|
||||
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
|
||||
op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
|
||||
EXPECT_TRUE(called_nonautograd);
|
||||
EXPECT_FALSE(called_autograd);
|
||||
|
||||
called_nonautograd = called_autograd = false;
|
||||
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
|
||||
op->typed<void (Tensor)>().call(dummyTensor(key));
|
||||
EXPECT_FALSE(called_autograd);
|
||||
EXPECT_TRUE(called_nonautograd);
|
||||
}
|
||||
{
|
||||
auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
|
||||
.kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::XLA)
|
||||
.kernel<decltype(autograd_kernel), &autograd_kernel>(key)
|
||||
.catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());
|
||||
|
||||
auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
|
||||
ASSERT_TRUE(op.has_value());
|
||||
|
||||
// When there's direct registration to XLA backend, AutogradXLA doesn't pick up catchAll
|
||||
// When there's direct registration to XLA / Lazy backend, Autograd{XLA, Lazy} doesn't pick up catchAll
|
||||
// kernel in precompute but just keep fallthrough kernel from backend fallback.
|
||||
// Thus it falls through AutogradXLA and reaches the kernel at XLA key.
|
||||
// Thus it falls through Autograd{XLA, Lazy} and reaches the kernel at XLA / Lazy key.
|
||||
called_nonautograd = called_autograd = false;
|
||||
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
|
||||
op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
|
||||
EXPECT_FALSE(called_nonautograd);
|
||||
EXPECT_TRUE(called_autograd);
|
||||
|
||||
called_nonautograd = called_autograd = false;
|
||||
op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
|
||||
op->typed<void (Tensor)>().call(dummyTensor(key));
|
||||
EXPECT_TRUE(called_autograd);
|
||||
EXPECT_FALSE(called_nonautograd);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
|
||||
whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::XLA);
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, whenRegisterWithLazyKernelAndCatchAll_AutogradLazyIsNotFilled) {
|
||||
whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::Lazy);
|
||||
}
|
||||
|
||||
TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingCppSignatures_thenFails) {
|
||||
expectThrows<c10::Error>([] {
|
||||
auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
|
||||
@ -1217,9 +1250,11 @@ TEST(NewOperatorRegistrationTest, testBasics) {
|
||||
m.def("dummy4", [](const Tensor& self, const Tensor& other) { return other; });
|
||||
m.impl("dummy", c10::DeviceType::CPU, [](const Tensor& self) { return self; });
|
||||
m.impl("dummy", c10::DeviceType::XLA, [](const Tensor& self) { return self; });
|
||||
m.impl("dummy", c10::DeviceType::Lazy, [](const Tensor& self) { return self; });
|
||||
// Internal API
|
||||
m.impl("dummy2", c10::DispatchKey::CPU, [](const Tensor& self) { return self; });
|
||||
m.impl("dummy2", c10::DispatchKey::XLA, [](const Tensor& self) { return self; });
|
||||
m.impl("dummy2", c10::DispatchKey::Lazy, [](const Tensor& self) { return self; });
|
||||
|
||||
ASSERT_TRUE(Dispatcher::singleton().findSchema({"_test::dummy", ""}).has_value());
|
||||
// Should have a schema even if there are no impls
|
||||
@ -1345,15 +1380,15 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradKernel) {
|
||||
ASSERT_TRUE(math_called);
|
||||
}
|
||||
|
||||
{
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
math_called = false;
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
callOp(*op, dummyTensor(key));
|
||||
ASSERT_TRUE(math_called);
|
||||
}
|
||||
|
||||
{
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
math_called = false;
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
|
||||
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
|
||||
ASSERT_TRUE(math_called);
|
||||
}
|
||||
|
||||
@ -1523,16 +1558,16 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradKernel) {
|
||||
ASSERT_TRUE(called);
|
||||
}
|
||||
|
||||
{
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
called = false;
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
callOp(*op, dummyTensor(key));
|
||||
ASSERT_TRUE(called);
|
||||
}
|
||||
|
||||
{
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
called = false;
|
||||
// AutogradXLA is fallthrough, calls XLA kernel
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
|
||||
// Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
|
||||
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
|
||||
ASSERT_TRUE(called);
|
||||
}
|
||||
|
||||
@ -1575,17 +1610,17 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradAndCompos
|
||||
ASSERT_TRUE(backend_called);
|
||||
}
|
||||
|
||||
{
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
backend_called = math_called = false;
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA));
|
||||
callOp(*op, dummyTensor(key));
|
||||
ASSERT_TRUE(backend_called);
|
||||
ASSERT_FALSE(math_called);
|
||||
}
|
||||
|
||||
{
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
backend_called = math_called = false;
|
||||
// AutogradXLA is fallthrough, calls XLA kernel
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
|
||||
// Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
|
||||
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
|
||||
ASSERT_FALSE(math_called);
|
||||
ASSERT_TRUE(backend_called);
|
||||
}
|
||||
@ -1681,11 +1716,11 @@ TEST(NewOperatorRegistrationTest, dispatch) {
|
||||
ASSERT_TRUE(autograd_called);
|
||||
}
|
||||
|
||||
{
|
||||
for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
|
||||
autograd_called = false;
|
||||
auto op = Dispatcher::singleton().findSchema({"test::fn_autograd", ""});
|
||||
ASSERT_TRUE(op.has_value());
|
||||
callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
|
||||
callOp(*op, dummyTensor(key, /*requires_grad=*/true));
|
||||
ASSERT_TRUE(autograd_called);
|
||||
}
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ list(APPEND ATen_CPU_TEST_SRCS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/quantized_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/operators_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/lazy_tensor_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp
|
||||
|
23
aten/src/ATen/test/lazy_tensor_test.cpp
Normal file
23
aten/src/ATen/test/lazy_tensor_test.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
void LazyTensorTest(c10::DispatchKey dispatch_key, at::DeviceType device_type) {
|
||||
auto tensor_impl =
|
||||
c10::make_intrusive<c10::TensorImpl, c10::UndefinedTensorImpl>(
|
||||
dispatch_key,
|
||||
caffe2::TypeMeta::Make<float>(),
|
||||
at::Device(device_type, 0));
|
||||
at::Tensor t(std::move(tensor_impl));
|
||||
ASSERT_TRUE(t.device() == at::Device(device_type, 0));
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
TEST(XlaTensorTest, TestNoStorage) {
|
||||
LazyTensorTest(at::DispatchKey::XLA, at::DeviceType::XLA);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
TEST(LazyTensorTest, TestNoStorage) {
|
||||
LazyTensorTest(at::DispatchKey::Lazy, at::DeviceType::Lazy);
|
||||
}
|
@ -18,7 +18,7 @@ VALGRIND=${VALGRIND:=ON}
|
||||
./tensor_interop_test
|
||||
./undefined_tensor_test
|
||||
./extension_backend_test
|
||||
./xla_tensor_test
|
||||
./lazy_tensor_test
|
||||
./tensor_iterator_test
|
||||
./Dimname_test
|
||||
./Dict_test
|
||||
|
@ -51,6 +51,7 @@ enum class Backend {
|
||||
MkldnnCPU,
|
||||
MLC,
|
||||
HPU,
|
||||
Lazy,
|
||||
NumOptions
|
||||
};
|
||||
|
||||
@ -69,6 +70,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
|
||||
return Backend::MSNPU;
|
||||
} else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
|
||||
return Backend::XLA;
|
||||
} else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
|
||||
return Backend::Lazy;
|
||||
} else if (t == DispatchKey::MLC || t == DispatchKey::AutogradMLC) {
|
||||
return Backend::MLC;
|
||||
} else if (t == DispatchKey::Vulkan) {
|
||||
@ -124,6 +127,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
|
||||
return DispatchKey::MSNPU;
|
||||
case Backend::XLA:
|
||||
return DispatchKey::XLA;
|
||||
case Backend::Lazy:
|
||||
return DispatchKey::Lazy;
|
||||
case Backend::XPU:
|
||||
return DispatchKey::XPU;
|
||||
case Backend::SparseXPU:
|
||||
@ -177,6 +182,8 @@ static inline DeviceType backendToDeviceType(Backend b) {
|
||||
return DeviceType::MSNPU;
|
||||
case Backend::XLA:
|
||||
return DeviceType::XLA;
|
||||
case Backend::Lazy:
|
||||
return DeviceType::Lazy;
|
||||
case Backend::SparseCPU:
|
||||
return DeviceType::CPU;
|
||||
case Backend::SparseCUDA:
|
||||
@ -232,6 +239,8 @@ static inline const char* toString(Backend b) {
|
||||
return "MSNPU";
|
||||
case Backend::XLA:
|
||||
return "XLA";
|
||||
case Backend::Lazy:
|
||||
return "Lazy";
|
||||
case Backend::MLC:
|
||||
return "MLC";
|
||||
case Backend::SparseCPU:
|
||||
|
@ -45,6 +45,7 @@ DeviceType parse_type(const std::string& device_string) {
|
||||
{"fpga", DeviceType::FPGA},
|
||||
{"msnpu", DeviceType::MSNPU},
|
||||
{"xla", DeviceType::XLA},
|
||||
{"lazy", DeviceType::Lazy},
|
||||
{"vulkan", DeviceType::Vulkan},
|
||||
{"mlc", DeviceType::MLC},
|
||||
{"meta", DeviceType::Meta},
|
||||
@ -61,7 +62,7 @@ DeviceType parse_type(const std::string& device_string) {
|
||||
}
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, vulkan, meta, hpu device type at start of device string: ",
|
||||
"Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
|
||||
device_string);
|
||||
}
|
||||
} // namespace
|
||||
|
@ -103,7 +103,7 @@ struct C10_API Device final {
|
||||
|
||||
/// Return true if the device supports arbirtary strides.
|
||||
bool supports_as_strided() const noexcept {
|
||||
return type_ != DeviceType::XLA;
|
||||
return type_ != DeviceType::XLA && type_ != DeviceType::Lazy;
|
||||
}
|
||||
|
||||
/// Same string as returned from operator<<.
|
||||
|
@ -29,6 +29,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
|
||||
return lower_case ? "msnpu" : "MSNPU";
|
||||
case DeviceType::XLA:
|
||||
return lower_case ? "xla" : "XLA";
|
||||
case DeviceType::Lazy:
|
||||
return lower_case ? "lazy" : "LAZY";
|
||||
case DeviceType::MLC:
|
||||
return lower_case ? "mlc" : "MLC";
|
||||
case DeviceType::Vulkan:
|
||||
@ -75,6 +77,7 @@ bool isValidDeviceType(DeviceType d) {
|
||||
case DeviceType::FPGA:
|
||||
case DeviceType::MSNPU:
|
||||
case DeviceType::XLA:
|
||||
case DeviceType::Lazy:
|
||||
case DeviceType::MLC:
|
||||
case DeviceType::Vulkan:
|
||||
case DeviceType::Metal:
|
||||
|
@ -30,11 +30,12 @@ enum class DeviceType : int8_t {
|
||||
Meta = 14, // Meta (tensors with no data)
|
||||
HPU = 15, // HPU / HABANA
|
||||
VE = 16, // SX-Aurora / NEC
|
||||
Lazy = 17, // Lazy Tensors
|
||||
// NB: If you add more devices:
|
||||
// - Change the implementations of DeviceTypeName and isValidDeviceType
|
||||
// in DeviceType.cpp
|
||||
// - Change the number below
|
||||
COMPILE_TIME_MAX_DEVICE_TYPES = 17,
|
||||
COMPILE_TIME_MAX_DEVICE_TYPES = 18,
|
||||
};
|
||||
|
||||
constexpr DeviceType kCPU = DeviceType::CPU;
|
||||
@ -50,18 +51,19 @@ constexpr DeviceType kMetal = DeviceType::Metal;
|
||||
constexpr DeviceType kXPU = DeviceType::XPU;
|
||||
constexpr DeviceType kHPU = DeviceType::HPU;
|
||||
constexpr DeviceType kVE = DeviceType::VE;
|
||||
constexpr DeviceType kLazy = DeviceType::Lazy;
|
||||
|
||||
// define explicit int constant
|
||||
constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
|
||||
static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
|
||||
|
||||
static_assert(
|
||||
COMPILE_TIME_MAX_DEVICE_TYPES <= 17,
|
||||
COMPILE_TIME_MAX_DEVICE_TYPES <= 18,
|
||||
"Hey! You seem to be adding a lot of new DeviceTypes. The intent was "
|
||||
"for this constant to reflect the actual number of DeviceTypes we support "
|
||||
"in PyTorch; it's important that this number is not too large as we "
|
||||
"use this to allocate stack arrays in some places in our code. If you "
|
||||
"are indeed just adding the 17th device type, feel free to change "
|
||||
"are indeed just adding the 18th device type, feel free to change "
|
||||
"the check to 32; but if you are adding some sort of extensible device "
|
||||
"types registration, please be aware that you are affecting code that "
|
||||
"this number is small. Try auditing uses of this constant.");
|
||||
|
@ -23,6 +23,8 @@ const char* toString(DispatchKey t) {
|
||||
return "MSNPU";
|
||||
case DispatchKey::XLA:
|
||||
return "XLA";
|
||||
case DispatchKey::Lazy:
|
||||
return "Lazy";
|
||||
case DispatchKey::MLC:
|
||||
return "MLC";
|
||||
case DispatchKey::HPU:
|
||||
@ -91,6 +93,8 @@ const char* toString(DispatchKey t) {
|
||||
return "AutogradCUDA";
|
||||
case DispatchKey::AutogradXLA:
|
||||
return "AutogradXLA";
|
||||
case DispatchKey::AutogradLazy:
|
||||
return "AutogradLazy";
|
||||
case DispatchKey::AutogradMLC:
|
||||
return "AutogradMLC";
|
||||
case DispatchKey::AutogradHPU:
|
||||
@ -179,6 +183,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
|
||||
return DispatchKey::AutogradCUDA;
|
||||
case DispatchKey::XLA:
|
||||
return DispatchKey::AutogradXLA;
|
||||
case DispatchKey::Lazy:
|
||||
return DispatchKey::AutogradLazy;
|
||||
case DispatchKey::MLC:
|
||||
return DispatchKey::AutogradMLC;
|
||||
case DispatchKey::HPU:
|
||||
|
@ -68,6 +68,7 @@ enum class DispatchKey : uint8_t {
|
||||
XPU, // For out of tree Intel's heterogeneous computing plug-in
|
||||
HPU, // For out of tree & closed source integration of HPU / Habana
|
||||
VE, // For out of tree & closed source integration of SX-Aurora / NEC
|
||||
Lazy, // For lazy tensor backends
|
||||
|
||||
// A meta tensor is a tensor without any data associated with it. (They
|
||||
// have also colloquially been referred to as tensors on the "null" device).
|
||||
@ -229,6 +230,7 @@ enum class DispatchKey : uint8_t {
|
||||
AutogradCPU,
|
||||
AutogradCUDA,
|
||||
AutogradXLA,
|
||||
AutogradLazy,
|
||||
AutogradXPU,
|
||||
AutogradMLC,
|
||||
AutogradHPU,
|
||||
|
@ -12,6 +12,7 @@ constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
|
||||
DispatchKey::CPU,
|
||||
DispatchKey::CUDA,
|
||||
DispatchKey::XLA,
|
||||
DispatchKey::Lazy,
|
||||
DispatchKey::XPU,
|
||||
DispatchKey::PrivateUse1,
|
||||
DispatchKey::PrivateUse2,
|
||||
@ -57,6 +58,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
|
||||
return DispatchKeySet(DispatchKey::CUDA);
|
||||
case DispatchKey::AutogradXLA:
|
||||
return DispatchKeySet(DispatchKey::XLA);
|
||||
case DispatchKey::AutogradLazy:
|
||||
return DispatchKeySet(DispatchKey::Lazy);
|
||||
case DispatchKey::AutogradMLC:
|
||||
return DispatchKeySet(DispatchKey::MLC);
|
||||
case DispatchKey::AutogradHPU:
|
||||
|
@ -212,6 +212,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
|
||||
DispatchKey::AutogradCPU,
|
||||
DispatchKey::AutogradCUDA,
|
||||
DispatchKey::AutogradXLA,
|
||||
DispatchKey::AutogradLazy,
|
||||
DispatchKey::AutogradNestedTensor,
|
||||
DispatchKey::AutogradMLC,
|
||||
DispatchKey::AutogradHPU,
|
||||
|
@ -667,6 +667,8 @@ inline DispatchKey computeDispatchKey(
|
||||
return DispatchKey::MSNPU;
|
||||
case DeviceType::XLA:
|
||||
return DispatchKey::XLA;
|
||||
case DeviceType::Lazy:
|
||||
return DispatchKey::Lazy;
|
||||
case DeviceType::MLC:
|
||||
return DispatchKey::MLC;
|
||||
case DeviceType::Vulkan:
|
||||
@ -768,6 +770,9 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
|
||||
case DispatchKey::XLA:
|
||||
case DispatchKey::AutogradXLA:
|
||||
return DeviceType::XLA;
|
||||
case DispatchKey::Lazy:
|
||||
case DispatchKey::AutogradLazy:
|
||||
return DeviceType::Lazy;
|
||||
case DispatchKey::Vulkan:
|
||||
return DeviceType::Vulkan;
|
||||
case DispatchKey::Meta:
|
||||
|
@ -789,7 +789,7 @@ CPU: registered at {}:5 :: () -> () [ boxed unboxed ]
|
||||
class TestPythonDispatcher(TestCase):
|
||||
def test_basic(self):
|
||||
dispatcher = PythonDispatcher()
|
||||
dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"])
|
||||
dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd"])
|
||||
self.assertExpectedInline(
|
||||
dispatcher.dispatchTable(),
|
||||
'''\
|
||||
@ -799,16 +799,18 @@ key kernel
|
||||
---------------------------
|
||||
CPU fn_CPU [kernel]
|
||||
XLA fn_XLA [kernel]
|
||||
Lazy fn_Lazy [kernel]
|
||||
QuantizedCPU fn_CompositeImplicitAutograd [math kernel]
|
||||
AutogradOther fn_CompositeImplicitAutograd [math kernel]
|
||||
AutogradCPU fallthrough [backend fallback]
|
||||
AutogradXLA fallthrough [backend fallback]
|
||||
AutogradLazy fallthrough [backend fallback]
|
||||
'''
|
||||
)
|
||||
|
||||
def test_math_autogradcpu(self):
|
||||
dispatcher = PythonDispatcher()
|
||||
dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd", "AutogradCPU"])
|
||||
dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd", "AutogradCPU"])
|
||||
self.assertExpectedInline(
|
||||
dispatcher.dispatchTable(),
|
||||
'''\
|
||||
@ -818,10 +820,12 @@ key kernel
|
||||
---------------------------
|
||||
CPU fn_CPU [kernel]
|
||||
XLA fn_XLA [kernel]
|
||||
Lazy fn_Lazy [kernel]
|
||||
QuantizedCPU fn_CompositeImplicitAutograd [math kernel]
|
||||
AutogradOther fn_CompositeImplicitAutograd [math kernel]
|
||||
AutogradCPU fn_AutogradCPU [kernel]
|
||||
AutogradXLA fallthrough [backend fallback]
|
||||
AutogradLazy fallthrough [backend fallback]
|
||||
'''
|
||||
)
|
||||
self.assertExpectedInline(
|
||||
@ -833,6 +837,7 @@ key kernel
|
||||
---------------------------
|
||||
CPU fn_CPU
|
||||
XLA fn_XLA
|
||||
Lazy fn_Lazy
|
||||
AutogradCPU fn_AutogradCPU
|
||||
CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
|
||||
'''
|
||||
@ -840,7 +845,7 @@ CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
|
||||
|
||||
def test_defaultbackend_autogradcpu(self):
|
||||
dispatcher = PythonDispatcher()
|
||||
dispatcher.register(["CPU", "XLA", "CompositeExplicitAutograd", "AutogradCPU"])
|
||||
dispatcher.register(["CPU", "XLA", "Lazy", "CompositeExplicitAutograd", "AutogradCPU"])
|
||||
self.assertExpectedInline(
|
||||
dispatcher.dispatchTable(),
|
||||
'''\
|
||||
@ -850,10 +855,12 @@ key kernel
|
||||
---------------------------
|
||||
CPU fn_CPU [kernel]
|
||||
XLA fn_XLA [kernel]
|
||||
Lazy fn_Lazy [kernel]
|
||||
QuantizedCPU fn_CompositeExplicitAutograd [default backend kernel]
|
||||
AutogradOther fallthrough [backend fallback]
|
||||
AutogradCPU fn_AutogradCPU [kernel]
|
||||
AutogradXLA fallthrough [backend fallback]
|
||||
AutogradLazy fallthrough [backend fallback]
|
||||
'''
|
||||
)
|
||||
|
||||
@ -866,6 +873,7 @@ key kernel
|
||||
---------------------------
|
||||
CPU fn_CPU
|
||||
XLA fn_XLA
|
||||
Lazy fn_Lazy
|
||||
AutogradCPU fn_AutogradCPU
|
||||
CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd
|
||||
'''
|
||||
@ -883,10 +891,12 @@ key kernel
|
||||
---------------------------
|
||||
CPU fn_CPU [kernel]
|
||||
XLA fn_CompositeImplicitAutograd [math kernel]
|
||||
Lazy fn_CompositeImplicitAutograd [math kernel]
|
||||
QuantizedCPU fn_QuantizedCPU [kernel]
|
||||
AutogradOther ambiguous_autogradother [ambiguous autogradother]
|
||||
AutogradCPU fallthrough [backend fallback]
|
||||
AutogradXLA fn_CompositeImplicitAutograd [math kernel]
|
||||
AutogradLazy fn_CompositeImplicitAutograd [math kernel]
|
||||
'''
|
||||
)
|
||||
|
||||
|
@ -58,6 +58,7 @@ class DispatchKey(Enum):
|
||||
FPGA = auto()
|
||||
MSNPU = auto()
|
||||
XLA = auto()
|
||||
Lazy = auto()
|
||||
Vulkan = auto()
|
||||
Metal = auto()
|
||||
XPU = auto()
|
||||
@ -89,6 +90,7 @@ class DispatchKey(Enum):
|
||||
AutogradCPU = auto()
|
||||
AutogradCUDA = auto()
|
||||
AutogradXLA = auto()
|
||||
AutogradLazy = auto()
|
||||
AutogradNestedTensor = auto()
|
||||
AutogradXPU = auto()
|
||||
AutogradPrivateUse1 = auto()
|
||||
|
@ -55,6 +55,7 @@ class PythonDispatcher:
|
||||
"CPU", "AutogradCPU",
|
||||
"QuantizedCPU", "AutogradOther",
|
||||
"XLA", "AutogradXLA",
|
||||
"Lazy", "AutogradLazy",
|
||||
]
|
||||
alias_keys = [
|
||||
"CompositeExplicitAutograd",
|
||||
|
@ -109,6 +109,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
|
||||
.value("FPGA", c10::DeviceType::FPGA)
|
||||
.value("MSNPU", c10::DeviceType::MSNPU)
|
||||
.value("XLA", c10::DeviceType::XLA)
|
||||
.value("Lazy", c10::DeviceType::Lazy)
|
||||
.value("MLC", c10::DeviceType::MLC)
|
||||
.value("HPU", c10::DeviceType::HPU)
|
||||
.value("Meta", c10::DeviceType::Meta)
|
||||
|
@ -32,6 +32,7 @@ c10::optional<c10::DispatchKey> parseDispatchKey(const std::string& k) {
|
||||
{"CPU", c10::DispatchKey::CPU},
|
||||
{"CUDA", c10::DispatchKey::CUDA},
|
||||
{"XLA", c10::DispatchKey::XLA},
|
||||
{"Lazy", c10::DispatchKey::Lazy},
|
||||
{"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
|
||||
{"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd},
|
||||
{"Autograd", c10::DispatchKey::Autograd},
|
||||
|
@ -319,6 +319,7 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
|
||||
dispatch_key == c10::DispatchKey::CUDA ||
|
||||
dispatch_key == c10::DispatchKey::HIP ||
|
||||
dispatch_key == c10::DispatchKey::XLA ||
|
||||
dispatch_key == c10::DispatchKey::Lazy ||
|
||||
dispatch_key == c10::DispatchKey::XPU,
|
||||
"new(): expected DispatchKey: ",
|
||||
c10::DispatchKey::CPU,
|
||||
@ -329,11 +330,13 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
|
||||
" or ",
|
||||
c10::DispatchKey::XLA,
|
||||
" or ",
|
||||
c10::DispatchKey::Lazy,
|
||||
" or ",
|
||||
c10::DispatchKey::XPU,
|
||||
" but got: ",
|
||||
dispatch_key);
|
||||
} else if(expected_layout == c10::kSparse) {
|
||||
// NOTE: no sparse XLA
|
||||
// NOTE: no sparse XLA or Lazy
|
||||
TORCH_CHECK(
|
||||
dispatch_key == c10::DispatchKey::SparseCPU ||
|
||||
dispatch_key == c10::DispatchKey::SparseCUDA ||
|
||||
|
@ -292,6 +292,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
|
||||
return c10::DispatchKey::CUDA;
|
||||
case c10::DeviceType::XLA:
|
||||
return c10::DispatchKey::XLA;
|
||||
case c10::DeviceType::Lazy:
|
||||
return c10::DispatchKey::Lazy;
|
||||
case c10::DeviceType::MLC:
|
||||
return c10::DispatchKey::MLC;
|
||||
case c10::DeviceType::Meta:
|
||||
|
Reference in New Issue
Block a user