From b176feec1ebc0ef555f57e6301e29b3d8dc01236 Mon Sep 17 00:00:00 2001 From: Alex Suhan Date: Mon, 26 Jul 2021 22:59:10 -0700 Subject: [PATCH] Add device and key for lazy tensors (#61621) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61621 Test Plan: CI Reviewed By: mruberry Differential Revision: D29912934 Pulled By: asuhan fbshipit-source-id: 493c32063a3e756d93cbf1d876563a35eaafb537 --- aten/src/ATen/Context.h | 3 + aten/src/ATen/TensorIterator.cpp | 4 +- aten/src/ATen/core/VariableFallbackKernel.cpp | 4 + .../op_registration/op_registration_test.cpp | 149 +++++++++++------- aten/src/ATen/test/CMakeLists.txt | 2 +- aten/src/ATen/test/lazy_tensor_test.cpp | 23 +++ aten/tools/run_tests.sh | 2 +- c10/core/Backend.h | 9 ++ c10/core/Device.cpp | 3 +- c10/core/Device.h | 2 +- c10/core/DeviceType.cpp | 3 + c10/core/DeviceType.h | 8 +- c10/core/DispatchKey.cpp | 6 + c10/core/DispatchKey.h | 2 + c10/core/DispatchKeySet.cpp | 3 + c10/core/DispatchKeySet.h | 1 + c10/core/TensorOptions.h | 5 + test/test_dispatch.py | 16 +- tools/codegen/model.py | 2 + torch/_python_dispatcher.py | 1 + torch/csrc/autograd/init.cpp | 1 + torch/csrc/utils/python_dispatch.cpp | 1 + torch/csrc/utils/tensor_new.cpp | 5 +- torch/library.h | 2 + 24 files changed, 187 insertions(+), 70 deletions(-) create mode 100644 aten/src/ATen/test/lazy_tensor_test.cpp diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 67d8d6ce8c5f..26f1d11f92b4 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -73,6 +73,9 @@ class TORCH_API Context { static bool hasXLA() { return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA); } + static bool hasLazy() { + return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy); + } static bool hasMLC() { return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC); } diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index d3cfa88a201c..735c5c601a60 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -1343,9 +1343,9 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { if (is_meta_) return; - // XLA tensors don't have storage, so they don't have an underlying data pointer. + // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer. // Nothing beyond this point is important for meta functions, so it's fine to exit early here. - if (common_device_.type() == DeviceType::XLA) return; + if (common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::Lazy) return; for (auto& op : operands_) { TORCH_INTERNAL_ASSERT(op.tensor->defined()); diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp index fba58950717c..432796dcbd78 100644 --- a/aten/src/ATen/core/VariableFallbackKernel.cpp +++ b/aten/src/ATen/core/VariableFallbackKernel.cpp @@ -52,6 +52,10 @@ TORCH_LIBRARY_IMPL(_, AutogradXLA, m) { m.fallback(torch::CppFunction::makeFallthrough()); } +TORCH_LIBRARY_IMPL(_, AutogradLazy, m) { + m.fallback(torch::CppFunction::makeFallthrough()); +} + TORCH_LIBRARY_IMPL(_, AutogradMLC, m) { m.fallback(torch::CppFunction::makeFallthrough()); } diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index 23d032e76949..9b21a06f284b 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -257,6 +257,11 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox " backend."); } +std::string expectedMessageForBackend(DispatchKey key) { + std::string key_str(c10::toString(key)); + return "Could not run '_test::dummy' with arguments from the '" + key_str + "' backend"; +} + TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCalling_thenCallsCorrectKernel) { bool called_kernel1 = false; bool called_kernel2 = false; @@ -277,18 +282,20 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall EXPECT_FALSE(called_kernel1); EXPECT_TRUE(called_kernel2); - expectThrows([&] { - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); - }, "Could not run '_test::dummy' with arguments from the 'XLA'" - " backend."); + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { + std::string expectMessage = expectedMessageForBackend(key); + expectThrows([&] { + callOp(*op, dummyTensor(key)); + }, expectMessage.c_str()); - // also assert that the error message contains the available tensor type ids, but don't assert their order - expectThrows([&] { - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); - }, "CPU"); - expectThrows([&] { - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); - }, "CUDA"); + // also assert that the error message contains the available tensor type ids, but don't assert their order + expectThrows([&] { + callOp(*op, dummyTensor(key)); + }, "CPU"); + expectThrows([&] { + callOp(*op, dummyTensor(key)); + }, "CUDA"); + } } bool called_stackbased_kernel = false; @@ -302,7 +309,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndNoneCanInf auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options() .kernel<&stackBasedKernel>(c10::DispatchKey::CPU) .kernel<&stackBasedKernel>(c10::DispatchKey::CUDA) - .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)); + .kernel<&stackBasedKernel>(c10::DispatchKey::XLA) + .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy)); }, "Cannot infer operator schema for this kind of kernel in registration of operator _test::dummy"); } @@ -311,7 +319,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() .kernel<&stackBasedKernel>(c10::DispatchKey::CPU) .kernel<&stackBasedKernel>(c10::DispatchKey::CUDA) - .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)); + .kernel<&stackBasedKernel>(c10::DispatchKey::XLA) + .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy)); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); ASSERT_TRUE(op.has_value()); // assert schema is registered @@ -326,10 +335,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI EXPECT_TRUE(called_stackbased_kernel); EXPECT_FALSE(called_kernel); - called_kernel = called_stackbased_kernel = false; - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); - EXPECT_TRUE(called_stackbased_kernel); - EXPECT_FALSE(called_kernel); + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { + called_kernel = called_stackbased_kernel = false; + callOp(*op, dummyTensor(key)); + EXPECT_TRUE(called_stackbased_kernel); + EXPECT_FALSE(called_kernel); + } } TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCanInferSchema_thenSucceeds) { @@ -337,7 +348,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options() .kernel<&stackBasedKernel>(c10::DispatchKey::CPU) .kernel(c10::DispatchKey::CUDA, &called_kernel) - .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)); + .kernel<&stackBasedKernel>(c10::DispatchKey::XLA) + .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy)); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); ASSERT_TRUE(op.has_value()); // assert schema is registered @@ -352,10 +364,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan EXPECT_FALSE(called_stackbased_kernel); EXPECT_TRUE(called_kernel); - called_kernel = called_stackbased_kernel = false; - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); - EXPECT_TRUE(called_stackbased_kernel); - EXPECT_FALSE(called_kernel); + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { + called_kernel = called_stackbased_kernel = false; + callOp(*op, dummyTensor(key)); + EXPECT_TRUE(called_stackbased_kernel); + EXPECT_FALSE(called_kernel); + } } TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneCanInferSchema_thenSucceeds) { @@ -363,7 +377,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() .kernel<&stackBasedKernel>(c10::DispatchKey::CPU) .kernel(c10::DispatchKey::CUDA, &called_kernel) - .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)); + .kernel<&stackBasedKernel>(c10::DispatchKey::XLA) + .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy)); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); ASSERT_TRUE(op.has_value()); // assert schema is registered @@ -378,10 +393,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC EXPECT_FALSE(called_stackbased_kernel); EXPECT_TRUE(called_kernel); - called_kernel = called_stackbased_kernel = false; - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); - EXPECT_TRUE(called_stackbased_kernel); - EXPECT_FALSE(called_kernel); + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { + called_kernel = called_stackbased_kernel = false; + callOp(*op, dummyTensor(key)); + EXPECT_TRUE(called_stackbased_kernel); + EXPECT_FALSE(called_kernel); + } } struct DummyKernelWithIntParam final : OperatorKernel { @@ -570,21 +587,21 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) { EXPECT_FALSE(called_nonautograd); } -TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) { +void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) { auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() - .kernel(DispatchKey::AutogradXLA) + .kernel(c10::getAutogradKeyFromBackend(key)) .kernel(DispatchKey::Autograd)); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); ASSERT_TRUE(op.has_value()); + std::string expectedMessage = expectedMessageForBackend(key); expectThrows([&] { - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); - }, "Could not run '_test::dummy' with arguments from the 'XLA'" - " backend."); + callOp(*op, dummyTensor(key)); + }, expectedMessage.c_str()); called_nonautograd = called_autograd = false; - op->typed().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true)); + op->typed().call(dummyTensor(key, /*requires_grad=*/true)); EXPECT_TRUE(called_nonautograd); EXPECT_FALSE(called_autograd); @@ -594,7 +611,15 @@ TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) { EXPECT_FALSE(called_nonautograd); } -TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) { +TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) { + LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA); +} + +TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) { + LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy); +} + +void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) { { auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() .catchAllKernel()); @@ -603,38 +628,46 @@ TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAI ASSERT_TRUE(op.has_value()); called_nonautograd = called_autograd = false; - op->typed().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true)); + op->typed().call(dummyTensor(key, /*requires_grad=*/true)); EXPECT_TRUE(called_nonautograd); EXPECT_FALSE(called_autograd); called_nonautograd = called_autograd = false; - op->typed().call(dummyTensor(DispatchKey::XLA)); + op->typed().call(dummyTensor(key)); EXPECT_FALSE(called_autograd); EXPECT_TRUE(called_nonautograd); } { auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() - .kernel(DispatchKey::XLA) + .kernel(key) .catchAllKernel()); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); ASSERT_TRUE(op.has_value()); - // When there's direct registration to XLA backend, AutogradXLA doesn't pick up catchAll + // When there's direct registration to XLA / Lazy backend, Autograd{XLA, Lazy} doesn't pick up catchAll // kernel in precompute but just keep fallthrough kernel from backend fallback. - // Thus it falls through AutogradXLA and reaches the kernel at XLA key. + // Thus it falls through Autograd{XLA, Lazy} and reaches the kernel at XLA / Lazy key. called_nonautograd = called_autograd = false; - op->typed().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true)); + op->typed().call(dummyTensor(key, /*requires_grad=*/true)); EXPECT_FALSE(called_nonautograd); EXPECT_TRUE(called_autograd); called_nonautograd = called_autograd = false; - op->typed().call(dummyTensor(DispatchKey::XLA)); + op->typed().call(dummyTensor(key)); EXPECT_TRUE(called_autograd); EXPECT_FALSE(called_nonautograd); } } +TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) { + whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::XLA); +} + +TEST(OperatorRegistrationTest, whenRegisterWithLazyKernelAndCatchAll_AutogradLazyIsNotFilled) { + whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::Lazy); +} + TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingCppSignatures_thenFails) { expectThrows([] { auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options() @@ -1217,9 +1250,11 @@ TEST(NewOperatorRegistrationTest, testBasics) { m.def("dummy4", [](const Tensor& self, const Tensor& other) { return other; }); m.impl("dummy", c10::DeviceType::CPU, [](const Tensor& self) { return self; }); m.impl("dummy", c10::DeviceType::XLA, [](const Tensor& self) { return self; }); + m.impl("dummy", c10::DeviceType::Lazy, [](const Tensor& self) { return self; }); // Internal API m.impl("dummy2", c10::DispatchKey::CPU, [](const Tensor& self) { return self; }); m.impl("dummy2", c10::DispatchKey::XLA, [](const Tensor& self) { return self; }); + m.impl("dummy2", c10::DispatchKey::Lazy, [](const Tensor& self) { return self; }); ASSERT_TRUE(Dispatcher::singleton().findSchema({"_test::dummy", ""}).has_value()); // Should have a schema even if there are no impls @@ -1345,15 +1380,15 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradKernel) { ASSERT_TRUE(math_called); } - { + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { math_called = false; - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); + callOp(*op, dummyTensor(key)); ASSERT_TRUE(math_called); } - { + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { math_called = false; - callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true)); + callOp(*op, dummyTensor(key, /*requires_grad=*/true)); ASSERT_TRUE(math_called); } @@ -1523,16 +1558,16 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradKernel) { ASSERT_TRUE(called); } - { + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { called = false; - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); + callOp(*op, dummyTensor(key)); ASSERT_TRUE(called); } - { + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { called = false; - // AutogradXLA is fallthrough, calls XLA kernel - callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true)); + // Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel + callOp(*op, dummyTensor(key, /*requires_grad=*/true)); ASSERT_TRUE(called); } @@ -1575,17 +1610,17 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradAndCompos ASSERT_TRUE(backend_called); } - { + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { backend_called = math_called = false; - callOp(*op, dummyTensor(c10::DispatchKey::XLA)); + callOp(*op, dummyTensor(key)); ASSERT_TRUE(backend_called); ASSERT_FALSE(math_called); } - { + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { backend_called = math_called = false; - // AutogradXLA is fallthrough, calls XLA kernel - callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true)); + // Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel + callOp(*op, dummyTensor(key, /*requires_grad=*/true)); ASSERT_FALSE(math_called); ASSERT_TRUE(backend_called); } @@ -1681,11 +1716,11 @@ TEST(NewOperatorRegistrationTest, dispatch) { ASSERT_TRUE(autograd_called); } - { + for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { autograd_called = false; auto op = Dispatcher::singleton().findSchema({"test::fn_autograd", ""}); ASSERT_TRUE(op.has_value()); - callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true)); + callOp(*op, dummyTensor(key, /*requires_grad=*/true)); ASSERT_TRUE(autograd_called); } } diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index dc6595e37e4e..2fe6aa856b0b 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -26,7 +26,7 @@ list(APPEND ATen_CPU_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/quantized_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/operators_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/lazy_tensor_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp diff --git a/aten/src/ATen/test/lazy_tensor_test.cpp b/aten/src/ATen/test/lazy_tensor_test.cpp new file mode 100644 index 000000000000..4c42fc4e7788 --- /dev/null +++ b/aten/src/ATen/test/lazy_tensor_test.cpp @@ -0,0 +1,23 @@ +#include + +#include + +void LazyTensorTest(c10::DispatchKey dispatch_key, at::DeviceType device_type) { + auto tensor_impl = + c10::make_intrusive( + dispatch_key, + caffe2::TypeMeta::Make(), + at::Device(device_type, 0)); + at::Tensor t(std::move(tensor_impl)); + ASSERT_TRUE(t.device() == at::Device(device_type, 0)); +} + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +TEST(XlaTensorTest, TestNoStorage) { + LazyTensorTest(at::DispatchKey::XLA, at::DeviceType::XLA); +} + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +TEST(LazyTensorTest, TestNoStorage) { + LazyTensorTest(at::DispatchKey::Lazy, at::DeviceType::Lazy); +} diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh index 5fc9e00c3b41..a3ae3ef839cc 100755 --- a/aten/tools/run_tests.sh +++ b/aten/tools/run_tests.sh @@ -18,7 +18,7 @@ VALGRIND=${VALGRIND:=ON} ./tensor_interop_test ./undefined_tensor_test ./extension_backend_test -./xla_tensor_test +./lazy_tensor_test ./tensor_iterator_test ./Dimname_test ./Dict_test diff --git a/c10/core/Backend.h b/c10/core/Backend.h index 6fa4cf59ed3c..2f071345311f 100644 --- a/c10/core/Backend.h +++ b/c10/core/Backend.h @@ -51,6 +51,7 @@ enum class Backend { MkldnnCPU, MLC, HPU, + Lazy, NumOptions }; @@ -69,6 +70,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) { return Backend::MSNPU; } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) { return Backend::XLA; + } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) { + return Backend::Lazy; } else if (t == DispatchKey::MLC || t == DispatchKey::AutogradMLC) { return Backend::MLC; } else if (t == DispatchKey::Vulkan) { @@ -124,6 +127,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) { return DispatchKey::MSNPU; case Backend::XLA: return DispatchKey::XLA; + case Backend::Lazy: + return DispatchKey::Lazy; case Backend::XPU: return DispatchKey::XPU; case Backend::SparseXPU: @@ -177,6 +182,8 @@ static inline DeviceType backendToDeviceType(Backend b) { return DeviceType::MSNPU; case Backend::XLA: return DeviceType::XLA; + case Backend::Lazy: + return DeviceType::Lazy; case Backend::SparseCPU: return DeviceType::CPU; case Backend::SparseCUDA: @@ -232,6 +239,8 @@ static inline const char* toString(Backend b) { return "MSNPU"; case Backend::XLA: return "XLA"; + case Backend::Lazy: + return "Lazy"; case Backend::MLC: return "MLC"; case Backend::SparseCPU: diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp index 82cfc3c0572e..ee6f1b473fe0 100644 --- a/c10/core/Device.cpp +++ b/c10/core/Device.cpp @@ -45,6 +45,7 @@ DeviceType parse_type(const std::string& device_string) { {"fpga", DeviceType::FPGA}, {"msnpu", DeviceType::MSNPU}, {"xla", DeviceType::XLA}, + {"lazy", DeviceType::Lazy}, {"vulkan", DeviceType::Vulkan}, {"mlc", DeviceType::MLC}, {"meta", DeviceType::Meta}, @@ -61,7 +62,7 @@ DeviceType parse_type(const std::string& device_string) { } TORCH_CHECK( false, - "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, vulkan, meta, hpu device type at start of device string: ", + "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ", device_string); } } // namespace diff --git a/c10/core/Device.h b/c10/core/Device.h index 599219e8504f..227d2cc512cc 100644 --- a/c10/core/Device.h +++ b/c10/core/Device.h @@ -103,7 +103,7 @@ struct C10_API Device final { /// Return true if the device supports arbirtary strides. bool supports_as_strided() const noexcept { - return type_ != DeviceType::XLA; + return type_ != DeviceType::XLA && type_ != DeviceType::Lazy; } /// Same string as returned from operator<<. diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp index 54b80d491c4b..4ff939806f98 100644 --- a/c10/core/DeviceType.cpp +++ b/c10/core/DeviceType.cpp @@ -29,6 +29,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) { return lower_case ? "msnpu" : "MSNPU"; case DeviceType::XLA: return lower_case ? "xla" : "XLA"; + case DeviceType::Lazy: + return lower_case ? "lazy" : "LAZY"; case DeviceType::MLC: return lower_case ? "mlc" : "MLC"; case DeviceType::Vulkan: @@ -75,6 +77,7 @@ bool isValidDeviceType(DeviceType d) { case DeviceType::FPGA: case DeviceType::MSNPU: case DeviceType::XLA: + case DeviceType::Lazy: case DeviceType::MLC: case DeviceType::Vulkan: case DeviceType::Metal: diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h index 6f60399bad7f..2ae028d14402 100644 --- a/c10/core/DeviceType.h +++ b/c10/core/DeviceType.h @@ -30,11 +30,12 @@ enum class DeviceType : int8_t { Meta = 14, // Meta (tensors with no data) HPU = 15, // HPU / HABANA VE = 16, // SX-Aurora / NEC + Lazy = 17, // Lazy Tensors // NB: If you add more devices: // - Change the implementations of DeviceTypeName and isValidDeviceType // in DeviceType.cpp // - Change the number below - COMPILE_TIME_MAX_DEVICE_TYPES = 17, + COMPILE_TIME_MAX_DEVICE_TYPES = 18, }; constexpr DeviceType kCPU = DeviceType::CPU; @@ -50,18 +51,19 @@ constexpr DeviceType kMetal = DeviceType::Metal; constexpr DeviceType kXPU = DeviceType::XPU; constexpr DeviceType kHPU = DeviceType::HPU; constexpr DeviceType kVE = DeviceType::VE; +constexpr DeviceType kLazy = DeviceType::Lazy; // define explicit int constant constexpr int COMPILE_TIME_MAX_DEVICE_TYPES = static_cast(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES); static_assert( - COMPILE_TIME_MAX_DEVICE_TYPES <= 17, + COMPILE_TIME_MAX_DEVICE_TYPES <= 18, "Hey! You seem to be adding a lot of new DeviceTypes. The intent was " "for this constant to reflect the actual number of DeviceTypes we support " "in PyTorch; it's important that this number is not too large as we " "use this to allocate stack arrays in some places in our code. If you " - "are indeed just adding the 17th device type, feel free to change " + "are indeed just adding the 18th device type, feel free to change " "the check to 32; but if you are adding some sort of extensible device " "types registration, please be aware that you are affecting code that " "this number is small. Try auditing uses of this constant."); diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index 9732cd2833c2..5c414484b38f 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -23,6 +23,8 @@ const char* toString(DispatchKey t) { return "MSNPU"; case DispatchKey::XLA: return "XLA"; + case DispatchKey::Lazy: + return "Lazy"; case DispatchKey::MLC: return "MLC"; case DispatchKey::HPU: @@ -91,6 +93,8 @@ const char* toString(DispatchKey t) { return "AutogradCUDA"; case DispatchKey::AutogradXLA: return "AutogradXLA"; + case DispatchKey::AutogradLazy: + return "AutogradLazy"; case DispatchKey::AutogradMLC: return "AutogradMLC"; case DispatchKey::AutogradHPU: @@ -179,6 +183,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) { return DispatchKey::AutogradCUDA; case DispatchKey::XLA: return DispatchKey::AutogradXLA; + case DispatchKey::Lazy: + return DispatchKey::AutogradLazy; case DispatchKey::MLC: return DispatchKey::AutogradMLC; case DispatchKey::HPU: diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index 87f053a144be..b22778fb616e 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -68,6 +68,7 @@ enum class DispatchKey : uint8_t { XPU, // For out of tree Intel's heterogeneous computing plug-in HPU, // For out of tree & closed source integration of HPU / Habana VE, // For out of tree & closed source integration of SX-Aurora / NEC + Lazy, // For lazy tensor backends // A meta tensor is a tensor without any data associated with it. (They // have also colloquially been referred to as tensors on the "null" device). @@ -229,6 +230,7 @@ enum class DispatchKey : uint8_t { AutogradCPU, AutogradCUDA, AutogradXLA, + AutogradLazy, AutogradXPU, AutogradMLC, AutogradHPU, diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index 679560d4de7c..a4e399dd734b 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -12,6 +12,7 @@ constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends | DispatchKey::CPU, DispatchKey::CUDA, DispatchKey::XLA, + DispatchKey::Lazy, DispatchKey::XPU, DispatchKey::PrivateUse1, DispatchKey::PrivateUse2, @@ -57,6 +58,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) { return DispatchKeySet(DispatchKey::CUDA); case DispatchKey::AutogradXLA: return DispatchKeySet(DispatchKey::XLA); + case DispatchKey::AutogradLazy: + return DispatchKeySet(DispatchKey::Lazy); case DispatchKey::AutogradMLC: return DispatchKeySet(DispatchKey::MLC); case DispatchKey::AutogradHPU: diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 07a77e26cf92..0d3a25ea9d8d 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -212,6 +212,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({ DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA, + DispatchKey::AutogradLazy, DispatchKey::AutogradNestedTensor, DispatchKey::AutogradMLC, DispatchKey::AutogradHPU, diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h index 2d2077b769e1..fff9433e270f 100644 --- a/c10/core/TensorOptions.h +++ b/c10/core/TensorOptions.h @@ -667,6 +667,8 @@ inline DispatchKey computeDispatchKey( return DispatchKey::MSNPU; case DeviceType::XLA: return DispatchKey::XLA; + case DeviceType::Lazy: + return DispatchKey::Lazy; case DeviceType::MLC: return DispatchKey::MLC; case DeviceType::Vulkan: @@ -768,6 +770,9 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) { case DispatchKey::XLA: case DispatchKey::AutogradXLA: return DeviceType::XLA; + case DispatchKey::Lazy: + case DispatchKey::AutogradLazy: + return DeviceType::Lazy; case DispatchKey::Vulkan: return DeviceType::Vulkan; case DispatchKey::Meta: diff --git a/test/test_dispatch.py b/test/test_dispatch.py index 62cd5d64a1af..3b7ac1c83100 100644 --- a/test/test_dispatch.py +++ b/test/test_dispatch.py @@ -789,7 +789,7 @@ CPU: registered at {}:5 :: () -> () [ boxed unboxed ] class TestPythonDispatcher(TestCase): def test_basic(self): dispatcher = PythonDispatcher() - dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"]) + dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd"]) self.assertExpectedInline( dispatcher.dispatchTable(), '''\ @@ -799,16 +799,18 @@ key kernel --------------------------- CPU fn_CPU [kernel] XLA fn_XLA [kernel] +Lazy fn_Lazy [kernel] QuantizedCPU fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fallthrough [backend fallback] AutogradXLA fallthrough [backend fallback] +AutogradLazy fallthrough [backend fallback] ''' ) def test_math_autogradcpu(self): dispatcher = PythonDispatcher() - dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd", "AutogradCPU"]) + dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd", "AutogradCPU"]) self.assertExpectedInline( dispatcher.dispatchTable(), '''\ @@ -818,10 +820,12 @@ key kernel --------------------------- CPU fn_CPU [kernel] XLA fn_XLA [kernel] +Lazy fn_Lazy [kernel] QuantizedCPU fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] +AutogradLazy fallthrough [backend fallback] ''' ) self.assertExpectedInline( @@ -833,6 +837,7 @@ key kernel --------------------------- CPU fn_CPU XLA fn_XLA +Lazy fn_Lazy AutogradCPU fn_AutogradCPU CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd ''' @@ -840,7 +845,7 @@ CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd def test_defaultbackend_autogradcpu(self): dispatcher = PythonDispatcher() - dispatcher.register(["CPU", "XLA", "CompositeExplicitAutograd", "AutogradCPU"]) + dispatcher.register(["CPU", "XLA", "Lazy", "CompositeExplicitAutograd", "AutogradCPU"]) self.assertExpectedInline( dispatcher.dispatchTable(), '''\ @@ -850,10 +855,12 @@ key kernel --------------------------- CPU fn_CPU [kernel] XLA fn_XLA [kernel] +Lazy fn_Lazy [kernel] QuantizedCPU fn_CompositeExplicitAutograd [default backend kernel] AutogradOther fallthrough [backend fallback] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] +AutogradLazy fallthrough [backend fallback] ''' ) @@ -866,6 +873,7 @@ key kernel --------------------------- CPU fn_CPU XLA fn_XLA +Lazy fn_Lazy AutogradCPU fn_AutogradCPU CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd ''' @@ -883,10 +891,12 @@ key kernel --------------------------- CPU fn_CPU [kernel] XLA fn_CompositeImplicitAutograd [math kernel] +Lazy fn_CompositeImplicitAutograd [math kernel] QuantizedCPU fn_QuantizedCPU [kernel] AutogradOther ambiguous_autogradother [ambiguous autogradother] AutogradCPU fallthrough [backend fallback] AutogradXLA fn_CompositeImplicitAutograd [math kernel] +AutogradLazy fn_CompositeImplicitAutograd [math kernel] ''' ) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 2c619371416e..d6f02d5a6898 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -58,6 +58,7 @@ class DispatchKey(Enum): FPGA = auto() MSNPU = auto() XLA = auto() + Lazy = auto() Vulkan = auto() Metal = auto() XPU = auto() @@ -89,6 +90,7 @@ class DispatchKey(Enum): AutogradCPU = auto() AutogradCUDA = auto() AutogradXLA = auto() + AutogradLazy = auto() AutogradNestedTensor = auto() AutogradXPU = auto() AutogradPrivateUse1 = auto() diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py index d13b0aa92673..aa19a18efb3b 100644 --- a/torch/_python_dispatcher.py +++ b/torch/_python_dispatcher.py @@ -55,6 +55,7 @@ class PythonDispatcher: "CPU", "AutogradCPU", "QuantizedCPU", "AutogradOther", "XLA", "AutogradXLA", + "Lazy", "AutogradLazy", ] alias_keys = [ "CompositeExplicitAutograd", diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 2c37f59990f9..0778cbc6012a 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -109,6 +109,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .value("FPGA", c10::DeviceType::FPGA) .value("MSNPU", c10::DeviceType::MSNPU) .value("XLA", c10::DeviceType::XLA) + .value("Lazy", c10::DeviceType::Lazy) .value("MLC", c10::DeviceType::MLC) .value("HPU", c10::DeviceType::HPU) .value("Meta", c10::DeviceType::Meta) diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp index 82056f93ec40..61dac97b8dc7 100644 --- a/torch/csrc/utils/python_dispatch.cpp +++ b/torch/csrc/utils/python_dispatch.cpp @@ -32,6 +32,7 @@ c10::optional parseDispatchKey(const std::string& k) { {"CPU", c10::DispatchKey::CPU}, {"CUDA", c10::DispatchKey::CUDA}, {"XLA", c10::DispatchKey::XLA}, + {"Lazy", c10::DispatchKey::Lazy}, {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, {"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd}, {"Autograd", c10::DispatchKey::Autograd}, diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index e8ef72060b34..819a3a4c5da8 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -319,6 +319,7 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la dispatch_key == c10::DispatchKey::CUDA || dispatch_key == c10::DispatchKey::HIP || dispatch_key == c10::DispatchKey::XLA || + dispatch_key == c10::DispatchKey::Lazy || dispatch_key == c10::DispatchKey::XPU, "new(): expected DispatchKey: ", c10::DispatchKey::CPU, @@ -329,11 +330,13 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la " or ", c10::DispatchKey::XLA, " or ", + c10::DispatchKey::Lazy, + " or ", c10::DispatchKey::XPU, " but got: ", dispatch_key); } else if(expected_layout == c10::kSparse) { - // NOTE: no sparse XLA + // NOTE: no sparse XLA or Lazy TORCH_CHECK( dispatch_key == c10::DispatchKey::SparseCPU || dispatch_key == c10::DispatchKey::SparseCUDA || diff --git a/torch/library.h b/torch/library.h index 2135efcf882c..0e85910664ee 100644 --- a/torch/library.h +++ b/torch/library.h @@ -292,6 +292,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) { return c10::DispatchKey::CUDA; case c10::DeviceType::XLA: return c10::DispatchKey::XLA; + case c10::DeviceType::Lazy: + return c10::DispatchKey::Lazy; case c10::DeviceType::MLC: return c10::DispatchKey::MLC; case c10::DeviceType::Meta: