From b176feec1ebc0ef555f57e6301e29b3d8dc01236 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Mon, 26 Jul 2021 22:59:10 -0700
Subject: [PATCH] Add device and key for lazy tensors (#61621)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61621

Test Plan: CI

Reviewed By: mruberry

Differential Revision: D29912934

Pulled By: asuhan

fbshipit-source-id: 493c32063a3e756d93cbf1d876563a35eaafb537
---
 aten/src/ATen/Context.h                       |   3 +
 aten/src/ATen/TensorIterator.cpp              |   4 +-
 aten/src/ATen/core/VariableFallbackKernel.cpp |   4 +
 .../op_registration/op_registration_test.cpp  | 149 +++++++++++-------
 aten/src/ATen/test/CMakeLists.txt             |   2 +-
 aten/src/ATen/test/lazy_tensor_test.cpp       |  23 +++
 aten/tools/run_tests.sh                       |   2 +-
 c10/core/Backend.h                            |   9 ++
 c10/core/Device.cpp                           |   3 +-
 c10/core/Device.h                             |   2 +-
 c10/core/DeviceType.cpp                       |   3 +
 c10/core/DeviceType.h                         |   8 +-
 c10/core/DispatchKey.cpp                      |   6 +
 c10/core/DispatchKey.h                        |   2 +
 c10/core/DispatchKeySet.cpp                   |   3 +
 c10/core/DispatchKeySet.h                     |   1 +
 c10/core/TensorOptions.h                      |   5 +
 test/test_dispatch.py                         |  16 +-
 tools/codegen/model.py                        |   2 +
 torch/_python_dispatcher.py                   |   1 +
 torch/csrc/autograd/init.cpp                  |   1 +
 torch/csrc/utils/python_dispatch.cpp          |   1 +
 torch/csrc/utils/tensor_new.cpp               |   5 +-
 torch/library.h                               |   2 +
 24 files changed, 187 insertions(+), 70 deletions(-)
 create mode 100644 aten/src/ATen/test/lazy_tensor_test.cpp

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 67d8d6ce8c5f..26f1d11f92b4 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -73,6 +73,9 @@ class TORCH_API Context {
   static bool hasXLA() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA);
   }
+  static bool hasLazy() {
+    return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
+  }
   static bool hasMLC() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC);
   }
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index d3cfa88a201c..735c5c601a60 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -1343,9 +1343,9 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
 
   if (is_meta_) return;
 
-  // XLA tensors don't have storage, so they don't have an underlying data pointer.
+  // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
   // Nothing beyond this point is important for meta functions, so it's fine to exit early here.
-  if (common_device_.type() == DeviceType::XLA) return;
+  if (common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::Lazy) return;
 
   for (auto& op : operands_) {
     TORCH_INTERNAL_ASSERT(op.tensor->defined());
diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp
index fba58950717c..432796dcbd78 100644
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@@ -52,6 +52,10 @@ TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
 }
 
+TORCH_LIBRARY_IMPL(_, AutogradLazy, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
 TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
 }
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index 23d032e76949..9b21a06f284b 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -257,6 +257,11 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
   " backend.");
 }
 
+std::string expectedMessageForBackend(DispatchKey key) {
+  std::string key_str(c10::toString(key));
+  return "Could not run '_test::dummy' with arguments from the '" + key_str + "' backend";
+}
+
 TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCalling_thenCallsCorrectKernel) {
   bool called_kernel1 = false;
   bool called_kernel2 = false;
@@ -277,18 +282,20 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall
   EXPECT_FALSE(called_kernel1);
   EXPECT_TRUE(called_kernel2);
 
-  expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "Could not run '_test::dummy' with arguments from the 'XLA'"
-  " backend.");
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    std::string expectMessage = expectedMessageForBackend(key);
+    expectThrows<c10::Error>([&] {
+      callOp(*op, dummyTensor(key));
+    }, expectMessage.c_str());
 
-  // also assert that the error message contains the available tensor type ids, but don't assert their order
-  expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "CPU");
-  expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "CUDA");
+    // also assert that the error message contains the available tensor type ids, but don't assert their order
+    expectThrows<c10::Error>([&] {
+      callOp(*op, dummyTensor(key));
+    }, "CPU");
+    expectThrows<c10::Error>([&] {
+      callOp(*op, dummyTensor(key));
+    }, "CUDA");
+  }
 }
 
 bool called_stackbased_kernel = false;
@@ -302,7 +309,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndNoneCanInf
     auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
       .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
       .kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
-      .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+      .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+      .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
   }, "Cannot infer operator schema for this kind of kernel in registration of operator _test::dummy");
 }
 
@@ -311,7 +319,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
   auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
     .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
     .kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
-    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+    .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
 
   auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
   ASSERT_TRUE(op.has_value()); // assert schema is registered
@@ -326,10 +335,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
   EXPECT_TRUE(called_stackbased_kernel);
   EXPECT_FALSE(called_kernel);
 
-  called_kernel = called_stackbased_kernel = false;
-  callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  EXPECT_TRUE(called_stackbased_kernel);
-  EXPECT_FALSE(called_kernel);
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    called_kernel = called_stackbased_kernel = false;
+    callOp(*op, dummyTensor(key));
+    EXPECT_TRUE(called_stackbased_kernel);
+    EXPECT_FALSE(called_kernel);
+  }
 }
 
 TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCanInferSchema_thenSucceeds) {
@@ -337,7 +348,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
   auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
     .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
     .kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
-    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+    .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
 
   auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
   ASSERT_TRUE(op.has_value()); // assert schema is registered
@@ -352,10 +364,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
   EXPECT_FALSE(called_stackbased_kernel);
   EXPECT_TRUE(called_kernel);
 
-  called_kernel = called_stackbased_kernel = false;
-  callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  EXPECT_TRUE(called_stackbased_kernel);
-  EXPECT_FALSE(called_kernel);
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    called_kernel = called_stackbased_kernel = false;
+    callOp(*op, dummyTensor(key));
+    EXPECT_TRUE(called_stackbased_kernel);
+    EXPECT_FALSE(called_kernel);
+  }
 }
 
 TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneCanInferSchema_thenSucceeds) {
@@ -363,7 +377,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
   auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
     .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
     .kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
-    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+    .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
 
   auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
   ASSERT_TRUE(op.has_value()); // assert schema is registered
@@ -378,10 +393,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
   EXPECT_FALSE(called_stackbased_kernel);
   EXPECT_TRUE(called_kernel);
 
-  called_kernel = called_stackbased_kernel = false;
-  callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  EXPECT_TRUE(called_stackbased_kernel);
-  EXPECT_FALSE(called_kernel);
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    called_kernel = called_stackbased_kernel = false;
+    callOp(*op, dummyTensor(key));
+    EXPECT_TRUE(called_stackbased_kernel);
+    EXPECT_FALSE(called_kernel);
+  }
 }
 
 struct DummyKernelWithIntParam final : OperatorKernel {
@@ -570,21 +587,21 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) {
   EXPECT_FALSE(called_nonautograd);
 }
 
-TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
+void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
   auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
-    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(DispatchKey::AutogradXLA)
+    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(key))
     .kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::Autograd));
 
   auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
   ASSERT_TRUE(op.has_value());
 
+  std::string expectedMessage = expectedMessageForBackend(key);
   expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "Could not run '_test::dummy' with arguments from the 'XLA'"
-  " backend.");
+    callOp(*op, dummyTensor(key));
+  }, expectedMessage.c_str());
 
   called_nonautograd = called_autograd = false;
-  op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
+  op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
   EXPECT_TRUE(called_nonautograd);
   EXPECT_FALSE(called_autograd);
 
@@ -594,7 +611,15 @@ TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
   EXPECT_FALSE(called_nonautograd);
 }
 
-TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
+TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
+  LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA);
+}
+
+TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) {
+  LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy);
+}
+
+void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) {
   {
     auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
       .catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());
@@ -603,38 +628,46 @@ TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAI
     ASSERT_TRUE(op.has_value());
 
     called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
+    op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
     EXPECT_TRUE(called_nonautograd);
     EXPECT_FALSE(called_autograd);
 
     called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
+    op->typed<void (Tensor)>().call(dummyTensor(key));
     EXPECT_FALSE(called_autograd);
     EXPECT_TRUE(called_nonautograd);
   }
   {
     auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
-      .kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::XLA)
+      .kernel<decltype(autograd_kernel), &autograd_kernel>(key)
       .catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());
 
     auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
     ASSERT_TRUE(op.has_value());
 
-    // When there's direct registration to XLA backend, AutogradXLA doesn't pick up catchAll
+    // When there's direct registration to XLA / Lazy backend, Autograd{XLA, Lazy} doesn't pick up catchAll
     // kernel in precompute but just keep fallthrough kernel from backend fallback.
-    // Thus it falls through AutogradXLA and reaches the kernel at XLA key.
+    // Thus it falls through Autograd{XLA, Lazy} and reaches the kernel at XLA / Lazy key.
     called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
+    op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
     EXPECT_FALSE(called_nonautograd);
     EXPECT_TRUE(called_autograd);
 
     called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
+    op->typed<void (Tensor)>().call(dummyTensor(key));
     EXPECT_TRUE(called_autograd);
     EXPECT_FALSE(called_nonautograd);
   }
 }
 
+TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
+  whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::XLA);
+}
+
+TEST(OperatorRegistrationTest, whenRegisterWithLazyKernelAndCatchAll_AutogradLazyIsNotFilled) {
+  whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::Lazy);
+}
+
 TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingCppSignatures_thenFails) {
   expectThrows<c10::Error>([] {
     auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
@@ -1217,9 +1250,11 @@ TEST(NewOperatorRegistrationTest, testBasics) {
   m.def("dummy4", [](const Tensor& self, const Tensor& other) { return other; });
   m.impl("dummy", c10::DeviceType::CPU, [](const Tensor& self) { return self; });
   m.impl("dummy", c10::DeviceType::XLA, [](const Tensor& self) { return self; });
+  m.impl("dummy", c10::DeviceType::Lazy, [](const Tensor& self) { return self; });
   // Internal API
   m.impl("dummy2", c10::DispatchKey::CPU, [](const Tensor& self) { return self; });
   m.impl("dummy2", c10::DispatchKey::XLA, [](const Tensor& self) { return self; });
+  m.impl("dummy2", c10::DispatchKey::Lazy, [](const Tensor& self) { return self; });
 
   ASSERT_TRUE(Dispatcher::singleton().findSchema({"_test::dummy", ""}).has_value());
   // Should have a schema even if there are no impls
@@ -1345,15 +1380,15 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradKernel) {
     ASSERT_TRUE(math_called);
   }
 
-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
     math_called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
+    callOp(*op, dummyTensor(key));
     ASSERT_TRUE(math_called);
   }
 
-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
     math_called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
     ASSERT_TRUE(math_called);
   }
 
@@ -1523,16 +1558,16 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradKernel) {
     ASSERT_TRUE(called);
   }
 
-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
     called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
+    callOp(*op, dummyTensor(key));
     ASSERT_TRUE(called);
   }
 
-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
     called = false;
-    // AutogradXLA is fallthrough, calls XLA kernel
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    // Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
     ASSERT_TRUE(called);
   }
 
@@ -1575,17 +1610,17 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradAndCompos
     ASSERT_TRUE(backend_called);
   }
 
-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
     backend_called = math_called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
+    callOp(*op, dummyTensor(key));
     ASSERT_TRUE(backend_called);
     ASSERT_FALSE(math_called);
   }
 
-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
     backend_called = math_called = false;
-    // AutogradXLA is fallthrough, calls XLA kernel
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    // Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
     ASSERT_FALSE(math_called);
     ASSERT_TRUE(backend_called);
   }
@@ -1681,11 +1716,11 @@ TEST(NewOperatorRegistrationTest, dispatch) {
     ASSERT_TRUE(autograd_called);
   }
 
-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
     autograd_called = false;
     auto op = Dispatcher::singleton().findSchema({"test::fn_autograd", ""});
     ASSERT_TRUE(op.has_value());
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
     ASSERT_TRUE(autograd_called);
   }
 }
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index dc6595e37e4e..2fe6aa856b0b 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -26,7 +26,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/quantized_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/operators_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/lazy_tensor_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp
diff --git a/aten/src/ATen/test/lazy_tensor_test.cpp b/aten/src/ATen/test/lazy_tensor_test.cpp
new file mode 100644
index 000000000000..4c42fc4e7788
--- /dev/null
+++ b/aten/src/ATen/test/lazy_tensor_test.cpp
@@ -0,0 +1,23 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+
+void LazyTensorTest(c10::DispatchKey dispatch_key, at::DeviceType device_type) {
+  auto tensor_impl =
+      c10::make_intrusive<c10::TensorImpl, c10::UndefinedTensorImpl>(
+          dispatch_key,
+          caffe2::TypeMeta::Make<float>(),
+          at::Device(device_type, 0));
+  at::Tensor t(std::move(tensor_impl));
+  ASSERT_TRUE(t.device() == at::Device(device_type, 0));
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST(XlaTensorTest, TestNoStorage) {
+  LazyTensorTest(at::DispatchKey::XLA, at::DeviceType::XLA);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST(LazyTensorTest, TestNoStorage) {
+  LazyTensorTest(at::DispatchKey::Lazy, at::DeviceType::Lazy);
+}
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
index 5fc9e00c3b41..a3ae3ef839cc 100755
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@@ -18,7 +18,7 @@ VALGRIND=${VALGRIND:=ON}
 ./tensor_interop_test
 ./undefined_tensor_test
 ./extension_backend_test
-./xla_tensor_test
+./lazy_tensor_test
 ./tensor_iterator_test
 ./Dimname_test
 ./Dict_test
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index 6fa4cf59ed3c..2f071345311f 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -51,6 +51,7 @@ enum class Backend {
   MkldnnCPU,
   MLC,
   HPU,
+  Lazy,
   NumOptions
 };
 
@@ -69,6 +70,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::MSNPU;
   } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
     return Backend::XLA;
+  } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
+    return Backend::Lazy;
   } else if (t == DispatchKey::MLC || t == DispatchKey::AutogradMLC) {
     return Backend::MLC;
   } else if (t == DispatchKey::Vulkan) {
@@ -124,6 +127,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::MSNPU;
     case Backend::XLA:
       return DispatchKey::XLA;
+    case Backend::Lazy:
+      return DispatchKey::Lazy;
     case Backend::XPU:
       return DispatchKey::XPU;
     case Backend::SparseXPU:
@@ -177,6 +182,8 @@ static inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::MSNPU;
     case Backend::XLA:
       return DeviceType::XLA;
+    case Backend::Lazy:
+      return DeviceType::Lazy;
     case Backend::SparseCPU:
       return DeviceType::CPU;
     case Backend::SparseCUDA:
@@ -232,6 +239,8 @@ static inline const char* toString(Backend b) {
       return "MSNPU";
     case Backend::XLA:
       return "XLA";
+    case Backend::Lazy:
+      return "Lazy";
     case Backend::MLC:
       return "MLC";
     case Backend::SparseCPU:
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 82cfc3c0572e..ee6f1b473fe0 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -45,6 +45,7 @@ DeviceType parse_type(const std::string& device_string) {
           {"fpga", DeviceType::FPGA},
           {"msnpu", DeviceType::MSNPU},
           {"xla", DeviceType::XLA},
+          {"lazy", DeviceType::Lazy},
           {"vulkan", DeviceType::Vulkan},
           {"mlc", DeviceType::MLC},
           {"meta", DeviceType::Meta},
@@ -61,7 +62,7 @@ DeviceType parse_type(const std::string& device_string) {
   }
   TORCH_CHECK(
       false,
-      "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, vulkan, meta, hpu device type at start of device string: ",
+      "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
       device_string);
 }
 } // namespace
diff --git a/c10/core/Device.h b/c10/core/Device.h
index 599219e8504f..227d2cc512cc 100644
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@@ -103,7 +103,7 @@ struct C10_API Device final {
 
   /// Return true if the device supports arbirtary strides.
   bool supports_as_strided() const noexcept {
-    return type_ != DeviceType::XLA;
+    return type_ != DeviceType::XLA && type_ != DeviceType::Lazy;
   }
 
   /// Same string as returned from operator<<.
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index 54b80d491c4b..4ff939806f98 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -29,6 +29,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
       return lower_case ? "msnpu" : "MSNPU";
     case DeviceType::XLA:
       return lower_case ? "xla" : "XLA";
+    case DeviceType::Lazy:
+      return lower_case ? "lazy" : "LAZY";
     case DeviceType::MLC:
       return lower_case ? "mlc" : "MLC";
     case DeviceType::Vulkan:
@@ -75,6 +77,7 @@ bool isValidDeviceType(DeviceType d) {
     case DeviceType::FPGA:
     case DeviceType::MSNPU:
     case DeviceType::XLA:
+    case DeviceType::Lazy:
     case DeviceType::MLC:
     case DeviceType::Vulkan:
     case DeviceType::Metal:
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index 6f60399bad7f..2ae028d14402 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -30,11 +30,12 @@ enum class DeviceType : int8_t {
   Meta = 14, // Meta (tensors with no data)
   HPU = 15, // HPU / HABANA
   VE = 16, // SX-Aurora / NEC
+  Lazy = 17, // Lazy Tensors
   // NB: If you add more devices:
   //  - Change the implementations of DeviceTypeName and isValidDeviceType
   //    in DeviceType.cpp
   //  - Change the number below
-  COMPILE_TIME_MAX_DEVICE_TYPES = 17,
+  COMPILE_TIME_MAX_DEVICE_TYPES = 18,
 };
 
 constexpr DeviceType kCPU = DeviceType::CPU;
@@ -50,18 +51,19 @@ constexpr DeviceType kMetal = DeviceType::Metal;
 constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kHPU = DeviceType::HPU;
 constexpr DeviceType kVE = DeviceType::VE;
+constexpr DeviceType kLazy = DeviceType::Lazy;
 
 // define explicit int constant
 constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
     static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
 
 static_assert(
-    COMPILE_TIME_MAX_DEVICE_TYPES <= 17,
+    COMPILE_TIME_MAX_DEVICE_TYPES <= 18,
     "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
     "for this constant to reflect the actual number of DeviceTypes we support "
     "in PyTorch; it's important that this number is not too large as we "
     "use this to allocate stack arrays in some places in our code.  If you "
-    "are indeed just adding the 17th device type, feel free to change "
+    "are indeed just adding the 18th device type, feel free to change "
     "the check to 32; but if you are adding some sort of extensible device "
     "types registration, please be aware that you are affecting code that "
     "this number is small.  Try auditing uses of this constant.");
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index 9732cd2833c2..5c414484b38f 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -23,6 +23,8 @@ const char* toString(DispatchKey t) {
       return "MSNPU";
     case DispatchKey::XLA:
       return "XLA";
+    case DispatchKey::Lazy:
+      return "Lazy";
     case DispatchKey::MLC:
       return "MLC";
     case DispatchKey::HPU:
@@ -91,6 +93,8 @@ const char* toString(DispatchKey t) {
       return "AutogradCUDA";
     case DispatchKey::AutogradXLA:
       return "AutogradXLA";
+    case DispatchKey::AutogradLazy:
+      return "AutogradLazy";
     case DispatchKey::AutogradMLC:
       return "AutogradMLC";
     case DispatchKey::AutogradHPU:
@@ -179,6 +183,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
       return DispatchKey::AutogradCUDA;
     case DispatchKey::XLA:
       return DispatchKey::AutogradXLA;
+    case DispatchKey::Lazy:
+      return DispatchKey::AutogradLazy;
     case DispatchKey::MLC:
       return DispatchKey::AutogradMLC;
     case DispatchKey::HPU:
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 87f053a144be..b22778fb616e 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -68,6 +68,7 @@ enum class DispatchKey : uint8_t {
   XPU, // For out of tree Intel's heterogeneous computing plug-in
   HPU, // For out of tree & closed source integration of HPU / Habana
   VE, // For out of tree & closed source integration of SX-Aurora / NEC
+  Lazy, // For lazy tensor backends
 
   // A meta tensor is a tensor without any data associated with it.  (They
   // have also colloquially been referred to as tensors on the "null" device).
@@ -229,6 +230,7 @@ enum class DispatchKey : uint8_t {
   AutogradCPU,
   AutogradCUDA,
   AutogradXLA,
+  AutogradLazy,
   AutogradXPU,
   AutogradMLC,
   AutogradHPU,
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index 679560d4de7c..a4e399dd734b 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -12,6 +12,7 @@ constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
         DispatchKey::CPU,
         DispatchKey::CUDA,
         DispatchKey::XLA,
+        DispatchKey::Lazy,
         DispatchKey::XPU,
         DispatchKey::PrivateUse1,
         DispatchKey::PrivateUse2,
@@ -57,6 +58,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
       return DispatchKeySet(DispatchKey::CUDA);
     case DispatchKey::AutogradXLA:
       return DispatchKeySet(DispatchKey::XLA);
+    case DispatchKey::AutogradLazy:
+      return DispatchKeySet(DispatchKey::Lazy);
     case DispatchKey::AutogradMLC:
       return DispatchKeySet(DispatchKey::MLC);
     case DispatchKey::AutogradHPU:
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 07a77e26cf92..0d3a25ea9d8d 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -212,6 +212,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
     DispatchKey::AutogradCPU,
     DispatchKey::AutogradCUDA,
     DispatchKey::AutogradXLA,
+    DispatchKey::AutogradLazy,
     DispatchKey::AutogradNestedTensor,
     DispatchKey::AutogradMLC,
     DispatchKey::AutogradHPU,
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 2d2077b769e1..fff9433e270f 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -667,6 +667,8 @@ inline DispatchKey computeDispatchKey(
           return DispatchKey::MSNPU;
         case DeviceType::XLA:
           return DispatchKey::XLA;
+        case DeviceType::Lazy:
+          return DispatchKey::Lazy;
         case DeviceType::MLC:
           return DispatchKey::MLC;
         case DeviceType::Vulkan:
@@ -768,6 +770,9 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
     case DispatchKey::XLA:
     case DispatchKey::AutogradXLA:
       return DeviceType::XLA;
+    case DispatchKey::Lazy:
+    case DispatchKey::AutogradLazy:
+      return DeviceType::Lazy;
     case DispatchKey::Vulkan:
       return DeviceType::Vulkan;
     case DispatchKey::Meta:
diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index 62cd5d64a1af..3b7ac1c83100 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -789,7 +789,7 @@ CPU: registered at {}:5 :: () -> () [ boxed unboxed ]
 class TestPythonDispatcher(TestCase):
     def test_basic(self):
         dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"])
+        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd"])
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
             '''\
@@ -799,16 +799,18 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
+Lazy            fn_Lazy [kernel]
 QuantizedCPU    fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
 AutogradCPU     fallthrough [backend fallback]
 AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
         )
 
     def test_math_autogradcpu(self):
         dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd", "AutogradCPU"])
+        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd", "AutogradCPU"])
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
             '''\
@@ -818,10 +820,12 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
+Lazy            fn_Lazy [kernel]
 QuantizedCPU    fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
         )
         self.assertExpectedInline(
@@ -833,6 +837,7 @@ key             kernel
 ---------------------------
 CPU             fn_CPU
 XLA             fn_XLA
+Lazy            fn_Lazy
 AutogradCPU     fn_AutogradCPU
 CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
 '''
@@ -840,7 +845,7 @@ CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
 
     def test_defaultbackend_autogradcpu(self):
         dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "CompositeExplicitAutograd", "AutogradCPU"])
+        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeExplicitAutograd", "AutogradCPU"])
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
             '''\
@@ -850,10 +855,12 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
+Lazy            fn_Lazy [kernel]
 QuantizedCPU    fn_CompositeExplicitAutograd [default backend kernel]
 AutogradOther   fallthrough [backend fallback]
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
         )
 
@@ -866,6 +873,7 @@ key             kernel
 ---------------------------
 CPU             fn_CPU
 XLA             fn_XLA
+Lazy            fn_Lazy
 AutogradCPU     fn_AutogradCPU
 CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd
 '''
@@ -883,10 +891,12 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_CompositeImplicitAutograd [math kernel]
+Lazy            fn_CompositeImplicitAutograd [math kernel]
 QuantizedCPU    fn_QuantizedCPU [kernel]
 AutogradOther   ambiguous_autogradother [ambiguous autogradother]
 AutogradCPU     fallthrough [backend fallback]
 AutogradXLA     fn_CompositeImplicitAutograd [math kernel]
+AutogradLazy    fn_CompositeImplicitAutograd [math kernel]
 '''
         )
 
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 2c619371416e..d6f02d5a6898 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -58,6 +58,7 @@ class DispatchKey(Enum):
     FPGA = auto()
     MSNPU = auto()
     XLA = auto()
+    Lazy = auto()
     Vulkan = auto()
     Metal = auto()
     XPU = auto()
@@ -89,6 +90,7 @@ class DispatchKey(Enum):
     AutogradCPU = auto()
     AutogradCUDA = auto()
     AutogradXLA = auto()
+    AutogradLazy = auto()
     AutogradNestedTensor = auto()
     AutogradXPU = auto()
     AutogradPrivateUse1 = auto()
diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py
index d13b0aa92673..aa19a18efb3b 100644
--- a/torch/_python_dispatcher.py
+++ b/torch/_python_dispatcher.py
@@ -55,6 +55,7 @@ class PythonDispatcher:
         "CPU", "AutogradCPU",
         "QuantizedCPU", "AutogradOther",
         "XLA", "AutogradXLA",
+        "Lazy", "AutogradLazy",
     ]
     alias_keys = [
         "CompositeExplicitAutograd",
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 2c37f59990f9..0778cbc6012a 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -109,6 +109,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("FPGA", c10::DeviceType::FPGA)
       .value("MSNPU", c10::DeviceType::MSNPU)
       .value("XLA", c10::DeviceType::XLA)
+      .value("Lazy", c10::DeviceType::Lazy)
       .value("MLC", c10::DeviceType::MLC)
       .value("HPU", c10::DeviceType::HPU)
       .value("Meta", c10::DeviceType::Meta)
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 82056f93ec40..61dac97b8dc7 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -32,6 +32,7 @@ c10::optional<c10::DispatchKey> parseDispatchKey(const std::string& k) {
     {"CPU", c10::DispatchKey::CPU},
     {"CUDA", c10::DispatchKey::CUDA},
     {"XLA", c10::DispatchKey::XLA},
+    {"Lazy", c10::DispatchKey::Lazy},
     {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
     {"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd},
     {"Autograd", c10::DispatchKey::Autograd},
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index e8ef72060b34..819a3a4c5da8 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -319,6 +319,7 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
             dispatch_key == c10::DispatchKey::CUDA ||
             dispatch_key == c10::DispatchKey::HIP ||
             dispatch_key == c10::DispatchKey::XLA ||
+            dispatch_key == c10::DispatchKey::Lazy ||
             dispatch_key == c10::DispatchKey::XPU,
         "new(): expected DispatchKey: ",
         c10::DispatchKey::CPU,
@@ -329,11 +330,13 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
         " or ",
         c10::DispatchKey::XLA,
         " or ",
+        c10::DispatchKey::Lazy,
+        " or ",
         c10::DispatchKey::XPU,
         " but got: ",
         dispatch_key);
   } else if(expected_layout == c10::kSparse) {
-    // NOTE: no sparse XLA
+    // NOTE: no sparse XLA or Lazy
     TORCH_CHECK(
         dispatch_key == c10::DispatchKey::SparseCPU ||
             dispatch_key == c10::DispatchKey::SparseCUDA ||
diff --git a/torch/library.h b/torch/library.h
index 2135efcf882c..0e85910664ee 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -292,6 +292,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
         return c10::DispatchKey::CUDA;
       case c10::DeviceType::XLA:
         return c10::DispatchKey::XLA;
+      case c10::DeviceType::Lazy:
+        return c10::DispatchKey::Lazy;
       case c10::DeviceType::MLC:
         return c10::DispatchKey::MLC;
       case c10::DeviceType::Meta: