Add device and key for lazy tensors (#61621)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61621 Test Plan: CI Reviewed By: mruberry Differential Revision: D29912934 Pulled By: asuhan fbshipit-source-id: 493c32063a3e756d93cbf1d876563a35eaafb537
2025-10-20 21:14:14 +08:00 · 2021-07-26 22:59:10 -07:00
parent 2945a73d90
commit b176feec1e
24 changed files with 187 additions and 70 deletions
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -73,6 +73,9 @@ class TORCH_API Context {
  static bool hasXLA() {
    return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA);
  }
+  static bool hasLazy() {
+    return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
+  }
  static bool hasMLC() {
    return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC);
  }
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -1343,9 +1343,9 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {

  if (is_meta_) return;

-  // XLA tensors don't have storage, so they don't have an underlying data pointer.
+  // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
  // Nothing beyond this point is important for meta functions, so it's fine to exit early here.
-  if (common_device_.type() == DeviceType::XLA) return;
+  if (common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::Lazy) return;

  for (auto& op : operands_) {
    TORCH_INTERNAL_ASSERT(op.tensor->defined());
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@ -52,6 +52,10 @@ TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }

+TORCH_LIBRARY_IMPL(_, AutogradLazy, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
 TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@ -257,6 +257,11 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
  " backend.");
 }

+std::string expectedMessageForBackend(DispatchKey key) {
+  std::string key_str(c10::toString(key));
+  return "Could not run '_test::dummy' with arguments from the '" + key_str + "' backend";
+}
+
 TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCalling_thenCallsCorrectKernel) {
  bool called_kernel1 = false;
  bool called_kernel2 = false;
@ -277,18 +282,20 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall
  EXPECT_FALSE(called_kernel1);
  EXPECT_TRUE(called_kernel2);

-  expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "Could not run '_test::dummy' with arguments from the 'XLA'"
-  " backend.");
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    std::string expectMessage = expectedMessageForBackend(key);
+    expectThrows<c10::Error>([&] {
+      callOp(*op, dummyTensor(key));
+    }, expectMessage.c_str());

-  // also assert that the error message contains the available tensor type ids, but don't assert their order
-  expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "CPU");
-  expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "CUDA");
+    // also assert that the error message contains the available tensor type ids, but don't assert their order
+    expectThrows<c10::Error>([&] {
+      callOp(*op, dummyTensor(key));
+    }, "CPU");
+    expectThrows<c10::Error>([&] {
+      callOp(*op, dummyTensor(key));
+    }, "CUDA");
+  }
 }

 bool called_stackbased_kernel = false;
@ -302,7 +309,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndNoneCanInf
    auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
      .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
      .kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
-      .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+      .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+      .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));
  }, "Cannot infer operator schema for this kind of kernel in registration of operator _test::dummy");
 }

@ -311,7 +319,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
    .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
    .kernel<&stackBasedKernel>(c10::DispatchKey::CUDA)
-    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+    .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));

  auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
  ASSERT_TRUE(op.has_value()); // assert schema is registered
@ -326,10 +335,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndNoneCanI
  EXPECT_TRUE(called_stackbased_kernel);
  EXPECT_FALSE(called_kernel);

-  called_kernel = called_stackbased_kernel = false;
-  callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  EXPECT_TRUE(called_stackbased_kernel);
-  EXPECT_FALSE(called_kernel);
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    called_kernel = called_stackbased_kernel = false;
+    callOp(*op, dummyTensor(key));
+    EXPECT_TRUE(called_stackbased_kernel);
+    EXPECT_FALSE(called_kernel);
+  }
 }

 TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCanInferSchema_thenSucceeds) {
@ -337,7 +348,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
  auto registrar1 = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
    .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
    .kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
-    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+    .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));

  auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
  ASSERT_TRUE(op.has_value()); // assert schema is registered
@ -352,10 +364,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsByNameAndOnlyOneCan
  EXPECT_FALSE(called_stackbased_kernel);
  EXPECT_TRUE(called_kernel);

-  called_kernel = called_stackbased_kernel = false;
-  callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  EXPECT_TRUE(called_stackbased_kernel);
-  EXPECT_FALSE(called_kernel);
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    called_kernel = called_stackbased_kernel = false;
+    callOp(*op, dummyTensor(key));
+    EXPECT_TRUE(called_stackbased_kernel);
+    EXPECT_FALSE(called_kernel);
+  }
 }

 TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneCanInferSchema_thenSucceeds) {
@ -363,7 +377,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
    .kernel<&stackBasedKernel>(c10::DispatchKey::CPU)
    .kernel<MockKernel>(c10::DispatchKey::CUDA, &called_kernel)
-    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA));
+    .kernel<&stackBasedKernel>(c10::DispatchKey::XLA)
+    .kernel<&stackBasedKernel>(c10::DispatchKey::Lazy));

  auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
  ASSERT_TRUE(op.has_value()); // assert schema is registered
@ -378,10 +393,12 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsBySchemaAndOnlyOneC
  EXPECT_FALSE(called_stackbased_kernel);
  EXPECT_TRUE(called_kernel);

-  called_kernel = called_stackbased_kernel = false;
-  callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  EXPECT_TRUE(called_stackbased_kernel);
-  EXPECT_FALSE(called_kernel);
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+    called_kernel = called_stackbased_kernel = false;
+    callOp(*op, dummyTensor(key));
+    EXPECT_TRUE(called_stackbased_kernel);
+    EXPECT_FALSE(called_kernel);
+  }
 }

 struct DummyKernelWithIntParam final : OperatorKernel {
@ -570,21 +587,21 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) {
  EXPECT_FALSE(called_nonautograd);
 }

-TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
+void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
-    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(DispatchKey::AutogradXLA)
+    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(key))
    .kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::Autograd));

  auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
  ASSERT_TRUE(op.has_value());

+  std::string expectedMessage = expectedMessageForBackend(key);
  expectThrows<c10::Error>([&] {
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
-  }, "Could not run '_test::dummy' with arguments from the 'XLA'"
-  " backend.");
+    callOp(*op, dummyTensor(key));
+  }, expectedMessage.c_str());

  called_nonautograd = called_autograd = false;
-  op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
+  op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
  EXPECT_TRUE(called_nonautograd);
  EXPECT_FALSE(called_autograd);

@ -594,7 +611,15 @@ TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
  EXPECT_FALSE(called_nonautograd);
 }

-TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
+TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
+  LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA);
+}
+
+TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) {
+  LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy);
+}
+
+void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) {
  {
    auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
      .catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());
@ -603,38 +628,46 @@ TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAI
    ASSERT_TRUE(op.has_value());

    called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
+    op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
    EXPECT_TRUE(called_nonautograd);
    EXPECT_FALSE(called_autograd);

    called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
+    op->typed<void (Tensor)>().call(dummyTensor(key));
    EXPECT_FALSE(called_autograd);
    EXPECT_TRUE(called_nonautograd);
  }
  {
    auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
-      .kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::XLA)
+      .kernel<decltype(autograd_kernel), &autograd_kernel>(key)
      .catchAllKernel<decltype(nonautograd_kernel), nonautograd_kernel>());

    auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
    ASSERT_TRUE(op.has_value());

-    // When there's direct registration to XLA backend, AutogradXLA doesn't pick up catchAll
+    // When there's direct registration to XLA / Lazy backend, Autograd{XLA, Lazy} doesn't pick up catchAll
    // kernel in precompute but just keep fallthrough kernel from backend fallback.
-    // Thus it falls through AutogradXLA and reaches the kernel at XLA key.
+    // Thus it falls through Autograd{XLA, Lazy} and reaches the kernel at XLA / Lazy key.
    called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA, /*requires_grad=*/true));
+    op->typed<void (Tensor)>().call(dummyTensor(key, /*requires_grad=*/true));
    EXPECT_FALSE(called_nonautograd);
    EXPECT_TRUE(called_autograd);

    called_nonautograd = called_autograd = false;
-    op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::XLA));
+    op->typed<void (Tensor)>().call(dummyTensor(key));
    EXPECT_TRUE(called_autograd);
    EXPECT_FALSE(called_nonautograd);
  }
 }

+TEST(OperatorRegistrationTest, whenRegisterWithXLAKernelAndCatchAll_AutogradXLAIsNotFilled) {
+  whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::XLA);
+}
+
+TEST(OperatorRegistrationTest, whenRegisterWithLazyKernelAndCatchAll_AutogradLazyIsNotFilled) {
+  whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::Lazy);
+}
+
 TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingCppSignatures_thenFails) {
  expectThrows<c10::Error>([] {
    auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
@ -1217,9 +1250,11 @@ TEST(NewOperatorRegistrationTest, testBasics) {
  m.def("dummy4", [](const Tensor& self, const Tensor& other) { return other; });
  m.impl("dummy", c10::DeviceType::CPU, [](const Tensor& self) { return self; });
  m.impl("dummy", c10::DeviceType::XLA, [](const Tensor& self) { return self; });
+  m.impl("dummy", c10::DeviceType::Lazy, [](const Tensor& self) { return self; });
  // Internal API
  m.impl("dummy2", c10::DispatchKey::CPU, [](const Tensor& self) { return self; });
  m.impl("dummy2", c10::DispatchKey::XLA, [](const Tensor& self) { return self; });
+  m.impl("dummy2", c10::DispatchKey::Lazy, [](const Tensor& self) { return self; });

  ASSERT_TRUE(Dispatcher::singleton().findSchema({"_test::dummy", ""}).has_value());
  // Should have a schema even if there are no impls
@ -1345,15 +1380,15 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradKernel) {
    ASSERT_TRUE(math_called);
  }

-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
    math_called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
+    callOp(*op, dummyTensor(key));
    ASSERT_TRUE(math_called);
  }

-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
    math_called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
    ASSERT_TRUE(math_called);
  }

@ -1523,16 +1558,16 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradKernel) {
    ASSERT_TRUE(called);
  }

-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
    called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
+    callOp(*op, dummyTensor(key));
    ASSERT_TRUE(called);
  }

-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
    called = false;
-    // AutogradXLA is fallthrough, calls XLA kernel
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    // Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
    ASSERT_TRUE(called);
  }

@ -1575,17 +1610,17 @@ TEST(NewOperatorRegistrationTest, dispatchWithCompositeExplicitAutogradAndCompos
    ASSERT_TRUE(backend_called);
  }

-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
    backend_called = math_called = false;
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA));
+    callOp(*op, dummyTensor(key));
    ASSERT_TRUE(backend_called);
    ASSERT_FALSE(math_called);
  }

-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
    backend_called = math_called = false;
-    // AutogradXLA is fallthrough, calls XLA kernel
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    // Autograd{XLA, Lazy} is fallthrough, calls XLA / Lazy kernel
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
    ASSERT_FALSE(math_called);
    ASSERT_TRUE(backend_called);
  }
@ -1681,11 +1716,11 @@ TEST(NewOperatorRegistrationTest, dispatch) {
    ASSERT_TRUE(autograd_called);
  }

-  {
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
    autograd_called = false;
    auto op = Dispatcher::singleton().findSchema({"test::fn_autograd", ""});
    ASSERT_TRUE(op.has_value());
-    callOp(*op, dummyTensor(c10::DispatchKey::XLA, /*requires_grad=*/true));
+    callOp(*op, dummyTensor(key, /*requires_grad=*/true));
    ASSERT_TRUE(autograd_called);
  }
 }
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -26,7 +26,7 @@ list(APPEND ATen_CPU_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/quantized_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/operators_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/lazy_tensor_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp
--- a/aten/src/ATen/test/lazy_tensor_test.cpp
+++ b/aten/src/ATen/test/lazy_tensor_test.cpp
@ -0,0 +1,23 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+
+void LazyTensorTest(c10::DispatchKey dispatch_key, at::DeviceType device_type) {
+  auto tensor_impl =
+      c10::make_intrusive<c10::TensorImpl, c10::UndefinedTensorImpl>(
+          dispatch_key,
+          caffe2::TypeMeta::Make<float>(),
+          at::Device(device_type, 0));
+  at::Tensor t(std::move(tensor_impl));
+  ASSERT_TRUE(t.device() == at::Device(device_type, 0));
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST(XlaTensorTest, TestNoStorage) {
+  LazyTensorTest(at::DispatchKey::XLA, at::DeviceType::XLA);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST(LazyTensorTest, TestNoStorage) {
+  LazyTensorTest(at::DispatchKey::Lazy, at::DeviceType::Lazy);
+}
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@ -18,7 +18,7 @@ VALGRIND=${VALGRIND:=ON}
 ./tensor_interop_test
 ./undefined_tensor_test
 ./extension_backend_test
-./xla_tensor_test
+./lazy_tensor_test
 ./tensor_iterator_test
 ./Dimname_test
 ./Dict_test
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@ -51,6 +51,7 @@ enum class Backend {
  MkldnnCPU,
  MLC,
  HPU,
+  Lazy,
  NumOptions
 };

@ -69,6 +70,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
    return Backend::MSNPU;
  } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
    return Backend::XLA;
+  } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
+    return Backend::Lazy;
  } else if (t == DispatchKey::MLC || t == DispatchKey::AutogradMLC) {
    return Backend::MLC;
  } else if (t == DispatchKey::Vulkan) {
@ -124,6 +127,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
      return DispatchKey::MSNPU;
    case Backend::XLA:
      return DispatchKey::XLA;
+    case Backend::Lazy:
+      return DispatchKey::Lazy;
    case Backend::XPU:
      return DispatchKey::XPU;
    case Backend::SparseXPU:
@ -177,6 +182,8 @@ static inline DeviceType backendToDeviceType(Backend b) {
      return DeviceType::MSNPU;
    case Backend::XLA:
      return DeviceType::XLA;
+    case Backend::Lazy:
+      return DeviceType::Lazy;
    case Backend::SparseCPU:
      return DeviceType::CPU;
    case Backend::SparseCUDA:
@ -232,6 +239,8 @@ static inline const char* toString(Backend b) {
      return "MSNPU";
    case Backend::XLA:
      return "XLA";
+    case Backend::Lazy:
+      return "Lazy";
    case Backend::MLC:
      return "MLC";
    case Backend::SparseCPU:
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@ -45,6 +45,7 @@ DeviceType parse_type(const std::string& device_string) {
          {"fpga", DeviceType::FPGA},
          {"msnpu", DeviceType::MSNPU},
          {"xla", DeviceType::XLA},
+          {"lazy", DeviceType::Lazy},
          {"vulkan", DeviceType::Vulkan},
          {"mlc", DeviceType::MLC},
          {"meta", DeviceType::Meta},
@ -61,7 +62,7 @@ DeviceType parse_type(const std::string& device_string) {
  }
  TORCH_CHECK(
      false,
-      "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, vulkan, meta, hpu device type at start of device string: ",
+      "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
      device_string);
 }
 } // namespace
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@ -103,7 +103,7 @@ struct C10_API Device final {

  /// Return true if the device supports arbirtary strides.
  bool supports_as_strided() const noexcept {
-    return type_ != DeviceType::XLA;
+    return type_ != DeviceType::XLA && type_ != DeviceType::Lazy;
  }

  /// Same string as returned from operator<<.
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@ -29,6 +29,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
      return lower_case ? "msnpu" : "MSNPU";
    case DeviceType::XLA:
      return lower_case ? "xla" : "XLA";
+    case DeviceType::Lazy:
+      return lower_case ? "lazy" : "LAZY";
    case DeviceType::MLC:
      return lower_case ? "mlc" : "MLC";
    case DeviceType::Vulkan:
@ -75,6 +77,7 @@ bool isValidDeviceType(DeviceType d) {
    case DeviceType::FPGA:
    case DeviceType::MSNPU:
    case DeviceType::XLA:
+    case DeviceType::Lazy:
    case DeviceType::MLC:
    case DeviceType::Vulkan:
    case DeviceType::Metal:
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@ -30,11 +30,12 @@ enum class DeviceType : int8_t {
  Meta = 14, // Meta (tensors with no data)
  HPU = 15, // HPU / HABANA
  VE = 16, // SX-Aurora / NEC
+  Lazy = 17, // Lazy Tensors
  // NB: If you add more devices:
  //  - Change the implementations of DeviceTypeName and isValidDeviceType
  //    in DeviceType.cpp
  //  - Change the number below
-  COMPILE_TIME_MAX_DEVICE_TYPES = 17,
+  COMPILE_TIME_MAX_DEVICE_TYPES = 18,
 };

 constexpr DeviceType kCPU = DeviceType::CPU;
@ -50,18 +51,19 @@ constexpr DeviceType kMetal = DeviceType::Metal;
 constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kHPU = DeviceType::HPU;
 constexpr DeviceType kVE = DeviceType::VE;
+constexpr DeviceType kLazy = DeviceType::Lazy;

 // define explicit int constant
 constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
    static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);

 static_assert(
-    COMPILE_TIME_MAX_DEVICE_TYPES <= 17,
+    COMPILE_TIME_MAX_DEVICE_TYPES <= 18,
    "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
    "for this constant to reflect the actual number of DeviceTypes we support "
    "in PyTorch; it's important that this number is not too large as we "
    "use this to allocate stack arrays in some places in our code.  If you "
-    "are indeed just adding the 17th device type, feel free to change "
+    "are indeed just adding the 18th device type, feel free to change "
    "the check to 32; but if you are adding some sort of extensible device "
    "types registration, please be aware that you are affecting code that "
    "this number is small.  Try auditing uses of this constant.");
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@ -23,6 +23,8 @@ const char* toString(DispatchKey t) {
      return "MSNPU";
    case DispatchKey::XLA:
      return "XLA";
+    case DispatchKey::Lazy:
+      return "Lazy";
    case DispatchKey::MLC:
      return "MLC";
    case DispatchKey::HPU:
@ -91,6 +93,8 @@ const char* toString(DispatchKey t) {
      return "AutogradCUDA";
    case DispatchKey::AutogradXLA:
      return "AutogradXLA";
+    case DispatchKey::AutogradLazy:
+      return "AutogradLazy";
    case DispatchKey::AutogradMLC:
      return "AutogradMLC";
    case DispatchKey::AutogradHPU:
@ -179,6 +183,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
      return DispatchKey::AutogradCUDA;
    case DispatchKey::XLA:
      return DispatchKey::AutogradXLA;
+    case DispatchKey::Lazy:
+      return DispatchKey::AutogradLazy;
    case DispatchKey::MLC:
      return DispatchKey::AutogradMLC;
    case DispatchKey::HPU:
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@ -68,6 +68,7 @@ enum class DispatchKey : uint8_t {
  XPU, // For out of tree Intel's heterogeneous computing plug-in
  HPU, // For out of tree & closed source integration of HPU / Habana
  VE, // For out of tree & closed source integration of SX-Aurora / NEC
+  Lazy, // For lazy tensor backends

  // A meta tensor is a tensor without any data associated with it.  (They
  // have also colloquially been referred to as tensors on the "null" device).
@ -229,6 +230,7 @@ enum class DispatchKey : uint8_t {
  AutogradCPU,
  AutogradCUDA,
  AutogradXLA,
+  AutogradLazy,
  AutogradXPU,
  AutogradMLC,
  AutogradHPU,
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@ -12,6 +12,7 @@ constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
        DispatchKey::CPU,
        DispatchKey::CUDA,
        DispatchKey::XLA,
+        DispatchKey::Lazy,
        DispatchKey::XPU,
        DispatchKey::PrivateUse1,
        DispatchKey::PrivateUse2,
@ -57,6 +58,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
      return DispatchKeySet(DispatchKey::CUDA);
    case DispatchKey::AutogradXLA:
      return DispatchKeySet(DispatchKey::XLA);
+    case DispatchKey::AutogradLazy:
+      return DispatchKeySet(DispatchKey::Lazy);
    case DispatchKey::AutogradMLC:
      return DispatchKeySet(DispatchKey::MLC);
    case DispatchKey::AutogradHPU:
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@ -212,6 +212,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
    DispatchKey::AutogradCPU,
    DispatchKey::AutogradCUDA,
    DispatchKey::AutogradXLA,
+    DispatchKey::AutogradLazy,
    DispatchKey::AutogradNestedTensor,
    DispatchKey::AutogradMLC,
    DispatchKey::AutogradHPU,
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@ -667,6 +667,8 @@ inline DispatchKey computeDispatchKey(
          return DispatchKey::MSNPU;
        case DeviceType::XLA:
          return DispatchKey::XLA;
+        case DeviceType::Lazy:
+          return DispatchKey::Lazy;
        case DeviceType::MLC:
          return DispatchKey::MLC;
        case DeviceType::Vulkan:
@ -768,6 +770,9 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
    case DispatchKey::XLA:
    case DispatchKey::AutogradXLA:
      return DeviceType::XLA;
+    case DispatchKey::Lazy:
+    case DispatchKey::AutogradLazy:
+      return DeviceType::Lazy;
    case DispatchKey::Vulkan:
      return DeviceType::Vulkan;
    case DispatchKey::Meta:
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@ -789,7 +789,7 @@ CPU: registered at {}:5 :: () -> () [ boxed unboxed ]
 class TestPythonDispatcher(TestCase):
    def test_basic(self):
        dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"])
+        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd"])
        self.assertExpectedInline(
            dispatcher.dispatchTable(),
            '''\
@ -799,16 +799,18 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
+Lazy            fn_Lazy [kernel]
 QuantizedCPU    fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
 AutogradCPU     fallthrough [backend fallback]
 AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
        )

    def test_math_autogradcpu(self):
        dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd", "AutogradCPU"])
+        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd", "AutogradCPU"])
        self.assertExpectedInline(
            dispatcher.dispatchTable(),
            '''\
@ -818,10 +820,12 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
+Lazy            fn_Lazy [kernel]
 QuantizedCPU    fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
        )
        self.assertExpectedInline(
@ -833,6 +837,7 @@ key             kernel
 ---------------------------
 CPU             fn_CPU
 XLA             fn_XLA
+Lazy            fn_Lazy
 AutogradCPU     fn_AutogradCPU
 CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
 '''
@ -840,7 +845,7 @@ CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd

    def test_defaultbackend_autogradcpu(self):
        dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "CompositeExplicitAutograd", "AutogradCPU"])
+        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeExplicitAutograd", "AutogradCPU"])
        self.assertExpectedInline(
            dispatcher.dispatchTable(),
            '''\
@ -850,10 +855,12 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
+Lazy            fn_Lazy [kernel]
 QuantizedCPU    fn_CompositeExplicitAutograd [default backend kernel]
 AutogradOther   fallthrough [backend fallback]
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
        )

@ -866,6 +873,7 @@ key             kernel
 ---------------------------
 CPU             fn_CPU
 XLA             fn_XLA
+Lazy            fn_Lazy
 AutogradCPU     fn_AutogradCPU
 CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd
 '''
@ -883,10 +891,12 @@ key             kernel
 ---------------------------
 CPU             fn_CPU [kernel]
 XLA             fn_CompositeImplicitAutograd [math kernel]
+Lazy            fn_CompositeImplicitAutograd [math kernel]
 QuantizedCPU    fn_QuantizedCPU [kernel]
 AutogradOther   ambiguous_autogradother [ambiguous autogradother]
 AutogradCPU     fallthrough [backend fallback]
 AutogradXLA     fn_CompositeImplicitAutograd [math kernel]
+AutogradLazy    fn_CompositeImplicitAutograd [math kernel]
 '''
        )

--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@ -58,6 +58,7 @@ class DispatchKey(Enum):
    FPGA = auto()
    MSNPU = auto()
    XLA = auto()
+    Lazy = auto()
    Vulkan = auto()
    Metal = auto()
    XPU = auto()
@ -89,6 +90,7 @@ class DispatchKey(Enum):
    AutogradCPU = auto()
    AutogradCUDA = auto()
    AutogradXLA = auto()
+    AutogradLazy = auto()
    AutogradNestedTensor = auto()
    AutogradXPU = auto()
    AutogradPrivateUse1 = auto()
--- a/torch/_python_dispatcher.py
+++ b/torch/_python_dispatcher.py
@ -55,6 +55,7 @@ class PythonDispatcher:
        "CPU", "AutogradCPU",
        "QuantizedCPU", "AutogradOther",
        "XLA", "AutogradXLA",
+        "Lazy", "AutogradLazy",
    ]
    alias_keys = [
        "CompositeExplicitAutograd",
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@ -109,6 +109,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
      .value("FPGA", c10::DeviceType::FPGA)
      .value("MSNPU", c10::DeviceType::MSNPU)
      .value("XLA", c10::DeviceType::XLA)
+      .value("Lazy", c10::DeviceType::Lazy)
      .value("MLC", c10::DeviceType::MLC)
      .value("HPU", c10::DeviceType::HPU)
      .value("Meta", c10::DeviceType::Meta)
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@ -32,6 +32,7 @@ c10::optional<c10::DispatchKey> parseDispatchKey(const std::string& k) {
    {"CPU", c10::DispatchKey::CPU},
    {"CUDA", c10::DispatchKey::CUDA},
    {"XLA", c10::DispatchKey::XLA},
+    {"Lazy", c10::DispatchKey::Lazy},
    {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
    {"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd},
    {"Autograd", c10::DispatchKey::Autograd},
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@ -319,6 +319,7 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
            dispatch_key == c10::DispatchKey::CUDA ||
            dispatch_key == c10::DispatchKey::HIP ||
            dispatch_key == c10::DispatchKey::XLA ||
+            dispatch_key == c10::DispatchKey::Lazy ||
            dispatch_key == c10::DispatchKey::XPU,
        "new(): expected DispatchKey: ",
        c10::DispatchKey::CPU,
@ -329,11 +330,13 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
        " or ",
        c10::DispatchKey::XLA,
        " or ",
+        c10::DispatchKey::Lazy,
+        " or ",
        c10::DispatchKey::XPU,
        " but got: ",
        dispatch_key);
  } else if(expected_layout == c10::kSparse) {
-    // NOTE: no sparse XLA
+    // NOTE: no sparse XLA or Lazy
    TORCH_CHECK(
        dispatch_key == c10::DispatchKey::SparseCPU ||
            dispatch_key == c10::DispatchKey::SparseCUDA ||
--- a/torch/library.h
+++ b/torch/library.h
@ -292,6 +292,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
        return c10::DispatchKey::CUDA;
      case c10::DeviceType::XLA:
        return c10::DispatchKey::XLA;
+      case c10::DeviceType::Lazy:
+        return c10::DispatchKey::Lazy;
      case c10::DeviceType::MLC:
        return c10::DispatchKey::MLC;
      case c10::DeviceType::Meta: