Revert "Add Accelerator device and shell hooks (#119329)"

This reverts commit 4b9568a360c4a90220e78e43435be8c56bc33fb2. Reverted https://github.com/pytorch/pytorch/pull/119329 on behalf of https://github.com/huydhn due to Breaks internal build and requires OSS file update to fix it ([comment](https://github.com/pytorch/pytorch/pull/119329#issuecomment-1940278598))
2025-10-20 21:14:14 +08:00 · 2024-02-13 02:23:45 +00:00
parent 7d4b666870
commit 214f06ae3a
16 changed files with 103 additions and 185 deletions
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -1,13 +1,11 @@
 #pragma once

 #include <ATen/CPUGeneratorImpl.h>
-#include <ATen/DeviceAccelerator.h>
 #include <ATen/LinalgBackend.h>
 #include <ATen/core/ATenGeneral.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
 #include <ATen/core/Generator.h>
 #include <ATen/core/LegacyTypeDispatch.h>
-#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
 #include <ATen/detail/IPUHooksInterface.h>
@ -58,22 +56,6 @@ class TORCH_API Context {
      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
    }
  }
-  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
-      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
-    c10::DeviceType device_type = opt_device_type.has_value()
-        ? opt_device_type.value()
-        : at::getAccelerator(true).value();
-    if (device_type == at::kCUDA) {
-      return at::detail::getCUDAHooks();
-    } else if (device_type == at::kMPS) {
-      return at::detail::getMPSHooks();
-    } else if (device_type == at::kPrivateUse1) {
-      return at::detail::getPrivateUse1Hooks();
-    } else {
-      AT_ERROR(
-          c10::DeviceTypeName(device_type), " device type not an accelerator.");
-    }
-  }
  Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
    initCUDAIfNeeded(device_type);
    initHIPIfNeeded(device_type);
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@ -1,31 +0,0 @@
-#include <ATen/DeviceAccelerator.h>
-#include <ATen/Context.h>
-
-namespace at {
-
-C10_API std::optional<DeviceType> getAccelerator(bool checked) {
-#define CHECK_NO_CUDA \
-  TORCH_CHECK(!at::hasCUDA(), "Cannot have both CUDA and PrivateUse1");
-
-#define CHECK_NO_PU1 \
-  TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");
-
-    if (is_privateuse1_backend_registered()) {
-        // We explicitly allow PrivateUse1 and another device at the same time
-        // as we use this for testing.
-        // Whenever a PrivateUse1 device is registered, use it first.
-        return kPrivateUse1;
-    } else if (at::hasCUDA()) {
-        CHECK_NO_PU1
-        return kCUDA;
-    } else {
-        TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
-        return std::nullopt;
-    }
-
-#undef CHECK_NO_CUDA
-#undef CHECK_NO_PU1
-}
-
-
-} // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -1,27 +0,0 @@
-#pragma once
-
-#include <c10/core/DeviceType.h>
-#include <c10/macros/Macros.h>
-
-#include <ATen/detail/MTIAHooksInterface.h>
-#include <optional>
-
-// This file defines the top level Accelerator concept for PyTorch.
-// A device is an accelerator per the definition here if:
-// - It is mutually exclusive with all other accelerators
-// - It performs asynchronous compute via a Stream/Event system
-// - It provides a set of common APIs as defined by AcceleratorHooksInterface
-//
-// As of today, accelerator devices are (in no particular order):
-// CUDA, MTIA, PrivateUse1
-// We want to add once all the proper APIs are supported and tested:
-// HIP, MPS, XPU
-
-namespace at {
-
-// Ensures that only one accelerator is available (at
-// compile time if possible) and return it.
-// When checked is true, the returned optional always has a value.
-TORCH_API std::optional<DeviceType> getAccelerator(bool checked = false);
-
-} // namespace at
--- a/aten/src/ATen/detail/AcceleratorHooksInterface.h
+++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h
@ -1,21 +0,0 @@
-#pragma once
-
-#include <c10/core/Device.h>
-
-namespace at {
-
-// AcceleratorHooksInterface is a shared interface provided by all
-// accelerators to allow generic code.
-// This inferface is hook-based as it corresponds to all the functions
-// that are going to be called in a generic way from the CPU code.
-
-struct TORCH_API AcceleratorHooksInterface {
-  // This should never actually be implemented, but it is used to
-  // squelch -Werror=non-virtual-dtor
-  virtual ~AcceleratorHooksInterface() = default;
-
-  // Whether the device at device_index is fully initialized or not.
-  virtual bool hasPrimaryContext(DeviceIndex device_index) const = 0;
-};
-
-} // namespace at
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -4,8 +4,6 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>

-#include <ATen/detail/AcceleratorHooksInterface.h>
-
 // Forward-declares at::Generator and at::cuda::NVRTC
 namespace at {
 struct Generator;
@ -59,7 +57,7 @@ constexpr const char* CUDA_HELP =
 // TODO: Consider putting the stub definitions in another class, so that one
 // never forgets to implement each virtual function in the real implementation
 // in CUDAHooks.  This probably doesn't buy us much though.
-struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
+struct TORCH_API CUDAHooksInterface {
  // This should never actually be implemented, but it is used to
  // squelch -Werror=non-virtual-dtor
  virtual ~CUDAHooksInterface() = default;
@ -109,7 +107,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
  }

-  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const {
    TORCH_CHECK(false, "Cannot call hasPrimaryContext(", device_index, ") without ATen_cuda library. ", CUDA_HELP);
  }

--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@ -4,7 +4,6 @@

 #include <c10/core/Allocator.h>
 #include <ATen/core/Generator.h>
-#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>

@ -12,7 +11,7 @@

 namespace at {

-struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
+struct TORCH_API MPSHooksInterface {
  // this fails the implementation if MPSHooks functions are called, but
  // MPS backend is not present.
  #define FAIL_MPSHOOKS_FUNC(func) \
@ -87,9 +86,7 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
  virtual double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
-  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
-    FAIL_MPSHOOKS_FUNC(__func__);
-  }
+
  #undef FAIL_MPSHOOKS_FUNC
 };

--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@ -4,8 +4,6 @@

 #include <c10/util/Registry.h>

-#include <ATen/detail/AcceleratorHooksInterface.h>
-
 #include <string>

 namespace at {
@ -19,7 +17,7 @@ constexpr const char* MTIA_HELP =
    "this error has occurred because you are trying "
    "to use some MTIA's functionality without MTIA extension included.";

-struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
+struct TORCH_API MTIAHooksInterface {
  virtual ~MTIAHooksInterface() = default;

  virtual void initMTIA() const {
@ -39,14 +37,6 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
        "Cannot query detailed MTIA version without MTIA Extension for PyTorch.",
        MTIA_HELP);
  }
-
-  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
-    TORCH_CHECK(
-        false,
-        "Cannot check MTIA primary context without MTIA Extension for PyTorch.",
-        MTIA_HELP);
-  }
-
 };

 struct TORCH_API MTIAHooksArgs {};
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp
@ -22,15 +22,4 @@ TORCH_API bool isPrivateUse1HooksRegistered() {
  return privateuse1_hooks != nullptr;
 }

-namespace detail {
-
-TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks() {
-  TORCH_CHECK(
-      privateuse1_hooks != nullptr,
-      "Please register PrivateUse1HooksInterface by `RegisterPrivateUse1HooksInterface` first.");
-  return *privateuse1_hooks;
 }
-
-} // namespace detail
-
-} // namespace at
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@ -1,14 +1,13 @@
 #pragma once

 #include <ATen/core/Generator.h>
-#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
 #include <c10/core/Storage.h>
 #include <c10/util/Exception.h>
 namespace at {

-struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
+struct TORCH_API PrivateUse1HooksInterface {
  virtual ~PrivateUse1HooksInterface() = default;
  virtual const at::Generator& getDefaultGenerator(
      c10::DeviceIndex device_index) {
@ -29,7 +28,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
  }

-  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const {
    TORCH_CHECK_NOT_IMPLEMENTED(
        false,
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`.");
@ -52,10 +51,4 @@ TORCH_API at::PrivateUse1HooksInterface* GetPrivateUse1HooksInterface();

 TORCH_API bool isPrivateUse1HooksRegistered();

-namespace detail {
-
-TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks();
-
-} // namespace detail
-
 } // namespace at
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@ -46,12 +46,6 @@ struct MPSHooks : public at::MPSHooksInterface {
  void synchronizeEvent(uint32_t event_id) const override;
  bool queryEvent(uint32_t event_id) const override;
  double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const override;
-
-  // Compatibility with Accelerator API
-  bool hasPrimaryContext(DeviceIndex device_index) const override {
-    // When MPS is available, it is always in use for the one device.
-    return true;
-  }
 };

 } // namespace at::mps
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -970,7 +970,6 @@ aten_cpu_source_non_codegen_list = [
    "aten/src/ATen/AccumulateType.cpp",
    "aten/src/ATen/LegacyBatchedTensorImpl.cpp",
    "aten/src/ATen/CPUGeneratorImpl.cpp",
-    "aten/src/ATen/DeviceAccelerator.cpp",
    "aten/src/ATen/Context.cpp",
    "aten/src/ATen/DLConvertor.cpp",
    "aten/src/ATen/EmptyTensor.cpp",
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -8,7 +8,6 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/dynamo/compiled_autograd.h>

-#include <ATen/DeviceAccelerator.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/Parallel.h>
@ -47,7 +46,8 @@
 #include <unordered_set>
 #include <utility>

-namespace torch::autograd {
+namespace torch {
+namespace autograd {

 namespace {
 static bool in_bad_autograd_fork =
@ -991,7 +991,10 @@ void Engine::evaluate_function(
  // ensure they're safe to consume in the context of the present
  // func's stream (if applicable). So we guard onto that stream
  // before working with the grads in any capacity.
-  auto opt_parent_stream = (*func).stream();
+  auto opt_parent_stream = (*func).stream(c10::DeviceType::CUDA);
+  if (!opt_parent_stream.has_value()) {
+    opt_parent_stream = (*func).stream(c10::DeviceType::PrivateUse1);
+  }
  c10::OptionalStreamGuard parent_stream_guard{opt_parent_stream};

  // If exec_info_ is not empty, we have to instrument the execution
@ -1009,7 +1012,10 @@ void Engine::evaluate_function(
          *func, InputBuffer::variables(std::move(inputs)));
    }
    if (auto* capture_vec = fn_info.captures_.get()) {
-      auto opt_parent_stream = (*func).stream();
+      auto opt_parent_stream = (*func).stream(c10::DeviceType::CUDA);
+      if (!opt_parent_stream.has_value()) {
+        opt_parent_stream = (*func).stream(c10::DeviceType::PrivateUse1);
+      }
      // Lock mutex for writing to graph_task->captured_vars_.
      std::lock_guard<std::mutex> lock(graph_task->mutex_);
      for (const auto& capture : *capture_vec) {
@ -1101,7 +1107,10 @@ void Engine::evaluate_function(
      InputBuffer input_buffer(next.function->num_inputs());

      // Accumulates into buffer
-      auto opt_next_stream = next.function->stream();
+      auto opt_next_stream = next.function->stream(c10::DeviceType::CUDA);
+      if (!opt_next_stream.has_value()) {
+        opt_next_stream = next.function->stream(c10::DeviceType::PrivateUse1);
+      }
      input_buffer.add(
          next.input_nr, std::move(output), opt_parent_stream, opt_next_stream);

@ -1117,7 +1126,10 @@ void Engine::evaluate_function(
      auto& input_buffer = not_ready_it->second;

      // Accumulates into buffer
-      auto opt_next_stream = next.function->stream();
+      auto opt_next_stream = next.function->stream(c10::DeviceType::CUDA);
+      if (!opt_next_stream.has_value()) {
+        opt_next_stream = next.function->stream(c10::DeviceType::PrivateUse1);
+      }
      input_buffer.add(
          next.input_nr, std::move(output), opt_parent_stream, opt_next_stream);
      if (is_ready) {
@ -1149,7 +1161,10 @@ auto Engine::compute_dependencies(
    uint64_t min_topo_nr) -> void {
  // Computes the number of dependencies for each function which requires grad
  std::vector<Node*> queue{root};
-  bool will_use_accelerator = false;
+  bool might_use_cuda = at::globalContext().hasCUDA();
+  bool might_use_privateuse1 = at::isPrivateUse1HooksRegistered();
+  bool will_use_cuda = false;
+  bool will_use_privateuse1 = false;

  // Queue contains all nodes that will start propagating gradients.
  // We no longer have to expand functions that don't require grad.
@ -1160,8 +1175,12 @@ auto Engine::compute_dependencies(
    if (fn->topological_nr() < min_topo_nr) {
      continue;
    }
-    if (!will_use_accelerator) {
-      will_use_accelerator = fn->stream().has_value();
+    if (might_use_cuda && !will_use_cuda) {
+      will_use_cuda = fn->stream(c10::DeviceType::CUDA).has_value();
+    }
+    if (might_use_privateuse1 && !will_use_privateuse1) {
+      will_use_privateuse1 =
+          fn->stream(c10::DeviceType::PrivateUse1).has_value();
    }
    for (const auto& edge : fn->next_edges()) {
      if (auto next_ptr = edge.function.get()) {
@ -1173,11 +1192,21 @@ auto Engine::compute_dependencies(
    }
  }

-  if (will_use_accelerator) {
-    // Collects current streams for devices where this process has a
+  if (will_use_cuda) {
+    // Collects current streams for CUDA/ROCM devices where this process has a
    // context, so GraphTask::exec_post_processing can sync them with
    // leaf_streams.
-    task.stash_current_streams();
+    task.stash_current_cuda_streams();
+  }
+
+  // Assume that two devices will not be used simultaneously.
+  TORCH_CHECK(
+      !(will_use_cuda && will_use_privateuse1),
+      "CUDA and privateuse1 cannot be used simultaneously in streaming backwards");
+
+  if (will_use_privateuse1) {
+    // Collects current streams for privateuse1.
+    task.stash_current_privateuse1_streams();
  }
 }

@ -1267,7 +1296,12 @@ auto Engine::execute(
    auto input = inputs.at(0);

    const auto input_stream = InputMetadata(input).stream();
-    auto opt_next_stream = root_edges.at(0).function->stream();
+    auto opt_next_stream =
+        root_edges.at(0).function->stream(c10::DeviceType::CUDA);
+    if (!opt_next_stream.has_value()) {
+      opt_next_stream =
+          root_edges.at(0).function->stream(c10::DeviceType::PrivateUse1);
+    }
    input_buffer.add(
        root_edges.at(0).input_nr,
        std::move(input),
@ -1535,15 +1569,15 @@ void Engine::add_thread_pool_task(const std::weak_ptr<GraphTask>& graph_task) {
  thread_pool_shared_->work_.notify_one();
 }

-// Remembers current streams on all devices where a context has been created for
-// This function assumes the accelerator device is available.
-void GraphTask::stash_current_streams() {
-  const auto accelerator = at::getAccelerator(true).value();
-  const auto guard = c10::impl::VirtualGuardImpl{accelerator};
-  auto num_devices = guard.deviceCount();
-  caller_current_streams_.resize(num_devices);
-  if (num_devices > 0) {
-    for (c10::DeviceIndex idx = 0; idx < num_devices; idx++) {
+// Remembers current streams on all CUDA/ROCM devices where a context has been
+// created. Only called if Engine::execute detects at least one node runs on a
+// cuda/rocm stream.
+void GraphTask::stash_current_cuda_streams() {
+  const auto guard = c10::impl::VirtualGuardImpl{c10::DeviceType::CUDA};
+  auto num_gpus = guard.deviceCount();
+  caller_current_streams_.resize(num_gpus);
+  if (num_gpus > 0) {
+    for (c10::DeviceIndex idx = 0; idx < num_gpus; idx++) {
 #if defined(USE_ROCM) && (ROCM_VERSION < 50000)
      // If the build targets ROCM, stash streams for all visible devices
      // unconditionally, to work around
@ -1552,10 +1586,28 @@ void GraphTask::stash_current_streams() {
      // https://github.com/pytorch/pytorch/issues/59750 is fixed.
      if (true) {
 #else
-      if (at::globalContext().getAcceleratorHooksInterface().hasPrimaryContext(
-              idx)) {
+      if (at::detail::getCUDAHooks().hasPrimaryContext(idx)) {
 #endif
-        caller_current_streams_[idx] = guard.getStream({accelerator, idx});
+        caller_current_streams_[idx] =
+            guard.getStream({c10::DeviceType::CUDA, idx});
+      } else {
+        caller_current_streams_[idx] = c10::nullopt;
+      }
+    }
+  }
+}
+
+// Remembers current streams on all devices where a context has been created for
+// privateuse1
+void GraphTask::stash_current_privateuse1_streams() {
+  const auto guard = c10::impl::VirtualGuardImpl{c10::DeviceType::PrivateUse1};
+  auto num_devices = guard.deviceCount();
+  caller_current_streams_.resize(num_devices);
+  if (num_devices > 0) {
+    for (c10::DeviceIndex idx = 0; idx < num_devices; idx++) {
+      if (at::GetPrivateUse1HooksInterface()->hasPrimaryContext(idx)) {
+        caller_current_streams_[idx] =
+            guard.getStream({c10::DeviceType::PrivateUse1, idx});
      } else {
        caller_current_streams_[idx] = c10::nullopt;
      }
@ -1675,4 +1727,5 @@ void GraphTask::init_to_execute(
  }
 }

-} // namespace torch::autograd
+} // namespace autograd
+} // namespace torch
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@ -239,13 +239,9 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   * elements are on different devices (across multiple GPUs, for example)
   * they may have different streams.
   */
-  c10::optional<c10::Stream> stream() {
-    auto opt_device_type = at::getAccelerator();
-    if (!opt_device_type.has_value()) {
-      return c10::nullopt;
-    }
+  c10::optional<c10::Stream> stream(const c10::DeviceType device_type) {
    for (const auto& metadata : input_metadata_) {
-      if (metadata.device().type() == opt_device_type.value())
+      if (metadata.device().type() == device_type)
        return metadata.stream();
    }

--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@ -127,8 +127,11 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
  // These will be synced with leaf_streams in exec_post_processing.
  std::vector<c10::optional<c10::Stream>> caller_current_streams_;

-  // Collects caller_current_streams_ for the accelerator device.
-  void stash_current_streams();
+  // Collects caller_current_streams_ for cuda/rocm.
+  void stash_current_cuda_streams();
+
+  // Collects caller_current_streams_ for privateuse1.
+  void stash_current_privateuse1_streams();

  void init_to_execute(
      Node& graph_root,
--- a/torch/csrc/distributed/autograd/context/context.cpp
+++ b/torch/csrc/distributed/autograd/context/context.cpp
@ -90,7 +90,8 @@ void DistAutogradContext::accumulateGrad(
  // CUDA stream restoration from autograd function. Hence, we manually
  // call it here to get the streams correct.
  auto forward_stream =
-      torch::autograd::impl::grad_accumulator(variable)->stream();
+      torch::autograd::impl::grad_accumulator(variable)->stream(
+          grad.device().type());
  c10::OptionalStreamGuard stream_guard(forward_stream);

  // No higher order gradients supported in distributed autograd.
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@ -219,7 +219,8 @@ void DistEngine::computeDependencies(
    queue.push(mapEntry.second.get());
  }

-  bool will_use_accelerator = false;
+  bool might_use_cuda = at::globalContext().hasCUDA();
+  bool will_use_cuda = false;

  edge_list recvBackwardEdges;
  // Traverse the graph.
@ -228,8 +229,8 @@ void DistEngine::computeDependencies(
    auto fn = queue.front();
    queue.pop();

-    if (!will_use_accelerator) {
-      will_use_accelerator = fn->stream().has_value();
+    if (might_use_cuda && !will_use_cuda) {
+      will_use_cuda = fn->stream(c10::DeviceType::CUDA).has_value();
    }

    for (const auto& edge : fn->next_edges()) {
@ -268,11 +269,11 @@ void DistEngine::computeDependencies(
    }
  }

-  if (will_use_accelerator) {
+  if (will_use_cuda) {
    // Collects current streams for CUDA/ROCM devices where this process has a
    // context, so graphTask::exec_post_processing can sync them with
    // leaf_streams.
-    graphTask->stash_current_streams();
+    graphTask->stash_current_cuda_streams();
  }

  // Now lets compute which functions need to be executed. The algorithm is as
@ -460,7 +461,8 @@ c10::intrusive_ptr<c10::ivalue::Future> DistEngine::executeSendFunctionAsync(
  // inputs might have been retrieved over the wire on a separate stream and the
  // sendFunction itself runs on a different stream. As a result, we need to
  // manually synchronize those two streams here.
-  const auto& send_backward_stream = sendFunction->stream();
+  const auto& send_backward_stream =
+      sendFunction->stream(c10::DeviceType::CUDA);
  if (send_backward_stream) {
    for (const auto& grad : sendFunction->getGrads()) {
      const auto guard = c10::impl::VirtualGuardImpl{c10::DeviceType::CUDA};