Remove calls to CopyFrom that can be sync (#13205)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/13205 CopyFrom without context argument does the sync copy on the current gpu - exactly what most of the places need. This diff kills about 60% of CopyFrom usages. Most common pattern is gpu->cpu copy with further FinishDeviceComputation - the latter can be just killed. Reviewed By: Yangqing Differential Revision: D11236076 fbshipit-source-id: eb790ca494dfc5d5e3a7d850b45d6f73221bb204
2025-10-20 21:14:14 +08:00 · 2018-10-29 13:54:47 -07:00
parent 8ad69a80e3
commit 5a2b2aa6af
14 changed files with 37 additions and 65 deletions
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@ -847,6 +847,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   * preserves the DeviceType of the source tensor (so, e.g., if you allocate
   * a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
   * CUDA-to-CPU transfer).
+   *
+   * If the function is invoked without `context` the copy would be synchronous
   */
  void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
    AT_ASSERT(!is_variable());
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@ -86,7 +86,6 @@ class ImageInputOp final

  unique_ptr<db::DBReader> owned_reader_;
  const db::DBReader* reader_;
-  CPUContext cpu_context_;
  Tensor prefetched_image_{CPU};
  Tensor prefetched_label_{CPU};
  vector<TensorCPU> prefetched_additional_outputs_;
@ -1208,12 +1207,13 @@ bool ImageInputOp<Context>::Prefetch() {
  // If the context is not CPUContext, we will need to do a copy in the
  // prefetch function as well.
  if (!std::is_same<Context, CPUContext>::value) {
-    prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
-    prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
+    // do sync copies
+    prefetched_image_on_device_.CopyFrom(prefetched_image_);
+    prefetched_label_on_device_.CopyFrom(prefetched_label_);

    for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
      prefetched_additional_outputs_on_device_[i].CopyFrom(
-          prefetched_additional_outputs_[i], &cpu_context_);
+          prefetched_additional_outputs_[i]);
    }
  }

--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@ -961,10 +961,10 @@ class CollectTensorOp final : public Operator<Context> {
      } else if (pos >= tensorVector->size()) {
        // append
        tensorVector->emplace_back(Context::GetDeviceType());
-        tensorVector->back().CopyFrom(tensor, &context_);
+        tensorVector->back().CopyFrom(tensor); // sync copy
      } else {
        // replace
-        tensorVector->at(pos).CopyFrom(tensor, &context_);
+        tensorVector->at(pos).CopyFrom(tensor); // sync copy
      }
    }

--- a/caffe2/operators/enforce_finite_op.cu
+++ b/caffe2/operators/enforce_finite_op.cu
@ -7,7 +7,7 @@ namespace caffe2 {
 template <>
 template <typename T>
 bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
-  buffer_.CopyFrom(Input(0), &context_);
+  buffer_.CopyFrom(Input(0)); // sync copy
  EnforceOnCPU<T>(buffer_);
  return true;
 }
--- a/caffe2/operators/lengths_tile_op.cc
+++ b/caffe2/operators/lengths_tile_op.cc
@ -15,8 +15,7 @@ bool LengthsTileOp<CPUContext>::RunOnDevice() {
  // Context::CopyFrom and math::Sum need the same context to avoid race
  // conditions
  // why? CPUContext is not used in Sum
-  lengths_host_.CopyFrom(lengths, &context_);
-  context_.FinishDeviceComputation();
+  lengths_host_.CopyFrom(lengths); // sync copy
  auto lengths_size = lengths_host_.numel();
  auto* lengths_data = lengths_host_.data<int32_t>();

--- a/caffe2/operators/lengths_tile_op.cu
+++ b/caffe2/operators/lengths_tile_op.cu
@ -28,8 +28,7 @@ bool LengthsTileOp<CUDAContext>::RunOnDevice() {
  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
  CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));

-  lengths_host_.CopyFrom(lengths, &context_);
-  context_.FinishDeviceComputation();
+  lengths_host_.CopyFrom(lengths); // sync copy
  auto lengths_size = lengths_host_.size();
  auto* lengths_data = lengths_host_.data<int32_t>();

--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@ -62,12 +62,10 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
  }

  bool RunOnDevice() override {
-    bool need_sync = false;
    for (int i = 0; i < InputSize(); ++i) {
      if (this->InputIsTensorType(i, CUDA)) {
-        BlobGetMutableTensor(local_input_blobs_[i], CPU)
-            ->CopyFrom(Input(i), &context_);
-        need_sync = true;
+        // use sync copy
+        BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i));
      } else {
        VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
        // Note(jiayq): This removes a const but conceptually
@ -79,11 +77,6 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
      }
    }

-    // Sync to make sure copies are done.
-    if (need_sync) {
-      context_.FinishDeviceComputation();
-    }
-
    if (!base_op_->Run()) {
      LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
                 << ProtoDebugString(this->debug_def());
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@ -190,7 +190,7 @@ void CreateAndRun(
  EXPECT_NE(nullptr, Y_blob);

  auto& Y = Y_blob->Get<Tensor>();
-  outResult->CopyFrom(Y, &context);
+  outResult->CopyFrom(Y);
 }

 } // namespace
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@ -813,7 +813,7 @@ bool NanCheckOp<CPUContext>::RunOnDevice() {
  }

  if (&X != Y) {
-    Y->CopyFrom(X, &context_);
+    Y->CopyFrom(X);
  }
  return true;
 }
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@ -107,9 +107,8 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {

      {
        std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-        cpu_X.CopyFrom(Input(j), &context_);
+        cpu_X.CopyFrom(Input(j)); // sync copy
      }
-      context_.FinishDeviceComputation();
      std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
                << "]" << std::endl;
      tensorPrinter_.Print<float>(cpu_X);
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@ -132,9 +132,8 @@ class PrintOp final : public Operator<Context> {
    if (this->InputIsTensorType(0, CPU)) {
      tensor = &this->template Input<Tensor>(0, CPU);
    } else {
-      tensor_copy_if_needed.CopyFrom(Input(0), &context_);
-      // Make sure that the copy is finished.
-      context_.FinishDeviceComputation();
+      // sync copy
+      tensor_copy_if_needed.CopyFrom(Input(0));
      tensor = &tensor_copy_if_needed;
    }
    tensor_printer_.Print<T>(*tensor);
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@ -60,8 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  EXPECT_EQ(tensorY_host->size(), 30);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@ -81,8 +80,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  EXPECT_EQ(tensorY_host->size(), 30);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@ -102,8 +100,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  EXPECT_EQ(tensorY_host->size(), 30);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@ -160,8 +157,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  EXPECT_EQ(tensorY_host->size(), 30);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@ -181,8 +177,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  EXPECT_EQ(tensorY_host->size(), 30);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@ -201,8 +196,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  EXPECT_EQ(tensorY_host->size(), 30);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@ -256,8 +250,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
  }
@ -274,8 +267,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
  }
@ -292,8 +284,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
  }
@ -346,8 +337,7 @@ TEST(MathROCBLASTest, GemvTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
  }
@ -364,8 +354,7 @@ TEST(MathROCBLASTest, GemvTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
  }
@ -382,8 +371,7 @@ TEST(MathROCBLASTest, GemvTrans) {
      tensorY->mutable_data<float>(),
      &context);
  context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
  for (int i = 0; i < tensorY_host->size(); ++i) {
    CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
  }
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@ -72,8 +72,7 @@ void executeGpuBinaryOpTest(

  // Copy result to CPU so we can inspect it
  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
-  tensory_host->CopyFrom(*tensory, &context);
-  context.FinishDeviceComputation();
+  tensory_host->CopyFrom(*tensory);

  for (int i = 0; i < shapey; ++i) {
    EXPECT_EQ(tensory_host->data<float>()[i], correct_output(i));
@ -126,8 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {

  // Copy result to CPU so we can inspect it
  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
-  tensory_host->CopyFrom(*tensory, &context);
-  context.FinishDeviceComputation();
+  tensory_host->CopyFrom(*tensory);

  for (int k = 0; k < 33; k++) {
    for (int i = 0; i < 25; i++) {
@ -403,8 +401,7 @@ class ReduceTensorGPUTest : public testing::Test {
  void VerifyResult(const std::vector<float>& expected_output) {
    Blob* blob_y_host = ws_.CreateBlob("Y_host");
    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    Y_host->CopyFrom(*Y_);
    ASSERT_EQ(expected_output.size(), Y_host->size());
    for (std::size_t i = 0; i < expected_output.size(); ++i) {
      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
@ -682,8 +679,7 @@ class BroadcastGPUTest : public testing::Test {
  void VerifyResult(const std::vector<float>& expected_output) {
    Blob* blob_y_host = ws_.CreateBlob("Y_host");
    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    Y_host->CopyFrom(*Y_);
    ASSERT_EQ(expected_output.size(), Y_host->size());
    for (std::size_t i = 0; i < expected_output.size(); ++i) {
      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
@ -767,11 +763,10 @@ class MomentsGPUTest : public testing::Test {
      const std::vector<float>& variance_data) {
    Blob* blob_mean_host = ws_.CreateBlob("mean_host");
    auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
-    mean_host->CopyFrom(*mean_, cuda_context_.get());
+    mean_host->CopyFrom(*mean_);
    Blob* blob_variance_host = ws_.CreateBlob("variance_host");
    auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
-    variance_host->CopyFrom(*variance_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    variance_host->CopyFrom(*variance_);

    ASSERT_EQ(mean_data.size(), mean_host->size());
    for (std::size_t i = 0; i < mean_data.size(); ++i) {
@ -891,8 +886,7 @@ class TransposeGPUTest : public testing::Test {
  void VerifyResult(const std::vector<float>& expected_output) {
    Blob* blob_y_host = ws_.CreateBlob("Y_host");
    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    Y_host->CopyFrom(*Y_);
    ASSERT_EQ(expected_output.size(), Y_host->size());
    for (std::size_t i = 0; i < expected_output.size(); ++i) {
      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@ -51,7 +51,6 @@ class VideoInputOp final : public PrefetchOperator<Context> {
      std::bernoulli_distribution* mirror_this_clip);

  const db::DBReader* reader_;
-  CPUContext cpu_context_;
  Tensor prefetched_clip_rgb_{CPU};
  Tensor prefetched_clip_of_{CPU};
  Tensor prefetched_label_{CPU};