Remove calls to CopyFrom that can be sync (#13205)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13205

CopyFrom without context argument does the sync copy on the current gpu - exactly what most of the places need.

This diff kills about 60% of CopyFrom usages. Most common pattern is gpu->cpu copy with further FinishDeviceComputation - the latter can be just killed.

Reviewed By: Yangqing

Differential Revision: D11236076

fbshipit-source-id: eb790ca494dfc5d5e3a7d850b45d6f73221bb204
This commit is contained in:
Dmytro Dzhulgakov
2018-10-29 13:54:47 -07:00
committed by Facebook Github Bot
parent 8ad69a80e3
commit 5a2b2aa6af
14 changed files with 37 additions and 65 deletions

View File

@ -847,6 +847,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
* preserves the DeviceType of the source tensor (so, e.g., if you allocate
* a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
* CUDA-to-CPU transfer).
*
* If the function is invoked without `context` the copy would be synchronous
*/
void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
AT_ASSERT(!is_variable());

View File

@ -86,7 +86,6 @@ class ImageInputOp final
unique_ptr<db::DBReader> owned_reader_;
const db::DBReader* reader_;
CPUContext cpu_context_;
Tensor prefetched_image_{CPU};
Tensor prefetched_label_{CPU};
vector<TensorCPU> prefetched_additional_outputs_;
@ -1208,12 +1207,13 @@ bool ImageInputOp<Context>::Prefetch() {
// If the context is not CPUContext, we will need to do a copy in the
// prefetch function as well.
if (!std::is_same<Context, CPUContext>::value) {
prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
// do sync copies
prefetched_image_on_device_.CopyFrom(prefetched_image_);
prefetched_label_on_device_.CopyFrom(prefetched_label_);
for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
prefetched_additional_outputs_on_device_[i].CopyFrom(
prefetched_additional_outputs_[i], &cpu_context_);
prefetched_additional_outputs_[i]);
}
}

View File

@ -961,10 +961,10 @@ class CollectTensorOp final : public Operator<Context> {
} else if (pos >= tensorVector->size()) {
// append
tensorVector->emplace_back(Context::GetDeviceType());
tensorVector->back().CopyFrom(tensor, &context_);
tensorVector->back().CopyFrom(tensor); // sync copy
} else {
// replace
tensorVector->at(pos).CopyFrom(tensor, &context_);
tensorVector->at(pos).CopyFrom(tensor); // sync copy
}
}

View File

@ -7,7 +7,7 @@ namespace caffe2 {
template <>
template <typename T>
bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
buffer_.CopyFrom(Input(0), &context_);
buffer_.CopyFrom(Input(0)); // sync copy
EnforceOnCPU<T>(buffer_);
return true;
}

View File

@ -15,8 +15,7 @@ bool LengthsTileOp<CPUContext>::RunOnDevice() {
// Context::CopyFrom and math::Sum need the same context to avoid race
// conditions
// why? CPUContext is not used in Sum
lengths_host_.CopyFrom(lengths, &context_);
context_.FinishDeviceComputation();
lengths_host_.CopyFrom(lengths); // sync copy
auto lengths_size = lengths_host_.numel();
auto* lengths_data = lengths_host_.data<int32_t>();

View File

@ -28,8 +28,7 @@ bool LengthsTileOp<CUDAContext>::RunOnDevice() {
CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
lengths_host_.CopyFrom(lengths, &context_);
context_.FinishDeviceComputation();
lengths_host_.CopyFrom(lengths); // sync copy
auto lengths_size = lengths_host_.size();
auto* lengths_data = lengths_host_.data<int32_t>();

View File

@ -62,12 +62,10 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
}
bool RunOnDevice() override {
bool need_sync = false;
for (int i = 0; i < InputSize(); ++i) {
if (this->InputIsTensorType(i, CUDA)) {
BlobGetMutableTensor(local_input_blobs_[i], CPU)
->CopyFrom(Input(i), &context_);
need_sync = true;
// use sync copy
BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i));
} else {
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
// Note(jiayq): This removes a const but conceptually
@ -79,11 +77,6 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
}
}
// Sync to make sure copies are done.
if (need_sync) {
context_.FinishDeviceComputation();
}
if (!base_op_->Run()) {
LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
<< ProtoDebugString(this->debug_def());

View File

@ -190,7 +190,7 @@ void CreateAndRun(
EXPECT_NE(nullptr, Y_blob);
auto& Y = Y_blob->Get<Tensor>();
outResult->CopyFrom(Y, &context);
outResult->CopyFrom(Y);
}
} // namespace

View File

@ -813,7 +813,7 @@ bool NanCheckOp<CPUContext>::RunOnDevice() {
}
if (&X != Y) {
Y->CopyFrom(X, &context_);
Y->CopyFrom(X);
}
return true;
}

View File

@ -107,9 +107,8 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
{
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
cpu_X.CopyFrom(Input(j), &context_);
cpu_X.CopyFrom(Input(j)); // sync copy
}
context_.FinishDeviceComputation();
std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
<< "]" << std::endl;
tensorPrinter_.Print<float>(cpu_X);

View File

@ -132,9 +132,8 @@ class PrintOp final : public Operator<Context> {
if (this->InputIsTensorType(0, CPU)) {
tensor = &this->template Input<Tensor>(0, CPU);
} else {
tensor_copy_if_needed.CopyFrom(Input(0), &context_);
// Make sure that the copy is finished.
context_.FinishDeviceComputation();
// sync copy
tensor_copy_if_needed.CopyFrom(Input(0));
tensor = &tensor_copy_if_needed;
}
tensor_printer_.Print<T>(*tensor);

View File

@ -60,8 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@ -81,8 +80,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@ -102,8 +100,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@ -160,8 +157,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@ -181,8 +177,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@ -201,8 +196,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@ -256,8 +250,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
}
@ -274,8 +267,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
}
@ -292,8 +284,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
}
@ -346,8 +337,7 @@ TEST(MathROCBLASTest, GemvTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
}
@ -364,8 +354,7 @@ TEST(MathROCBLASTest, GemvTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
}
@ -382,8 +371,7 @@ TEST(MathROCBLASTest, GemvTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
}

View File

@ -72,8 +72,7 @@ void executeGpuBinaryOpTest(
// Copy result to CPU so we can inspect it
auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
tensory_host->CopyFrom(*tensory, &context);
context.FinishDeviceComputation();
tensory_host->CopyFrom(*tensory);
for (int i = 0; i < shapey; ++i) {
EXPECT_EQ(tensory_host->data<float>()[i], correct_output(i));
@ -126,8 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
// Copy result to CPU so we can inspect it
auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
tensory_host->CopyFrom(*tensory, &context);
context.FinishDeviceComputation();
tensory_host->CopyFrom(*tensory);
for (int k = 0; k < 33; k++) {
for (int i = 0; i < 25; i++) {
@ -403,8 +401,7 @@ class ReduceTensorGPUTest : public testing::Test {
void VerifyResult(const std::vector<float>& expected_output) {
Blob* blob_y_host = ws_.CreateBlob("Y_host");
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
Y_host->CopyFrom(*Y_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
Y_host->CopyFrom(*Y_);
ASSERT_EQ(expected_output.size(), Y_host->size());
for (std::size_t i = 0; i < expected_output.size(); ++i) {
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
@ -682,8 +679,7 @@ class BroadcastGPUTest : public testing::Test {
void VerifyResult(const std::vector<float>& expected_output) {
Blob* blob_y_host = ws_.CreateBlob("Y_host");
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
Y_host->CopyFrom(*Y_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
Y_host->CopyFrom(*Y_);
ASSERT_EQ(expected_output.size(), Y_host->size());
for (std::size_t i = 0; i < expected_output.size(); ++i) {
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
@ -767,11 +763,10 @@ class MomentsGPUTest : public testing::Test {
const std::vector<float>& variance_data) {
Blob* blob_mean_host = ws_.CreateBlob("mean_host");
auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
mean_host->CopyFrom(*mean_, cuda_context_.get());
mean_host->CopyFrom(*mean_);
Blob* blob_variance_host = ws_.CreateBlob("variance_host");
auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
variance_host->CopyFrom(*variance_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
variance_host->CopyFrom(*variance_);
ASSERT_EQ(mean_data.size(), mean_host->size());
for (std::size_t i = 0; i < mean_data.size(); ++i) {
@ -891,8 +886,7 @@ class TransposeGPUTest : public testing::Test {
void VerifyResult(const std::vector<float>& expected_output) {
Blob* blob_y_host = ws_.CreateBlob("Y_host");
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
Y_host->CopyFrom(*Y_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
Y_host->CopyFrom(*Y_);
ASSERT_EQ(expected_output.size(), Y_host->size());
for (std::size_t i = 0; i < expected_output.size(); ++i) {
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);

View File

@ -51,7 +51,6 @@ class VideoInputOp final : public PrefetchOperator<Context> {
std::bernoulli_distribution* mirror_this_clip);
const db::DBReader* reader_;
CPUContext cpu_context_;
Tensor prefetched_clip_rgb_{CPU};
Tensor prefetched_clip_of_{CPU};
Tensor prefetched_label_{CPU};