mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Remove calls to CopyFrom that can be sync (#13205)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/13205 CopyFrom without context argument does the sync copy on the current gpu - exactly what most of the places need. This diff kills about 60% of CopyFrom usages. Most common pattern is gpu->cpu copy with further FinishDeviceComputation - the latter can be just killed. Reviewed By: Yangqing Differential Revision: D11236076 fbshipit-source-id: eb790ca494dfc5d5e3a7d850b45d6f73221bb204
This commit is contained in:
committed by
Facebook Github Bot
parent
8ad69a80e3
commit
5a2b2aa6af
@ -847,6 +847,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
* preserves the DeviceType of the source tensor (so, e.g., if you allocate
|
||||
* a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
|
||||
* CUDA-to-CPU transfer).
|
||||
*
|
||||
* If the function is invoked without `context` the copy would be synchronous
|
||||
*/
|
||||
void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
|
||||
AT_ASSERT(!is_variable());
|
||||
|
@ -86,7 +86,6 @@ class ImageInputOp final
|
||||
|
||||
unique_ptr<db::DBReader> owned_reader_;
|
||||
const db::DBReader* reader_;
|
||||
CPUContext cpu_context_;
|
||||
Tensor prefetched_image_{CPU};
|
||||
Tensor prefetched_label_{CPU};
|
||||
vector<TensorCPU> prefetched_additional_outputs_;
|
||||
@ -1208,12 +1207,13 @@ bool ImageInputOp<Context>::Prefetch() {
|
||||
// If the context is not CPUContext, we will need to do a copy in the
|
||||
// prefetch function as well.
|
||||
if (!std::is_same<Context, CPUContext>::value) {
|
||||
prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
|
||||
prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
|
||||
// do sync copies
|
||||
prefetched_image_on_device_.CopyFrom(prefetched_image_);
|
||||
prefetched_label_on_device_.CopyFrom(prefetched_label_);
|
||||
|
||||
for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
|
||||
prefetched_additional_outputs_on_device_[i].CopyFrom(
|
||||
prefetched_additional_outputs_[i], &cpu_context_);
|
||||
prefetched_additional_outputs_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -961,10 +961,10 @@ class CollectTensorOp final : public Operator<Context> {
|
||||
} else if (pos >= tensorVector->size()) {
|
||||
// append
|
||||
tensorVector->emplace_back(Context::GetDeviceType());
|
||||
tensorVector->back().CopyFrom(tensor, &context_);
|
||||
tensorVector->back().CopyFrom(tensor); // sync copy
|
||||
} else {
|
||||
// replace
|
||||
tensorVector->at(pos).CopyFrom(tensor, &context_);
|
||||
tensorVector->at(pos).CopyFrom(tensor); // sync copy
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7,7 +7,7 @@ namespace caffe2 {
|
||||
template <>
|
||||
template <typename T>
|
||||
bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
|
||||
buffer_.CopyFrom(Input(0), &context_);
|
||||
buffer_.CopyFrom(Input(0)); // sync copy
|
||||
EnforceOnCPU<T>(buffer_);
|
||||
return true;
|
||||
}
|
||||
|
@ -15,8 +15,7 @@ bool LengthsTileOp<CPUContext>::RunOnDevice() {
|
||||
// Context::CopyFrom and math::Sum need the same context to avoid race
|
||||
// conditions
|
||||
// why? CPUContext is not used in Sum
|
||||
lengths_host_.CopyFrom(lengths, &context_);
|
||||
context_.FinishDeviceComputation();
|
||||
lengths_host_.CopyFrom(lengths); // sync copy
|
||||
auto lengths_size = lengths_host_.numel();
|
||||
auto* lengths_data = lengths_host_.data<int32_t>();
|
||||
|
||||
|
@ -28,8 +28,7 @@ bool LengthsTileOp<CUDAContext>::RunOnDevice() {
|
||||
CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
|
||||
CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
|
||||
|
||||
lengths_host_.CopyFrom(lengths, &context_);
|
||||
context_.FinishDeviceComputation();
|
||||
lengths_host_.CopyFrom(lengths); // sync copy
|
||||
auto lengths_size = lengths_host_.size();
|
||||
auto* lengths_data = lengths_host_.data<int32_t>();
|
||||
|
||||
|
@ -62,12 +62,10 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
bool need_sync = false;
|
||||
for (int i = 0; i < InputSize(); ++i) {
|
||||
if (this->InputIsTensorType(i, CUDA)) {
|
||||
BlobGetMutableTensor(local_input_blobs_[i], CPU)
|
||||
->CopyFrom(Input(i), &context_);
|
||||
need_sync = true;
|
||||
// use sync copy
|
||||
BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i));
|
||||
} else {
|
||||
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
|
||||
// Note(jiayq): This removes a const but conceptually
|
||||
@ -79,11 +77,6 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
|
||||
}
|
||||
}
|
||||
|
||||
// Sync to make sure copies are done.
|
||||
if (need_sync) {
|
||||
context_.FinishDeviceComputation();
|
||||
}
|
||||
|
||||
if (!base_op_->Run()) {
|
||||
LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
|
||||
<< ProtoDebugString(this->debug_def());
|
||||
|
@ -190,7 +190,7 @@ void CreateAndRun(
|
||||
EXPECT_NE(nullptr, Y_blob);
|
||||
|
||||
auto& Y = Y_blob->Get<Tensor>();
|
||||
outResult->CopyFrom(Y, &context);
|
||||
outResult->CopyFrom(Y);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -813,7 +813,7 @@ bool NanCheckOp<CPUContext>::RunOnDevice() {
|
||||
}
|
||||
|
||||
if (&X != Y) {
|
||||
Y->CopyFrom(X, &context_);
|
||||
Y->CopyFrom(X);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -107,9 +107,8 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
||||
cpu_X.CopyFrom(Input(j), &context_);
|
||||
cpu_X.CopyFrom(Input(j)); // sync copy
|
||||
}
|
||||
context_.FinishDeviceComputation();
|
||||
std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
|
||||
<< "]" << std::endl;
|
||||
tensorPrinter_.Print<float>(cpu_X);
|
||||
|
@ -132,9 +132,8 @@ class PrintOp final : public Operator<Context> {
|
||||
if (this->InputIsTensorType(0, CPU)) {
|
||||
tensor = &this->template Input<Tensor>(0, CPU);
|
||||
} else {
|
||||
tensor_copy_if_needed.CopyFrom(Input(0), &context_);
|
||||
// Make sure that the copy is finished.
|
||||
context_.FinishDeviceComputation();
|
||||
// sync copy
|
||||
tensor_copy_if_needed.CopyFrom(Input(0));
|
||||
tensor = &tensor_copy_if_needed;
|
||||
}
|
||||
tensor_printer_.Print<T>(*tensor);
|
||||
|
@ -60,8 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
EXPECT_EQ(tensorY_host->size(), 30);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
|
||||
@ -81,8 +80,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
EXPECT_EQ(tensorY_host->size(), 30);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
|
||||
@ -102,8 +100,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
EXPECT_EQ(tensorY_host->size(), 30);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
|
||||
@ -160,8 +157,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
EXPECT_EQ(tensorY_host->size(), 30);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
|
||||
@ -181,8 +177,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
EXPECT_EQ(tensorY_host->size(), 30);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
|
||||
@ -201,8 +196,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
EXPECT_EQ(tensorY_host->size(), 30);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
|
||||
@ -256,8 +250,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
|
||||
}
|
||||
@ -274,8 +267,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
|
||||
}
|
||||
@ -292,8 +284,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
|
||||
}
|
||||
@ -346,8 +337,7 @@ TEST(MathROCBLASTest, GemvTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
|
||||
}
|
||||
@ -364,8 +354,7 @@ TEST(MathROCBLASTest, GemvTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
|
||||
}
|
||||
@ -382,8 +371,7 @@ TEST(MathROCBLASTest, GemvTrans) {
|
||||
tensorY->mutable_data<float>(),
|
||||
&context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensorY_host->CopyFrom(*tensorY);
|
||||
for (int i = 0; i < tensorY_host->size(); ++i) {
|
||||
CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
|
||||
}
|
||||
|
@ -72,8 +72,7 @@ void executeGpuBinaryOpTest(
|
||||
|
||||
// Copy result to CPU so we can inspect it
|
||||
auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
|
||||
tensory_host->CopyFrom(*tensory, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensory_host->CopyFrom(*tensory);
|
||||
|
||||
for (int i = 0; i < shapey; ++i) {
|
||||
EXPECT_EQ(tensory_host->data<float>()[i], correct_output(i));
|
||||
@ -126,8 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
|
||||
|
||||
// Copy result to CPU so we can inspect it
|
||||
auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
|
||||
tensory_host->CopyFrom(*tensory, &context);
|
||||
context.FinishDeviceComputation();
|
||||
tensory_host->CopyFrom(*tensory);
|
||||
|
||||
for (int k = 0; k < 33; k++) {
|
||||
for (int i = 0; i < 25; i++) {
|
||||
@ -403,8 +401,7 @@ class ReduceTensorGPUTest : public testing::Test {
|
||||
void VerifyResult(const std::vector<float>& expected_output) {
|
||||
Blob* blob_y_host = ws_.CreateBlob("Y_host");
|
||||
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
|
||||
Y_host->CopyFrom(*Y_, cuda_context_.get());
|
||||
cuda_context_->FinishDeviceComputation();
|
||||
Y_host->CopyFrom(*Y_);
|
||||
ASSERT_EQ(expected_output.size(), Y_host->size());
|
||||
for (std::size_t i = 0; i < expected_output.size(); ++i) {
|
||||
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
|
||||
@ -682,8 +679,7 @@ class BroadcastGPUTest : public testing::Test {
|
||||
void VerifyResult(const std::vector<float>& expected_output) {
|
||||
Blob* blob_y_host = ws_.CreateBlob("Y_host");
|
||||
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
|
||||
Y_host->CopyFrom(*Y_, cuda_context_.get());
|
||||
cuda_context_->FinishDeviceComputation();
|
||||
Y_host->CopyFrom(*Y_);
|
||||
ASSERT_EQ(expected_output.size(), Y_host->size());
|
||||
for (std::size_t i = 0; i < expected_output.size(); ++i) {
|
||||
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
|
||||
@ -767,11 +763,10 @@ class MomentsGPUTest : public testing::Test {
|
||||
const std::vector<float>& variance_data) {
|
||||
Blob* blob_mean_host = ws_.CreateBlob("mean_host");
|
||||
auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
|
||||
mean_host->CopyFrom(*mean_, cuda_context_.get());
|
||||
mean_host->CopyFrom(*mean_);
|
||||
Blob* blob_variance_host = ws_.CreateBlob("variance_host");
|
||||
auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
|
||||
variance_host->CopyFrom(*variance_, cuda_context_.get());
|
||||
cuda_context_->FinishDeviceComputation();
|
||||
variance_host->CopyFrom(*variance_);
|
||||
|
||||
ASSERT_EQ(mean_data.size(), mean_host->size());
|
||||
for (std::size_t i = 0; i < mean_data.size(); ++i) {
|
||||
@ -891,8 +886,7 @@ class TransposeGPUTest : public testing::Test {
|
||||
void VerifyResult(const std::vector<float>& expected_output) {
|
||||
Blob* blob_y_host = ws_.CreateBlob("Y_host");
|
||||
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
|
||||
Y_host->CopyFrom(*Y_, cuda_context_.get());
|
||||
cuda_context_->FinishDeviceComputation();
|
||||
Y_host->CopyFrom(*Y_);
|
||||
ASSERT_EQ(expected_output.size(), Y_host->size());
|
||||
for (std::size_t i = 0; i < expected_output.size(); ++i) {
|
||||
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
|
||||
|
@ -51,7 +51,6 @@ class VideoInputOp final : public PrefetchOperator<Context> {
|
||||
std::bernoulli_distribution* mirror_this_clip);
|
||||
|
||||
const db::DBReader* reader_;
|
||||
CPUContext cpu_context_;
|
||||
Tensor prefetched_clip_rgb_{CPU};
|
||||
Tensor prefetched_clip_of_{CPU};
|
||||
Tensor prefetched_label_{CPU};
|
||||
|
Reference in New Issue
Block a user