mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-22 14:15:01 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12103 This defers lookup of defaults to the site where we read out of TensorOptions. THIS IS A BC-BREAKING BEHAVIOR CHANGE, but we expect the bulk of uses of OptionsGuard don't allocate TensorOptions inside the OptionsGuard region, and then use it outside of the region (the situation where behavior could change.) I also optimize the size of TensorOptions by rearranging fields, so that we always fit in two 64-bit words. Reviewed By: goldsborough Differential Revision: D10052523 fbshipit-source-id: f454a15b4dbf8cd17bc902ab7d2016f2f689ed13
232 lines
7.5 KiB
C++
232 lines
7.5 KiB
C++
#include <gtest/gtest.h>
|
|
|
|
#include <torch/csrc/autograd/functions/comm.h>
|
|
#include <torch/nn/module.h>
|
|
#include <torch/nn/modules/linear.h>
|
|
#include <torch/nn/parallel/data_parallel.h>
|
|
#include <torch/nn/pimpl.h>
|
|
#include <torch/tensor.h>
|
|
|
|
#include <test/cpp/api/support.h>
|
|
|
|
#include <iostream>
|
|
#include <memory>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
using namespace torch::autograd;
|
|
using namespace torch::nn;
|
|
|
|
struct ParallelTest : torch::test::SeedingFixture {};
|
|
|
|
TEST_F(ParallelTest, DifferentiableScatter_MultiCUDA) {
|
|
Scatter scatter(
|
|
{torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
|
|
|
|
auto input = torch::ones(10, torch::requires_grad(true));
|
|
auto output = scatter.apply({input});
|
|
|
|
ASSERT_EQ(output.size(), 2);
|
|
ASSERT_EQ(output[0].size(0), 5);
|
|
ASSERT_EQ(output[1].size(0), 5);
|
|
|
|
ASSERT_TRUE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
|
|
.allclose(input));
|
|
|
|
torch::Tensor sum = output[0].to({torch::kCUDA, 1}) + output[1];
|
|
sum.backward();
|
|
|
|
ASSERT_TRUE(input.grad().defined());
|
|
ASSERT_TRUE(input.grad().device().is_cpu());
|
|
ASSERT_EQ(input.grad().sum().item<int32_t>(), 10);
|
|
}
|
|
|
|
TEST_F(ParallelTest, DifferentiableGather_MultiCUDA) {
|
|
Gather gather(torch::Device(torch::kCUDA, 1));
|
|
|
|
auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0}));
|
|
auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1}));
|
|
|
|
auto outputs = gather.apply({a, b});
|
|
ASSERT_EQ(outputs.size(), 1);
|
|
torch::Tensor output = outputs.front();
|
|
|
|
ASSERT_EQ(output.size(0), 10);
|
|
ASSERT_EQ(output.device(), torch::Device(torch::kCUDA, 1));
|
|
|
|
auto chunks = output.chunk(2);
|
|
ASSERT_TRUE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
|
|
ASSERT_TRUE(chunks[1].allclose(b));
|
|
|
|
output.backward();
|
|
|
|
ASSERT_TRUE(a.grad().defined());
|
|
ASSERT_EQ(a.grad().device(), torch::Device(torch::kCUDA, 0));
|
|
ASSERT_EQ(a.grad().sum().item<int32_t>(), 5);
|
|
|
|
ASSERT_TRUE(b.grad().defined());
|
|
ASSERT_EQ(b.grad().device(), torch::Device(torch::kCUDA, 1));
|
|
ASSERT_EQ(b.grad().sum().item<int32_t>(), 5);
|
|
}
|
|
|
|
TEST_F(ParallelTest, Replicate_MultiCUDA) {
|
|
Linear linear(3, 4);
|
|
auto replicas = parallel::replicate(
|
|
linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
|
|
ASSERT_EQ(replicas.size(), 2);
|
|
|
|
auto original_parameters = linear->parameters();
|
|
|
|
auto replica1_parameters = replicas[0]->parameters();
|
|
for (auto& parameter : replica1_parameters) {
|
|
ASSERT_EQ(parameter->device(), torch::Device(torch::kCUDA, 0));
|
|
}
|
|
replicas[0]->to(torch::kCPU);
|
|
ASSERT_EQ(replica1_parameters.size(), original_parameters.size());
|
|
for (size_t i = 0; i < original_parameters.size(); ++i) {
|
|
ASSERT_TRUE(replica1_parameters[i]->allclose(*original_parameters[i]));
|
|
ASSERT_TRUE(
|
|
replica1_parameters[i]->data<float>() !=
|
|
original_parameters[i]->data<float>());
|
|
}
|
|
|
|
auto replica2_parameters = replicas[1]->parameters();
|
|
for (auto& parameter : replica2_parameters) {
|
|
ASSERT_EQ(parameter->device(), torch::Device(torch::kCUDA, 1));
|
|
}
|
|
replicas[1]->to(torch::kCPU);
|
|
ASSERT_EQ(replica2_parameters.size(), original_parameters.size());
|
|
for (size_t i = 0; i < original_parameters.size(); ++i) {
|
|
ASSERT_TRUE(replica2_parameters[i]->allclose(*original_parameters[i]));
|
|
ASSERT_TRUE(
|
|
replica2_parameters[i]->data<float>() !=
|
|
original_parameters[i]->data<float>());
|
|
}
|
|
}
|
|
|
|
TEST_F(ParallelTest, ParallelApply_MultiCUDA) {
|
|
Linear a(3, 4);
|
|
|
|
Linear b(std::dynamic_pointer_cast<LinearImpl>(a->clone()));
|
|
b->to({torch::kCUDA, 0});
|
|
|
|
Linear c(std::dynamic_pointer_cast<LinearImpl>(a->clone()));
|
|
c->to({torch::kCUDA, 1});
|
|
|
|
std::vector<Linear> modules = {a, b, c};
|
|
std::vector<torch::Tensor> inputs = {
|
|
torch::ones({2, 3}),
|
|
torch::ones({2, 3}, torch::device({torch::kCUDA, 0})),
|
|
torch::ones({2, 3}, torch::device({torch::kCUDA, 1}))};
|
|
|
|
auto outputs = parallel::parallel_apply(modules, inputs);
|
|
|
|
ASSERT_EQ(outputs.size(), 3);
|
|
ASSERT_TRUE(outputs[0].device().is_cpu());
|
|
|
|
ASSERT_EQ(outputs[1].device(), torch::Device(torch::kCUDA, 0));
|
|
ASSERT_TRUE(outputs[1].to(torch::kCPU).allclose(outputs[0]));
|
|
|
|
ASSERT_EQ(outputs[2].device(), torch::Device(torch::kCUDA, 1));
|
|
ASSERT_TRUE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
|
|
}
|
|
|
|
TEST_F(ParallelTest, ParallelApplyWithDifferentOutputDevice_MultiCUDA) {
|
|
struct M : torch::nn::Module {
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
return torch::ones({5}, torch::dtype(torch::kInt32));
|
|
}
|
|
};
|
|
|
|
std::vector<std::shared_ptr<M>> modules = {
|
|
std::make_shared<M>(), std::make_shared<M>(), std::make_shared<M>()};
|
|
std::vector<torch::Tensor> inputs = {
|
|
torch::empty({}), torch::empty({}), torch::empty({})};
|
|
std::vector<torch::Device> devices = {
|
|
{torch::kCUDA, 1}, {torch::kCUDA, 0}, {torch::kCPU}};
|
|
|
|
auto outputs = parallel::parallel_apply(modules, inputs, devices);
|
|
|
|
ASSERT_EQ(outputs.size(), 3);
|
|
ASSERT_TRUE(outputs[0].device().is_cuda());
|
|
ASSERT_EQ(outputs[0].device(), torch::Device(torch::kCUDA, 1));
|
|
|
|
ASSERT_TRUE(outputs[1].device().is_cuda());
|
|
ASSERT_EQ(outputs[1].device(), torch::Device(torch::kCUDA, 0));
|
|
|
|
ASSERT_TRUE(outputs[2].device().is_cpu());
|
|
}
|
|
|
|
TEST_F(ParallelTest, ParallelApplyRethrowsException_MultiCUDA) {
|
|
struct M : torch::nn::Cloneable<M> {
|
|
void reset() override {}
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
throw std::runtime_error("Badness!");
|
|
}
|
|
};
|
|
|
|
auto m = std::make_shared<M>();
|
|
auto input = torch::ones({10, 3});
|
|
ASSERT_THROWS_WITH(parallel::data_parallel(m, input), "Badness!");
|
|
}
|
|
|
|
TEST_F(
|
|
ParallelTest,
|
|
DataParallelPlacesTheOutputOnTheRequestedDevice_MultiCUDA) {
|
|
struct M : torch::nn::Cloneable<M> {
|
|
void reset() override {}
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
// Intermediate tensors should be on the replica's current device.
|
|
intermediate_tensor = torch::rand(5);
|
|
// The returned tensor should be on the output device.
|
|
return torch::ones(3);
|
|
}
|
|
torch::Tensor intermediate_tensor;
|
|
};
|
|
auto m = std::make_shared<M>();
|
|
auto input = torch::ones({10, 3});
|
|
{
|
|
auto output = parallel::data_parallel(
|
|
m,
|
|
input,
|
|
/*devices=*/at::nullopt,
|
|
/*output_device=*/torch::Device(torch::kCUDA, 1));
|
|
ASSERT_TRUE(output.defined());
|
|
ASSERT_TRUE(output.device().is_cuda());
|
|
ASSERT_EQ(output.device().index(), 1);
|
|
}
|
|
{
|
|
// Verify for the single-device case (where we don't scatter/gather).
|
|
auto output = parallel::data_parallel(
|
|
m,
|
|
input,
|
|
/*devices=*/std::vector<torch::Device>{torch::Device(torch::kCUDA, 0)},
|
|
/*output_device=*/torch::Device(torch::kCUDA, 1));
|
|
ASSERT_TRUE(m->intermediate_tensor.defined());
|
|
ASSERT_TRUE(m->intermediate_tensor.device().is_cuda());
|
|
ASSERT_EQ(m->intermediate_tensor.device().index(), 0);
|
|
ASSERT_TRUE(output.defined());
|
|
ASSERT_TRUE(output.device().is_cuda());
|
|
ASSERT_EQ(output.device().index(), 1);
|
|
}
|
|
}
|
|
|
|
TEST_F(ParallelTest, DataParallelUsesAllAvailableCUDADevices_CUDA) {
|
|
struct M : torch::nn::Cloneable<M> {
|
|
void reset() override {}
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
return torch::tensor(torch::getDefaultTensorOptions().device().index());
|
|
}
|
|
};
|
|
|
|
auto m = std::make_shared<M>();
|
|
auto input = torch::ones({10, 3});
|
|
auto output = parallel::data_parallel(m, input);
|
|
|
|
const auto device_count = torch::cuda::device_count();
|
|
ASSERT_EQ(output.numel(), device_count);
|
|
for (size_t i = 0; i < device_count; ++i) {
|
|
ASSERT_EQ(output[i].item<int32_t>(), i);
|
|
}
|
|
}
|