mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 13:44:15 +08:00
Summary: Slightly modified Adam, following the python implementation, and the `ProducesPyTorchValues` tests pass. I had a problem with another test though (see commit c1a6241676ab84fc531c1c3a10f964aa5704092e), it seems that optimizing for two steps with the same optimizer vs optimizing for two steps using freshly initialized objects will produce the same output. Pull Request resolved: https://github.com/pytorch/pytorch/pull/40009 Differential Revision: D22096053 Pulled By: glaringlee fbshipit-source-id: a31a8f5488cb37c53752ddf15436efabdba67dc4
455 lines
14 KiB
C++
455 lines
14 KiB
C++
#include <gtest/gtest.h>
|
|
|
|
#include <torch/torch.h>
|
|
|
|
#include <test/cpp/api/optim_baseline.h>
|
|
#include <test/cpp/api/support.h>
|
|
|
|
#include <cmath>
|
|
#include <cstdlib>
|
|
#include <functional>
|
|
#include <iostream>
|
|
#include <memory>
|
|
#include <random>
|
|
#include <vector>
|
|
|
|
using namespace torch::nn;
|
|
using namespace torch::optim;
|
|
|
|
template <typename OptimizerClass, typename Options>
|
|
bool test_optimizer_xor(Options options) {
|
|
torch::manual_seed(0);
|
|
|
|
Sequential model(
|
|
Linear(2, 8),
|
|
Functional(torch::sigmoid),
|
|
Linear(8, 1),
|
|
Functional(torch::sigmoid));
|
|
|
|
const int64_t kBatchSize = 200;
|
|
const int64_t kMaximumNumberOfEpochs = 3000;
|
|
|
|
OptimizerClass optimizer(model->parameters(), options);
|
|
|
|
float running_loss = 1;
|
|
int epoch = 0;
|
|
while (running_loss > 0.1) {
|
|
auto inputs = torch::empty({kBatchSize, 2});
|
|
auto labels = torch::empty({kBatchSize});
|
|
for (size_t i = 0; i < kBatchSize; i++) {
|
|
inputs[i] = torch::randint(2, {2}, torch::kInt64);
|
|
labels[i] = inputs[i][0].item<int64_t>() ^ inputs[i][1].item<int64_t>();
|
|
}
|
|
|
|
inputs.set_requires_grad(true);
|
|
|
|
auto step = [&](OptimizerClass& optimizer, Sequential model, torch::Tensor inputs, torch::Tensor labels) {
|
|
auto closure = [&]() {
|
|
optimizer.zero_grad();
|
|
auto x = model->forward(inputs);
|
|
auto loss = torch::binary_cross_entropy(x, labels);
|
|
loss.backward();
|
|
return loss;
|
|
};
|
|
return optimizer.step(closure);
|
|
};
|
|
|
|
torch::Tensor loss = step(optimizer, model, inputs, labels);
|
|
|
|
running_loss = running_loss * 0.99 + loss.item<float>() * 0.01;
|
|
if (epoch > kMaximumNumberOfEpochs) {
|
|
std::cout << "Loss is too high after epoch " << epoch << ": "
|
|
<< running_loss << std::endl;
|
|
return false;
|
|
}
|
|
epoch++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <typename Parameters>
|
|
void assign_parameter(
|
|
const Parameters& parameters,
|
|
const char* name,
|
|
torch::Tensor new_tensor) {
|
|
auto parameter = parameters[name];
|
|
parameter.set_requires_grad(false);
|
|
parameter.flatten().copy_(new_tensor);
|
|
parameter.set_requires_grad(true);
|
|
}
|
|
|
|
template <typename OptimizerClass, typename Options>
|
|
void check_exact_values(
|
|
Options options,
|
|
std::vector<std::vector<torch::Tensor>> expected_parameters) {
|
|
const size_t kIterations = 1001;
|
|
const size_t kSampleEvery = 100;
|
|
|
|
torch::manual_seed(0);
|
|
|
|
Sequential model(
|
|
Linear(2, 3),
|
|
Functional(torch::sigmoid),
|
|
Linear(3, 1),
|
|
Functional(torch::sigmoid));
|
|
|
|
model->to(torch::kFloat64);
|
|
|
|
// Use exact input values because matching random values is hard.
|
|
auto parameters = model->named_parameters();
|
|
assign_parameter(
|
|
parameters,
|
|
"0.weight",
|
|
torch::tensor({-0.2109, -0.4976, -0.1413, -0.3420, -0.2524, 0.6976}, torch::kFloat64));
|
|
assign_parameter(
|
|
parameters, "0.bias", torch::tensor({-0.1085, -0.2979, 0.6892}, torch::kFloat64));
|
|
assign_parameter(
|
|
parameters, "2.weight", torch::tensor({-0.0508, -0.3941, -0.2843}, torch::kFloat64));
|
|
assign_parameter(parameters, "2.bias", torch::tensor({-0.0711}, torch::kFloat64));
|
|
|
|
auto optimizer = OptimizerClass(parameters.values(), options);
|
|
torch::Tensor input =
|
|
torch::tensor({0.1, 0.2, 0.3, 0.4, 0.5, 0.6}, torch::kFloat64).reshape({3, 2});
|
|
|
|
for (size_t i = 0; i < kIterations; ++i) {
|
|
optimizer.zero_grad();
|
|
auto output = model->forward(input);
|
|
auto loss = output.sum();
|
|
loss.backward();
|
|
|
|
auto closure = []() { return torch::tensor({10}); };
|
|
optimizer.step(closure);
|
|
|
|
if (i % kSampleEvery == 0) {
|
|
ASSERT_TRUE(
|
|
expected_parameters.at(i / kSampleEvery).size() == parameters.size());
|
|
for (size_t p = 0; p < parameters.size(); ++p) {
|
|
ASSERT_TRUE(parameters[p]->defined());
|
|
// Always compare using double dtype, regardless of the original dtype of the tensors
|
|
auto computed = parameters[p]->flatten().to(torch::kFloat64);
|
|
auto expected = expected_parameters.at(i / kSampleEvery).at(p).to(torch::kFloat64);
|
|
if (!computed.allclose(expected, /*rtol=*/1e-3, /*atol=*/5e-4)) {
|
|
std::cout << "Iteration " << i << ": " << computed
|
|
<< " != " << expected << " (parameter " << p << ")"
|
|
<< std::endl;
|
|
ASSERT_TRUE(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST(OptimTest, OptimizerAccessors) {
|
|
auto options = AdagradOptions(1.0);
|
|
std::vector<torch::Tensor> params;
|
|
for (size_t i = 0; i < 3; i++) {
|
|
params.push_back(torch::randn(10));
|
|
}
|
|
auto optimizer = Adagrad(params, options);
|
|
// test for defaults() method with non-const reference
|
|
auto& options_ = static_cast<AdagradOptions&>(optimizer.defaults());
|
|
ASSERT_TRUE(options == options_);
|
|
// test for param_groups() with non-const reference return
|
|
auto& params_groups = optimizer.param_groups();
|
|
params_groups.push_back(OptimizerParamGroup(params));
|
|
auto& params_1 = params_groups[1].params();
|
|
for (size_t i = 0; i < params_1.size(); i++) {
|
|
torch::equal(params[i], params_1[i]);
|
|
}
|
|
|
|
// test for add_param_group() when one or more params existing in another param_group
|
|
// are passed in the new param group to be added
|
|
ASSERT_THROWS_WITH(
|
|
optimizer.add_param_group(OptimizerParamGroup(params)), "some parameters appear in more than one parameter group");
|
|
|
|
// test for state() with non-const reference return
|
|
auto& state_ = static_cast<AdagradParamState&>(*(optimizer.state()[c10::guts::to_string(params_1[0].unsafeGetTensorImpl())]));
|
|
state_.step(state_.step()+1);
|
|
|
|
const auto& optimizer_ = Adagrad(params, options);
|
|
optimizer_.defaults();
|
|
// test for param_groups() with const reference return
|
|
const auto& params_2 = optimizer_.param_groups();
|
|
// test for state() with const reference return
|
|
optimizer_.state();
|
|
}
|
|
|
|
#define OLD_INTERFACE_WARNING_CHECK(func) \
|
|
{ \
|
|
torch::test::WarningCapture warnings; \
|
|
func; \
|
|
ASSERT_EQ( \
|
|
torch::test::count_substr_occurrences( \
|
|
warnings.str(), "will be removed"), \
|
|
1); \
|
|
}
|
|
|
|
struct MyOptimizerOptions : public OptimizerCloneableOptions<MyOptimizerOptions> {
|
|
MyOptimizerOptions(double lr = 1.0) : lr_(lr) {};
|
|
TORCH_ARG(double, lr) = 1.0;
|
|
};
|
|
|
|
TEST(OptimTest, OldInterface) {
|
|
struct MyOptimizer : Optimizer {
|
|
using Optimizer::Optimizer;
|
|
torch::Tensor step(LossClosure closure = nullptr) override { return {};}
|
|
explicit MyOptimizer(
|
|
std::vector<at::Tensor> params, MyOptimizerOptions defaults = {}) :
|
|
Optimizer({std::move(OptimizerParamGroup(params))}, std::make_unique<MyOptimizerOptions>(defaults)) {}
|
|
};
|
|
std::vector<torch::Tensor> parameters = {
|
|
torch::ones({2, 3}), torch::zeros({2, 3}), torch::rand({2, 3})};
|
|
{
|
|
MyOptimizer optimizer(parameters);
|
|
size_t size;
|
|
OLD_INTERFACE_WARNING_CHECK(size = optimizer.size());
|
|
ASSERT_EQ(size, parameters.size());
|
|
}
|
|
{
|
|
std::vector<at::Tensor> params;
|
|
MyOptimizer optimizer(params);
|
|
|
|
size_t size;
|
|
OLD_INTERFACE_WARNING_CHECK(size = optimizer.size());
|
|
ASSERT_EQ(size, 0);
|
|
|
|
OLD_INTERFACE_WARNING_CHECK(optimizer.add_parameters(parameters));
|
|
|
|
OLD_INTERFACE_WARNING_CHECK(size = optimizer.size());
|
|
ASSERT_EQ(size, parameters.size());
|
|
|
|
std::vector<torch::Tensor> params_;
|
|
OLD_INTERFACE_WARNING_CHECK(params_ = optimizer.parameters());
|
|
for (size_t p = 0; p < size; ++p) {
|
|
ASSERT_TRUE(params_[p].allclose(parameters[p]));
|
|
}
|
|
}
|
|
{
|
|
Linear linear(3, 4);
|
|
MyOptimizer optimizer(linear->parameters());
|
|
|
|
size_t size;
|
|
OLD_INTERFACE_WARNING_CHECK(size = optimizer.size());
|
|
ASSERT_EQ(size, linear->parameters().size());
|
|
}
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_SGD) {
|
|
ASSERT_TRUE(test_optimizer_xor<SGD>(
|
|
SGDOptions(0.1).momentum(0.9).nesterov(true).weight_decay(1e-6)));
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_LBFGS) {
|
|
ASSERT_TRUE(test_optimizer_xor<LBFGS>(LBFGSOptions(1.0)));
|
|
ASSERT_TRUE(test_optimizer_xor<LBFGS>(LBFGSOptions(1.0).line_search_fn("strong_wolfe")));
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_Adagrad) {
|
|
ASSERT_TRUE(test_optimizer_xor<Adagrad>(
|
|
AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3)));
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_RMSprop) {
|
|
ASSERT_TRUE(test_optimizer_xor<RMSprop>(RMSpropOptions(0.1).centered(true)));
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_RMSpropWithMomentum) {
|
|
ASSERT_TRUE(test_optimizer_xor<RMSprop>(
|
|
RMSpropOptions(0.1).momentum(0.9).weight_decay(1e-6)));
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_Adam) {
|
|
ASSERT_TRUE(test_optimizer_xor<Adam>(AdamOptions(0.1).weight_decay(1e-6)));
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_AdamWithAmsgrad) {
|
|
ASSERT_TRUE(test_optimizer_xor<Adam>(
|
|
AdamOptions(0.1).weight_decay(1e-6).amsgrad(true)));
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_Adam) {
|
|
check_exact_values<Adam>(AdamOptions(1.0), expected_parameters::Adam());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_AdamWithWeightDecay) {
|
|
check_exact_values<Adam>(
|
|
AdamOptions(1.0).weight_decay(1e-2),
|
|
expected_parameters::Adam_with_weight_decay());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_AdamWithWeightDecayAndAMSGrad) {
|
|
check_exact_values<Adam>(
|
|
AdamOptions(1.0).weight_decay(1e-6).amsgrad(true),
|
|
expected_parameters::Adam_with_weight_decay_and_amsgrad());
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_AdamW) {
|
|
ASSERT_TRUE(test_optimizer_xor<AdamW>(AdamWOptions(0.1)));
|
|
}
|
|
|
|
TEST(OptimTest, XORConvergence_AdamWWithAmsgrad) {
|
|
ASSERT_TRUE(test_optimizer_xor<AdamW>(
|
|
AdamWOptions(0.1).amsgrad(true)));
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_AdamW) {
|
|
check_exact_values<AdamW>(AdamWOptions(1.0), expected_parameters::AdamW());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_AdamWWithoutWeightDecay) {
|
|
check_exact_values<AdamW>(
|
|
AdamWOptions(1.0).weight_decay(0),
|
|
expected_parameters::AdamW_without_weight_decay());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_AdamWWithAMSGrad) {
|
|
check_exact_values<AdamW>(
|
|
AdamWOptions(1.0).amsgrad(true),
|
|
expected_parameters::AdamW_with_amsgrad());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_Adagrad) {
|
|
check_exact_values<Adagrad>(
|
|
AdagradOptions(1.0), expected_parameters::Adagrad());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_AdagradWithWeightDecay) {
|
|
check_exact_values<Adagrad>(
|
|
AdagradOptions(1.0).weight_decay(1e-2),
|
|
expected_parameters::Adagrad_with_weight_decay());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_AdagradWithWeightDecayAndLRDecay) {
|
|
check_exact_values<Adagrad>(
|
|
AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3),
|
|
expected_parameters::Adagrad_with_weight_decay_and_lr_decay());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_RMSprop) {
|
|
check_exact_values<RMSprop>(
|
|
RMSpropOptions(0.1), expected_parameters::RMSprop());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_RMSpropWithWeightDecay) {
|
|
check_exact_values<RMSprop>(
|
|
RMSpropOptions(0.1).weight_decay(1e-2),
|
|
expected_parameters::RMSprop_with_weight_decay());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_RMSpropWithWeightDecayAndCentered) {
|
|
check_exact_values<RMSprop>(
|
|
RMSpropOptions(0.1).weight_decay(1e-6).centered(true),
|
|
expected_parameters::RMSprop_with_weight_decay_and_centered());
|
|
}
|
|
|
|
TEST(
|
|
OptimTest,
|
|
ProducesPyTorchValues_RMSpropWithWeightDecayAndCenteredAndMomentum) {
|
|
check_exact_values<RMSprop>(
|
|
RMSpropOptions(0.1).weight_decay(1e-6).centered(true).momentum(0.9),
|
|
expected_parameters::
|
|
RMSprop_with_weight_decay_and_centered_and_momentum());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_SGD) {
|
|
check_exact_values<SGD>(SGDOptions(0.1), expected_parameters::SGD());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_SGDWithWeightDecay) {
|
|
check_exact_values<SGD>(
|
|
SGDOptions(0.1).weight_decay(1e-2),
|
|
expected_parameters::SGD_with_weight_decay());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_SGDWithWeightDecayAndMomentum) {
|
|
check_exact_values<SGD>(
|
|
SGDOptions(0.1).weight_decay(1e-2).momentum(0.9),
|
|
expected_parameters::SGD_with_weight_decay_and_momentum());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_SGDWithWeightDecayAndNesterovMomentum) {
|
|
check_exact_values<SGD>(
|
|
SGDOptions(0.1).weight_decay(1e-6).momentum(0.9).nesterov(true),
|
|
expected_parameters::SGD_with_weight_decay_and_nesterov_momentum());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_LBFGS) {
|
|
check_exact_values<LBFGS>(
|
|
LBFGSOptions(1.0),
|
|
expected_parameters::LBFGS());
|
|
}
|
|
|
|
TEST(OptimTest, ProducesPyTorchValues_LBFGS_with_line_search) {
|
|
check_exact_values<LBFGS>(
|
|
LBFGSOptions(1.0).line_search_fn("strong_wolfe"),
|
|
expected_parameters::LBFGS_with_line_search());
|
|
}
|
|
|
|
TEST(OptimTest, ZeroGrad) {
|
|
torch::manual_seed(0);
|
|
|
|
Linear model(2, 8);
|
|
SGD optimizer(model->parameters(), 0.1);
|
|
|
|
for (const auto& parameter : model->parameters()) {
|
|
ASSERT_FALSE(parameter.grad().defined());
|
|
}
|
|
|
|
auto output = model->forward(torch::ones({5, 2}));
|
|
auto loss = output.sum();
|
|
loss.backward();
|
|
|
|
for (const auto& parameter : model->parameters()) {
|
|
ASSERT_TRUE(parameter.grad().defined());
|
|
ASSERT_GT(parameter.grad().sum().item<float>(), 0);
|
|
}
|
|
|
|
optimizer.zero_grad();
|
|
|
|
for (const auto& parameter : model->parameters()) {
|
|
ASSERT_TRUE(parameter.grad().defined());
|
|
ASSERT_EQ(parameter.grad().sum().item<float>(), 0);
|
|
}
|
|
}
|
|
|
|
TEST(OptimTest, ExternalVectorOfParameters) {
|
|
torch::manual_seed(0);
|
|
|
|
std::vector<torch::Tensor> parameters = {
|
|
torch::randn({2, 2}), torch::randn({3, 3}), torch::randn({4, 4})};
|
|
std::vector<torch::Tensor> original_parameters = {
|
|
parameters[0].clone(), parameters[1].clone(), parameters[2].clone()};
|
|
|
|
// Set all gradients to one
|
|
for (auto& parameter : parameters) {
|
|
parameter.grad() = torch::ones_like(parameter);
|
|
}
|
|
|
|
SGD optimizer(parameters, 1.0);
|
|
|
|
optimizer.step();
|
|
|
|
ASSERT_TRUE(parameters[0].allclose(original_parameters[0] - 1.0));
|
|
ASSERT_TRUE(parameters[1].allclose(original_parameters[1] - 1.0));
|
|
ASSERT_TRUE(parameters[2].allclose(original_parameters[2] - 1.0));
|
|
}
|
|
|
|
TEST(OptimTest, AddParameter_LBFGS) {
|
|
torch::manual_seed(0);
|
|
|
|
std::vector<torch::Tensor> parameters = {torch::randn({5, 5})};
|
|
std::vector<torch::Tensor> original_parameters = {parameters[0].clone()};
|
|
|
|
// Set all gradients to one
|
|
for (auto& parameter : parameters) {
|
|
parameter.grad() = torch::ones_like(parameter);
|
|
}
|
|
|
|
LBFGS optimizer(std::vector<torch::Tensor>{}, 1.0);
|
|
OLD_INTERFACE_WARNING_CHECK(optimizer.add_parameters(parameters));
|
|
|
|
optimizer.step([]() { return torch::tensor(1); });
|
|
|
|
// REQUIRE this doesn't throw
|
|
}
|