Add validator for optimizers when parameters are shared

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18497 Reviewed By: kennyhorror Differential Revision: D14614738 fbshipit-source-id: beddd8349827dcc8ccae36f21e5d29627056afcd
2025-10-20 21:14:14 +08:00 · 2019-04-17 21:07:42 -07:00
parent 2787f1d8ed
commit c48e1679f9
3 changed files with 153 additions and 1 deletions
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@ -14,7 +14,7 @@ from caffe2.python.modeling.parameter_sharing import (
 )
 from caffe2.python.modeling.net_modifier import NetModifier

-from caffe2.python.optimizer import get_param_device
+from caffe2.python.optimizer import get_param_device, Optimizer
 from caffe2.python.regularizer import Regularizer, RegularizationBy
 from caffe2.python.layers import layers
 from caffe2.proto import caffe2_pb2
@ -228,6 +228,66 @@ class LayerModelHelper(model_helper.ModelHelper):
                    scope.CurrentNameScope(), param_name, ref_shape, shape)
            )

+    def _validate_param_optim(self, param_name, optim):
+        # there are three possible values for optim:
+        # 1) None (which will use self._default_optimizer after this layer is instantiated)
+        # 2) self.NoOptim
+        # 3) an instance of Optimizer class such as AdagradOptimizer
+
+        # this implies this parameter is not shared with any other parameter so far
+        if param_name not in self.param_to_optim:
+            return
+
+        logger.info("{} shares the same parameter with another parameter. "
+                    "Validating if the same optimizer has been specified for them.".format(
+                        param_name,
+                    ))
+
+        ref_optim = self.param_to_optim[param_name]
+
+        if optim is None:
+            assert ref_optim == self._default_optimizer, (
+                "Optim for {} is None which will fall back to use default_optimizer. "
+                "However, the optimizer that has been specified for this shared parameter "
+                "is {} which is different from default_optimizer {}. "
+                "Please check the optimizers specified for parameters shared "
+                "with {} and the default_optimizer to ensure the consistency.".format(
+                    param_name, ref_optim, self._default_optimizer, param_name
+                )
+            )
+        elif optim == self.NoOptim:
+            assert ref_optim == self.NoOptim, (
+                "Optim for {} is NoOptim. However, the optimizer for the parameters "
+                "shared with {} is {} which is different from NoOptim. "
+                "Please check the optimizer specified for other parameters in the "
+                "shared group to ensure consistency.".format(
+                    param_name, param_name, ref_optim
+                )
+            )
+        elif isinstance(optim, Optimizer):
+            assert isinstance(ref_optim, Optimizer), (
+                "Optim for {} is an instance of Optimizer. However, the optimizer "
+                "for the parameters shared with {} is {} which is not an instance "
+                "of Optimizer. Please check the optimizer specified for other "
+                " parameters in the shared group to ensure consistency.".format(
+                    param_name, param_name, ref_optim, optim
+                )
+            )
+
+            assert type(optim) is type(ref_optim) and optim.attributes == ref_optim.attributes, (
+                "Optim for {} is an instance of Optimizer. However, the optimizer "
+                "for the parameters shared with {} is {}. "
+                "This optimizer either doesn't have the same type as the current optimizer: "
+                "{} vs {}, or its attributes such as learning rate are different from "
+                "that of current optimizer which is {} vs {}. "
+                "Please check the optimizer specified for other parameters in the "
+                "shared group to ensure consistency.".format(
+                    param_name, param_name, ref_optim, type(optim), type(ref_optim), optim.attributes, ref_optim.attributes
+                )
+            )
+        else:
+            raise ValueError("optim should be either None, NoOptim, or an instance of Optimizer, Got {} ".format(optim))
+
    def create_param(self, param_name, shape, initializer, optimizer=None,
                     ps_param=None, regularizer=None):
        if isinstance(param_name, core.BlobReference):
@ -270,6 +330,8 @@ class LayerModelHelper(model_helper.ModelHelper):

        self._validate_param_shape(param_name, shape)

+        self._validate_param_optim(param_name, optimizer)
+
        self._param_to_shape[param_name] = shape

        return param
--- a/caffe2/python/layer_parameter_sharing_test.py
+++ b/caffe2/python/layer_parameter_sharing_test.py
@ -7,6 +7,7 @@ from caffe2.python import core, scope
 from caffe2.python.modeling.parameter_sharing import (
    ParameterSharing,
 )
+from caffe2.python.optimizer import AdagradOptimizer, AdamOptimizer
 from caffe2.python.layer_test_util import LayersTestCase
 import six

@ -149,3 +150,84 @@ class ParameterSharingTest(LayersTestCase):
            sorted(op_outputs),
            ['global_scope/shared_fc/b', 'global_scope/shared_fc/w']
        )
+
+    def test_layer_shared_parameter_optim_validator(self):
+        """
+        This test is to cover the _validate_param_optim function in
+        layer_model_helper class.
+        """
+
+        output_dims = 2
+
+        adagrad_optim = AdagradOptimizer(
+            alpha=0.004,
+            epsilon=0.02,
+        )
+
+        self.model.default_optimizer = adagrad_optim
+
+        # the following covers the branch -- optim is None
+        with scope.NameScope('global_scope_0'):
+            with ParameterSharing({'scope_1': 'scope_0'}):
+                with scope.NameScope('scope_0'):
+                    fc1_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=self.model.NoOptim,
+                    )
+
+                with scope.NameScope('scope_1'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims
+                    )
+
+        # the following covers the branch -- optim is NoOptim
+        with scope.NameScope('global_scope_1'):
+            with ParameterSharing({'scope_1': 'scope_0'}):
+                with scope.NameScope('scope_0'):
+                    fc1_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=None,
+                    )
+
+                with scope.NameScope('scope_1'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=self.model.NoOptim,
+                    )
+
+        # the following covers the branch -- optim is an instance of Optimizer
+        adagrad_optim_2 = AdagradOptimizer(
+            alpha=0.005,
+            epsilon=0.02,
+        )
+
+        adam_optim = AdamOptimizer()
+
+        self.model.default_optimizer = adagrad_optim_2
+
+        with scope.NameScope('global_scope_2'):
+            with ParameterSharing({'scope_1': 'scope_0', 'scope_2': 'scope_0'}):
+                with scope.NameScope('scope_0'):
+                    fc1_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=None,   # it will use adagrad_optim_2
+                    )
+
+                with scope.NameScope('scope_1'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=adagrad_optim,
+                    )
+
+                with scope.NameScope('scope_2'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=adam_optim,
+                    )
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@ -9,6 +9,7 @@ from collections import namedtuple, defaultdict
 from past.builtins import basestring

 import logging
+import copy

 import numpy as np

@ -72,6 +73,13 @@ class Optimizer(object):
            classname, self._instance_num, base_str, node_name, gpu_id,
        )

+    @property
+    def attributes(self):
+        # return a dict that contains attributes related to init args only
+        attr = copy.deepcopy(self.__dict__)
+        del attr['_instance_num']
+        return attr
+
    def make_unique_blob_name(self, base_str):
        """
        Returns a blob name that will be unique to the current device