## @package gradient_checker # Module caffe2.python.gradient_checker import os import numpy as np from caffe2.python import core, workspace, net_drawer from caffe2.proto import caffe2_pb2 def getGradientForOp(op): return core.GradientRegistry.GetGradientForOp( op, [s + '_grad' for s in op.output]) def _get_grad_blob(grad_map, input_to_check): grad_blob = grad_map[input_to_check] if isinstance(grad_blob, core.BlobReference): return workspace.blobs[grad_blob] # If grad_blob is not a single blob, it should be a gradient slice. # To make it comparable with the estimiated gradient which is dense, # we need to first convert grad_blob to dense gradient. assert isinstance(grad_blob, core.GradientSlice) dense_grad = 'tmp_dense_grad' sparse_to_dense_op = core.CreateOperator( 'SparseToDense', [grad_blob.indices, grad_blob.values, input_to_check], dense_grad, ) workspace.RunOperatorOnce(sparse_to_dense_op) return workspace.blobs[dense_grad] def _get_grad(net, outputs, outputs_with_grad, input_values, inputs_with_grads): grad_net = net.Clone(net.Name() + "_copy") grad_map = grad_net.AddGradientOperators(outputs_with_grad) for name, value in (input_values or {}).items(): workspace.blobs[name] = value for input_to_check in inputs_with_grads: assert input_to_check in grad_map, ( '{} has no gradient, cannot check net gradient.'.format( input_to_check)) assert str(input_to_check) in workspace.blobs workspace.RunNetOnce(grad_net) forward_results = [(output, workspace.blobs[output]) for output in outputs] grads = {input_to_check: _get_grad_blob(grad_map, input_to_check) for input_to_check in inputs_with_grads} return forward_results, grads, grad_net def _assert_close(value1, value2, threshold, err_msg=''): np.testing.assert_allclose( value1, value2, atol=threshold, rtol=threshold, err_msg=err_msg, ) delta = np.abs(value1 - value2).flatten() return np.mean(delta), max(delta) class NetGradientChecker(object): @staticmethod def CompareNets(nets, outputs, outputs_with_grad_ids, inputs_with_grads, input_values=None, threshold=0.0000001, print_net_images=False): def _get_output_with_grad_names(net_outputs): return [net_outputs[i] for i in outputs_with_grad_ids] if print_net_images: for i, net in enumerate(nets): png = net_drawer.GetPydotGraph(net).create_png() with open("caffe2_net_forward_" + str(i) + net.Name() + ".png", 'wb') \ as f: f.write(png) results = [ _get_grad(net, net_outputs, _get_output_with_grad_names(net_outputs), input_values, inputs_with_grads) for net, net_outputs in zip(nets, outputs) ] if print_net_images: _, _, backward_nets = zip(*results) for i, net in enumerate(backward_nets): png = net_drawer.GetPydotGraph(net).create_png() with open("caffe2_net_" + str(i) + net.Name() + ".png", 'wb') \ as f: f.write(png) first_net_results, first_net_grads, _ = results[0] for net_results, net_grads, _ in results[1:]: assert len(net_results) == len(first_net_results) for idx, ((blob1, blob_value1), (blob2, blob_value2)) in enumerate( zip(first_net_results, net_results)): _assert_close( blob_value1, blob_value2, threshold, err_msg="Different forward pass results for output id {}. " "Corresponding output blobs: {} and {}".format( idx, blob1, blob2)) assert net_grads.keys() == first_net_grads.keys() for blob, blob_grad_value in net_grads.items(): _assert_close( first_net_grads[blob], blob_grad_value, threshold, err_msg="Different gradients for input {}".format(blob)) @staticmethod def Check(net, outputs_with_grad, input_values, input_to_check, step_size=0.0001, threshold=0.05, print_net=True): net_results, net_grads, full_net = _get_grad( net, [], outputs_with_grad, input_values, [input_to_check]) analytic_grad = net_grads[input_to_check] def GetLoss(new_value): workspace.blobs[input_to_check] = new_value workspace.RunNetOnce(full_net) return sum([ workspace.blobs[output] for output in outputs_with_grad ]).sum() def GetValue(dim, delta): input_value = input_values[input_to_check].copy() input_value.flat[dim] += delta return input_value grad_estimate = np.zeros_like(input_values[input_to_check]) for dim in range(input_values[input_to_check].size): pos_loss = GetLoss(GetValue(dim, step_size)) neg_loss = GetLoss(GetValue(dim, -step_size)) grad_estimate.flat[dim] = (pos_loss - neg_loss) / step_size / 2 err_msg = "Error in gradient check for net_copy {}".format( net.Name()) if print_net: err_msg += ": {}".format(net.Proto()) return _assert_close(analytic_grad, grad_estimate, threshold, err_msg) class GradientChecker: """A gradient checker in Python. This is not the most efficient way to check gradients, as the Python interface will involve a lot of copies back and forth operations. Use at your own risk. """ def __init__( self, stepsize, threshold, device_option=None, workspace_name="gradient_check", input_device_options=None, ): self._stepsize = stepsize self._threshold = threshold self._device_option = device_option or caffe2_pb2.DeviceOption() self._workspace_name = workspace_name if input_device_options is None: self._input_device_options = {} else: self._input_device_options = input_device_options def GetLossAndGrad( self, op, grad_ops, inputs, input_names, input_to_check, grad_name, outputs_with_grads ): for i in range(len(inputs)): workspace.FeedBlob(input_names[i], inputs[i], self._input_device_options.get( input_names[i], self._device_option)) x = inputs[input_to_check] # Run. workspace.RunOperatorOnce(op) loss = 0. # Get Loss and feed in the gradients, run gradient ops. for idx in outputs_with_grads: name = op.output[idx] arr = workspace.FetchBlob(name) loss += (arr**2).sum() workspace.FeedBlob(name + '_grad', arr, self._device_option) loss /= 2. # Run gradient ops workspace.RunOperatorsOnce(grad_ops) # Get gradients if isinstance(grad_name, core.GradientSlice): workspace.FeedBlob('zeros', np.zeros_like(x, dtype=np.float32)) workspace.FeedBlob('ones', np.ones(1, dtype=np.float32)) gv_cpu_op = core.CreateOperator( 'EnsureCPUOutput', grad_name.values, grad_name.values + '_cpu', device_option=self._device_option ) gi_cpu_op = core.CreateOperator( 'EnsureCPUOutput', grad_name.indices, grad_name.indices + '_cpu', device_option=self._device_option ) sparse_to_dense_op = core.CreateOperator( 'ScatterWeightedSum', [ 'zeros', 'ones', grad_name.indices + '_cpu', grad_name.values + '_cpu', 'ones' ], 'zeros', ) workspace.RunOperatorOnce(gv_cpu_op) workspace.RunOperatorOnce(gi_cpu_op) workspace.RunOperatorOnce(sparse_to_dense_op) grad = workspace.FetchBlob('zeros') else: grad = workspace.FetchBlob(grad_name) return loss, grad def CheckSimple( self, op, inputs, input_to_check, outputs_with_grads, grad_ops=None, input_device_options=None, ensure_outputs_are_inferred=False, ): """Checks the operator in a very simple fashion by stacking a sum of squares on the top. Inputs: op: the operator to be checked. inputs: the input data in numpy arrays. input_to_check: an index specifying which input blob we should check. outputs_with_grads: indices specifying which output blobs will we need to check gradients with. For these outputs, we will collect a squared sum and also feed in their gradients. grad_operator: the gradient operator. If not given, we will get the gradient operator from the gradient registry. input_device_options: an optional mapping from input names to DeviceOptions (to override the default DeviceOption) ensure_outputs_are_inferred: if set will assert that the gradient output shapes matches the inferred shapes Outputs: boolean: True if it passes, False if it does not pass. """ # Entering the checker workspace old_ws_name = workspace.CurrentWorkspace() if self._workspace_name != old_ws_name: workspace.SwitchWorkspace(self._workspace_name, True) op.device_option.CopyFrom(self._device_option) if grad_ops is None: # TODO(jiayq): use the gradient registration instead of the old # hack. grad_ops, g_input = getGradientForOp(op) _input_device_options = input_device_options or \ core.InferOpBlobDevicesAsDict(op)[0] # First, feed in the input. for i, arr in enumerate(inputs): workspace.FeedBlob( op.input[i], arr, _input_device_options.get( op.input[i], self._device_option)) # Get the loss and gradient for the original. grad_name = g_input[input_to_check] loss, grad = self.GetLossAndGrad( op, grad_ops, inputs, op.input, input_to_check, grad_name, outputs_with_grads, ) grad_estimate = np.zeros_like(inputs[input_to_check]) if grad_estimate.shape != grad.shape: raise Exception( "Mismatched gradient shapes: estimated ({}), grad ({})".format( grad_estimate.shape, grad.shape)) if ensure_outputs_are_inferred: self._assertInferTensorChecks(op, grad_ops) full_grad_check = os.getenv('CAFFE2_FULL_GRAD_CHECK') == '1' dims_to_check = inputs[input_to_check].size for current_dim in range(dims_to_check): # Grad check is very expensive (as it involves running the op from # scratch for each of the input tensor elements). Thus, let's # run it by default only on a small subset of dimensions. Here we # apply very scientific approach: the first and the last 3 elements # of each tensor. Pass CAFFE2_FULL_GRAD_CHECK=1 env var to enable # the full check if not full_grad_check and current_dim >= 3 and \ current_dim + 3 < dims_to_check: grad_estimate.flat[current_dim] = grad.flat[current_dim] continue # Positive gradient inputs[input_to_check].flat[current_dim] += self._stepsize pos_loss, _ = self.GetLossAndGrad( op, grad_ops, inputs, op.input, input_to_check, grad_name, outputs_with_grads ) # Negative gradient inputs[input_to_check].flat[current_dim] -= self._stepsize * 2 neg_loss, _ = self.GetLossAndGrad( op, grad_ops, inputs, op.input, input_to_check, grad_name, outputs_with_grads ) # Recover the value inputs[input_to_check].flat[current_dim] += self._stepsize grad_estimate.flat[current_dim] = ( pos_loss - neg_loss) / self._stepsize / 2 # Now, check correctness fail_mat = ~np.isclose( grad, grad_estimate, atol=self._threshold, rtol=self._threshold) if np.any(fail_mat): idx = np.flatnonzero(fail_mat) print('Failed. [idx, grad, grad_estimate] are:') print(np.vstack([idx, grad.flat[idx], grad_estimate.flat[idx]]).T) ret = False else: ret = True # After finishing, cleaning up things. if self._workspace_name != old_ws_name: # We reset the workspace to make sure everything intermediate is # cleaned up. Note that there is no need to delete a workspace - # when empty it takes a very limited amount of memory. workspace.ResetWorkspace() workspace.SwitchWorkspace(old_ws_name) return ret, grad, grad_estimate def _assertInferTensorChecks(self, op, grad_ops): tmp_net = caffe2_pb2.NetDef() tmp_net.op.extend([op]) tmp_net.op.extend(grad_ops) inferred_shapes, inferred_types = workspace.InferShapesAndTypes( [tmp_net], nets_proto=True, ) outputs = set() for grad_op in grad_ops: outputs.update(grad_op.output) for output in outputs: if output not in inferred_shapes: raise Exception( "expected output {} to be inferred".format(output)) blob = workspace.FetchBlob(output) correct_shape = list(blob.shape) inferred_shape = list(inferred_shapes[output]) if correct_shape != inferred_shape: raise Exception( "Mismatched inferred shape: want({}), got({})".format( correct_shape, inferred_shape)) if type(blob) is np.ndarray: if blob.dtype == np.dtype('float64'): correct_type = caffe2_pb2.TensorProto.DOUBLE elif blob.dtype == np.dtype('float32'): correct_type = caffe2_pb2.TensorProto.FLOAT elif blob.dtype == np.dtype('int32'): correct_type = caffe2_pb2.TensorProto.INT32 elif blob.dtype == np.dtype('int64'): correct_type = caffe2_pb2.TensorProto.INT64 else: correct_type = "unknown {}".format(np.dtype) else: correct_type = str(type(blob)) inferred_type = inferred_types[output] if correct_type != inferred_type: raise Exception( "Mismatched inferred type: want({}), got({})".format( correct_type, inferred_type))