Enable junk fill for the default CPU allocator (#13377)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/13377 * Enable junk fill for the default CPU allocator. The first diff only enables this for the tests. A second diff will change the default of zero-fill to false. * Fix tests to use 64-bit counters that IterOp and LearningRateOp demands. * Fix kernels that uses uninitialized memory. Reviewed By: salexspb Differential Revision: D10866512 fbshipit-source-id: 17860e77e63a203edf46d0da0335608f77884821
2025-10-20 21:14:14 +08:00 · 2018-11-08 00:00:39 -08:00
parent 21991c05a9
commit de41d1ae0b
11 changed files with 65 additions and 9 deletions
--- a/caffe2/core/allocator.cc
+++ b/caffe2/core/allocator.cc
@ -13,8 +13,30 @@ C10_DEFINE_bool(
    true,
    "If set, do memory zerofilling when allocating on CPU");

+C10_DEFINE_bool(
+    caffe2_cpu_allocator_do_junk_fill,
+    false,
+    "If set, fill memory with deterministic junk when allocating on CPU");
+
 namespace caffe2 {

+void memset_junk(void* data, size_t num) {
+  // This garbage pattern is NaN when interpretted as floating point values,
+  // or as very large integer values.
+  static constexpr int32_t kJunkPattern = 0x7fedbeef;
+  static constexpr int64_t kJunkPattern64 =
+      static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
+  int32_t int64_count = num / sizeof(kJunkPattern64);
+  int32_t remaining_bytes = num % sizeof(kJunkPattern64);
+  int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
+  for (int i = 0; i < int64_count; i++) {
+    data_i64[i] = kJunkPattern64;
+  }
+  if (remaining_bytes > 0) {
+    memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
+  }
+}
+
 void NoDelete(void*) {}

 at::Allocator* GetCPUAllocator() {
--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@ -10,6 +10,7 @@

 C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
 C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
+C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);

 namespace caffe2 {

@ -43,6 +44,11 @@ class CAFFE2_API MemoryAllocationReporter {
  size_t allocated_;
 };

+// Fill the data memory region of num bytes with a particular garbage pattern.
+// The garbage value is chosen to be NaN if interpreted as floating point value,
+// or a very large integer.
+CAFFE2_API void memset_junk(void* data, size_t num);
+
 struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
  DefaultCPUAllocator() {}
  ~DefaultCPUAllocator() override {}
@ -58,8 +64,14 @@ struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
    CAFFE_ENFORCE(data);
    // move data to a thread's NUMA node
    NUMAMove(data, nbytes, GetCurrentNUMANode());
+    CHECK(
+        !FLAGS_caffe2_cpu_allocator_do_zero_fill ||
+        !FLAGS_caffe2_cpu_allocator_do_junk_fill)
+        << "Cannot request both zero-fill and junk-fill at the same time";
    if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
      memset(data, 0, nbytes);
+    } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
+      memset_junk(data, nbytes);
    }
    if (FLAGS_caffe2_report_cpu_memory_usage) {
      reporter_.New(data, nbytes);
--- a/caffe2/operators/instance_norm_gradient_op.cc
+++ b/caffe2/operators/instance_norm_gradient_op.cc
@ -92,13 +92,23 @@ bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {

    // for each channel
    // dl/dbias = sum_j dl/dy_j
-    bias_grad_arr += output_grad_mat.rowwise().sum();
+    auto bias_grad_delta = output_grad_mat.rowwise().sum();
+    if (n == 0) {
+      bias_grad_arr = bias_grad_delta;
+    } else {
+      bias_grad_arr += bias_grad_delta;
+    }
    // for each channel
    // dl/dscale = sum_j dl/dy_j (x_j - mu) / stdev
-    scale_grad_arr +=
+    auto scale_grad_delta =
        ((input_grad_mat.colwise() * inv_stdev_arr) * output_grad_mat)
            .rowwise()
            .sum();
+    if (n == 0) {
+      scale_grad_arr = scale_grad_delta;
+    } else {
+      scale_grad_arr += scale_grad_delta;
+    }

    // dl/dx_j = this gross thing
    // Derived gradient and manually massaged it to minimize extra storage
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@ -205,8 +205,8 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
    // copy data and add padding index as zero
    Tensor zero{CPU};
    zero.Resize(block_size);
-    auto zeroPtr =
-        static_cast<const char*>(zero.raw_mutable_data(features.dtype()));
+    auto zeroPtr = static_cast<char*>(zero.raw_mutable_data(features.dtype()));
+    memset(zeroPtr, 0, zero.nbytes());
    int start_dest = 0;
    int start_src = 0;
    for (int i = 0; i < lengths.numel(); ++i) {
--- a/caffe2/operators/sequence_ops.h
+++ b/caffe2/operators/sequence_ops.h
@ -26,10 +26,15 @@ class GatherPaddingOp final : public Operator<Context> {
  bool RunOnDevice() override {
    if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
      Output(0)->Resize(std::vector<int64_t>(0));
-      Output(0)->template mutable_data<int64_t>();
+      auto output_0_data = Output(0)->template mutable_data<int64_t>();
+      // TODO(zhengxq): as suggested by salex@, change this to a loop.
+      math::Set<int64_t, Context>(
+          Output(0)->numel(), 0, output_0_data, &context_);
      if (OutputSize() == 2) {
        Output(1)->Resize(std::vector<int64_t>(0));
-        Output(1)->template mutable_data<int64_t>();
+        auto output_1_data = Output(1)->template mutable_data<int64_t>();
+        math::Set<int64_t, Context>(
+            Output(1)->numel(), 0, output_1_data, &context_);
      }
      return true;
    }
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@ -252,6 +252,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {

    char* output_data =
        static_cast<char*>(output->raw_mutable_data(gradient_output.dtype()));
+    memset(output_data, 0, output->nbytes());
    math::Set<char, Context>(
        default_length * gradient_output.itemsize(), 0, output_data, &context_);

--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@ -81,7 +81,8 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
          // Get output channel
          size_t c = filterTensor->dim32(0);
          tensor->Resize(c);
-          tensor->mutable_data<float>();
+          float* tensor_data = tensor->mutable_data<float>();
+          memset(tensor_data, 0, tensor->nbytes());
          break;
        }
        convOrder++;
--- a/caffe2/python/gradient_check_test.py
+++ b/caffe2/python/gradient_check_test.py
@ -458,7 +458,7 @@ class TestIf(test_util.TestCase):

        init_net = init_nb.get()[0]
        ITER = init_net.ConstantFill(
-            [], "ITER", shape=[1], value=0, dtype=core.DataType.INT32)
+            [], "ITER", shape=[1], value=0, dtype=core.DataType.INT64)
        train_net.Iter(ITER, ITER)
        LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1,
                                        policy="step", stepsize=20, gamma=0.9)
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@ -48,6 +48,8 @@ class TestCase(unittest.TestCase):
        workspace.GlobalInit([
            'caffe2',
            '--caffe2_log_level=0',
+            '--caffe2_cpu_allocator_do_zero_fill=0',
+            '--caffe2_cpu_allocator_do_junk_fill=1',
        ])
        # clear the default engines settings to separate out its
        # affect from the ops tests
--- a/caffe2/python/toy_regression_test.py
+++ b/caffe2/python/toy_regression_test.py
@ -24,7 +24,7 @@ class TestToyRegression(test_util.TestCase):
        LR = init_net.ConstantFill([], "LR", shape=[1], value=-0.1)
        ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
        ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0,
-                                     dtype=core.DataType.INT32)
+                                     dtype=core.DataType.INT64)

        train_net = core.Net("train")
        X = train_net.GaussianFill([], "X", shape=[64, 2], mean=0.0, std=1.0)
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@ -117,6 +117,9 @@ class TestWorkspace(unittest.TestCase):

        """ test in-place initialization """
        tensor.init([2, 3], core.DataType.INT32)
+        for x in range(2):
+            for y in range(3):
+                tensor.data[x, y] = 0
        tensor.data[1, 1] = 100
        val = np.zeros([2, 3], dtype=np.int32)
        val[1, 1] = 100