Enable junk fill for the default CPU allocator (#13377)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13377

* Enable junk fill for the default CPU allocator. The first diff only enables this for the tests. A second diff will change the default of zero-fill to false.
* Fix tests to use 64-bit counters that IterOp and LearningRateOp demands.
* Fix kernels that uses uninitialized memory.

Reviewed By: salexspb

Differential Revision: D10866512

fbshipit-source-id: 17860e77e63a203edf46d0da0335608f77884821
This commit is contained in:
Xiaoqiang Zheng
2018-11-08 00:00:39 -08:00
committed by Facebook Github Bot
parent 21991c05a9
commit de41d1ae0b
11 changed files with 65 additions and 9 deletions

View File

@ -13,8 +13,30 @@ C10_DEFINE_bool(
true,
"If set, do memory zerofilling when allocating on CPU");
C10_DEFINE_bool(
caffe2_cpu_allocator_do_junk_fill,
false,
"If set, fill memory with deterministic junk when allocating on CPU");
namespace caffe2 {
void memset_junk(void* data, size_t num) {
// This garbage pattern is NaN when interpretted as floating point values,
// or as very large integer values.
static constexpr int32_t kJunkPattern = 0x7fedbeef;
static constexpr int64_t kJunkPattern64 =
static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
int32_t int64_count = num / sizeof(kJunkPattern64);
int32_t remaining_bytes = num % sizeof(kJunkPattern64);
int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
for (int i = 0; i < int64_count; i++) {
data_i64[i] = kJunkPattern64;
}
if (remaining_bytes > 0) {
memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
}
}
void NoDelete(void*) {}
at::Allocator* GetCPUAllocator() {

View File

@ -10,6 +10,7 @@
C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);
namespace caffe2 {
@ -43,6 +44,11 @@ class CAFFE2_API MemoryAllocationReporter {
size_t allocated_;
};
// Fill the data memory region of num bytes with a particular garbage pattern.
// The garbage value is chosen to be NaN if interpreted as floating point value,
// or a very large integer.
CAFFE2_API void memset_junk(void* data, size_t num);
struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
DefaultCPUAllocator() {}
~DefaultCPUAllocator() override {}
@ -58,8 +64,14 @@ struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
CAFFE_ENFORCE(data);
// move data to a thread's NUMA node
NUMAMove(data, nbytes, GetCurrentNUMANode());
CHECK(
!FLAGS_caffe2_cpu_allocator_do_zero_fill ||
!FLAGS_caffe2_cpu_allocator_do_junk_fill)
<< "Cannot request both zero-fill and junk-fill at the same time";
if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
memset(data, 0, nbytes);
} else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
memset_junk(data, nbytes);
}
if (FLAGS_caffe2_report_cpu_memory_usage) {
reporter_.New(data, nbytes);

View File

@ -92,13 +92,23 @@ bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
// for each channel
// dl/dbias = sum_j dl/dy_j
bias_grad_arr += output_grad_mat.rowwise().sum();
auto bias_grad_delta = output_grad_mat.rowwise().sum();
if (n == 0) {
bias_grad_arr = bias_grad_delta;
} else {
bias_grad_arr += bias_grad_delta;
}
// for each channel
// dl/dscale = sum_j dl/dy_j (x_j - mu) / stdev
scale_grad_arr +=
auto scale_grad_delta =
((input_grad_mat.colwise() * inv_stdev_arr) * output_grad_mat)
.rowwise()
.sum();
if (n == 0) {
scale_grad_arr = scale_grad_delta;
} else {
scale_grad_arr += scale_grad_delta;
}
// dl/dx_j = this gross thing
// Derived gradient and manually massaged it to minimize extra storage

View File

@ -205,8 +205,8 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
// copy data and add padding index as zero
Tensor zero{CPU};
zero.Resize(block_size);
auto zeroPtr =
static_cast<const char*>(zero.raw_mutable_data(features.dtype()));
auto zeroPtr = static_cast<char*>(zero.raw_mutable_data(features.dtype()));
memset(zeroPtr, 0, zero.nbytes());
int start_dest = 0;
int start_src = 0;
for (int i = 0; i < lengths.numel(); ++i) {

View File

@ -26,10 +26,15 @@ class GatherPaddingOp final : public Operator<Context> {
bool RunOnDevice() override {
if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
Output(0)->Resize(std::vector<int64_t>(0));
Output(0)->template mutable_data<int64_t>();
auto output_0_data = Output(0)->template mutable_data<int64_t>();
// TODO(zhengxq): as suggested by salex@, change this to a loop.
math::Set<int64_t, Context>(
Output(0)->numel(), 0, output_0_data, &context_);
if (OutputSize() == 2) {
Output(1)->Resize(std::vector<int64_t>(0));
Output(1)->template mutable_data<int64_t>();
auto output_1_data = Output(1)->template mutable_data<int64_t>();
math::Set<int64_t, Context>(
Output(1)->numel(), 0, output_1_data, &context_);
}
return true;
}

View File

@ -252,6 +252,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {
char* output_data =
static_cast<char*>(output->raw_mutable_data(gradient_output.dtype()));
memset(output_data, 0, output->nbytes());
math::Set<char, Context>(
default_length * gradient_output.itemsize(), 0, output_data, &context_);

View File

@ -81,7 +81,8 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
// Get output channel
size_t c = filterTensor->dim32(0);
tensor->Resize(c);
tensor->mutable_data<float>();
float* tensor_data = tensor->mutable_data<float>();
memset(tensor_data, 0, tensor->nbytes());
break;
}
convOrder++;

View File

@ -458,7 +458,7 @@ class TestIf(test_util.TestCase):
init_net = init_nb.get()[0]
ITER = init_net.ConstantFill(
[], "ITER", shape=[1], value=0, dtype=core.DataType.INT32)
[], "ITER", shape=[1], value=0, dtype=core.DataType.INT64)
train_net.Iter(ITER, ITER)
LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1,
policy="step", stepsize=20, gamma=0.9)

View File

@ -48,6 +48,8 @@ class TestCase(unittest.TestCase):
workspace.GlobalInit([
'caffe2',
'--caffe2_log_level=0',
'--caffe2_cpu_allocator_do_zero_fill=0',
'--caffe2_cpu_allocator_do_junk_fill=1',
])
# clear the default engines settings to separate out its
# affect from the ops tests

View File

@ -24,7 +24,7 @@ class TestToyRegression(test_util.TestCase):
LR = init_net.ConstantFill([], "LR", shape=[1], value=-0.1)
ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0,
dtype=core.DataType.INT32)
dtype=core.DataType.INT64)
train_net = core.Net("train")
X = train_net.GaussianFill([], "X", shape=[64, 2], mean=0.0, std=1.0)

View File

@ -117,6 +117,9 @@ class TestWorkspace(unittest.TestCase):
""" test in-place initialization """
tensor.init([2, 3], core.DataType.INT32)
for x in range(2):
for y in range(3):
tensor.data[x, y] = 0
tensor.data[1, 1] = 100
val = np.zeros([2, 3], dtype=np.int32)
val[1, 1] = 100