mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Enable junk fill for the default CPU allocator (#13377)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/13377 * Enable junk fill for the default CPU allocator. The first diff only enables this for the tests. A second diff will change the default of zero-fill to false. * Fix tests to use 64-bit counters that IterOp and LearningRateOp demands. * Fix kernels that uses uninitialized memory. Reviewed By: salexspb Differential Revision: D10866512 fbshipit-source-id: 17860e77e63a203edf46d0da0335608f77884821
This commit is contained in:
committed by
Facebook Github Bot
parent
21991c05a9
commit
de41d1ae0b
@ -13,8 +13,30 @@ C10_DEFINE_bool(
|
||||
true,
|
||||
"If set, do memory zerofilling when allocating on CPU");
|
||||
|
||||
C10_DEFINE_bool(
|
||||
caffe2_cpu_allocator_do_junk_fill,
|
||||
false,
|
||||
"If set, fill memory with deterministic junk when allocating on CPU");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
void memset_junk(void* data, size_t num) {
|
||||
// This garbage pattern is NaN when interpretted as floating point values,
|
||||
// or as very large integer values.
|
||||
static constexpr int32_t kJunkPattern = 0x7fedbeef;
|
||||
static constexpr int64_t kJunkPattern64 =
|
||||
static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
|
||||
int32_t int64_count = num / sizeof(kJunkPattern64);
|
||||
int32_t remaining_bytes = num % sizeof(kJunkPattern64);
|
||||
int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
|
||||
for (int i = 0; i < int64_count; i++) {
|
||||
data_i64[i] = kJunkPattern64;
|
||||
}
|
||||
if (remaining_bytes > 0) {
|
||||
memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void NoDelete(void*) {}
|
||||
|
||||
at::Allocator* GetCPUAllocator() {
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
|
||||
C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
|
||||
C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -43,6 +44,11 @@ class CAFFE2_API MemoryAllocationReporter {
|
||||
size_t allocated_;
|
||||
};
|
||||
|
||||
// Fill the data memory region of num bytes with a particular garbage pattern.
|
||||
// The garbage value is chosen to be NaN if interpreted as floating point value,
|
||||
// or a very large integer.
|
||||
CAFFE2_API void memset_junk(void* data, size_t num);
|
||||
|
||||
struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
|
||||
DefaultCPUAllocator() {}
|
||||
~DefaultCPUAllocator() override {}
|
||||
@ -58,8 +64,14 @@ struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
|
||||
CAFFE_ENFORCE(data);
|
||||
// move data to a thread's NUMA node
|
||||
NUMAMove(data, nbytes, GetCurrentNUMANode());
|
||||
CHECK(
|
||||
!FLAGS_caffe2_cpu_allocator_do_zero_fill ||
|
||||
!FLAGS_caffe2_cpu_allocator_do_junk_fill)
|
||||
<< "Cannot request both zero-fill and junk-fill at the same time";
|
||||
if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
|
||||
memset(data, 0, nbytes);
|
||||
} else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
|
||||
memset_junk(data, nbytes);
|
||||
}
|
||||
if (FLAGS_caffe2_report_cpu_memory_usage) {
|
||||
reporter_.New(data, nbytes);
|
||||
|
@ -92,13 +92,23 @@ bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
|
||||
// for each channel
|
||||
// dl/dbias = sum_j dl/dy_j
|
||||
bias_grad_arr += output_grad_mat.rowwise().sum();
|
||||
auto bias_grad_delta = output_grad_mat.rowwise().sum();
|
||||
if (n == 0) {
|
||||
bias_grad_arr = bias_grad_delta;
|
||||
} else {
|
||||
bias_grad_arr += bias_grad_delta;
|
||||
}
|
||||
// for each channel
|
||||
// dl/dscale = sum_j dl/dy_j (x_j - mu) / stdev
|
||||
scale_grad_arr +=
|
||||
auto scale_grad_delta =
|
||||
((input_grad_mat.colwise() * inv_stdev_arr) * output_grad_mat)
|
||||
.rowwise()
|
||||
.sum();
|
||||
if (n == 0) {
|
||||
scale_grad_arr = scale_grad_delta;
|
||||
} else {
|
||||
scale_grad_arr += scale_grad_delta;
|
||||
}
|
||||
|
||||
// dl/dx_j = this gross thing
|
||||
// Derived gradient and manually massaged it to minimize extra storage
|
||||
|
@ -205,8 +205,8 @@ bool PadEmptySamplesOp<CPUContext>::RunOnDevice() {
|
||||
// copy data and add padding index as zero
|
||||
Tensor zero{CPU};
|
||||
zero.Resize(block_size);
|
||||
auto zeroPtr =
|
||||
static_cast<const char*>(zero.raw_mutable_data(features.dtype()));
|
||||
auto zeroPtr = static_cast<char*>(zero.raw_mutable_data(features.dtype()));
|
||||
memset(zeroPtr, 0, zero.nbytes());
|
||||
int start_dest = 0;
|
||||
int start_src = 0;
|
||||
for (int i = 0; i < lengths.numel(); ++i) {
|
||||
|
@ -26,10 +26,15 @@ class GatherPaddingOp final : public Operator<Context> {
|
||||
bool RunOnDevice() override {
|
||||
if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
|
||||
Output(0)->Resize(std::vector<int64_t>(0));
|
||||
Output(0)->template mutable_data<int64_t>();
|
||||
auto output_0_data = Output(0)->template mutable_data<int64_t>();
|
||||
// TODO(zhengxq): as suggested by salex@, change this to a loop.
|
||||
math::Set<int64_t, Context>(
|
||||
Output(0)->numel(), 0, output_0_data, &context_);
|
||||
if (OutputSize() == 2) {
|
||||
Output(1)->Resize(std::vector<int64_t>(0));
|
||||
Output(1)->template mutable_data<int64_t>();
|
||||
auto output_1_data = Output(1)->template mutable_data<int64_t>();
|
||||
math::Set<int64_t, Context>(
|
||||
Output(1)->numel(), 0, output_1_data, &context_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -252,6 +252,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {
|
||||
|
||||
char* output_data =
|
||||
static_cast<char*>(output->raw_mutable_data(gradient_output.dtype()));
|
||||
memset(output_data, 0, output->nbytes());
|
||||
math::Set<char, Context>(
|
||||
default_length * gradient_output.itemsize(), 0, output_data, &context_);
|
||||
|
||||
|
@ -81,7 +81,8 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
|
||||
// Get output channel
|
||||
size_t c = filterTensor->dim32(0);
|
||||
tensor->Resize(c);
|
||||
tensor->mutable_data<float>();
|
||||
float* tensor_data = tensor->mutable_data<float>();
|
||||
memset(tensor_data, 0, tensor->nbytes());
|
||||
break;
|
||||
}
|
||||
convOrder++;
|
||||
|
@ -458,7 +458,7 @@ class TestIf(test_util.TestCase):
|
||||
|
||||
init_net = init_nb.get()[0]
|
||||
ITER = init_net.ConstantFill(
|
||||
[], "ITER", shape=[1], value=0, dtype=core.DataType.INT32)
|
||||
[], "ITER", shape=[1], value=0, dtype=core.DataType.INT64)
|
||||
train_net.Iter(ITER, ITER)
|
||||
LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1,
|
||||
policy="step", stepsize=20, gamma=0.9)
|
||||
|
@ -48,6 +48,8 @@ class TestCase(unittest.TestCase):
|
||||
workspace.GlobalInit([
|
||||
'caffe2',
|
||||
'--caffe2_log_level=0',
|
||||
'--caffe2_cpu_allocator_do_zero_fill=0',
|
||||
'--caffe2_cpu_allocator_do_junk_fill=1',
|
||||
])
|
||||
# clear the default engines settings to separate out its
|
||||
# affect from the ops tests
|
||||
|
@ -24,7 +24,7 @@ class TestToyRegression(test_util.TestCase):
|
||||
LR = init_net.ConstantFill([], "LR", shape=[1], value=-0.1)
|
||||
ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
|
||||
ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0,
|
||||
dtype=core.DataType.INT32)
|
||||
dtype=core.DataType.INT64)
|
||||
|
||||
train_net = core.Net("train")
|
||||
X = train_net.GaussianFill([], "X", shape=[64, 2], mean=0.0, std=1.0)
|
||||
|
@ -117,6 +117,9 @@ class TestWorkspace(unittest.TestCase):
|
||||
|
||||
""" test in-place initialization """
|
||||
tensor.init([2, 3], core.DataType.INT32)
|
||||
for x in range(2):
|
||||
for y in range(3):
|
||||
tensor.data[x, y] = 0
|
||||
tensor.data[1, 1] = 100
|
||||
val = np.zeros([2, 3], dtype=np.int32)
|
||||
val[1, 1] = 100
|
||||
|
Reference in New Issue
Block a user