mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/20180 Adding ShufflenetV2 (by Ma et. al. 2018) to the caffe2's benchmark suite. To run, use: `buck run mode/opt caffe2/caffe2/python/examples:imagenet_trainer -- --train_data null --batch_size 128 --epoch_size 3200 --num_epochs 2 --num_gpus 2 --model shufflenet` Reviewed By: bddppq, xw285cornell Differential Revision: D15094282 fbshipit-source-id: 0e1ce9c5975868e917b0f179e2c5b15647a76b4e
201 lines
5.8 KiB
Python
201 lines
5.8 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
import numpy as np
|
|
import time
|
|
|
|
from caffe2.python import workspace, cnn, memonger, core
|
|
|
|
def has_blob(proto, needle):
|
|
for op in proto.op:
|
|
for inp in op.input:
|
|
if inp == needle:
|
|
return True
|
|
for outp in op.output:
|
|
if outp == needle:
|
|
return True
|
|
return False
|
|
|
|
|
|
def count_blobs(proto):
|
|
blobs = set()
|
|
for op in proto.op:
|
|
blobs = blobs.union(set(op.input)).union(set(op.output))
|
|
return len(blobs)
|
|
|
|
|
|
def count_shared_blobs(proto):
|
|
blobs = set()
|
|
for op in proto.op:
|
|
blobs = blobs.union(set(op.input)).union(set(op.output))
|
|
return len([b for b in blobs if "_shared" in b])
|
|
|
|
|
|
def test_shared_grads(
|
|
with_shapes,
|
|
create_model,
|
|
conv_blob,
|
|
last_out_blob,
|
|
data_blob='gpu_0/data',
|
|
label_blob='gpu_0/label',
|
|
num_labels=1000,
|
|
):
|
|
model = cnn.CNNModelHelper(
|
|
order="NCHW",
|
|
name="test",
|
|
cudnn_exhaustive_search=True,
|
|
)
|
|
with core.NameScope("gpu_0"):
|
|
data = model.net.AddExternalInput(data_blob)
|
|
label = model.net.AddExternalInput(label_blob)
|
|
(_softmax, loss) = create_model(
|
|
model,
|
|
data,
|
|
num_input_channels=3,
|
|
num_labels=num_labels,
|
|
label=label,
|
|
is_test=False,
|
|
)
|
|
|
|
param_to_grad = model.AddGradientOperators([loss])
|
|
|
|
(shapes, types) = workspace.InferShapesAndTypes(
|
|
[model.param_init_net, model.net],
|
|
{data_blob: [4, 3, 227, 227],
|
|
label_blob: [4]},
|
|
)
|
|
|
|
count_before = count_blobs(model.net.Proto())
|
|
optim_proto = memonger.share_grad_blobs(
|
|
model.net,
|
|
["gpu_0/loss"],
|
|
set(model.param_to_grad.values()),
|
|
"gpu_0/",
|
|
share_activations=True,
|
|
dont_share_blobs=set([str(param_to_grad[conv_blob])]),
|
|
blob_shapes=shapes if with_shapes else None,
|
|
)
|
|
count_after = count_blobs(optim_proto)
|
|
|
|
# Run model and compare results. We check that the loss is same
|
|
# and also that the final gradient (conv1_w_grad is same)
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
|
label = (np.random.rand(4) * num_labels).astype(np.int32)
|
|
|
|
workspace.FeedBlob(data_blob, data)
|
|
workspace.FeedBlob(label_blob, label)
|
|
|
|
workspace.RunNetOnce(model.net)
|
|
model.net.Proto().type = 'dag'
|
|
model.net.Proto().num_workers = 4
|
|
loss1 = workspace.FetchBlob(last_out_blob)
|
|
conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
|
|
workspace.FeedBlob(param_to_grad[conv_blob], np.array([0.0]))
|
|
|
|
workspace.RunNetOnce(optim_proto)
|
|
optimized_loss1 = workspace.FetchBlob(last_out_blob)
|
|
optim_conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
|
|
|
|
return [(count_after, count_before),
|
|
(loss1, optimized_loss1),
|
|
(conv1_w_grad, optim_conv1_w_grad)]
|
|
|
|
|
|
def test_forward_only(
|
|
create_model,
|
|
last_out_blob,
|
|
data_blob='gpu_0/data',
|
|
num_labels=1000,
|
|
):
|
|
model = cnn.CNNModelHelper(
|
|
order="NCHW",
|
|
name="test",
|
|
cudnn_exhaustive_search=True,
|
|
)
|
|
with core.NameScope("gpu_0"):
|
|
data = model.net.AddExternalInput(data_blob)
|
|
create_model(
|
|
model,
|
|
data,
|
|
num_input_channels=3,
|
|
num_labels=num_labels,
|
|
is_test=True
|
|
)
|
|
|
|
count_before = count_blobs(model.net.Proto())
|
|
optim_proto = memonger.optimize_inference_for_dag(
|
|
model.net, [data_blob], "gpu_0/"
|
|
)
|
|
count_after = count_blobs(optim_proto)
|
|
num_shared_blobs = count_shared_blobs(optim_proto)
|
|
|
|
# Run model and compare results
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
|
|
|
workspace.FeedBlob(data_blob, data)
|
|
workspace.RunNetOnce(model.net)
|
|
model.net.Proto().type = 'dag'
|
|
model.net.Proto().num_workers = 4
|
|
loss1 = workspace.FetchBlob(last_out_blob)
|
|
|
|
workspace.RunNetOnce(optim_proto)
|
|
optimized_loss1 = workspace.FetchBlob(last_out_blob)
|
|
return [(count_after, count_before),
|
|
(num_shared_blobs),
|
|
(loss1, optimized_loss1)]
|
|
|
|
|
|
def test_forward_only_fast_simplenet(
|
|
create_model,
|
|
last_out_blob,
|
|
data_blob="gpu_0/data",
|
|
num_labels=1000,
|
|
):
|
|
model = cnn.CNNModelHelper(
|
|
order="NCHW",
|
|
name="test",
|
|
cudnn_exhaustive_search=True,
|
|
)
|
|
with core.NameScope("gpu_0"):
|
|
data = model.net.AddExternalInput(data_blob)
|
|
create_model(
|
|
model,
|
|
data,
|
|
num_input_channels=3,
|
|
num_labels=num_labels,
|
|
is_test=True
|
|
)
|
|
|
|
count_before = count_blobs(model.net.Proto())
|
|
t = time.time()
|
|
optim_proto = memonger.optimize_inference_fast(
|
|
model.net.Proto(),
|
|
set([data_blob, last_out_blob]).union(
|
|
set(model.net.Proto().external_input))
|
|
)
|
|
print("Optimization took {} secs".format(time.time() - t))
|
|
count_after = count_blobs(optim_proto)
|
|
num_shared_blobs = count_shared_blobs(optim_proto)
|
|
|
|
print(count_after, count_before, num_shared_blobs)
|
|
|
|
# Run model and compare results
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
|
|
|
workspace.FeedBlob(data_blob, data)
|
|
model.net.Proto().type = 'simple'
|
|
|
|
workspace.RunNetOnce(model.net)
|
|
loss1 = workspace.FetchBlob(last_out_blob)
|
|
|
|
workspace.RunNetOnce(optim_proto)
|
|
optimized_loss1 = workspace.FetchBlob(last_out_blob)
|
|
return [(count_after, count_before),
|
|
(num_shared_blobs),
|
|
(loss1, optimized_loss1)]
|