mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Adding ShufflenetV2 to caffe2's benchmark suite. (#20180)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/20180 Adding ShufflenetV2 (by Ma et. al. 2018) to the caffe2's benchmark suite. To run, use: `buck run mode/opt caffe2/caffe2/python/examples:imagenet_trainer -- --train_data null --batch_size 128 --epoch_size 3200 --num_epochs 2 --num_gpus 2 --model shufflenet` Reviewed By: bddppq, xw285cornell Differential Revision: D15094282 fbshipit-source-id: 0e1ce9c5975868e917b0f179e2c5b15647a76b4e
This commit is contained in:
committed by
Facebook Github Bot
parent
3aa7ee6fe6
commit
f93e0619f3
727
caffe2/python/examples/imagenet_trainer.py
Normal file
727
caffe2/python/examples/imagenet_trainer.py
Normal file
@ -0,0 +1,727 @@
|
||||
# Module caffe2.python.examples.resnet50_trainer
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import numpy as np
|
||||
import time
|
||||
import os
|
||||
|
||||
from caffe2.python import core, workspace, experiment_util, data_parallel_model
|
||||
from caffe2.python import dyndep, optimizer
|
||||
from caffe2.python import timeout_guard, model_helper, brew
|
||||
from caffe2.proto import caffe2_pb2
|
||||
|
||||
import caffe2.python.models.resnet as resnet
|
||||
import caffe2.python.models.shufflenet as shufflenet
|
||||
from caffe2.python.modeling.initializers import Initializer, PseudoFP16Initializer
|
||||
import caffe2.python.predictor.predictor_exporter as pred_exp
|
||||
import caffe2.python.predictor.predictor_py_utils as pred_utils
|
||||
from caffe2.python.predictor_constants import predictor_constants as predictor_constants
|
||||
|
||||
'''
|
||||
Parallelized multi-GPU distributed trainer for Resne(X)t & Shufflenet.
|
||||
Can be used to train on imagenet data, for example.
|
||||
The default parameters can train a standard Resnet-50 (1x64d), and parameters
|
||||
can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d).
|
||||
|
||||
To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
|
||||
|
||||
To run the trainer in multi-machine multi-gpu mode with M machines,
|
||||
run the same program on all machines, specifying num_shards = M, and
|
||||
shard_id = a unique integer in the set [0, M-1].
|
||||
|
||||
For rendezvous (the trainer processes have to know about each other),
|
||||
you can either use a directory path that is visible to all processes
|
||||
(e.g. NFS directory), or use a Redis instance. Use the former by
|
||||
passing the `file_store_path` argument. Use the latter by passing the
|
||||
`redis_host` and `redis_port` arguments.
|
||||
'''
|
||||
|
||||
logging.basicConfig()
|
||||
log = logging.getLogger("Imagenet_trainer")
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
|
||||
dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
|
||||
|
||||
|
||||
def AddImageInput(
|
||||
model,
|
||||
reader,
|
||||
batch_size,
|
||||
img_size,
|
||||
dtype,
|
||||
is_test,
|
||||
mean_per_channel=None,
|
||||
std_per_channel=None,
|
||||
):
|
||||
'''
|
||||
The image input operator loads image and label data from the reader and
|
||||
applies transformations to the images (random cropping, mirroring, ...).
|
||||
'''
|
||||
data, label = brew.image_input(
|
||||
model,
|
||||
reader, ["data", "label"],
|
||||
batch_size=batch_size,
|
||||
output_type=dtype,
|
||||
use_gpu_transform=True if core.IsGPUDeviceType(model._device_type) else False,
|
||||
use_caffe_datum=True,
|
||||
mean_per_channel=mean_per_channel,
|
||||
std_per_channel=std_per_channel,
|
||||
# mean_per_channel takes precedence over mean
|
||||
mean=128.,
|
||||
std=128.,
|
||||
scale=256,
|
||||
crop=img_size,
|
||||
mirror=1,
|
||||
is_test=is_test,
|
||||
)
|
||||
|
||||
data = model.StopGradient(data, data)
|
||||
|
||||
|
||||
def AddNullInput(model, reader, batch_size, img_size, dtype):
|
||||
'''
|
||||
The null input function uses a gaussian fill operator to emulate real image
|
||||
input. A label blob is hardcoded to a single value. This is useful if you
|
||||
want to test compute throughput or don't have a dataset available.
|
||||
'''
|
||||
suffix = "_fp16" if dtype == "float16" else ""
|
||||
model.param_init_net.GaussianFill(
|
||||
[],
|
||||
["data" + suffix],
|
||||
shape=[batch_size, 3, img_size, img_size],
|
||||
)
|
||||
if dtype == "float16":
|
||||
model.param_init_net.FloatToHalf("data" + suffix, "data")
|
||||
|
||||
model.param_init_net.ConstantFill(
|
||||
[],
|
||||
["label"],
|
||||
shape=[batch_size],
|
||||
value=1,
|
||||
dtype=core.DataType.INT32,
|
||||
)
|
||||
|
||||
|
||||
def SaveModel(args, train_model, epoch, use_ideep):
|
||||
prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0])
|
||||
predictor_export_meta = pred_exp.PredictorExportMeta(
|
||||
predict_net=train_model.net.Proto(),
|
||||
parameters=data_parallel_model.GetCheckpointParams(train_model),
|
||||
inputs=[prefix + "/data"],
|
||||
outputs=[prefix + "/softmax"],
|
||||
shapes={
|
||||
prefix + "/softmax": (1, args.num_labels),
|
||||
prefix + "/data": (args.num_channels, args.image_size, args.image_size)
|
||||
}
|
||||
)
|
||||
|
||||
# save the train_model for the current epoch
|
||||
model_path = "%s/%s_%d.mdl" % (
|
||||
args.file_store_path,
|
||||
args.save_model_name,
|
||||
epoch,
|
||||
)
|
||||
|
||||
# set db_type to be "minidb" instead of "log_file_db", which breaks
|
||||
# the serialization in save_to_db. Need to switch back to log_file_db
|
||||
# after migration
|
||||
pred_exp.save_to_db(
|
||||
db_type="minidb",
|
||||
db_destination=model_path,
|
||||
predictor_export_meta=predictor_export_meta,
|
||||
use_ideep=use_ideep
|
||||
)
|
||||
|
||||
|
||||
def LoadModel(path, model, use_ideep):
|
||||
'''
|
||||
Load pretrained model from file
|
||||
'''
|
||||
log.info("Loading path: {}".format(path))
|
||||
meta_net_def = pred_exp.load_from_db(path, 'minidb')
|
||||
init_net = core.Net(pred_utils.GetNet(
|
||||
meta_net_def, predictor_constants.GLOBAL_INIT_NET_TYPE))
|
||||
predict_init_net = core.Net(pred_utils.GetNet(
|
||||
meta_net_def, predictor_constants.PREDICT_INIT_NET_TYPE))
|
||||
|
||||
if use_ideep:
|
||||
predict_init_net.RunAllOnIDEEP()
|
||||
else:
|
||||
predict_init_net.RunAllOnGPU()
|
||||
if use_ideep:
|
||||
init_net.RunAllOnIDEEP()
|
||||
else:
|
||||
init_net.RunAllOnGPU()
|
||||
|
||||
assert workspace.RunNetOnce(predict_init_net)
|
||||
assert workspace.RunNetOnce(init_net)
|
||||
|
||||
# Hack: fix iteration counter which is in CUDA context after load model
|
||||
itercnt = workspace.FetchBlob("optimizer_iteration")
|
||||
workspace.FeedBlob(
|
||||
"optimizer_iteration",
|
||||
itercnt,
|
||||
device_option=core.DeviceOption(caffe2_pb2.CPU, 0)
|
||||
)
|
||||
|
||||
|
||||
def RunEpoch(
|
||||
args,
|
||||
epoch,
|
||||
train_model,
|
||||
test_model,
|
||||
total_batch_size,
|
||||
num_shards,
|
||||
expname,
|
||||
explog,
|
||||
):
|
||||
'''
|
||||
Run one epoch of the trainer.
|
||||
TODO: add checkpointing here.
|
||||
'''
|
||||
# TODO: add loading from checkpoint
|
||||
log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
|
||||
epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
|
||||
test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards)
|
||||
for i in range(epoch_iters):
|
||||
# This timeout is required (temporarily) since CUDA-NCCL
|
||||
# operators might deadlock when synchronizing between GPUs.
|
||||
timeout = args.first_iter_timeout if i == 0 else args.timeout
|
||||
with timeout_guard.CompleteInTimeOrDie(timeout):
|
||||
t1 = time.time()
|
||||
workspace.RunNet(train_model.net.Proto().name)
|
||||
t2 = time.time()
|
||||
dt = t2 - t1
|
||||
|
||||
fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
|
||||
log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
|
||||
prefix = "{}_{}".format(
|
||||
train_model._device_prefix,
|
||||
train_model._devices[0])
|
||||
accuracy = workspace.FetchBlob(prefix + '/accuracy')
|
||||
loss = workspace.FetchBlob(prefix + '/loss')
|
||||
train_fmt = "Training loss: {}, accuracy: {}"
|
||||
log.info(train_fmt.format(loss, accuracy))
|
||||
|
||||
num_images = epoch * epoch_iters * total_batch_size
|
||||
prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
|
||||
accuracy = workspace.FetchBlob(prefix + '/accuracy')
|
||||
loss = workspace.FetchBlob(prefix + '/loss')
|
||||
learning_rate = workspace.FetchBlob(
|
||||
data_parallel_model.GetLearningRateBlobNames(train_model)[0]
|
||||
)
|
||||
test_accuracy = 0
|
||||
test_accuracy_top5 = 0
|
||||
if test_model is not None:
|
||||
# Run 100 iters of testing
|
||||
ntests = 0
|
||||
for _ in range(test_epoch_iters):
|
||||
workspace.RunNet(test_model.net.Proto().name)
|
||||
for g in test_model._devices:
|
||||
test_accuracy += np.asscalar(workspace.FetchBlob(
|
||||
"{}_{}".format(test_model._device_prefix, g) + '/accuracy'
|
||||
))
|
||||
test_accuracy_top5 += np.asscalar(workspace.FetchBlob(
|
||||
"{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
|
||||
))
|
||||
ntests += 1
|
||||
test_accuracy /= ntests
|
||||
test_accuracy_top5 /= ntests
|
||||
else:
|
||||
test_accuracy = (-1)
|
||||
test_accuracy_top5 = (-1)
|
||||
|
||||
explog.log(
|
||||
input_count=num_images,
|
||||
batch_count=(i + epoch * epoch_iters),
|
||||
additional_values={
|
||||
'accuracy': accuracy,
|
||||
'loss': loss,
|
||||
'learning_rate': learning_rate,
|
||||
'epoch': epoch,
|
||||
'top1_test_accuracy': test_accuracy,
|
||||
'top5_test_accuracy': test_accuracy_top5,
|
||||
}
|
||||
)
|
||||
assert loss < 40, "Exploded gradients :("
|
||||
|
||||
# TODO: add checkpointing
|
||||
return epoch + 1
|
||||
|
||||
|
||||
def Train(args):
|
||||
if args.model == "resnext":
|
||||
model_name = "resnext" + str(args.num_layers)
|
||||
elif args.model == "shufflenet":
|
||||
model_name = "shufflenet"
|
||||
|
||||
# Either use specified device list or generate one
|
||||
if args.gpus is not None:
|
||||
gpus = [int(x) for x in args.gpus.split(',')]
|
||||
num_gpus = len(gpus)
|
||||
else:
|
||||
gpus = list(range(args.num_gpus))
|
||||
num_gpus = args.num_gpus
|
||||
|
||||
log.info("Running on GPUs: {}".format(gpus))
|
||||
|
||||
# Verify valid batch size
|
||||
total_batch_size = args.batch_size
|
||||
batch_per_device = total_batch_size // num_gpus
|
||||
assert \
|
||||
total_batch_size % num_gpus == 0, \
|
||||
"Number of GPUs must divide batch size"
|
||||
|
||||
# Verify valid image mean/std per channel
|
||||
if args.image_mean_per_channel:
|
||||
assert \
|
||||
len(args.image_mean_per_channel) == args.num_channels, \
|
||||
"The number of channels of image mean doesn't match input"
|
||||
|
||||
if args.image_std_per_channel:
|
||||
assert \
|
||||
len(args.image_std_per_channel) == args.num_channels, \
|
||||
"The number of channels of image std doesn't match input"
|
||||
|
||||
# Round down epoch size to closest multiple of batch size across machines
|
||||
global_batch_size = total_batch_size * args.num_shards
|
||||
epoch_iters = int(args.epoch_size / global_batch_size)
|
||||
|
||||
assert \
|
||||
epoch_iters > 0, \
|
||||
"Epoch size must be larger than batch size times shard count"
|
||||
|
||||
args.epoch_size = epoch_iters * global_batch_size
|
||||
log.info("Using epoch size: {}".format(args.epoch_size))
|
||||
|
||||
# Create ModelHelper object
|
||||
if args.use_ideep:
|
||||
train_arg_scope = {
|
||||
'use_cudnn': False,
|
||||
'cudnn_exhaustive_search': False,
|
||||
'training_mode': 1
|
||||
}
|
||||
else:
|
||||
train_arg_scope = {
|
||||
'order': 'NCHW',
|
||||
'use_cudnn': True,
|
||||
'cudnn_exhaustive_search': True,
|
||||
'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
|
||||
}
|
||||
train_model = model_helper.ModelHelper(
|
||||
name=model_name, arg_scope=train_arg_scope
|
||||
)
|
||||
|
||||
num_shards = args.num_shards
|
||||
shard_id = args.shard_id
|
||||
|
||||
# Expect interfaces to be comma separated.
|
||||
# Use of multiple network interfaces is not yet complete,
|
||||
# so simply use the first one in the list.
|
||||
interfaces = args.distributed_interfaces.split(",")
|
||||
|
||||
# Rendezvous using MPI when run with mpirun
|
||||
if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
|
||||
num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
|
||||
shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
|
||||
if num_shards > 1:
|
||||
rendezvous = dict(
|
||||
kv_handler=None,
|
||||
num_shards=num_shards,
|
||||
shard_id=shard_id,
|
||||
engine="GLOO",
|
||||
transport=args.distributed_transport,
|
||||
interface=interfaces[0],
|
||||
mpi_rendezvous=True,
|
||||
exit_nets=None)
|
||||
|
||||
elif num_shards > 1:
|
||||
# Create rendezvous for distributed computation
|
||||
store_handler = "store_handler"
|
||||
if args.redis_host is not None:
|
||||
# Use Redis for rendezvous if Redis host is specified
|
||||
workspace.RunOperatorOnce(
|
||||
core.CreateOperator(
|
||||
"RedisStoreHandlerCreate", [], [store_handler],
|
||||
host=args.redis_host,
|
||||
port=args.redis_port,
|
||||
prefix=args.run_id,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Use filesystem for rendezvous otherwise
|
||||
workspace.RunOperatorOnce(
|
||||
core.CreateOperator(
|
||||
"FileStoreHandlerCreate", [], [store_handler],
|
||||
path=args.file_store_path,
|
||||
prefix=args.run_id,
|
||||
)
|
||||
)
|
||||
|
||||
rendezvous = dict(
|
||||
kv_handler=store_handler,
|
||||
shard_id=shard_id,
|
||||
num_shards=num_shards,
|
||||
engine="GLOO",
|
||||
transport=args.distributed_transport,
|
||||
interface=interfaces[0],
|
||||
exit_nets=None)
|
||||
|
||||
else:
|
||||
rendezvous = None
|
||||
|
||||
# Model building functions
|
||||
def create_resnext_model_ops(model, loss_scale):
|
||||
initializer = (PseudoFP16Initializer if args.dtype == 'float16'
|
||||
else Initializer)
|
||||
|
||||
with brew.arg_scope([brew.conv, brew.fc],
|
||||
WeightInitializer=initializer,
|
||||
BiasInitializer=initializer,
|
||||
enable_tensor_core=args.enable_tensor_core,
|
||||
float16_compute=args.float16_compute):
|
||||
pred = resnet.create_resnext(
|
||||
model,
|
||||
"data",
|
||||
num_input_channels=args.num_channels,
|
||||
num_labels=args.num_labels,
|
||||
num_layers=args.num_layers,
|
||||
num_groups=args.resnext_num_groups,
|
||||
num_width_per_group=args.resnext_width_per_group,
|
||||
no_bias=True,
|
||||
no_loss=True,
|
||||
)
|
||||
|
||||
if args.dtype == 'float16':
|
||||
pred = model.net.HalfToFloat(pred, pred + '_fp32')
|
||||
|
||||
softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
|
||||
['softmax', 'loss'])
|
||||
loss = model.Scale(loss, scale=loss_scale)
|
||||
brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
|
||||
brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
|
||||
return [loss]
|
||||
|
||||
def create_shufflenet_model_ops(model, loss_scale):
|
||||
initializer = (PseudoFP16Initializer if args.dtype == 'float16'
|
||||
else Initializer)
|
||||
|
||||
with brew.arg_scope([brew.conv, brew.fc],
|
||||
WeightInitializer=initializer,
|
||||
BiasInitializer=initializer,
|
||||
enable_tensor_core=args.enable_tensor_core,
|
||||
float16_compute=args.float16_compute):
|
||||
pred = shufflenet.create_shufflenet(
|
||||
model,
|
||||
"data",
|
||||
num_input_channels=args.num_channels,
|
||||
num_labels=args.num_labels,
|
||||
no_loss=True,
|
||||
)
|
||||
|
||||
if args.dtype == 'float16':
|
||||
pred = model.net.HalfToFloat(pred, pred + '_fp32')
|
||||
|
||||
softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
|
||||
['softmax', 'loss'])
|
||||
loss = model.Scale(loss, scale=loss_scale)
|
||||
brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
|
||||
brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
|
||||
return [loss]
|
||||
|
||||
def add_optimizer(model):
|
||||
stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
|
||||
|
||||
if args.float16_compute:
|
||||
# TODO: merge with multi-prceision optimizer
|
||||
opt = optimizer.build_fp16_sgd(
|
||||
model,
|
||||
args.base_learning_rate,
|
||||
momentum=0.9,
|
||||
nesterov=1,
|
||||
weight_decay=args.weight_decay, # weight decay included
|
||||
policy="step",
|
||||
stepsize=stepsz,
|
||||
gamma=0.1
|
||||
)
|
||||
else:
|
||||
optimizer.add_weight_decay(model, args.weight_decay)
|
||||
opt = optimizer.build_multi_precision_sgd(
|
||||
model,
|
||||
args.base_learning_rate,
|
||||
momentum=0.9,
|
||||
nesterov=1,
|
||||
policy="step",
|
||||
stepsize=stepsz,
|
||||
gamma=0.1
|
||||
)
|
||||
return opt
|
||||
|
||||
# Define add_image_input function.
|
||||
# Depends on the "train_data" argument.
|
||||
# Note that the reader will be shared with between all GPUS.
|
||||
if args.train_data == "null":
|
||||
def add_image_input(model):
|
||||
AddNullInput(
|
||||
model,
|
||||
None,
|
||||
batch_size=batch_per_device,
|
||||
img_size=args.image_size,
|
||||
dtype=args.dtype,
|
||||
)
|
||||
else:
|
||||
reader = train_model.CreateDB(
|
||||
"reader",
|
||||
db=args.train_data,
|
||||
db_type=args.db_type,
|
||||
num_shards=num_shards,
|
||||
shard_id=shard_id,
|
||||
)
|
||||
|
||||
def add_image_input(model):
|
||||
AddImageInput(
|
||||
model,
|
||||
reader,
|
||||
batch_size=batch_per_device,
|
||||
img_size=args.image_size,
|
||||
dtype=args.dtype,
|
||||
is_test=False,
|
||||
mean_per_channel=args.image_mean_per_channel,
|
||||
std_per_channel=args.image_std_per_channel,
|
||||
)
|
||||
|
||||
def add_post_sync_ops(model):
|
||||
"""Add ops applied after initial parameter sync."""
|
||||
for param_info in model.GetOptimizationParamInfo(model.GetParams()):
|
||||
if param_info.blob_copy is not None:
|
||||
model.param_init_net.HalfToFloat(
|
||||
param_info.blob,
|
||||
param_info.blob_copy[core.DataType.FLOAT]
|
||||
)
|
||||
|
||||
data_parallel_model.Parallelize(
|
||||
train_model,
|
||||
input_builder_fun=add_image_input,
|
||||
forward_pass_builder_fun=create_resnext_model_ops
|
||||
if args.model == "resnext" else create_shufflenet_model_ops,
|
||||
optimizer_builder_fun=add_optimizer,
|
||||
post_sync_builder_fun=add_post_sync_ops,
|
||||
devices=gpus,
|
||||
rendezvous=rendezvous,
|
||||
optimize_gradient_memory=False,
|
||||
cpu_device=args.use_cpu,
|
||||
ideep=args.use_ideep,
|
||||
shared_model=args.use_cpu,
|
||||
combine_spatial_bn=args.use_cpu,
|
||||
)
|
||||
|
||||
data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)
|
||||
|
||||
workspace.RunNetOnce(train_model.param_init_net)
|
||||
workspace.CreateNet(train_model.net)
|
||||
|
||||
# Add test model, if specified
|
||||
test_model = None
|
||||
if (args.test_data is not None):
|
||||
log.info("----- Create test net ----")
|
||||
if args.use_ideep:
|
||||
test_arg_scope = {
|
||||
'use_cudnn': False,
|
||||
'cudnn_exhaustive_search': False,
|
||||
}
|
||||
else:
|
||||
test_arg_scope = {
|
||||
'order': "NCHW",
|
||||
'use_cudnn': True,
|
||||
'cudnn_exhaustive_search': True,
|
||||
}
|
||||
test_model = model_helper.ModelHelper(
|
||||
name=model_name + "_test",
|
||||
arg_scope=test_arg_scope,
|
||||
init_params=False,
|
||||
)
|
||||
|
||||
test_reader = test_model.CreateDB(
|
||||
"test_reader",
|
||||
db=args.test_data,
|
||||
db_type=args.db_type,
|
||||
)
|
||||
|
||||
def test_input_fn(model):
|
||||
AddImageInput(
|
||||
model,
|
||||
test_reader,
|
||||
batch_size=batch_per_device,
|
||||
img_size=args.image_size,
|
||||
dtype=args.dtype,
|
||||
is_test=True,
|
||||
mean_per_channel=args.image_mean_per_channel,
|
||||
std_per_channel=args.image_std_per_channel,
|
||||
)
|
||||
|
||||
data_parallel_model.Parallelize(
|
||||
test_model,
|
||||
input_builder_fun=test_input_fn,
|
||||
forward_pass_builder_fun=create_resnext_model_ops
|
||||
if args.model == "resnext" else create_shufflenet_model_ops,
|
||||
post_sync_builder_fun=add_post_sync_ops,
|
||||
param_update_builder_fun=None,
|
||||
devices=gpus,
|
||||
cpu_device=args.use_cpu,
|
||||
)
|
||||
workspace.RunNetOnce(test_model.param_init_net)
|
||||
workspace.CreateNet(test_model.net)
|
||||
|
||||
epoch = 0
|
||||
# load the pre-trained model and reset epoch
|
||||
if args.load_model_path is not None:
|
||||
LoadModel(args.load_model_path, train_model, args.use_ideep)
|
||||
|
||||
# Sync the model params
|
||||
data_parallel_model.FinalizeAfterCheckpoint(train_model)
|
||||
|
||||
# reset epoch. load_model_path should end with *_X.mdl,
|
||||
# where X is the epoch number
|
||||
last_str = args.load_model_path.split('_')[-1]
|
||||
if last_str.endswith('.mdl'):
|
||||
epoch = int(last_str[:-4])
|
||||
log.info("Reset epoch to {}".format(epoch))
|
||||
else:
|
||||
log.warning("The format of load_model_path doesn't match!")
|
||||
|
||||
expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % (
|
||||
model_name,
|
||||
args.num_gpus,
|
||||
total_batch_size,
|
||||
args.num_labels,
|
||||
args.base_learning_rate,
|
||||
)
|
||||
|
||||
explog = experiment_util.ModelTrainerLog(expname, args)
|
||||
|
||||
# Run the training one epoch a time
|
||||
while epoch < args.num_epochs:
|
||||
epoch = RunEpoch(
|
||||
args,
|
||||
epoch,
|
||||
train_model,
|
||||
test_model,
|
||||
total_batch_size,
|
||||
num_shards,
|
||||
expname,
|
||||
explog
|
||||
)
|
||||
|
||||
# Save the model for each epoch
|
||||
SaveModel(args, train_model, epoch, args.use_ideep)
|
||||
|
||||
model_path = "%s/%s_" % (
|
||||
args.file_store_path,
|
||||
args.save_model_name
|
||||
)
|
||||
# remove the saved model from the previous epoch if it exists
|
||||
if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
|
||||
os.remove(model_path + str(epoch - 1) + ".mdl")
|
||||
|
||||
|
||||
def main():
|
||||
# TODO: use argv
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Caffe2: ImageNet Trainer"
|
||||
)
|
||||
parser.add_argument("--train_data", type=str, default=None, required=True,
|
||||
help="Path to training data (or 'null' to simulate)")
|
||||
parser.add_argument("--num_layers", type=int, default=50,
|
||||
help="The number of layers in ResNe(X)t model")
|
||||
parser.add_argument("--resnext_num_groups", type=int, default=1,
|
||||
help="The cardinality of resnext")
|
||||
parser.add_argument("--resnext_width_per_group", type=int, default=64,
|
||||
help="The cardinality of resnext")
|
||||
parser.add_argument("--test_data", type=str, default=None,
|
||||
help="Path to test data")
|
||||
parser.add_argument("--image_mean_per_channel", type=float, nargs='+',
|
||||
help="The per channel mean for the images")
|
||||
parser.add_argument("--image_std_per_channel", type=float, nargs='+',
|
||||
help="The per channel standard deviation for the images")
|
||||
parser.add_argument("--test_epoch_size", type=int, default=50000,
|
||||
help="Number of test images")
|
||||
parser.add_argument("--db_type", type=str, default="lmdb",
|
||||
help="Database type (such as lmdb or leveldb)")
|
||||
parser.add_argument("--gpus", type=str,
|
||||
help="Comma separated list of GPU devices to use")
|
||||
parser.add_argument("--num_gpus", type=int, default=1,
|
||||
help="Number of GPU devices (instead of --gpus)")
|
||||
parser.add_argument("--num_channels", type=int, default=3,
|
||||
help="Number of color channels")
|
||||
parser.add_argument("--image_size", type=int, default=224,
|
||||
help="Input image size (to crop to)")
|
||||
parser.add_argument("--num_labels", type=int, default=1000,
|
||||
help="Number of labels")
|
||||
parser.add_argument("--batch_size", type=int, default=32,
|
||||
help="Batch size, total over all GPUs")
|
||||
parser.add_argument("--epoch_size", type=int, default=1500000,
|
||||
help="Number of images/epoch, total over all machines")
|
||||
parser.add_argument("--num_epochs", type=int, default=1000,
|
||||
help="Num epochs.")
|
||||
parser.add_argument("--base_learning_rate", type=float, default=0.1,
|
||||
help="Initial learning rate.")
|
||||
parser.add_argument("--weight_decay", type=float, default=1e-4,
|
||||
help="Weight decay (L2 regularization)")
|
||||
parser.add_argument("--cudnn_workspace_limit_mb", type=int, default=64,
|
||||
help="CuDNN workspace limit in MBs")
|
||||
parser.add_argument("--num_shards", type=int, default=1,
|
||||
help="Number of machines in distributed run")
|
||||
parser.add_argument("--shard_id", type=int, default=0,
|
||||
help="Shard id.")
|
||||
parser.add_argument("--run_id", type=str,
|
||||
help="Unique run identifier (e.g. uuid)")
|
||||
parser.add_argument("--redis_host", type=str,
|
||||
help="Host of Redis server (for rendezvous)")
|
||||
parser.add_argument("--redis_port", type=int, default=6379,
|
||||
help="Port of Redis server (for rendezvous)")
|
||||
parser.add_argument("--file_store_path", type=str, default="/tmp",
|
||||
help="Path to directory to use for rendezvous")
|
||||
parser.add_argument("--save_model_name", type=str, default="resnext_model",
|
||||
help="Save the trained model to a given name")
|
||||
parser.add_argument("--load_model_path", type=str, default=None,
|
||||
help="Load previously saved model to continue training")
|
||||
parser.add_argument("--use_cpu", type=bool, default=False,
|
||||
help="Use CPU instead of GPU")
|
||||
parser.add_argument("--use_ideep", type=bool, default=False,
|
||||
help="Use ideep")
|
||||
parser.add_argument('--dtype', default='float',
|
||||
choices=['float', 'float16'],
|
||||
help='Data type used for training')
|
||||
parser.add_argument('--float16_compute', action='store_true',
|
||||
help="Use float 16 compute, if available")
|
||||
parser.add_argument('--enable_tensor_core', action='store_true',
|
||||
help='Enable Tensor Core math for Conv and FC ops')
|
||||
parser.add_argument("--distributed_transport", type=str, default="tcp",
|
||||
help="Transport to use for distributed run [tcp|ibverbs]")
|
||||
parser.add_argument("--distributed_interfaces", type=str, default="",
|
||||
help="Network interfaces to use for distributed run")
|
||||
|
||||
parser.add_argument("--first_iter_timeout", type=int, default=600,
|
||||
help="Timeout (secs) of the first iteration "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument("--timeout", type=int, default=60,
|
||||
help="Timeout (secs) of each (except the first) iteration "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument("--model",
|
||||
default="resnext", const="resnext", nargs="?",
|
||||
choices=["shufflenet", "resnext"],
|
||||
help="List of models which can be run")
|
||||
args = parser.parse_args()
|
||||
|
||||
Train(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
|
||||
main()
|
@ -1,690 +0,0 @@
|
||||
# Module caffe2.python.examples.resnet50_trainer
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import numpy as np
|
||||
import time
|
||||
import os
|
||||
|
||||
from caffe2.python import core, workspace, experiment_util, data_parallel_model
|
||||
from caffe2.python import dyndep, optimizer
|
||||
from caffe2.python import timeout_guard, model_helper, brew
|
||||
from caffe2.proto import caffe2_pb2
|
||||
|
||||
import caffe2.python.models.resnet as resnet
|
||||
from caffe2.python.modeling.initializers import Initializer, PseudoFP16Initializer
|
||||
import caffe2.python.predictor.predictor_exporter as pred_exp
|
||||
import caffe2.python.predictor.predictor_py_utils as pred_utils
|
||||
from caffe2.python.predictor_constants import predictor_constants as predictor_constants
|
||||
|
||||
'''
|
||||
Parallelized multi-GPU distributed trainer for Resne(X)t.
|
||||
Can be used to train on imagenet data, for example.
|
||||
The default parameters can train a standard Resnet-50 (1x64d), and parameters
|
||||
can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d).
|
||||
|
||||
To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
|
||||
|
||||
To run the trainer in multi-machine multi-gpu mode with M machines,
|
||||
run the same program on all machines, specifying num_shards = M, and
|
||||
shard_id = a unique integer in the set [0, M-1].
|
||||
|
||||
For rendezvous (the trainer processes have to know about each other),
|
||||
you can either use a directory path that is visible to all processes
|
||||
(e.g. NFS directory), or use a Redis instance. Use the former by
|
||||
passing the `file_store_path` argument. Use the latter by passing the
|
||||
`redis_host` and `redis_port` arguments.
|
||||
'''
|
||||
|
||||
logging.basicConfig()
|
||||
log = logging.getLogger("ResNe(X)t_trainer")
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
|
||||
dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
|
||||
|
||||
|
||||
def AddImageInput(
|
||||
model,
|
||||
reader,
|
||||
batch_size,
|
||||
img_size,
|
||||
dtype,
|
||||
is_test,
|
||||
mean_per_channel=None,
|
||||
std_per_channel=None,
|
||||
):
|
||||
'''
|
||||
The image input operator loads image and label data from the reader and
|
||||
applies transformations to the images (random cropping, mirroring, ...).
|
||||
'''
|
||||
data, label = brew.image_input(
|
||||
model,
|
||||
reader, ["data", "label"],
|
||||
batch_size=batch_size,
|
||||
output_type=dtype,
|
||||
use_gpu_transform=True if core.IsGPUDeviceType(model._device_type) else False,
|
||||
use_caffe_datum=True,
|
||||
mean_per_channel=mean_per_channel,
|
||||
std_per_channel=std_per_channel,
|
||||
# mean_per_channel takes precedence over mean
|
||||
mean=128.,
|
||||
std=128.,
|
||||
scale=256,
|
||||
crop=img_size,
|
||||
mirror=1,
|
||||
is_test=is_test,
|
||||
)
|
||||
|
||||
data = model.StopGradient(data, data)
|
||||
|
||||
|
||||
def AddNullInput(model, reader, batch_size, img_size, dtype):
|
||||
'''
|
||||
The null input function uses a gaussian fill operator to emulate real image
|
||||
input. A label blob is hardcoded to a single value. This is useful if you
|
||||
want to test compute throughput or don't have a dataset available.
|
||||
'''
|
||||
suffix = "_fp16" if dtype == "float16" else ""
|
||||
model.param_init_net.GaussianFill(
|
||||
[],
|
||||
["data" + suffix],
|
||||
shape=[batch_size, 3, img_size, img_size],
|
||||
)
|
||||
if dtype == "float16":
|
||||
model.param_init_net.FloatToHalf("data" + suffix, "data")
|
||||
|
||||
model.param_init_net.ConstantFill(
|
||||
[],
|
||||
["label"],
|
||||
shape=[batch_size],
|
||||
value=1,
|
||||
dtype=core.DataType.INT32,
|
||||
)
|
||||
|
||||
|
||||
def SaveModel(args, train_model, epoch, use_ideep):
|
||||
prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0])
|
||||
predictor_export_meta = pred_exp.PredictorExportMeta(
|
||||
predict_net=train_model.net.Proto(),
|
||||
parameters=data_parallel_model.GetCheckpointParams(train_model),
|
||||
inputs=[prefix + "/data"],
|
||||
outputs=[prefix + "/softmax"],
|
||||
shapes={
|
||||
prefix + "/softmax": (1, args.num_labels),
|
||||
prefix + "/data": (args.num_channels, args.image_size, args.image_size)
|
||||
}
|
||||
)
|
||||
|
||||
# save the train_model for the current epoch
|
||||
model_path = "%s/%s_%d.mdl" % (
|
||||
args.file_store_path,
|
||||
args.save_model_name,
|
||||
epoch,
|
||||
)
|
||||
|
||||
# set db_type to be "minidb" instead of "log_file_db", which breaks
|
||||
# the serialization in save_to_db. Need to switch back to log_file_db
|
||||
# after migration
|
||||
pred_exp.save_to_db(
|
||||
db_type="minidb",
|
||||
db_destination=model_path,
|
||||
predictor_export_meta=predictor_export_meta,
|
||||
use_ideep = use_ideep
|
||||
)
|
||||
|
||||
|
||||
def LoadModel(path, model, use_ideep):
|
||||
'''
|
||||
Load pretrained model from file
|
||||
'''
|
||||
log.info("Loading path: {}".format(path))
|
||||
meta_net_def = pred_exp.load_from_db(path, 'minidb')
|
||||
init_net = core.Net(pred_utils.GetNet(
|
||||
meta_net_def, predictor_constants.GLOBAL_INIT_NET_TYPE))
|
||||
predict_init_net = core.Net(pred_utils.GetNet(
|
||||
meta_net_def, predictor_constants.PREDICT_INIT_NET_TYPE))
|
||||
|
||||
if use_ideep:
|
||||
predict_init_net.RunAllOnIDEEP()
|
||||
else:
|
||||
predict_init_net.RunAllOnGPU()
|
||||
if use_ideep:
|
||||
init_net.RunAllOnIDEEP()
|
||||
else:
|
||||
init_net.RunAllOnGPU()
|
||||
|
||||
assert workspace.RunNetOnce(predict_init_net)
|
||||
assert workspace.RunNetOnce(init_net)
|
||||
|
||||
# Hack: fix iteration counter which is in CUDA context after load model
|
||||
itercnt = workspace.FetchBlob("optimizer_iteration")
|
||||
workspace.FeedBlob(
|
||||
"optimizer_iteration",
|
||||
itercnt,
|
||||
device_option=core.DeviceOption(caffe2_pb2.CPU, 0)
|
||||
)
|
||||
|
||||
|
||||
def RunEpoch(
|
||||
args,
|
||||
epoch,
|
||||
train_model,
|
||||
test_model,
|
||||
total_batch_size,
|
||||
num_shards,
|
||||
expname,
|
||||
explog,
|
||||
):
|
||||
'''
|
||||
Run one epoch of the trainer.
|
||||
TODO: add checkpointing here.
|
||||
'''
|
||||
# TODO: add loading from checkpoint
|
||||
log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
|
||||
epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
|
||||
test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards)
|
||||
for i in range(epoch_iters):
|
||||
# This timeout is required (temporarily) since CUDA-NCCL
|
||||
# operators might deadlock when synchronizing between GPUs.
|
||||
timeout = args.first_iter_timeout if i == 0 else args.timeout
|
||||
with timeout_guard.CompleteInTimeOrDie(timeout):
|
||||
t1 = time.time()
|
||||
workspace.RunNet(train_model.net.Proto().name)
|
||||
t2 = time.time()
|
||||
dt = t2 - t1
|
||||
|
||||
fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
|
||||
log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
|
||||
prefix = "{}_{}".format(
|
||||
train_model._device_prefix,
|
||||
train_model._devices[0])
|
||||
accuracy = workspace.FetchBlob(prefix + '/accuracy')
|
||||
loss = workspace.FetchBlob(prefix + '/loss')
|
||||
train_fmt = "Training loss: {}, accuracy: {}"
|
||||
log.info(train_fmt.format(loss, accuracy))
|
||||
|
||||
num_images = epoch * epoch_iters * total_batch_size
|
||||
prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
|
||||
accuracy = workspace.FetchBlob(prefix + '/accuracy')
|
||||
loss = workspace.FetchBlob(prefix + '/loss')
|
||||
learning_rate = workspace.FetchBlob(
|
||||
data_parallel_model.GetLearningRateBlobNames(train_model)[0]
|
||||
)
|
||||
test_accuracy = 0
|
||||
test_accuracy_top5 = 0
|
||||
if test_model is not None:
|
||||
# Run 100 iters of testing
|
||||
ntests = 0
|
||||
for _ in range(test_epoch_iters):
|
||||
workspace.RunNet(test_model.net.Proto().name)
|
||||
for g in test_model._devices:
|
||||
test_accuracy += np.asscalar(workspace.FetchBlob(
|
||||
"{}_{}".format(test_model._device_prefix, g) + '/accuracy'
|
||||
))
|
||||
test_accuracy_top5 += np.asscalar(workspace.FetchBlob(
|
||||
"{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
|
||||
))
|
||||
ntests += 1
|
||||
test_accuracy /= ntests
|
||||
test_accuracy_top5 /= ntests
|
||||
else:
|
||||
test_accuracy = (-1)
|
||||
test_accuracy_top5 = (-1)
|
||||
|
||||
explog.log(
|
||||
input_count=num_images,
|
||||
batch_count=(i + epoch * epoch_iters),
|
||||
additional_values={
|
||||
'accuracy': accuracy,
|
||||
'loss': loss,
|
||||
'learning_rate': learning_rate,
|
||||
'epoch': epoch,
|
||||
'top1_test_accuracy': test_accuracy,
|
||||
'top5_test_accuracy': test_accuracy_top5,
|
||||
}
|
||||
)
|
||||
assert loss < 40, "Exploded gradients :("
|
||||
|
||||
# TODO: add checkpointing
|
||||
return epoch + 1
|
||||
|
||||
|
||||
def Train(args):
|
||||
# Either use specified device list or generate one
|
||||
if args.gpus is not None:
|
||||
gpus = [int(x) for x in args.gpus.split(',')]
|
||||
num_gpus = len(gpus)
|
||||
else:
|
||||
gpus = list(range(args.num_gpus))
|
||||
num_gpus = args.num_gpus
|
||||
|
||||
log.info("Running on GPUs: {}".format(gpus))
|
||||
|
||||
# Verify valid batch size
|
||||
total_batch_size = args.batch_size
|
||||
batch_per_device = total_batch_size // num_gpus
|
||||
assert \
|
||||
total_batch_size % num_gpus == 0, \
|
||||
"Number of GPUs must divide batch size"
|
||||
|
||||
# Verify valid image mean/std per channel
|
||||
if args.image_mean_per_channel:
|
||||
assert \
|
||||
len(args.image_mean_per_channel) == args.num_channels, \
|
||||
"The number of channels of image mean doesn't match input"
|
||||
|
||||
if args.image_std_per_channel:
|
||||
assert \
|
||||
len(args.image_std_per_channel) == args.num_channels, \
|
||||
"The number of channels of image std doesn't match input"
|
||||
|
||||
# Round down epoch size to closest multiple of batch size across machines
|
||||
global_batch_size = total_batch_size * args.num_shards
|
||||
epoch_iters = int(args.epoch_size / global_batch_size)
|
||||
|
||||
assert \
|
||||
epoch_iters > 0, \
|
||||
"Epoch size must be larger than batch size times shard count"
|
||||
|
||||
args.epoch_size = epoch_iters * global_batch_size
|
||||
log.info("Using epoch size: {}".format(args.epoch_size))
|
||||
|
||||
# Create ModelHelper object
|
||||
if args.use_ideep:
|
||||
train_arg_scope = {
|
||||
'use_cudnn': False,
|
||||
'cudnn_exhaustive_search': False,
|
||||
'training_mode': 1
|
||||
}
|
||||
else:
|
||||
train_arg_scope = {
|
||||
'order': 'NCHW',
|
||||
'use_cudnn': True,
|
||||
'cudnn_exhaustive_search': True,
|
||||
'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
|
||||
}
|
||||
train_model = model_helper.ModelHelper(
|
||||
name='resnext' + str(args.num_layers), arg_scope=train_arg_scope
|
||||
)
|
||||
|
||||
num_shards = args.num_shards
|
||||
shard_id = args.shard_id
|
||||
|
||||
# Expect interfaces to be comma separated.
|
||||
# Use of multiple network interfaces is not yet complete,
|
||||
# so simply use the first one in the list.
|
||||
interfaces = args.distributed_interfaces.split(",")
|
||||
|
||||
# Rendezvous using MPI when run with mpirun
|
||||
if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
|
||||
num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
|
||||
shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
|
||||
if num_shards > 1:
|
||||
rendezvous = dict(
|
||||
kv_handler=None,
|
||||
num_shards=num_shards,
|
||||
shard_id=shard_id,
|
||||
engine="GLOO",
|
||||
transport=args.distributed_transport,
|
||||
interface=interfaces[0],
|
||||
mpi_rendezvous=True,
|
||||
exit_nets=None)
|
||||
|
||||
elif num_shards > 1:
|
||||
# Create rendezvous for distributed computation
|
||||
store_handler = "store_handler"
|
||||
if args.redis_host is not None:
|
||||
# Use Redis for rendezvous if Redis host is specified
|
||||
workspace.RunOperatorOnce(
|
||||
core.CreateOperator(
|
||||
"RedisStoreHandlerCreate", [], [store_handler],
|
||||
host=args.redis_host,
|
||||
port=args.redis_port,
|
||||
prefix=args.run_id,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Use filesystem for rendezvous otherwise
|
||||
workspace.RunOperatorOnce(
|
||||
core.CreateOperator(
|
||||
"FileStoreHandlerCreate", [], [store_handler],
|
||||
path=args.file_store_path,
|
||||
prefix=args.run_id,
|
||||
)
|
||||
)
|
||||
|
||||
rendezvous = dict(
|
||||
kv_handler=store_handler,
|
||||
shard_id=shard_id,
|
||||
num_shards=num_shards,
|
||||
engine="GLOO",
|
||||
transport=args.distributed_transport,
|
||||
interface=interfaces[0],
|
||||
exit_nets=None)
|
||||
|
||||
else:
|
||||
rendezvous = None
|
||||
|
||||
# Model building functions
|
||||
def create_resnext_model_ops(model, loss_scale):
|
||||
initializer = (PseudoFP16Initializer if args.dtype == 'float16'
|
||||
else Initializer)
|
||||
|
||||
with brew.arg_scope([brew.conv, brew.fc],
|
||||
WeightInitializer=initializer,
|
||||
BiasInitializer=initializer,
|
||||
enable_tensor_core=args.enable_tensor_core,
|
||||
float16_compute=args.float16_compute):
|
||||
pred = resnet.create_resnext(
|
||||
model,
|
||||
"data",
|
||||
num_input_channels=args.num_channels,
|
||||
num_labels=args.num_labels,
|
||||
num_layers=args.num_layers,
|
||||
num_groups=args.resnext_num_groups,
|
||||
num_width_per_group=args.resnext_width_per_group,
|
||||
no_bias=True,
|
||||
no_loss=True,
|
||||
)
|
||||
|
||||
if args.dtype == 'float16':
|
||||
pred = model.net.HalfToFloat(pred, pred + '_fp32')
|
||||
|
||||
softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
|
||||
['softmax', 'loss'])
|
||||
loss = model.Scale(loss, scale=loss_scale)
|
||||
brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
|
||||
brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
|
||||
return [loss]
|
||||
|
||||
def add_optimizer(model):
|
||||
stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
|
||||
|
||||
if args.float16_compute:
|
||||
# TODO: merge with multi-prceision optimizer
|
||||
opt = optimizer.build_fp16_sgd(
|
||||
model,
|
||||
args.base_learning_rate,
|
||||
momentum=0.9,
|
||||
nesterov=1,
|
||||
weight_decay=args.weight_decay, # weight decay included
|
||||
policy="step",
|
||||
stepsize=stepsz,
|
||||
gamma=0.1
|
||||
)
|
||||
else:
|
||||
optimizer.add_weight_decay(model, args.weight_decay)
|
||||
opt = optimizer.build_multi_precision_sgd(
|
||||
model,
|
||||
args.base_learning_rate,
|
||||
momentum=0.9,
|
||||
nesterov=1,
|
||||
policy="step",
|
||||
stepsize=stepsz,
|
||||
gamma=0.1
|
||||
)
|
||||
return opt
|
||||
|
||||
# Define add_image_input function.
|
||||
# Depends on the "train_data" argument.
|
||||
# Note that the reader will be shared with between all GPUS.
|
||||
if args.train_data == "null":
|
||||
def add_image_input(model):
|
||||
AddNullInput(
|
||||
model,
|
||||
None,
|
||||
batch_size=batch_per_device,
|
||||
img_size=args.image_size,
|
||||
dtype=args.dtype,
|
||||
)
|
||||
else:
|
||||
reader = train_model.CreateDB(
|
||||
"reader",
|
||||
db=args.train_data,
|
||||
db_type=args.db_type,
|
||||
num_shards=num_shards,
|
||||
shard_id=shard_id,
|
||||
)
|
||||
|
||||
def add_image_input(model):
|
||||
AddImageInput(
|
||||
model,
|
||||
reader,
|
||||
batch_size=batch_per_device,
|
||||
img_size=args.image_size,
|
||||
dtype=args.dtype,
|
||||
is_test=False,
|
||||
mean_per_channel=args.image_mean_per_channel,
|
||||
std_per_channel=args.image_std_per_channel,
|
||||
)
|
||||
|
||||
def add_post_sync_ops(model):
|
||||
"""Add ops applied after initial parameter sync."""
|
||||
for param_info in model.GetOptimizationParamInfo(model.GetParams()):
|
||||
if param_info.blob_copy is not None:
|
||||
model.param_init_net.HalfToFloat(
|
||||
param_info.blob,
|
||||
param_info.blob_copy[core.DataType.FLOAT]
|
||||
)
|
||||
|
||||
# Create parallelized model
|
||||
data_parallel_model.Parallelize(
|
||||
train_model,
|
||||
input_builder_fun=add_image_input,
|
||||
forward_pass_builder_fun=create_resnext_model_ops,
|
||||
optimizer_builder_fun=add_optimizer,
|
||||
post_sync_builder_fun=add_post_sync_ops,
|
||||
devices=gpus,
|
||||
rendezvous=rendezvous,
|
||||
optimize_gradient_memory=False,
|
||||
cpu_device=args.use_cpu,
|
||||
ideep=args.use_ideep,
|
||||
shared_model=args.use_cpu,
|
||||
combine_spatial_bn=args.use_cpu,
|
||||
)
|
||||
|
||||
data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)
|
||||
|
||||
workspace.RunNetOnce(train_model.param_init_net)
|
||||
workspace.CreateNet(train_model.net)
|
||||
|
||||
# Add test model, if specified
|
||||
test_model = None
|
||||
if (args.test_data is not None):
|
||||
log.info("----- Create test net ----")
|
||||
if args.use_ideep:
|
||||
test_arg_scope = {
|
||||
'use_cudnn': False,
|
||||
'cudnn_exhaustive_search': False,
|
||||
}
|
||||
else:
|
||||
test_arg_scope = {
|
||||
'order': "NCHW",
|
||||
'use_cudnn': True,
|
||||
'cudnn_exhaustive_search': True,
|
||||
}
|
||||
test_model = model_helper.ModelHelper(
|
||||
name='resnext' + str(args.num_layers) + "_test",
|
||||
arg_scope=test_arg_scope,
|
||||
init_params=False,
|
||||
)
|
||||
|
||||
test_reader = test_model.CreateDB(
|
||||
"test_reader",
|
||||
db=args.test_data,
|
||||
db_type=args.db_type,
|
||||
)
|
||||
|
||||
def test_input_fn(model):
|
||||
AddImageInput(
|
||||
model,
|
||||
test_reader,
|
||||
batch_size=batch_per_device,
|
||||
img_size=args.image_size,
|
||||
dtype=args.dtype,
|
||||
is_test=True,
|
||||
mean_per_channel=args.image_mean_per_channel,
|
||||
std_per_channel=args.image_std_per_channel,
|
||||
)
|
||||
|
||||
data_parallel_model.Parallelize(
|
||||
test_model,
|
||||
input_builder_fun=test_input_fn,
|
||||
forward_pass_builder_fun=create_resnext_model_ops,
|
||||
post_sync_builder_fun=add_post_sync_ops,
|
||||
param_update_builder_fun=None,
|
||||
devices=gpus,
|
||||
cpu_device=args.use_cpu,
|
||||
)
|
||||
workspace.RunNetOnce(test_model.param_init_net)
|
||||
workspace.CreateNet(test_model.net)
|
||||
|
||||
epoch = 0
|
||||
# load the pre-trained model and reset epoch
|
||||
if args.load_model_path is not None:
|
||||
LoadModel(args.load_model_path, train_model, args.use_ideep)
|
||||
|
||||
# Sync the model params
|
||||
data_parallel_model.FinalizeAfterCheckpoint(train_model)
|
||||
|
||||
# reset epoch. load_model_path should end with *_X.mdl,
|
||||
# where X is the epoch number
|
||||
last_str = args.load_model_path.split('_')[-1]
|
||||
if last_str.endswith('.mdl'):
|
||||
epoch = int(last_str[:-4])
|
||||
log.info("Reset epoch to {}".format(epoch))
|
||||
else:
|
||||
log.warning("The format of load_model_path doesn't match!")
|
||||
|
||||
expname = "resnext_%d_gpu%d_b%d_L%d_lr%.2f_v2" % (
|
||||
args.num_layers,
|
||||
args.num_gpus,
|
||||
total_batch_size,
|
||||
args.num_labels,
|
||||
args.base_learning_rate,
|
||||
)
|
||||
|
||||
explog = experiment_util.ModelTrainerLog(expname, args)
|
||||
|
||||
# Run the training one epoch a time
|
||||
while epoch < args.num_epochs:
|
||||
epoch = RunEpoch(
|
||||
args,
|
||||
epoch,
|
||||
train_model,
|
||||
test_model,
|
||||
total_batch_size,
|
||||
num_shards,
|
||||
expname,
|
||||
explog
|
||||
)
|
||||
|
||||
# Save the model for each epoch
|
||||
SaveModel(args, train_model, epoch, args.use_ideep)
|
||||
|
||||
model_path = "%s/%s_" % (
|
||||
args.file_store_path,
|
||||
args.save_model_name
|
||||
)
|
||||
# remove the saved model from the previous epoch if it exists
|
||||
if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
|
||||
os.remove(model_path + str(epoch - 1) + ".mdl")
|
||||
|
||||
|
||||
def main():
|
||||
# TODO: use argv
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Caffe2: ResNe(X)t training"
|
||||
)
|
||||
parser.add_argument("--train_data", type=str, default=None, required=True,
|
||||
help="Path to training data (or 'null' to simulate)")
|
||||
parser.add_argument("--num_layers", type=int, default=50,
|
||||
help="The number of layers in ResNe(X)t model")
|
||||
parser.add_argument("--resnext_num_groups", type=int, default=1,
|
||||
help="The cardinality of resnext")
|
||||
parser.add_argument("--resnext_width_per_group", type=int, default=64,
|
||||
help="The cardinality of resnext")
|
||||
parser.add_argument("--test_data", type=str, default=None,
|
||||
help="Path to test data")
|
||||
parser.add_argument("--image_mean_per_channel", type=float, nargs='+',
|
||||
help="The per channel mean for the images")
|
||||
parser.add_argument("--image_std_per_channel", type=float, nargs='+',
|
||||
help="The per channel standard deviation for the images")
|
||||
parser.add_argument("--test_epoch_size", type=int, default=50000,
|
||||
help="Number of test images")
|
||||
parser.add_argument("--db_type", type=str, default="lmdb",
|
||||
help="Database type (such as lmdb or leveldb)")
|
||||
parser.add_argument("--gpus", type=str,
|
||||
help="Comma separated list of GPU devices to use")
|
||||
parser.add_argument("--num_gpus", type=int, default=1,
|
||||
help="Number of GPU devices (instead of --gpus)")
|
||||
parser.add_argument("--num_channels", type=int, default=3,
|
||||
help="Number of color channels")
|
||||
parser.add_argument("--image_size", type=int, default=224,
|
||||
help="Input image size (to crop to)")
|
||||
parser.add_argument("--num_labels", type=int, default=1000,
|
||||
help="Number of labels")
|
||||
parser.add_argument("--batch_size", type=int, default=32,
|
||||
help="Batch size, total over all GPUs")
|
||||
parser.add_argument("--epoch_size", type=int, default=1500000,
|
||||
help="Number of images/epoch, total over all machines")
|
||||
parser.add_argument("--num_epochs", type=int, default=1000,
|
||||
help="Num epochs.")
|
||||
parser.add_argument("--base_learning_rate", type=float, default=0.1,
|
||||
help="Initial learning rate.")
|
||||
parser.add_argument("--weight_decay", type=float, default=1e-4,
|
||||
help="Weight decay (L2 regularization)")
|
||||
parser.add_argument("--cudnn_workspace_limit_mb", type=int, default=64,
|
||||
help="CuDNN workspace limit in MBs")
|
||||
parser.add_argument("--num_shards", type=int, default=1,
|
||||
help="Number of machines in distributed run")
|
||||
parser.add_argument("--shard_id", type=int, default=0,
|
||||
help="Shard id.")
|
||||
parser.add_argument("--run_id", type=str,
|
||||
help="Unique run identifier (e.g. uuid)")
|
||||
parser.add_argument("--redis_host", type=str,
|
||||
help="Host of Redis server (for rendezvous)")
|
||||
parser.add_argument("--redis_port", type=int, default=6379,
|
||||
help="Port of Redis server (for rendezvous)")
|
||||
parser.add_argument("--file_store_path", type=str, default="/tmp",
|
||||
help="Path to directory to use for rendezvous")
|
||||
parser.add_argument("--save_model_name", type=str, default="resnext_model",
|
||||
help="Save the trained model to a given name")
|
||||
parser.add_argument("--load_model_path", type=str, default=None,
|
||||
help="Load previously saved model to continue training")
|
||||
parser.add_argument("--use_cpu", type=bool, default=False,
|
||||
help="Use CPU instead of GPU")
|
||||
parser.add_argument("--use_ideep", type=bool, default=False,
|
||||
help="Use ideep")
|
||||
parser.add_argument('--dtype', default='float',
|
||||
choices=['float', 'float16'],
|
||||
help='Data type used for training')
|
||||
parser.add_argument('--float16_compute', action='store_true',
|
||||
help="Use float 16 compute, if available")
|
||||
parser.add_argument('--enable_tensor_core', action='store_true',
|
||||
help='Enable Tensor Core math for Conv and FC ops')
|
||||
parser.add_argument("--distributed_transport", type=str, default="tcp",
|
||||
help="Transport to use for distributed run [tcp|ibverbs]")
|
||||
parser.add_argument("--distributed_interfaces", type=str, default="",
|
||||
help="Network interfaces to use for distributed run")
|
||||
|
||||
parser.add_argument("--first_iter_timeout", type=int, default=600,
|
||||
help="Timeout (secs) of the first iteration "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument("--timeout", type=int, default=60,
|
||||
help="Timeout (secs) of each (except the first) iteration "
|
||||
"(default: %(default)s)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
Train(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
|
||||
main()
|
1
caffe2/python/examples/resnet50_trainer.py
Symbolic link
1
caffe2/python/examples/resnet50_trainer.py
Symbolic link
@ -0,0 +1 @@
|
||||
imagenet_trainer.py
|
200
caffe2/python/models/imagenet_trainer_test_utils.py
Normal file
200
caffe2/python/models/imagenet_trainer_test_utils.py
Normal file
@ -0,0 +1,200 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
from caffe2.python import workspace, cnn, memonger, core
|
||||
|
||||
def has_blob(proto, needle):
|
||||
for op in proto.op:
|
||||
for inp in op.input:
|
||||
if inp == needle:
|
||||
return True
|
||||
for outp in op.output:
|
||||
if outp == needle:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def count_blobs(proto):
|
||||
blobs = set()
|
||||
for op in proto.op:
|
||||
blobs = blobs.union(set(op.input)).union(set(op.output))
|
||||
return len(blobs)
|
||||
|
||||
|
||||
def count_shared_blobs(proto):
|
||||
blobs = set()
|
||||
for op in proto.op:
|
||||
blobs = blobs.union(set(op.input)).union(set(op.output))
|
||||
return len([b for b in blobs if "_shared" in b])
|
||||
|
||||
|
||||
def test_shared_grads(
|
||||
with_shapes,
|
||||
create_model,
|
||||
conv_blob,
|
||||
last_out_blob,
|
||||
data_blob='gpu_0/data',
|
||||
label_blob='gpu_0/label',
|
||||
num_labels=1000,
|
||||
):
|
||||
model = cnn.CNNModelHelper(
|
||||
order="NCHW",
|
||||
name="test",
|
||||
cudnn_exhaustive_search=True,
|
||||
)
|
||||
with core.NameScope("gpu_0"):
|
||||
data = model.net.AddExternalInput(data_blob)
|
||||
label = model.net.AddExternalInput(label_blob)
|
||||
(_softmax, loss) = create_model(
|
||||
model,
|
||||
data,
|
||||
num_input_channels=3,
|
||||
num_labels=num_labels,
|
||||
label=label,
|
||||
is_test=False,
|
||||
)
|
||||
|
||||
param_to_grad = model.AddGradientOperators([loss])
|
||||
|
||||
(shapes, types) = workspace.InferShapesAndTypes(
|
||||
[model.param_init_net, model.net],
|
||||
{data_blob: [4, 3, 227, 227],
|
||||
label_blob: [4]},
|
||||
)
|
||||
|
||||
count_before = count_blobs(model.net.Proto())
|
||||
optim_proto = memonger.share_grad_blobs(
|
||||
model.net,
|
||||
["gpu_0/loss"],
|
||||
set(model.param_to_grad.values()),
|
||||
"gpu_0/",
|
||||
share_activations=True,
|
||||
dont_share_blobs=set([str(param_to_grad[conv_blob])]),
|
||||
blob_shapes=shapes if with_shapes else None,
|
||||
)
|
||||
count_after = count_blobs(optim_proto)
|
||||
|
||||
# Run model and compare results. We check that the loss is same
|
||||
# and also that the final gradient (conv1_w_grad is same)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
||||
label = (np.random.rand(4) * num_labels).astype(np.int32)
|
||||
|
||||
workspace.FeedBlob(data_blob, data)
|
||||
workspace.FeedBlob(label_blob, label)
|
||||
|
||||
workspace.RunNetOnce(model.net)
|
||||
model.net.Proto().type = 'dag'
|
||||
model.net.Proto().num_workers = 4
|
||||
loss1 = workspace.FetchBlob(last_out_blob)
|
||||
conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
|
||||
workspace.FeedBlob(param_to_grad[conv_blob], np.array([0.0]))
|
||||
|
||||
workspace.RunNetOnce(optim_proto)
|
||||
optimized_loss1 = workspace.FetchBlob(last_out_blob)
|
||||
optim_conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
|
||||
|
||||
return [(count_after, count_before),
|
||||
(loss1, optimized_loss1),
|
||||
(conv1_w_grad, optim_conv1_w_grad)]
|
||||
|
||||
|
||||
def test_forward_only(
|
||||
create_model,
|
||||
last_out_blob,
|
||||
data_blob='gpu_0/data',
|
||||
num_labels=1000,
|
||||
):
|
||||
model = cnn.CNNModelHelper(
|
||||
order="NCHW",
|
||||
name="test",
|
||||
cudnn_exhaustive_search=True,
|
||||
)
|
||||
with core.NameScope("gpu_0"):
|
||||
data = model.net.AddExternalInput(data_blob)
|
||||
create_model(
|
||||
model,
|
||||
data,
|
||||
num_input_channels=3,
|
||||
num_labels=num_labels,
|
||||
is_test=True
|
||||
)
|
||||
|
||||
count_before = count_blobs(model.net.Proto())
|
||||
optim_proto = memonger.optimize_inference_for_dag(
|
||||
model.net, [data_blob], "gpu_0/"
|
||||
)
|
||||
count_after = count_blobs(optim_proto)
|
||||
num_shared_blobs = count_shared_blobs(optim_proto)
|
||||
|
||||
# Run model and compare results
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
||||
|
||||
workspace.FeedBlob(data_blob, data)
|
||||
workspace.RunNetOnce(model.net)
|
||||
model.net.Proto().type = 'dag'
|
||||
model.net.Proto().num_workers = 4
|
||||
loss1 = workspace.FetchBlob(last_out_blob)
|
||||
|
||||
workspace.RunNetOnce(optim_proto)
|
||||
optimized_loss1 = workspace.FetchBlob(last_out_blob)
|
||||
return [(count_after, count_before),
|
||||
(num_shared_blobs),
|
||||
(loss1, optimized_loss1)]
|
||||
|
||||
|
||||
def test_forward_only_fast_simplenet(
|
||||
create_model,
|
||||
last_out_blob,
|
||||
data_blob="gpu_0/data",
|
||||
num_labels=1000,
|
||||
):
|
||||
model = cnn.CNNModelHelper(
|
||||
order="NCHW",
|
||||
name="test",
|
||||
cudnn_exhaustive_search=True,
|
||||
)
|
||||
with core.NameScope("gpu_0"):
|
||||
data = model.net.AddExternalInput(data_blob)
|
||||
create_model(
|
||||
model,
|
||||
data,
|
||||
num_input_channels=3,
|
||||
num_labels=num_labels,
|
||||
is_test=True
|
||||
)
|
||||
|
||||
count_before = count_blobs(model.net.Proto())
|
||||
t = time.time()
|
||||
optim_proto = memonger.optimize_inference_fast(
|
||||
model.net.Proto(),
|
||||
set([data_blob, last_out_blob]).union(
|
||||
set(model.net.Proto().external_input))
|
||||
)
|
||||
print("Optimization took {} secs".format(time.time() - t))
|
||||
count_after = count_blobs(optim_proto)
|
||||
num_shared_blobs = count_shared_blobs(optim_proto)
|
||||
|
||||
print(count_after, count_before, num_shared_blobs)
|
||||
|
||||
# Run model and compare results
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
||||
|
||||
workspace.FeedBlob(data_blob, data)
|
||||
model.net.Proto().type = 'simple'
|
||||
|
||||
workspace.RunNetOnce(model.net)
|
||||
loss1 = workspace.FetchBlob(last_out_blob)
|
||||
|
||||
workspace.RunNetOnce(optim_proto)
|
||||
optimized_loss1 = workspace.FetchBlob(last_out_blob)
|
||||
return [(count_after, count_before),
|
||||
(num_shared_blobs),
|
||||
(loss1, optimized_loss1)]
|
@ -4,194 +4,49 @@ from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
from caffe2.python import workspace, cnn, memonger, core
|
||||
import caffe2.python.models.resnet as resnet
|
||||
import hypothesis.strategies as st
|
||||
from hypothesis import given, settings
|
||||
import caffe2.python.hypothesis_test_util as hu
|
||||
|
||||
|
||||
def has_blob(proto, needle):
|
||||
for op in proto.op:
|
||||
for inp in op.input:
|
||||
if inp == needle:
|
||||
return True
|
||||
for outp in op.output:
|
||||
if outp == needle:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def count_blobs(proto):
|
||||
blobs = set()
|
||||
for op in proto.op:
|
||||
blobs = blobs.union(set(op.input)).union(set(op.output))
|
||||
return len(blobs)
|
||||
|
||||
|
||||
def count_shared_blobs(proto):
|
||||
blobs = set()
|
||||
for op in proto.op:
|
||||
blobs = blobs.union(set(op.input)).union(set(op.output))
|
||||
return len([b for b in blobs if "_shared" in b])
|
||||
|
||||
import caffe2.python.models.imagenet_trainer_test_utils as utils
|
||||
|
||||
class ResnetMemongerTest(hu.HypothesisTestCase):
|
||||
|
||||
@given(with_shapes=st.booleans(), **hu.gcs_cpu_only)
|
||||
@settings(max_examples=2, timeout=120)
|
||||
def test_resnet_shared_grads(self, with_shapes, gc, dc):
|
||||
model = cnn.CNNModelHelper(
|
||||
order="NCHW",
|
||||
name="test",
|
||||
cudnn_exhaustive_search=True,
|
||||
results = utils.test_shared_grads(
|
||||
with_shapes,
|
||||
resnet.create_resnet50,
|
||||
'gpu_0/conv1_w',
|
||||
'gpu_0/last_out_L1000'
|
||||
)
|
||||
with core.NameScope("gpu_0"):
|
||||
data = model.net.AddExternalInput("gpu_0/data")
|
||||
label = model.net.AddExternalInput("gpu_0/label")
|
||||
(_softmax, loss) = resnet.create_resnet50(
|
||||
model,
|
||||
data,
|
||||
num_input_channels=3,
|
||||
num_labels=1000,
|
||||
label=label,
|
||||
is_test=False,
|
||||
)
|
||||
|
||||
param_to_grad = model.AddGradientOperators([loss])
|
||||
|
||||
(shapes, types) = workspace.InferShapesAndTypes(
|
||||
[model.param_init_net, model.net],
|
||||
{'gpu_0/data': [4, 3, 227, 227],
|
||||
'gpu_0/label': [4]},
|
||||
)
|
||||
|
||||
count_before = count_blobs(model.net.Proto())
|
||||
optim_proto = memonger.share_grad_blobs(
|
||||
model.net,
|
||||
["gpu_0/loss"],
|
||||
set(model.param_to_grad.values()),
|
||||
"gpu_0/",
|
||||
share_activations=True,
|
||||
dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]),
|
||||
blob_shapes=shapes if with_shapes else None,
|
||||
)
|
||||
count_after = count_blobs(optim_proto)
|
||||
self.assertTrue(count_after < count_before)
|
||||
|
||||
# Run model and compare results. We check that the loss is same
|
||||
# and also that the final gradient (conv1_w_grad is same)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
||||
label = (np.random.rand(4) * 1000).astype(np.int32)
|
||||
|
||||
workspace.FeedBlob("gpu_0/data", data)
|
||||
workspace.FeedBlob("gpu_0/label", label)
|
||||
|
||||
workspace.RunNetOnce(model.net)
|
||||
model.net.Proto().type = 'dag'
|
||||
model.net.Proto().num_workers = 4
|
||||
loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
|
||||
conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
|
||||
workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0]))
|
||||
|
||||
workspace.RunNetOnce(optim_proto)
|
||||
optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
|
||||
optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
|
||||
|
||||
print("before: {} after: {}".format(count_before, count_after))
|
||||
|
||||
np.testing.assert_almost_equal(loss1, optimized_loss1)
|
||||
np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)
|
||||
self.assertTrue(results[0][0] < results[0][1])
|
||||
np.testing.assert_almost_equal(results[1][0], results[1][1])
|
||||
np.testing.assert_almost_equal(results[2][0], results[2][1])
|
||||
|
||||
def test_resnet_forward_only(self):
|
||||
model = cnn.CNNModelHelper(
|
||||
order="NCHW",
|
||||
name="test",
|
||||
cudnn_exhaustive_search=True,
|
||||
results = utils.test_forward_only(
|
||||
resnet.create_resnet50,
|
||||
'gpu_0/last_out_L1000'
|
||||
)
|
||||
with core.NameScope("gpu_0"):
|
||||
data = model.net.AddExternalInput("gpu_0/data")
|
||||
resnet.create_resnet50(
|
||||
model,
|
||||
data,
|
||||
num_input_channels=3,
|
||||
num_labels=1000,
|
||||
is_test=True
|
||||
)
|
||||
|
||||
count_before = count_blobs(model.net.Proto())
|
||||
optim_proto = memonger.optimize_inference_for_dag(
|
||||
model.net, ["gpu_0/data"], "gpu_0/"
|
||||
)
|
||||
count_after = count_blobs(optim_proto)
|
||||
num_shared_blobs = count_shared_blobs(optim_proto)
|
||||
|
||||
# Run model and compare results
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
||||
|
||||
workspace.FeedBlob("gpu_0/data", data)
|
||||
workspace.RunNetOnce(model.net)
|
||||
model.net.Proto().type = 'dag'
|
||||
model.net.Proto().num_workers = 4
|
||||
loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
|
||||
|
||||
workspace.RunNetOnce(optim_proto)
|
||||
optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
|
||||
self.assertTrue(count_after < count_before)
|
||||
self.assertTrue(num_shared_blobs < 7 and num_shared_blobs > 0)
|
||||
np.testing.assert_almost_equal(loss1, optimized_loss1)
|
||||
self.assertTrue(results[0][0] < results[0][1])
|
||||
self.assertTrue(results[1] < 7 and results[1] > 0)
|
||||
np.testing.assert_almost_equal(results[2][0], results[2][1])
|
||||
|
||||
def test_resnet_forward_only_fast_simplenet(self):
|
||||
'''
|
||||
Test C++ memonger that is only for simple nets
|
||||
'''
|
||||
model = cnn.CNNModelHelper(
|
||||
order="NCHW",
|
||||
name="test",
|
||||
cudnn_exhaustive_search=True,
|
||||
results = utils.test_forward_only_fast_simplenet(
|
||||
resnet.create_resnet50,
|
||||
'gpu_0/last_out_L1000'
|
||||
)
|
||||
with core.NameScope("gpu_0"):
|
||||
data = model.net.AddExternalInput("gpu_0/data")
|
||||
resnet.create_resnet50(
|
||||
model,
|
||||
data,
|
||||
num_input_channels=3,
|
||||
num_labels=1000,
|
||||
is_test=True
|
||||
)
|
||||
|
||||
count_before = count_blobs(model.net.Proto())
|
||||
t = time.time()
|
||||
optim_proto = memonger.optimize_inference_fast(
|
||||
model.net.Proto(),
|
||||
set(["gpu_0/data", "gpu_0/last_out_L1000"]).union(
|
||||
set(model.net.Proto().external_input))
|
||||
)
|
||||
print("Optimization took {} secs".format(time.time() - t))
|
||||
count_after = count_blobs(optim_proto)
|
||||
num_shared_blobs = count_shared_blobs(optim_proto)
|
||||
|
||||
self.assertTrue(count_after < count_before)
|
||||
print(count_after, count_before, num_shared_blobs)
|
||||
self.assertTrue(num_shared_blobs < 7 and num_shared_blobs > 0)
|
||||
|
||||
# Run model and compare results
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
data = np.random.rand(4, 3, 227, 227).astype(np.float32)
|
||||
|
||||
workspace.FeedBlob("gpu_0/data", data)
|
||||
model.net.Proto().type = 'simple'
|
||||
|
||||
workspace.RunNetOnce(model.net)
|
||||
loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
|
||||
|
||||
workspace.RunNetOnce(optim_proto)
|
||||
optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
|
||||
np.testing.assert_almost_equal(loss1, optimized_loss1)
|
||||
self.assertTrue(results[0][0] < results[0][1])
|
||||
self.assertTrue(results[1] < 4 and results[1] > 0)
|
||||
np.testing.assert_almost_equal(results[2][0], results[2][1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
198
caffe2/python/models/shufflenet.py
Normal file
198
caffe2/python/models/shufflenet.py
Normal file
@ -0,0 +1,198 @@
|
||||
# Module caffe2.python.models.shufflenet
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import brew
|
||||
|
||||
"""
|
||||
Utilitiy for creating ShuffleNet
|
||||
"ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" by Ma et. al. 2018
|
||||
"""
|
||||
|
||||
OUTPUT_CHANNELS = {
|
||||
'0.5x': [24, 48, 96, 192, 1024],
|
||||
'1.0x': [24, 116, 232, 464, 1024],
|
||||
'1.5x': [24, 176, 352, 704, 1024],
|
||||
'2.0x': [24, 244, 488, 976, 2048],
|
||||
}
|
||||
|
||||
|
||||
class ShuffleNetV2Builder():
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
data,
|
||||
num_input_channels,
|
||||
num_labels,
|
||||
num_groups=2,
|
||||
width='1.0x',
|
||||
is_test=False,
|
||||
detection=False,
|
||||
bn_epsilon=1e-5,
|
||||
):
|
||||
self.model = model
|
||||
self.prev_blob = data
|
||||
self.num_input_channels = num_input_channels
|
||||
self.num_labels = num_labels
|
||||
self.num_groups = num_groups
|
||||
self.output_channels = OUTPUT_CHANNELS[width]
|
||||
self.stage_repeats = [3, 7, 3]
|
||||
self.is_test = is_test
|
||||
self.detection = detection
|
||||
self.bn_epsilon = bn_epsilon
|
||||
|
||||
def create(self):
|
||||
in_channels = self.output_channels[0]
|
||||
|
||||
self.prev_blob = brew.conv(self.model, self.prev_blob, 'stage1_conv',
|
||||
self.num_input_channels, in_channels,
|
||||
weight_init=("MSRAFill", {}),
|
||||
kernel=3, stride=2)
|
||||
self.prev_blob = brew.max_pool(self.model, self.prev_blob,
|
||||
'stage1_pool', kernel=3, stride=2)
|
||||
|
||||
# adds stage#{2,3,4}; see table 5 of the ShufflenetV2 paper.
|
||||
for idx, (out_channels, n_repeats) in enumerate(zip(
|
||||
self.output_channels[1:4], self.stage_repeats
|
||||
)):
|
||||
prefix = 'stage{}_stride{}'.format(idx + 2, 2)
|
||||
self.add_spatial_ds_unit(prefix, in_channels, out_channels)
|
||||
in_channels = out_channels
|
||||
for i in range(n_repeats):
|
||||
prefix = 'stage{}_stride{}_repeat{}'.format(
|
||||
idx + 2, 1, i + 1
|
||||
)
|
||||
self.add_basic_unit(prefix, in_channels)
|
||||
|
||||
self.last_conv = brew.conv(self.model, self.prev_blob, 'conv5',
|
||||
in_channels, self.output_channels[4],
|
||||
kernel=1)
|
||||
self.avg_pool = self.model.AveragePool(self.last_conv, 'avg_pool',
|
||||
kernel=7)
|
||||
self.last_out = brew.fc(self.model,
|
||||
self.avg_pool,
|
||||
'last_out_L{}'.format(self.num_labels),
|
||||
self.output_channels[4],
|
||||
self.num_labels)
|
||||
|
||||
# spatial down sampling unit with stride=2
|
||||
def add_spatial_ds_unit(self, prefix, in_channels, out_channels, stride=2):
|
||||
right = left = self.prev_blob
|
||||
out_channels = out_channels // 2
|
||||
|
||||
# Enlarge the receptive field for detection task
|
||||
if self.detection:
|
||||
left = self.add_detection_unit(left, prefix + '_left_detection',
|
||||
in_channels, in_channels)
|
||||
|
||||
left = self.add_dwconv3x3_bn(left, prefix + 'left_dwconv',
|
||||
in_channels, stride)
|
||||
left = self.add_conv1x1_bn(left, prefix + '_left_conv1', in_channels,
|
||||
out_channels)
|
||||
|
||||
if self.detection:
|
||||
right = self.add_detection_unit(right, prefix + '_right_detection',
|
||||
in_channels, in_channels)
|
||||
|
||||
right = self.add_conv1x1_bn(right, prefix + '_right_conv1',
|
||||
in_channels, out_channels)
|
||||
right = self.add_dwconv3x3_bn(right, prefix + '_right_dwconv',
|
||||
out_channels, stride)
|
||||
right = self.add_conv1x1_bn(right, prefix + '_right_conv2',
|
||||
out_channels, out_channels)
|
||||
|
||||
self.prev_blob = brew.concat(self.model, [right, left],
|
||||
prefix + '_concat')
|
||||
self.prev_blob = self.model.net.ChannelShuffle(
|
||||
self.prev_blob, prefix + '_ch_shuffle',
|
||||
group=self.num_groups, kernel=1
|
||||
)
|
||||
|
||||
# basic unit with stride=1
|
||||
def add_basic_unit(self, prefix, in_channels, stride=1):
|
||||
in_channels = in_channels // 2
|
||||
left = prefix + '_left'
|
||||
right = prefix + '_right'
|
||||
self.model.net.Split(self.prev_blob, [left, right])
|
||||
|
||||
if self.detection:
|
||||
right = self.add_detection_unit(right, prefix + '_right_detection',
|
||||
in_channels, in_channels)
|
||||
|
||||
right = self.add_conv1x1_bn(right, prefix + '_right_conv1',
|
||||
in_channels, in_channels)
|
||||
right = self.add_dwconv3x3_bn(right, prefix + '_right_dwconv',
|
||||
in_channels, stride)
|
||||
right = self.add_conv1x1_bn(right, prefix + '_right_conv2',
|
||||
in_channels, in_channels)
|
||||
|
||||
self.prev_blob = brew.concat(self.model, [right, left],
|
||||
prefix + '_concat')
|
||||
|
||||
self.prev_blob = self.model.net.ChannelShuffle(
|
||||
self.prev_blob, prefix + '_ch_shuffle',
|
||||
group=self.num_groups, kernel=1
|
||||
)
|
||||
|
||||
# helper functions to create net's units
|
||||
def add_detection_unit(self, prev_blob, prefix, in_channels, out_channels,
|
||||
kernel=3, pad=1):
|
||||
out_blob = brew.conv(self.model, prev_blob, prefix + '_conv',
|
||||
in_channels, out_channels, kernel=kernel,
|
||||
weight_init=("MSRAFill", {}),
|
||||
group=in_channels, pad=pad)
|
||||
out_blob = brew.spatial_bn(self.model, out_blob, prefix + '_bn',
|
||||
out_channels, epsilon=self.bn_epsilon,
|
||||
is_test=self.is_test)
|
||||
return out_blob
|
||||
|
||||
def add_conv1x1_bn(self, prev_blob, blob, in_channels, out_channels):
|
||||
prev_blob = brew.conv(self.model, prev_blob, blob, in_channels,
|
||||
out_channels, kernel=1,
|
||||
weight_init=("MSRAFill", {}))
|
||||
prev_blob = brew.spatial_bn(self.model, prev_blob, prev_blob + '_bn',
|
||||
out_channels,
|
||||
epsilon=self.bn_epsilon,
|
||||
is_test=self.is_test)
|
||||
prev_blob = brew.relu(self.model, prev_blob, prev_blob)
|
||||
return prev_blob
|
||||
|
||||
def add_dwconv3x3_bn(self, prev_blob, blob, channels, stride):
|
||||
prev_blob = brew.conv(self.model, prev_blob, blob, channels,
|
||||
channels, kernel=3,
|
||||
weight_init=("MSRAFill", {}),
|
||||
stride=stride, group=channels, pad=1)
|
||||
prev_blob = brew.spatial_bn(self.model, prev_blob,
|
||||
prev_blob + '_bn',
|
||||
channels,
|
||||
epsilon=self.bn_epsilon,
|
||||
is_test=self.is_test)
|
||||
return prev_blob
|
||||
|
||||
|
||||
def create_shufflenet(
|
||||
model,
|
||||
data,
|
||||
num_input_channels,
|
||||
num_labels,
|
||||
label=None,
|
||||
is_test=False,
|
||||
no_loss=False,
|
||||
):
|
||||
builder = ShuffleNetV2Builder(model, data, num_input_channels,
|
||||
num_labels,
|
||||
is_test=is_test)
|
||||
builder.create()
|
||||
|
||||
if no_loss:
|
||||
return builder.last_out
|
||||
|
||||
if (label is not None):
|
||||
(softmax, loss) = model.SoftmaxWithLoss(
|
||||
[builder.last_out, label],
|
||||
["softmax", "loss"],
|
||||
)
|
||||
return (softmax, loss)
|
60
caffe2/python/models/shufflenet_test.py
Normal file
60
caffe2/python/models/shufflenet_test.py
Normal file
@ -0,0 +1,60 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
|
||||
import caffe2.python.models.shufflenet as shufflenet
|
||||
import hypothesis.strategies as st
|
||||
from hypothesis import given, settings
|
||||
import caffe2.python.hypothesis_test_util as hu
|
||||
import caffe2.python.models.imagenet_trainer_test_utils as utils
|
||||
|
||||
|
||||
class ShufflenetMemongerTest(hu.HypothesisTestCase):
|
||||
@given(with_shapes=st.booleans(), **hu.gcs_cpu_only)
|
||||
@settings(max_examples=2, timeout=120)
|
||||
def test_shufflenet_shared_grads(self, with_shapes, gc, dc):
|
||||
results = utils.test_shared_grads(
|
||||
with_shapes,
|
||||
shufflenet.create_shufflenet,
|
||||
'gpu_0/stage1_conv_w',
|
||||
'gpu_0/last_out_L1000'
|
||||
)
|
||||
self.assertTrue(results[0][0] < results[0][1])
|
||||
np.testing.assert_almost_equal(results[1][0], results[1][1])
|
||||
np.testing.assert_almost_equal(results[2][0], results[2][1])
|
||||
|
||||
def test_shufflenet_forward_only(self):
|
||||
results = utils.test_forward_only(
|
||||
shufflenet.create_shufflenet,
|
||||
'gpu_0/last_out_L1000'
|
||||
)
|
||||
self.assertTrue(results[0][0] < results[0][1])
|
||||
self.assertTrue(results[1] < 10 and results[1] > 0)
|
||||
np.testing.assert_almost_equal(results[2][0], results[2][1])
|
||||
|
||||
def test_shufflenet_forward_only_fast_simplenet(self):
|
||||
'''
|
||||
Test C++ memonger that is only for simple nets
|
||||
'''
|
||||
results = utils.test_forward_only_fast_simplenet(
|
||||
shufflenet.create_shufflenet,
|
||||
'gpu_0/last_out_L1000'
|
||||
)
|
||||
|
||||
self.assertTrue(results[0][0] < results[0][1])
|
||||
self.assertTrue(results[1] < 4 and results[1] > 0)
|
||||
np.testing.assert_almost_equal(results[2][0], results[2][1])
|
||||
|
||||
if __name__ == "__main__":
|
||||
import unittest
|
||||
import random
|
||||
random.seed(2006)
|
||||
workspace.GlobalInit([
|
||||
'caffe2',
|
||||
'--caffe2_log_level=0',
|
||||
'--caffe2_print_blob_sizes_at_exit=0',
|
||||
'--caffe2_gpu_memory_tracking=1'])
|
||||
unittest.main()
|
Reference in New Issue
Block a user