Adding ShufflenetV2 to caffe2's benchmark suite. (#20180)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/20180 Adding ShufflenetV2 (by Ma et. al. 2018) to the caffe2's benchmark suite. To run, use: `buck run mode/opt caffe2/caffe2/python/examples:imagenet_trainer -- --train_data null --batch_size 128 --epoch_size 3200 --num_epochs 2 --num_gpus 2 --model shufflenet` Reviewed By: bddppq, xw285cornell Differential Revision: D15094282 fbshipit-source-id: 0e1ce9c5975868e917b0f179e2c5b15647a76b4e
2025-10-20 21:14:14 +08:00 · 2019-05-23 20:34:48 -07:00
parent 3aa7ee6fe6
commit f93e0619f3
6 changed files with 1207 additions and 856 deletions
--- a/caffe2/python/examples/imagenet_trainer.py
+++ b/caffe2/python/examples/imagenet_trainer.py
@ -0,0 +1,727 @@
+# Module caffe2.python.examples.resnet50_trainer
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import logging
+import numpy as np
+import time
+import os
+
+from caffe2.python import core, workspace, experiment_util, data_parallel_model
+from caffe2.python import dyndep, optimizer
+from caffe2.python import timeout_guard, model_helper, brew
+from caffe2.proto import caffe2_pb2
+
+import caffe2.python.models.resnet as resnet
+import caffe2.python.models.shufflenet as shufflenet
+from caffe2.python.modeling.initializers import Initializer, PseudoFP16Initializer
+import caffe2.python.predictor.predictor_exporter as pred_exp
+import caffe2.python.predictor.predictor_py_utils as pred_utils
+from caffe2.python.predictor_constants import predictor_constants as predictor_constants
+
+'''
+Parallelized multi-GPU distributed trainer for Resne(X)t & Shufflenet.
+Can be used to train on imagenet data, for example.
+The default parameters can train a standard Resnet-50 (1x64d), and parameters
+can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d).
+
+To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
+
+To run the trainer in multi-machine multi-gpu mode with M machines,
+run the same program on all machines, specifying num_shards = M, and
+shard_id = a unique integer in the set [0, M-1].
+
+For rendezvous (the trainer processes have to know about each other),
+you can either use a directory path that is visible to all processes
+(e.g. NFS directory), or use a Redis instance. Use the former by
+passing the `file_store_path` argument. Use the latter by passing the
+`redis_host` and `redis_port` arguments.
+'''
+
+logging.basicConfig()
+log = logging.getLogger("Imagenet_trainer")
+log.setLevel(logging.DEBUG)
+
+dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
+dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
+
+
+def AddImageInput(
+    model,
+    reader,
+    batch_size,
+    img_size,
+    dtype,
+    is_test,
+    mean_per_channel=None,
+    std_per_channel=None,
+):
+    '''
+    The image input operator loads image and label data from the reader and
+    applies transformations to the images (random cropping, mirroring, ...).
+    '''
+    data, label = brew.image_input(
+        model,
+        reader, ["data", "label"],
+        batch_size=batch_size,
+        output_type=dtype,
+        use_gpu_transform=True if core.IsGPUDeviceType(model._device_type) else False,
+        use_caffe_datum=True,
+        mean_per_channel=mean_per_channel,
+        std_per_channel=std_per_channel,
+        # mean_per_channel takes precedence over mean
+        mean=128.,
+        std=128.,
+        scale=256,
+        crop=img_size,
+        mirror=1,
+        is_test=is_test,
+    )
+
+    data = model.StopGradient(data, data)
+
+
+def AddNullInput(model, reader, batch_size, img_size, dtype):
+    '''
+    The null input function uses a gaussian fill operator to emulate real image
+    input. A label blob is hardcoded to a single value. This is useful if you
+    want to test compute throughput or don't have a dataset available.
+    '''
+    suffix = "_fp16" if dtype == "float16" else ""
+    model.param_init_net.GaussianFill(
+        [],
+        ["data" + suffix],
+        shape=[batch_size, 3, img_size, img_size],
+    )
+    if dtype == "float16":
+        model.param_init_net.FloatToHalf("data" + suffix, "data")
+
+    model.param_init_net.ConstantFill(
+        [],
+        ["label"],
+        shape=[batch_size],
+        value=1,
+        dtype=core.DataType.INT32,
+    )
+
+
+def SaveModel(args, train_model, epoch, use_ideep):
+    prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0])
+    predictor_export_meta = pred_exp.PredictorExportMeta(
+        predict_net=train_model.net.Proto(),
+        parameters=data_parallel_model.GetCheckpointParams(train_model),
+        inputs=[prefix + "/data"],
+        outputs=[prefix + "/softmax"],
+        shapes={
+            prefix + "/softmax": (1, args.num_labels),
+            prefix + "/data": (args.num_channels, args.image_size, args.image_size)
+        }
+    )
+
+    # save the train_model for the current epoch
+    model_path = "%s/%s_%d.mdl" % (
+        args.file_store_path,
+        args.save_model_name,
+        epoch,
+    )
+
+    # set db_type to be "minidb" instead of "log_file_db", which breaks
+    # the serialization in save_to_db. Need to switch back to log_file_db
+    # after migration
+    pred_exp.save_to_db(
+        db_type="minidb",
+        db_destination=model_path,
+        predictor_export_meta=predictor_export_meta,
+        use_ideep=use_ideep
+    )
+
+
+def LoadModel(path, model, use_ideep):
+    '''
+    Load pretrained model from file
+    '''
+    log.info("Loading path: {}".format(path))
+    meta_net_def = pred_exp.load_from_db(path, 'minidb')
+    init_net = core.Net(pred_utils.GetNet(
+        meta_net_def, predictor_constants.GLOBAL_INIT_NET_TYPE))
+    predict_init_net = core.Net(pred_utils.GetNet(
+        meta_net_def, predictor_constants.PREDICT_INIT_NET_TYPE))
+
+    if use_ideep:
+        predict_init_net.RunAllOnIDEEP()
+    else:
+        predict_init_net.RunAllOnGPU()
+    if use_ideep:
+        init_net.RunAllOnIDEEP()
+    else:
+        init_net.RunAllOnGPU()
+
+    assert workspace.RunNetOnce(predict_init_net)
+    assert workspace.RunNetOnce(init_net)
+
+    # Hack: fix iteration counter which is in CUDA context after load model
+    itercnt = workspace.FetchBlob("optimizer_iteration")
+    workspace.FeedBlob(
+        "optimizer_iteration",
+        itercnt,
+        device_option=core.DeviceOption(caffe2_pb2.CPU, 0)
+    )
+
+
+def RunEpoch(
+    args,
+    epoch,
+    train_model,
+    test_model,
+    total_batch_size,
+    num_shards,
+    expname,
+    explog,
+):
+    '''
+    Run one epoch of the trainer.
+    TODO: add checkpointing here.
+    '''
+    # TODO: add loading from checkpoint
+    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
+    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
+    test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards)
+    for i in range(epoch_iters):
+        # This timeout is required (temporarily) since CUDA-NCCL
+        # operators might deadlock when synchronizing between GPUs.
+        timeout = args.first_iter_timeout if i == 0 else args.timeout
+        with timeout_guard.CompleteInTimeOrDie(timeout):
+            t1 = time.time()
+            workspace.RunNet(train_model.net.Proto().name)
+            t2 = time.time()
+            dt = t2 - t1
+
+        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
+        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
+        prefix = "{}_{}".format(
+            train_model._device_prefix,
+            train_model._devices[0])
+        accuracy = workspace.FetchBlob(prefix + '/accuracy')
+        loss = workspace.FetchBlob(prefix + '/loss')
+        train_fmt = "Training loss: {}, accuracy: {}"
+        log.info(train_fmt.format(loss, accuracy))
+
+    num_images = epoch * epoch_iters * total_batch_size
+    prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
+    accuracy = workspace.FetchBlob(prefix + '/accuracy')
+    loss = workspace.FetchBlob(prefix + '/loss')
+    learning_rate = workspace.FetchBlob(
+        data_parallel_model.GetLearningRateBlobNames(train_model)[0]
+    )
+    test_accuracy = 0
+    test_accuracy_top5 = 0
+    if test_model is not None:
+        # Run 100 iters of testing
+        ntests = 0
+        for _ in range(test_epoch_iters):
+            workspace.RunNet(test_model.net.Proto().name)
+            for g in test_model._devices:
+                test_accuracy += np.asscalar(workspace.FetchBlob(
+                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
+                ))
+                test_accuracy_top5 += np.asscalar(workspace.FetchBlob(
+                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
+                ))
+                ntests += 1
+        test_accuracy /= ntests
+        test_accuracy_top5 /= ntests
+    else:
+        test_accuracy = (-1)
+        test_accuracy_top5 = (-1)
+
+    explog.log(
+        input_count=num_images,
+        batch_count=(i + epoch * epoch_iters),
+        additional_values={
+            'accuracy': accuracy,
+            'loss': loss,
+            'learning_rate': learning_rate,
+            'epoch': epoch,
+            'top1_test_accuracy': test_accuracy,
+            'top5_test_accuracy': test_accuracy_top5,
+        }
+    )
+    assert loss < 40, "Exploded gradients :("
+
+    # TODO: add checkpointing
+    return epoch + 1
+
+
+def Train(args):
+    if args.model == "resnext":
+        model_name = "resnext" + str(args.num_layers)
+    elif args.model == "shufflenet":
+        model_name = "shufflenet"
+
+    # Either use specified device list or generate one
+    if args.gpus is not None:
+        gpus = [int(x) for x in args.gpus.split(',')]
+        num_gpus = len(gpus)
+    else:
+        gpus = list(range(args.num_gpus))
+        num_gpus = args.num_gpus
+
+    log.info("Running on GPUs: {}".format(gpus))
+
+    # Verify valid batch size
+    total_batch_size = args.batch_size
+    batch_per_device = total_batch_size // num_gpus
+    assert \
+        total_batch_size % num_gpus == 0, \
+        "Number of GPUs must divide batch size"
+
+    # Verify valid image mean/std per channel
+    if args.image_mean_per_channel:
+        assert \
+            len(args.image_mean_per_channel) == args.num_channels, \
+            "The number of channels of image mean doesn't match input"
+
+    if args.image_std_per_channel:
+        assert \
+            len(args.image_std_per_channel) == args.num_channels, \
+            "The number of channels of image std doesn't match input"
+
+    # Round down epoch size to closest multiple of batch size across machines
+    global_batch_size = total_batch_size * args.num_shards
+    epoch_iters = int(args.epoch_size / global_batch_size)
+
+    assert \
+        epoch_iters > 0, \
+        "Epoch size must be larger than batch size times shard count"
+
+    args.epoch_size = epoch_iters * global_batch_size
+    log.info("Using epoch size: {}".format(args.epoch_size))
+
+    # Create ModelHelper object
+    if args.use_ideep:
+        train_arg_scope = {
+            'use_cudnn': False,
+            'cudnn_exhaustive_search': False,
+            'training_mode': 1
+        }
+    else:
+        train_arg_scope = {
+            'order': 'NCHW',
+            'use_cudnn': True,
+            'cudnn_exhaustive_search': True,
+            'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
+        }
+    train_model = model_helper.ModelHelper(
+        name=model_name, arg_scope=train_arg_scope
+    )
+
+    num_shards = args.num_shards
+    shard_id = args.shard_id
+
+    # Expect interfaces to be comma separated.
+    # Use of multiple network interfaces is not yet complete,
+    # so simply use the first one in the list.
+    interfaces = args.distributed_interfaces.split(",")
+
+    # Rendezvous using MPI when run with mpirun
+    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
+        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
+        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
+        if num_shards > 1:
+            rendezvous = dict(
+                kv_handler=None,
+                num_shards=num_shards,
+                shard_id=shard_id,
+                engine="GLOO",
+                transport=args.distributed_transport,
+                interface=interfaces[0],
+                mpi_rendezvous=True,
+                exit_nets=None)
+
+    elif num_shards > 1:
+        # Create rendezvous for distributed computation
+        store_handler = "store_handler"
+        if args.redis_host is not None:
+            # Use Redis for rendezvous if Redis host is specified
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "RedisStoreHandlerCreate", [], [store_handler],
+                    host=args.redis_host,
+                    port=args.redis_port,
+                    prefix=args.run_id,
+                )
+            )
+        else:
+            # Use filesystem for rendezvous otherwise
+            workspace.RunOperatorOnce(
+                core.CreateOperator(
+                    "FileStoreHandlerCreate", [], [store_handler],
+                    path=args.file_store_path,
+                    prefix=args.run_id,
+                )
+            )
+
+        rendezvous = dict(
+            kv_handler=store_handler,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            engine="GLOO",
+            transport=args.distributed_transport,
+            interface=interfaces[0],
+            exit_nets=None)
+
+    else:
+        rendezvous = None
+
+    # Model building functions
+    def create_resnext_model_ops(model, loss_scale):
+        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
+                       else Initializer)
+
+        with brew.arg_scope([brew.conv, brew.fc],
+                            WeightInitializer=initializer,
+                            BiasInitializer=initializer,
+                            enable_tensor_core=args.enable_tensor_core,
+                            float16_compute=args.float16_compute):
+            pred = resnet.create_resnext(
+                model,
+                "data",
+                num_input_channels=args.num_channels,
+                num_labels=args.num_labels,
+                num_layers=args.num_layers,
+                num_groups=args.resnext_num_groups,
+                num_width_per_group=args.resnext_width_per_group,
+                no_bias=True,
+                no_loss=True,
+            )
+
+        if args.dtype == 'float16':
+            pred = model.net.HalfToFloat(pred, pred + '_fp32')
+
+        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
+                                              ['softmax', 'loss'])
+        loss = model.Scale(loss, scale=loss_scale)
+        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
+        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
+        return [loss]
+
+    def create_shufflenet_model_ops(model, loss_scale):
+        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
+                       else Initializer)
+
+        with brew.arg_scope([brew.conv, brew.fc],
+                            WeightInitializer=initializer,
+                            BiasInitializer=initializer,
+                            enable_tensor_core=args.enable_tensor_core,
+                            float16_compute=args.float16_compute):
+            pred = shufflenet.create_shufflenet(
+                model,
+                "data",
+                num_input_channels=args.num_channels,
+                num_labels=args.num_labels,
+                no_loss=True,
+            )
+
+        if args.dtype == 'float16':
+            pred = model.net.HalfToFloat(pred, pred + '_fp32')
+
+        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
+                                              ['softmax', 'loss'])
+        loss = model.Scale(loss, scale=loss_scale)
+        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
+        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
+        return [loss]
+
+    def add_optimizer(model):
+        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
+
+        if args.float16_compute:
+            # TODO: merge with multi-prceision optimizer
+            opt = optimizer.build_fp16_sgd(
+                model,
+                args.base_learning_rate,
+                momentum=0.9,
+                nesterov=1,
+                weight_decay=args.weight_decay,   # weight decay included
+                policy="step",
+                stepsize=stepsz,
+                gamma=0.1
+            )
+        else:
+            optimizer.add_weight_decay(model, args.weight_decay)
+            opt = optimizer.build_multi_precision_sgd(
+                model,
+                args.base_learning_rate,
+                momentum=0.9,
+                nesterov=1,
+                policy="step",
+                stepsize=stepsz,
+                gamma=0.1
+            )
+        return opt
+
+    # Define add_image_input function.
+    # Depends on the "train_data" argument.
+    # Note that the reader will be shared with between all GPUS.
+    if args.train_data == "null":
+        def add_image_input(model):
+            AddNullInput(
+                model,
+                None,
+                batch_size=batch_per_device,
+                img_size=args.image_size,
+                dtype=args.dtype,
+            )
+    else:
+        reader = train_model.CreateDB(
+            "reader",
+            db=args.train_data,
+            db_type=args.db_type,
+            num_shards=num_shards,
+            shard_id=shard_id,
+        )
+
+        def add_image_input(model):
+            AddImageInput(
+                model,
+                reader,
+                batch_size=batch_per_device,
+                img_size=args.image_size,
+                dtype=args.dtype,
+                is_test=False,
+                mean_per_channel=args.image_mean_per_channel,
+                std_per_channel=args.image_std_per_channel,
+            )
+
+    def add_post_sync_ops(model):
+        """Add ops applied after initial parameter sync."""
+        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
+            if param_info.blob_copy is not None:
+                model.param_init_net.HalfToFloat(
+                    param_info.blob,
+                    param_info.blob_copy[core.DataType.FLOAT]
+                )
+
+    data_parallel_model.Parallelize(
+        train_model,
+        input_builder_fun=add_image_input,
+        forward_pass_builder_fun=create_resnext_model_ops
+        if args.model == "resnext" else create_shufflenet_model_ops,
+        optimizer_builder_fun=add_optimizer,
+        post_sync_builder_fun=add_post_sync_ops,
+        devices=gpus,
+        rendezvous=rendezvous,
+        optimize_gradient_memory=False,
+        cpu_device=args.use_cpu,
+        ideep=args.use_ideep,
+        shared_model=args.use_cpu,
+        combine_spatial_bn=args.use_cpu,
+    )
+
+    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)
+
+    workspace.RunNetOnce(train_model.param_init_net)
+    workspace.CreateNet(train_model.net)
+
+    # Add test model, if specified
+    test_model = None
+    if (args.test_data is not None):
+        log.info("----- Create test net ----")
+        if args.use_ideep:
+            test_arg_scope = {
+                'use_cudnn': False,
+                'cudnn_exhaustive_search': False,
+            }
+        else:
+            test_arg_scope = {
+                'order': "NCHW",
+                'use_cudnn': True,
+                'cudnn_exhaustive_search': True,
+            }
+        test_model = model_helper.ModelHelper(
+            name=model_name + "_test",
+            arg_scope=test_arg_scope,
+            init_params=False,
+        )
+
+        test_reader = test_model.CreateDB(
+            "test_reader",
+            db=args.test_data,
+            db_type=args.db_type,
+        )
+
+        def test_input_fn(model):
+            AddImageInput(
+                model,
+                test_reader,
+                batch_size=batch_per_device,
+                img_size=args.image_size,
+                dtype=args.dtype,
+                is_test=True,
+                mean_per_channel=args.image_mean_per_channel,
+                std_per_channel=args.image_std_per_channel,
+            )
+
+        data_parallel_model.Parallelize(
+            test_model,
+            input_builder_fun=test_input_fn,
+            forward_pass_builder_fun=create_resnext_model_ops
+            if args.model == "resnext" else create_shufflenet_model_ops,
+            post_sync_builder_fun=add_post_sync_ops,
+            param_update_builder_fun=None,
+            devices=gpus,
+            cpu_device=args.use_cpu,
+        )
+        workspace.RunNetOnce(test_model.param_init_net)
+        workspace.CreateNet(test_model.net)
+
+    epoch = 0
+    # load the pre-trained model and reset epoch
+    if args.load_model_path is not None:
+        LoadModel(args.load_model_path, train_model, args.use_ideep)
+
+        # Sync the model params
+        data_parallel_model.FinalizeAfterCheckpoint(train_model)
+
+        # reset epoch. load_model_path should end with *_X.mdl,
+        # where X is the epoch number
+        last_str = args.load_model_path.split('_')[-1]
+        if last_str.endswith('.mdl'):
+            epoch = int(last_str[:-4])
+            log.info("Reset epoch to {}".format(epoch))
+        else:
+            log.warning("The format of load_model_path doesn't match!")
+
+    expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % (
+        model_name,
+        args.num_gpus,
+        total_batch_size,
+        args.num_labels,
+        args.base_learning_rate,
+    )
+
+    explog = experiment_util.ModelTrainerLog(expname, args)
+
+    # Run the training one epoch a time
+    while epoch < args.num_epochs:
+        epoch = RunEpoch(
+            args,
+            epoch,
+            train_model,
+            test_model,
+            total_batch_size,
+            num_shards,
+            expname,
+            explog
+        )
+
+        # Save the model for each epoch
+        SaveModel(args, train_model, epoch, args.use_ideep)
+
+        model_path = "%s/%s_" % (
+            args.file_store_path,
+            args.save_model_name
+        )
+        # remove the saved model from the previous epoch if it exists
+        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
+            os.remove(model_path + str(epoch - 1) + ".mdl")
+
+
+def main():
+    # TODO: use argv
+    parser = argparse.ArgumentParser(
+        description="Caffe2: ImageNet Trainer"
+    )
+    parser.add_argument("--train_data", type=str, default=None, required=True,
+                        help="Path to training data (or 'null' to simulate)")
+    parser.add_argument("--num_layers", type=int, default=50,
+                        help="The number of layers in ResNe(X)t model")
+    parser.add_argument("--resnext_num_groups", type=int, default=1,
+                        help="The cardinality of resnext")
+    parser.add_argument("--resnext_width_per_group", type=int, default=64,
+                        help="The cardinality of resnext")
+    parser.add_argument("--test_data", type=str, default=None,
+                        help="Path to test data")
+    parser.add_argument("--image_mean_per_channel", type=float, nargs='+',
+                        help="The per channel mean for the images")
+    parser.add_argument("--image_std_per_channel", type=float, nargs='+',
+                        help="The per channel standard deviation for the images")
+    parser.add_argument("--test_epoch_size", type=int, default=50000,
+                        help="Number of test images")
+    parser.add_argument("--db_type", type=str, default="lmdb",
+                        help="Database type (such as lmdb or leveldb)")
+    parser.add_argument("--gpus", type=str,
+                        help="Comma separated list of GPU devices to use")
+    parser.add_argument("--num_gpus", type=int, default=1,
+                        help="Number of GPU devices (instead of --gpus)")
+    parser.add_argument("--num_channels", type=int, default=3,
+                        help="Number of color channels")
+    parser.add_argument("--image_size", type=int, default=224,
+                        help="Input image size (to crop to)")
+    parser.add_argument("--num_labels", type=int, default=1000,
+                        help="Number of labels")
+    parser.add_argument("--batch_size", type=int, default=32,
+                        help="Batch size, total over all GPUs")
+    parser.add_argument("--epoch_size", type=int, default=1500000,
+                        help="Number of images/epoch, total over all machines")
+    parser.add_argument("--num_epochs", type=int, default=1000,
+                        help="Num epochs.")
+    parser.add_argument("--base_learning_rate", type=float, default=0.1,
+                        help="Initial learning rate.")
+    parser.add_argument("--weight_decay", type=float, default=1e-4,
+                        help="Weight decay (L2 regularization)")
+    parser.add_argument("--cudnn_workspace_limit_mb", type=int, default=64,
+                        help="CuDNN workspace limit in MBs")
+    parser.add_argument("--num_shards", type=int, default=1,
+                        help="Number of machines in distributed run")
+    parser.add_argument("--shard_id", type=int, default=0,
+                        help="Shard id.")
+    parser.add_argument("--run_id", type=str,
+                        help="Unique run identifier (e.g. uuid)")
+    parser.add_argument("--redis_host", type=str,
+                        help="Host of Redis server (for rendezvous)")
+    parser.add_argument("--redis_port", type=int, default=6379,
+                        help="Port of Redis server (for rendezvous)")
+    parser.add_argument("--file_store_path", type=str, default="/tmp",
+                        help="Path to directory to use for rendezvous")
+    parser.add_argument("--save_model_name", type=str, default="resnext_model",
+                        help="Save the trained model to a given name")
+    parser.add_argument("--load_model_path", type=str, default=None,
+                        help="Load previously saved model to continue training")
+    parser.add_argument("--use_cpu", type=bool, default=False,
+                        help="Use CPU instead of GPU")
+    parser.add_argument("--use_ideep", type=bool, default=False,
+                        help="Use ideep")
+    parser.add_argument('--dtype', default='float',
+                        choices=['float', 'float16'],
+                        help='Data type used for training')
+    parser.add_argument('--float16_compute', action='store_true',
+                        help="Use float 16 compute, if available")
+    parser.add_argument('--enable_tensor_core', action='store_true',
+                        help='Enable Tensor Core math for Conv and FC ops')
+    parser.add_argument("--distributed_transport", type=str, default="tcp",
+                        help="Transport to use for distributed run [tcp|ibverbs]")
+    parser.add_argument("--distributed_interfaces", type=str, default="",
+                        help="Network interfaces to use for distributed run")
+
+    parser.add_argument("--first_iter_timeout", type=int, default=600,
+                        help="Timeout (secs) of the first iteration "
+                        "(default: %(default)s)")
+    parser.add_argument("--timeout", type=int, default=60,
+                        help="Timeout (secs) of each (except the first) iteration "
+                        "(default: %(default)s)")
+    parser.add_argument("--model",
+                        default="resnext", const="resnext", nargs="?",
+                        choices=["shufflenet", "resnext"],
+                        help="List of models which can be run")
+    args = parser.parse_args()
+
+    Train(args)
+
+
+if __name__ == '__main__':
+    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
+    main()
--- a/caffe2/python/examples/resnet50_trainer.py
+++ b/caffe2/python/examples/resnet50_trainer.py
@ -1,690 +0,0 @@
-# Module caffe2.python.examples.resnet50_trainer
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import logging
-import numpy as np
-import time
-import os
-
-from caffe2.python import core, workspace, experiment_util, data_parallel_model
-from caffe2.python import dyndep, optimizer
-from caffe2.python import timeout_guard, model_helper, brew
-from caffe2.proto import caffe2_pb2
-
-import caffe2.python.models.resnet as resnet
-from caffe2.python.modeling.initializers import Initializer, PseudoFP16Initializer
-import caffe2.python.predictor.predictor_exporter as pred_exp
-import caffe2.python.predictor.predictor_py_utils as pred_utils
-from caffe2.python.predictor_constants import predictor_constants as predictor_constants
-
-'''
-Parallelized multi-GPU distributed trainer for Resne(X)t.
-Can be used to train on imagenet data, for example.
-The default parameters can train a standard Resnet-50 (1x64d), and parameters
-can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d).
-
-To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
-
-To run the trainer in multi-machine multi-gpu mode with M machines,
-run the same program on all machines, specifying num_shards = M, and
-shard_id = a unique integer in the set [0, M-1].
-
-For rendezvous (the trainer processes have to know about each other),
-you can either use a directory path that is visible to all processes
-(e.g. NFS directory), or use a Redis instance. Use the former by
-passing the `file_store_path` argument. Use the latter by passing the
-`redis_host` and `redis_port` arguments.
-'''
-
-logging.basicConfig()
-log = logging.getLogger("ResNe(X)t_trainer")
-log.setLevel(logging.DEBUG)
-
-dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
-dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
-
-
-def AddImageInput(
-    model,
-    reader,
-    batch_size,
-    img_size,
-    dtype,
-    is_test,
-    mean_per_channel=None,
-    std_per_channel=None,
-):
-    '''
-    The image input operator loads image and label data from the reader and
-    applies transformations to the images (random cropping, mirroring, ...).
-    '''
-    data, label = brew.image_input(
-        model,
-        reader, ["data", "label"],
-        batch_size=batch_size,
-        output_type=dtype,
-        use_gpu_transform=True if core.IsGPUDeviceType(model._device_type) else False,
-        use_caffe_datum=True,
-        mean_per_channel=mean_per_channel,
-        std_per_channel=std_per_channel,
-        # mean_per_channel takes precedence over mean
-        mean=128.,
-        std=128.,
-        scale=256,
-        crop=img_size,
-        mirror=1,
-        is_test=is_test,
-    )
-
-    data = model.StopGradient(data, data)
-
-
-def AddNullInput(model, reader, batch_size, img_size, dtype):
-    '''
-    The null input function uses a gaussian fill operator to emulate real image
-    input. A label blob is hardcoded to a single value. This is useful if you
-    want to test compute throughput or don't have a dataset available.
-    '''
-    suffix = "_fp16" if dtype == "float16" else ""
-    model.param_init_net.GaussianFill(
-        [],
-        ["data" + suffix],
-        shape=[batch_size, 3, img_size, img_size],
-    )
-    if dtype == "float16":
-        model.param_init_net.FloatToHalf("data" + suffix, "data")
-
-    model.param_init_net.ConstantFill(
-        [],
-        ["label"],
-        shape=[batch_size],
-        value=1,
-        dtype=core.DataType.INT32,
-    )
-
-
-def SaveModel(args, train_model, epoch, use_ideep):
-    prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0])
-    predictor_export_meta = pred_exp.PredictorExportMeta(
-        predict_net=train_model.net.Proto(),
-        parameters=data_parallel_model.GetCheckpointParams(train_model),
-        inputs=[prefix + "/data"],
-        outputs=[prefix + "/softmax"],
-        shapes={
-            prefix + "/softmax": (1, args.num_labels),
-            prefix + "/data": (args.num_channels, args.image_size, args.image_size)
-        }
-    )
-
-    # save the train_model for the current epoch
-    model_path = "%s/%s_%d.mdl" % (
-        args.file_store_path,
-        args.save_model_name,
-        epoch,
-    )
-
-    # set db_type to be "minidb" instead of "log_file_db", which breaks
-    # the serialization in save_to_db. Need to switch back to log_file_db
-    # after migration
-    pred_exp.save_to_db(
-        db_type="minidb",
-        db_destination=model_path,
-        predictor_export_meta=predictor_export_meta,
-        use_ideep = use_ideep
-    )
-
-
-def LoadModel(path, model, use_ideep):
-    '''
-    Load pretrained model from file
-    '''
-    log.info("Loading path: {}".format(path))
-    meta_net_def = pred_exp.load_from_db(path, 'minidb')
-    init_net = core.Net(pred_utils.GetNet(
-        meta_net_def, predictor_constants.GLOBAL_INIT_NET_TYPE))
-    predict_init_net = core.Net(pred_utils.GetNet(
-        meta_net_def, predictor_constants.PREDICT_INIT_NET_TYPE))
-
-    if use_ideep:
-        predict_init_net.RunAllOnIDEEP()
-    else:
-        predict_init_net.RunAllOnGPU()
-    if use_ideep:
-        init_net.RunAllOnIDEEP()
-    else:
-        init_net.RunAllOnGPU()
-
-    assert workspace.RunNetOnce(predict_init_net)
-    assert workspace.RunNetOnce(init_net)
-
-    # Hack: fix iteration counter which is in CUDA context after load model
-    itercnt = workspace.FetchBlob("optimizer_iteration")
-    workspace.FeedBlob(
-        "optimizer_iteration",
-        itercnt,
-        device_option=core.DeviceOption(caffe2_pb2.CPU, 0)
-    )
-
-
-def RunEpoch(
-    args,
-    epoch,
-    train_model,
-    test_model,
-    total_batch_size,
-    num_shards,
-    expname,
-    explog,
-):
-    '''
-    Run one epoch of the trainer.
-    TODO: add checkpointing here.
-    '''
-    # TODO: add loading from checkpoint
-    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
-    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
-    test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards)
-    for i in range(epoch_iters):
-        # This timeout is required (temporarily) since CUDA-NCCL
-        # operators might deadlock when synchronizing between GPUs.
-        timeout = args.first_iter_timeout if i == 0 else args.timeout
-        with timeout_guard.CompleteInTimeOrDie(timeout):
-            t1 = time.time()
-            workspace.RunNet(train_model.net.Proto().name)
-            t2 = time.time()
-            dt = t2 - t1
-
-        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
-        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
-        prefix = "{}_{}".format(
-            train_model._device_prefix,
-            train_model._devices[0])
-        accuracy = workspace.FetchBlob(prefix + '/accuracy')
-        loss = workspace.FetchBlob(prefix + '/loss')
-        train_fmt = "Training loss: {}, accuracy: {}"
-        log.info(train_fmt.format(loss, accuracy))
-
-    num_images = epoch * epoch_iters * total_batch_size
-    prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
-    accuracy = workspace.FetchBlob(prefix + '/accuracy')
-    loss = workspace.FetchBlob(prefix + '/loss')
-    learning_rate = workspace.FetchBlob(
-        data_parallel_model.GetLearningRateBlobNames(train_model)[0]
-    )
-    test_accuracy = 0
-    test_accuracy_top5 = 0
-    if test_model is not None:
-        # Run 100 iters of testing
-        ntests = 0
-        for _ in range(test_epoch_iters):
-            workspace.RunNet(test_model.net.Proto().name)
-            for g in test_model._devices:
-                test_accuracy += np.asscalar(workspace.FetchBlob(
-                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
-                ))
-                test_accuracy_top5 += np.asscalar(workspace.FetchBlob(
-                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
-                ))
-                ntests += 1
-        test_accuracy /= ntests
-        test_accuracy_top5 /= ntests
-    else:
-        test_accuracy = (-1)
-        test_accuracy_top5 = (-1)
-
-    explog.log(
-        input_count=num_images,
-        batch_count=(i + epoch * epoch_iters),
-        additional_values={
-            'accuracy': accuracy,
-            'loss': loss,
-            'learning_rate': learning_rate,
-            'epoch': epoch,
-            'top1_test_accuracy': test_accuracy,
-            'top5_test_accuracy': test_accuracy_top5,
-        }
-    )
-    assert loss < 40, "Exploded gradients :("
-
-    # TODO: add checkpointing
-    return epoch + 1
-
-
-def Train(args):
-    # Either use specified device list or generate one
-    if args.gpus is not None:
-        gpus = [int(x) for x in args.gpus.split(',')]
-        num_gpus = len(gpus)
-    else:
-        gpus = list(range(args.num_gpus))
-        num_gpus = args.num_gpus
-
-    log.info("Running on GPUs: {}".format(gpus))
-
-    # Verify valid batch size
-    total_batch_size = args.batch_size
-    batch_per_device = total_batch_size // num_gpus
-    assert \
-        total_batch_size % num_gpus == 0, \
-        "Number of GPUs must divide batch size"
-
-    # Verify valid image mean/std per channel
-    if args.image_mean_per_channel:
-        assert \
-            len(args.image_mean_per_channel) == args.num_channels, \
-            "The number of channels of image mean doesn't match input"
-
-    if args.image_std_per_channel:
-        assert \
-            len(args.image_std_per_channel) == args.num_channels, \
-            "The number of channels of image std doesn't match input"
-
-    # Round down epoch size to closest multiple of batch size across machines
-    global_batch_size = total_batch_size * args.num_shards
-    epoch_iters = int(args.epoch_size / global_batch_size)
-
-    assert \
-        epoch_iters > 0, \
-        "Epoch size must be larger than batch size times shard count"
-
-    args.epoch_size = epoch_iters * global_batch_size
-    log.info("Using epoch size: {}".format(args.epoch_size))
-
-    # Create ModelHelper object
-    if args.use_ideep:
-        train_arg_scope = {
-            'use_cudnn': False,
-            'cudnn_exhaustive_search': False,
-            'training_mode': 1
-        }
-    else:
-        train_arg_scope = {
-            'order': 'NCHW',
-            'use_cudnn': True,
-            'cudnn_exhaustive_search': True,
-            'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
-        }
-    train_model = model_helper.ModelHelper(
-        name='resnext' + str(args.num_layers), arg_scope=train_arg_scope
-    )
-
-    num_shards = args.num_shards
-    shard_id = args.shard_id
-
-    # Expect interfaces to be comma separated.
-    # Use of multiple network interfaces is not yet complete,
-    # so simply use the first one in the list.
-    interfaces = args.distributed_interfaces.split(",")
-
-    # Rendezvous using MPI when run with mpirun
-    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
-        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
-        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
-        if num_shards > 1:
-            rendezvous = dict(
-                kv_handler=None,
-                num_shards=num_shards,
-                shard_id=shard_id,
-                engine="GLOO",
-                transport=args.distributed_transport,
-                interface=interfaces[0],
-                mpi_rendezvous=True,
-                exit_nets=None)
-
-    elif num_shards > 1:
-        # Create rendezvous for distributed computation
-        store_handler = "store_handler"
-        if args.redis_host is not None:
-            # Use Redis for rendezvous if Redis host is specified
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "RedisStoreHandlerCreate", [], [store_handler],
-                    host=args.redis_host,
-                    port=args.redis_port,
-                    prefix=args.run_id,
-                )
-            )
-        else:
-            # Use filesystem for rendezvous otherwise
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "FileStoreHandlerCreate", [], [store_handler],
-                    path=args.file_store_path,
-                    prefix=args.run_id,
-                )
-            )
-
-        rendezvous = dict(
-            kv_handler=store_handler,
-            shard_id=shard_id,
-            num_shards=num_shards,
-            engine="GLOO",
-            transport=args.distributed_transport,
-            interface=interfaces[0],
-            exit_nets=None)
-
-    else:
-        rendezvous = None
-
-    # Model building functions
-    def create_resnext_model_ops(model, loss_scale):
-        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
-                       else Initializer)
-
-        with brew.arg_scope([brew.conv, brew.fc],
-                            WeightInitializer=initializer,
-                            BiasInitializer=initializer,
-                            enable_tensor_core=args.enable_tensor_core,
-                            float16_compute=args.float16_compute):
-            pred = resnet.create_resnext(
-                model,
-                "data",
-                num_input_channels=args.num_channels,
-                num_labels=args.num_labels,
-                num_layers=args.num_layers,
-                num_groups=args.resnext_num_groups,
-                num_width_per_group=args.resnext_width_per_group,
-                no_bias=True,
-                no_loss=True,
-            )
-
-        if args.dtype == 'float16':
-            pred = model.net.HalfToFloat(pred, pred + '_fp32')
-
-        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
-                                              ['softmax', 'loss'])
-        loss = model.Scale(loss, scale=loss_scale)
-        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
-        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
-        return [loss]
-
-    def add_optimizer(model):
-        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
-
-        if args.float16_compute:
-            # TODO: merge with multi-prceision optimizer
-            opt = optimizer.build_fp16_sgd(
-                model,
-                args.base_learning_rate,
-                momentum=0.9,
-                nesterov=1,
-                weight_decay=args.weight_decay,   # weight decay included
-                policy="step",
-                stepsize=stepsz,
-                gamma=0.1
-            )
-        else:
-            optimizer.add_weight_decay(model, args.weight_decay)
-            opt = optimizer.build_multi_precision_sgd(
-                model,
-                args.base_learning_rate,
-                momentum=0.9,
-                nesterov=1,
-                policy="step",
-                stepsize=stepsz,
-                gamma=0.1
-            )
-        return opt
-
-    # Define add_image_input function.
-    # Depends on the "train_data" argument.
-    # Note that the reader will be shared with between all GPUS.
-    if args.train_data == "null":
-        def add_image_input(model):
-            AddNullInput(
-                model,
-                None,
-                batch_size=batch_per_device,
-                img_size=args.image_size,
-                dtype=args.dtype,
-            )
-    else:
-        reader = train_model.CreateDB(
-            "reader",
-            db=args.train_data,
-            db_type=args.db_type,
-            num_shards=num_shards,
-            shard_id=shard_id,
-        )
-
-        def add_image_input(model):
-            AddImageInput(
-                model,
-                reader,
-                batch_size=batch_per_device,
-                img_size=args.image_size,
-                dtype=args.dtype,
-                is_test=False,
-                mean_per_channel=args.image_mean_per_channel,
-                std_per_channel=args.image_std_per_channel,
-            )
-
-    def add_post_sync_ops(model):
-        """Add ops applied after initial parameter sync."""
-        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
-            if param_info.blob_copy is not None:
-                model.param_init_net.HalfToFloat(
-                    param_info.blob,
-                    param_info.blob_copy[core.DataType.FLOAT]
-                )
-
-    # Create parallelized model
-    data_parallel_model.Parallelize(
-        train_model,
-        input_builder_fun=add_image_input,
-        forward_pass_builder_fun=create_resnext_model_ops,
-        optimizer_builder_fun=add_optimizer,
-        post_sync_builder_fun=add_post_sync_ops,
-        devices=gpus,
-        rendezvous=rendezvous,
-        optimize_gradient_memory=False,
-        cpu_device=args.use_cpu,
-        ideep=args.use_ideep,
-        shared_model=args.use_cpu,
-        combine_spatial_bn=args.use_cpu,
-    )
-
-    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)
-
-    workspace.RunNetOnce(train_model.param_init_net)
-    workspace.CreateNet(train_model.net)
-
-    # Add test model, if specified
-    test_model = None
-    if (args.test_data is not None):
-        log.info("----- Create test net ----")
-        if args.use_ideep:
-            test_arg_scope = {
-                'use_cudnn': False,
-                'cudnn_exhaustive_search': False,
-            }
-        else:
-            test_arg_scope = {
-                'order': "NCHW",
-                'use_cudnn': True,
-                'cudnn_exhaustive_search': True,
-            }
-        test_model = model_helper.ModelHelper(
-            name='resnext' + str(args.num_layers) + "_test",
-            arg_scope=test_arg_scope,
-            init_params=False,
-        )
-
-        test_reader = test_model.CreateDB(
-            "test_reader",
-            db=args.test_data,
-            db_type=args.db_type,
-        )
-
-        def test_input_fn(model):
-            AddImageInput(
-                model,
-                test_reader,
-                batch_size=batch_per_device,
-                img_size=args.image_size,
-                dtype=args.dtype,
-                is_test=True,
-                mean_per_channel=args.image_mean_per_channel,
-                std_per_channel=args.image_std_per_channel,
-            )
-
-        data_parallel_model.Parallelize(
-            test_model,
-            input_builder_fun=test_input_fn,
-            forward_pass_builder_fun=create_resnext_model_ops,
-            post_sync_builder_fun=add_post_sync_ops,
-            param_update_builder_fun=None,
-            devices=gpus,
-            cpu_device=args.use_cpu,
-        )
-        workspace.RunNetOnce(test_model.param_init_net)
-        workspace.CreateNet(test_model.net)
-
-    epoch = 0
-    # load the pre-trained model and reset epoch
-    if args.load_model_path is not None:
-        LoadModel(args.load_model_path, train_model, args.use_ideep)
-
-        # Sync the model params
-        data_parallel_model.FinalizeAfterCheckpoint(train_model)
-
-        # reset epoch. load_model_path should end with *_X.mdl,
-        # where X is the epoch number
-        last_str = args.load_model_path.split('_')[-1]
-        if last_str.endswith('.mdl'):
-            epoch = int(last_str[:-4])
-            log.info("Reset epoch to {}".format(epoch))
-        else:
-            log.warning("The format of load_model_path doesn't match!")
-
-    expname = "resnext_%d_gpu%d_b%d_L%d_lr%.2f_v2" % (
-        args.num_layers,
-        args.num_gpus,
-        total_batch_size,
-        args.num_labels,
-        args.base_learning_rate,
-    )
-
-    explog = experiment_util.ModelTrainerLog(expname, args)
-
-    # Run the training one epoch a time
-    while epoch < args.num_epochs:
-        epoch = RunEpoch(
-            args,
-            epoch,
-            train_model,
-            test_model,
-            total_batch_size,
-            num_shards,
-            expname,
-            explog
-        )
-
-        # Save the model for each epoch
-        SaveModel(args, train_model, epoch, args.use_ideep)
-
-        model_path = "%s/%s_" % (
-            args.file_store_path,
-            args.save_model_name
-        )
-        # remove the saved model from the previous epoch if it exists
-        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
-            os.remove(model_path + str(epoch - 1) + ".mdl")
-
-
-def main():
-    # TODO: use argv
-    parser = argparse.ArgumentParser(
-        description="Caffe2: ResNe(X)t training"
-    )
-    parser.add_argument("--train_data", type=str, default=None, required=True,
-                        help="Path to training data (or 'null' to simulate)")
-    parser.add_argument("--num_layers", type=int, default=50,
-                        help="The number of layers in ResNe(X)t model")
-    parser.add_argument("--resnext_num_groups", type=int, default=1,
-                        help="The cardinality of resnext")
-    parser.add_argument("--resnext_width_per_group", type=int, default=64,
-                        help="The cardinality of resnext")
-    parser.add_argument("--test_data", type=str, default=None,
-                        help="Path to test data")
-    parser.add_argument("--image_mean_per_channel", type=float, nargs='+',
-                        help="The per channel mean for the images")
-    parser.add_argument("--image_std_per_channel", type=float, nargs='+',
-                        help="The per channel standard deviation for the images")
-    parser.add_argument("--test_epoch_size", type=int, default=50000,
-                        help="Number of test images")
-    parser.add_argument("--db_type", type=str, default="lmdb",
-                        help="Database type (such as lmdb or leveldb)")
-    parser.add_argument("--gpus", type=str,
-                        help="Comma separated list of GPU devices to use")
-    parser.add_argument("--num_gpus", type=int, default=1,
-                        help="Number of GPU devices (instead of --gpus)")
-    parser.add_argument("--num_channels", type=int, default=3,
-                        help="Number of color channels")
-    parser.add_argument("--image_size", type=int, default=224,
-                        help="Input image size (to crop to)")
-    parser.add_argument("--num_labels", type=int, default=1000,
-                        help="Number of labels")
-    parser.add_argument("--batch_size", type=int, default=32,
-                        help="Batch size, total over all GPUs")
-    parser.add_argument("--epoch_size", type=int, default=1500000,
-                        help="Number of images/epoch, total over all machines")
-    parser.add_argument("--num_epochs", type=int, default=1000,
-                        help="Num epochs.")
-    parser.add_argument("--base_learning_rate", type=float, default=0.1,
-                        help="Initial learning rate.")
-    parser.add_argument("--weight_decay", type=float, default=1e-4,
-                        help="Weight decay (L2 regularization)")
-    parser.add_argument("--cudnn_workspace_limit_mb", type=int, default=64,
-                        help="CuDNN workspace limit in MBs")
-    parser.add_argument("--num_shards", type=int, default=1,
-                        help="Number of machines in distributed run")
-    parser.add_argument("--shard_id", type=int, default=0,
-                        help="Shard id.")
-    parser.add_argument("--run_id", type=str,
-                        help="Unique run identifier (e.g. uuid)")
-    parser.add_argument("--redis_host", type=str,
-                        help="Host of Redis server (for rendezvous)")
-    parser.add_argument("--redis_port", type=int, default=6379,
-                        help="Port of Redis server (for rendezvous)")
-    parser.add_argument("--file_store_path", type=str, default="/tmp",
-                        help="Path to directory to use for rendezvous")
-    parser.add_argument("--save_model_name", type=str, default="resnext_model",
-                        help="Save the trained model to a given name")
-    parser.add_argument("--load_model_path", type=str, default=None,
-                        help="Load previously saved model to continue training")
-    parser.add_argument("--use_cpu", type=bool, default=False,
-                        help="Use CPU instead of GPU")
-    parser.add_argument("--use_ideep", type=bool, default=False,
-                        help="Use ideep")
-    parser.add_argument('--dtype', default='float',
-                        choices=['float', 'float16'],
-                        help='Data type used for training')
-    parser.add_argument('--float16_compute', action='store_true',
-                        help="Use float 16 compute, if available")
-    parser.add_argument('--enable_tensor_core', action='store_true',
-                        help='Enable Tensor Core math for Conv and FC ops')
-    parser.add_argument("--distributed_transport", type=str, default="tcp",
-                        help="Transport to use for distributed run [tcp|ibverbs]")
-    parser.add_argument("--distributed_interfaces", type=str, default="",
-                        help="Network interfaces to use for distributed run")
-
-    parser.add_argument("--first_iter_timeout", type=int, default=600,
-                        help="Timeout (secs) of the first iteration "
-                        "(default: %(default)s)")
-    parser.add_argument("--timeout", type=int, default=60,
-                        help="Timeout (secs) of each (except the first) iteration "
-                        "(default: %(default)s)")
-
-    args = parser.parse_args()
-
-    Train(args)
-
-
-if __name__ == '__main__':
-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
-    main()
--- a/caffe2/python/examples/resnet50_trainer.py
+++ b/caffe2/python/examples/resnet50_trainer.py
@ -0,0 +1 @@
+imagenet_trainer.py
--- a/caffe2/python/models/imagenet_trainer_test_utils.py
+++ b/caffe2/python/models/imagenet_trainer_test_utils.py
@ -0,0 +1,200 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import time
+
+from caffe2.python import workspace, cnn, memonger, core
+
+def has_blob(proto, needle):
+    for op in proto.op:
+        for inp in op.input:
+            if inp == needle:
+                return True
+        for outp in op.output:
+            if outp == needle:
+                return True
+    return False
+
+
+def count_blobs(proto):
+    blobs = set()
+    for op in proto.op:
+        blobs = blobs.union(set(op.input)).union(set(op.output))
+    return len(blobs)
+
+
+def count_shared_blobs(proto):
+    blobs = set()
+    for op in proto.op:
+        blobs = blobs.union(set(op.input)).union(set(op.output))
+    return len([b for b in blobs if "_shared" in b])
+
+
+def test_shared_grads(
+    with_shapes,
+    create_model,
+    conv_blob,
+    last_out_blob,
+    data_blob='gpu_0/data',
+    label_blob='gpu_0/label',
+    num_labels=1000,
+):
+    model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="test",
+        cudnn_exhaustive_search=True,
+    )
+    with core.NameScope("gpu_0"):
+        data = model.net.AddExternalInput(data_blob)
+        label = model.net.AddExternalInput(label_blob)
+        (_softmax, loss) = create_model(
+            model,
+            data,
+            num_input_channels=3,
+            num_labels=num_labels,
+            label=label,
+            is_test=False,
+        )
+
+    param_to_grad = model.AddGradientOperators([loss])
+
+    (shapes, types) = workspace.InferShapesAndTypes(
+        [model.param_init_net, model.net],
+        {data_blob: [4, 3, 227, 227],
+         label_blob: [4]},
+    )
+
+    count_before = count_blobs(model.net.Proto())
+    optim_proto = memonger.share_grad_blobs(
+        model.net,
+        ["gpu_0/loss"],
+        set(model.param_to_grad.values()),
+        "gpu_0/",
+        share_activations=True,
+        dont_share_blobs=set([str(param_to_grad[conv_blob])]),
+        blob_shapes=shapes if with_shapes else None,
+    )
+    count_after = count_blobs(optim_proto)
+
+    # Run model and compare results. We check that the loss is same
+    # and also that the final gradient (conv1_w_grad is same)
+    workspace.RunNetOnce(model.param_init_net)
+    data = np.random.rand(4, 3, 227, 227).astype(np.float32)
+    label = (np.random.rand(4) * num_labels).astype(np.int32)
+
+    workspace.FeedBlob(data_blob, data)
+    workspace.FeedBlob(label_blob, label)
+
+    workspace.RunNetOnce(model.net)
+    model.net.Proto().type = 'dag'
+    model.net.Proto().num_workers = 4
+    loss1 = workspace.FetchBlob(last_out_blob)
+    conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
+    workspace.FeedBlob(param_to_grad[conv_blob], np.array([0.0]))
+
+    workspace.RunNetOnce(optim_proto)
+    optimized_loss1 = workspace.FetchBlob(last_out_blob)
+    optim_conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
+
+    return [(count_after, count_before),
+            (loss1, optimized_loss1),
+            (conv1_w_grad, optim_conv1_w_grad)]
+
+
+def test_forward_only(
+    create_model,
+    last_out_blob,
+    data_blob='gpu_0/data',
+    num_labels=1000,
+):
+    model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="test",
+        cudnn_exhaustive_search=True,
+    )
+    with core.NameScope("gpu_0"):
+            data = model.net.AddExternalInput(data_blob)
+            create_model(
+                model,
+                data,
+                num_input_channels=3,
+                num_labels=num_labels,
+                is_test=True
+            )
+
+    count_before = count_blobs(model.net.Proto())
+    optim_proto = memonger.optimize_inference_for_dag(
+        model.net, [data_blob], "gpu_0/"
+    )
+    count_after = count_blobs(optim_proto)
+    num_shared_blobs = count_shared_blobs(optim_proto)
+
+    # Run model and compare results
+    workspace.RunNetOnce(model.param_init_net)
+    data = np.random.rand(4, 3, 227, 227).astype(np.float32)
+
+    workspace.FeedBlob(data_blob, data)
+    workspace.RunNetOnce(model.net)
+    model.net.Proto().type = 'dag'
+    model.net.Proto().num_workers = 4
+    loss1 = workspace.FetchBlob(last_out_blob)
+
+    workspace.RunNetOnce(optim_proto)
+    optimized_loss1 = workspace.FetchBlob(last_out_blob)
+    return [(count_after, count_before),
+            (num_shared_blobs),
+            (loss1, optimized_loss1)]
+
+
+def test_forward_only_fast_simplenet(
+    create_model,
+    last_out_blob,
+    data_blob="gpu_0/data",
+    num_labels=1000,
+):
+    model = cnn.CNNModelHelper(
+        order="NCHW",
+        name="test",
+        cudnn_exhaustive_search=True,
+    )
+    with core.NameScope("gpu_0"):
+            data = model.net.AddExternalInput(data_blob)
+            create_model(
+                model,
+                data,
+                num_input_channels=3,
+                num_labels=num_labels,
+                is_test=True
+            )
+
+    count_before = count_blobs(model.net.Proto())
+    t = time.time()
+    optim_proto = memonger.optimize_inference_fast(
+        model.net.Proto(),
+        set([data_blob, last_out_blob]).union(
+            set(model.net.Proto().external_input))
+    )
+    print("Optimization took {} secs".format(time.time() - t))
+    count_after = count_blobs(optim_proto)
+    num_shared_blobs = count_shared_blobs(optim_proto)
+
+    print(count_after, count_before, num_shared_blobs)
+
+    # Run model and compare results
+    workspace.RunNetOnce(model.param_init_net)
+    data = np.random.rand(4, 3, 227, 227).astype(np.float32)
+
+    workspace.FeedBlob(data_blob, data)
+    model.net.Proto().type = 'simple'
+
+    workspace.RunNetOnce(model.net)
+    loss1 = workspace.FetchBlob(last_out_blob)
+
+    workspace.RunNetOnce(optim_proto)
+    optimized_loss1 = workspace.FetchBlob(last_out_blob)
+    return [(count_after, count_before),
+            (num_shared_blobs),
+            (loss1, optimized_loss1)]
--- a/caffe2/python/models/resnet_test.py
+++ b/caffe2/python/models/resnet_test.py
@ -4,194 +4,49 @@ from __future__ import print_function
 from __future__ import unicode_literals

 import numpy as np
-import time

-from caffe2.python import workspace, cnn, memonger, core
 import caffe2.python.models.resnet as resnet
 import hypothesis.strategies as st
 from hypothesis import given, settings
 import caffe2.python.hypothesis_test_util as hu
-
-
-def has_blob(proto, needle):
-    for op in proto.op:
-        for inp in op.input:
-            if inp == needle:
-                return True
-        for outp in op.output:
-            if outp == needle:
-                return True
-    return False
-
-
-def count_blobs(proto):
-    blobs = set()
-    for op in proto.op:
-        blobs = blobs.union(set(op.input)).union(set(op.output))
-    return len(blobs)
-
-
-def count_shared_blobs(proto):
-    blobs = set()
-    for op in proto.op:
-        blobs = blobs.union(set(op.input)).union(set(op.output))
-    return len([b for b in blobs if "_shared" in b])
-
+import caffe2.python.models.imagenet_trainer_test_utils as utils

 class ResnetMemongerTest(hu.HypothesisTestCase):

    @given(with_shapes=st.booleans(), **hu.gcs_cpu_only)
    @settings(max_examples=2, timeout=120)
    def test_resnet_shared_grads(self, with_shapes, gc, dc):
-        model = cnn.CNNModelHelper(
-            order="NCHW",
-            name="test",
-            cudnn_exhaustive_search=True,
+        results = utils.test_shared_grads(
+            with_shapes,
+            resnet.create_resnet50,
+            'gpu_0/conv1_w',
+            'gpu_0/last_out_L1000'
        )
-        with core.NameScope("gpu_0"):
-            data = model.net.AddExternalInput("gpu_0/data")
-            label = model.net.AddExternalInput("gpu_0/label")
-            (_softmax, loss) = resnet.create_resnet50(
-                model,
-                data,
-                num_input_channels=3,
-                num_labels=1000,
-                label=label,
-                is_test=False,
-            )
-
-        param_to_grad = model.AddGradientOperators([loss])
-
-        (shapes, types) = workspace.InferShapesAndTypes(
-            [model.param_init_net, model.net],
-            {'gpu_0/data': [4, 3, 227, 227],
-                         'gpu_0/label': [4]},
-        )
-
-        count_before = count_blobs(model.net.Proto())
-        optim_proto = memonger.share_grad_blobs(
-            model.net,
-            ["gpu_0/loss"],
-            set(model.param_to_grad.values()),
-            "gpu_0/",
-            share_activations=True,
-            dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]),
-            blob_shapes=shapes if with_shapes else None,
-        )
-        count_after = count_blobs(optim_proto)
-        self.assertTrue(count_after < count_before)
-
-        # Run model and compare results. We check that the loss is same
-        # and also that the final gradient (conv1_w_grad is same)
-        workspace.RunNetOnce(model.param_init_net)
-        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
-        label = (np.random.rand(4) * 1000).astype(np.int32)
-
-        workspace.FeedBlob("gpu_0/data", data)
-        workspace.FeedBlob("gpu_0/label", label)
-
-        workspace.RunNetOnce(model.net)
-        model.net.Proto().type = 'dag'
-        model.net.Proto().num_workers = 4
-        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
-        conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
-        workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0]))
-
-        workspace.RunNetOnce(optim_proto)
-        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
-        optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"])
-
-        print("before: {} after: {}".format(count_before, count_after))
-
-        np.testing.assert_almost_equal(loss1, optimized_loss1)
-        np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)
+        self.assertTrue(results[0][0] < results[0][1])
+        np.testing.assert_almost_equal(results[1][0], results[1][1])
+        np.testing.assert_almost_equal(results[2][0], results[2][1])

    def test_resnet_forward_only(self):
-        model = cnn.CNNModelHelper(
-            order="NCHW",
-            name="test",
-            cudnn_exhaustive_search=True,
+        results = utils.test_forward_only(
+            resnet.create_resnet50,
+            'gpu_0/last_out_L1000'
        )
-        with core.NameScope("gpu_0"):
-                data = model.net.AddExternalInput("gpu_0/data")
-                resnet.create_resnet50(
-                    model,
-                    data,
-                    num_input_channels=3,
-                    num_labels=1000,
-                    is_test=True
-                )
-
-        count_before = count_blobs(model.net.Proto())
-        optim_proto = memonger.optimize_inference_for_dag(
-            model.net, ["gpu_0/data"], "gpu_0/"
-        )
-        count_after = count_blobs(optim_proto)
-        num_shared_blobs = count_shared_blobs(optim_proto)
-
-        # Run model and compare results
-        workspace.RunNetOnce(model.param_init_net)
-        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
-
-        workspace.FeedBlob("gpu_0/data", data)
-        workspace.RunNetOnce(model.net)
-        model.net.Proto().type = 'dag'
-        model.net.Proto().num_workers = 4
-        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
-
-        workspace.RunNetOnce(optim_proto)
-        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
-        self.assertTrue(count_after < count_before)
-        self.assertTrue(num_shared_blobs < 7 and num_shared_blobs > 0)
-        np.testing.assert_almost_equal(loss1, optimized_loss1)
+        self.assertTrue(results[0][0] < results[0][1])
+        self.assertTrue(results[1] < 7 and results[1] > 0)
+        np.testing.assert_almost_equal(results[2][0], results[2][1])

    def test_resnet_forward_only_fast_simplenet(self):
        '''
        Test C++ memonger that is only for simple nets
        '''
-        model = cnn.CNNModelHelper(
-            order="NCHW",
-            name="test",
-            cudnn_exhaustive_search=True,
+        results = utils.test_forward_only_fast_simplenet(
+            resnet.create_resnet50,
+            'gpu_0/last_out_L1000'
        )
-        with core.NameScope("gpu_0"):
-                data = model.net.AddExternalInput("gpu_0/data")
-                resnet.create_resnet50(
-                    model,
-                    data,
-                    num_input_channels=3,
-                    num_labels=1000,
-                    is_test=True
-                )

-        count_before = count_blobs(model.net.Proto())
-        t = time.time()
-        optim_proto = memonger.optimize_inference_fast(
-            model.net.Proto(),
-            set(["gpu_0/data", "gpu_0/last_out_L1000"]).union(
-                set(model.net.Proto().external_input))
-        )
-        print("Optimization took {} secs".format(time.time() - t))
-        count_after = count_blobs(optim_proto)
-        num_shared_blobs = count_shared_blobs(optim_proto)
-
-        self.assertTrue(count_after < count_before)
-        print(count_after, count_before, num_shared_blobs)
-        self.assertTrue(num_shared_blobs < 7 and num_shared_blobs > 0)
-
-        # Run model and compare results
-        workspace.RunNetOnce(model.param_init_net)
-        data = np.random.rand(4, 3, 227, 227).astype(np.float32)
-
-        workspace.FeedBlob("gpu_0/data", data)
-        model.net.Proto().type = 'simple'
-
-        workspace.RunNetOnce(model.net)
-        loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
-
-        workspace.RunNetOnce(optim_proto)
-        optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000")
-        np.testing.assert_almost_equal(loss1, optimized_loss1)
+        self.assertTrue(results[0][0] < results[0][1])
+        self.assertTrue(results[1] < 4 and results[1] > 0)
+        np.testing.assert_almost_equal(results[2][0], results[2][1])


 if __name__ == "__main__":
--- a/caffe2/python/models/shufflenet.py
+++ b/caffe2/python/models/shufflenet.py
@ -0,0 +1,198 @@
+# Module caffe2.python.models.shufflenet
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import brew
+
+"""
+Utilitiy for creating ShuffleNet
+"ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" by Ma et. al. 2018
+"""
+
+OUTPUT_CHANNELS = {
+    '0.5x': [24, 48, 96, 192, 1024],
+    '1.0x': [24, 116, 232, 464, 1024],
+    '1.5x': [24, 176, 352, 704, 1024],
+    '2.0x': [24, 244, 488, 976, 2048],
+}
+
+
+class ShuffleNetV2Builder():
+    def __init__(
+        self,
+        model,
+        data,
+        num_input_channels,
+        num_labels,
+        num_groups=2,
+        width='1.0x',
+        is_test=False,
+        detection=False,
+        bn_epsilon=1e-5,
+    ):
+        self.model = model
+        self.prev_blob = data
+        self.num_input_channels = num_input_channels
+        self.num_labels = num_labels
+        self.num_groups = num_groups
+        self.output_channels = OUTPUT_CHANNELS[width]
+        self.stage_repeats = [3, 7, 3]
+        self.is_test = is_test
+        self.detection = detection
+        self.bn_epsilon = bn_epsilon
+
+    def create(self):
+        in_channels = self.output_channels[0]
+
+        self.prev_blob = brew.conv(self.model, self.prev_blob, 'stage1_conv',
+                                   self.num_input_channels, in_channels,
+                                   weight_init=("MSRAFill", {}),
+                                   kernel=3, stride=2)
+        self.prev_blob = brew.max_pool(self.model, self.prev_blob,
+                                       'stage1_pool', kernel=3, stride=2)
+
+        # adds stage#{2,3,4}; see table 5 of the ShufflenetV2 paper.
+        for idx, (out_channels, n_repeats) in enumerate(zip(
+            self.output_channels[1:4], self.stage_repeats
+        )):
+            prefix = 'stage{}_stride{}'.format(idx + 2, 2)
+            self.add_spatial_ds_unit(prefix, in_channels, out_channels)
+            in_channels = out_channels
+            for i in range(n_repeats):
+                prefix = 'stage{}_stride{}_repeat{}'.format(
+                    idx + 2, 1, i + 1
+                )
+                self.add_basic_unit(prefix, in_channels)
+
+        self.last_conv = brew.conv(self.model, self.prev_blob, 'conv5',
+                                   in_channels, self.output_channels[4],
+                                   kernel=1)
+        self.avg_pool = self.model.AveragePool(self.last_conv, 'avg_pool',
+                                               kernel=7)
+        self.last_out = brew.fc(self.model,
+                                self.avg_pool,
+                                'last_out_L{}'.format(self.num_labels),
+                                self.output_channels[4],
+                                self.num_labels)
+
+    # spatial down sampling unit with stride=2
+    def add_spatial_ds_unit(self, prefix, in_channels, out_channels, stride=2):
+        right = left = self.prev_blob
+        out_channels = out_channels // 2
+
+        # Enlarge the receptive field for detection task
+        if self.detection:
+            left = self.add_detection_unit(left, prefix + '_left_detection',
+                                           in_channels, in_channels)
+
+        left = self.add_dwconv3x3_bn(left, prefix + 'left_dwconv',
+                                     in_channels, stride)
+        left = self.add_conv1x1_bn(left, prefix + '_left_conv1', in_channels,
+                                   out_channels)
+
+        if self.detection:
+            right = self.add_detection_unit(right, prefix + '_right_detection',
+                                            in_channels, in_channels)
+
+        right = self.add_conv1x1_bn(right, prefix + '_right_conv1',
+                                    in_channels, out_channels)
+        right = self.add_dwconv3x3_bn(right, prefix + '_right_dwconv',
+                                      out_channels, stride)
+        right = self.add_conv1x1_bn(right, prefix + '_right_conv2',
+                                    out_channels, out_channels)
+
+        self.prev_blob = brew.concat(self.model, [right, left],
+                                     prefix + '_concat')
+        self.prev_blob = self.model.net.ChannelShuffle(
+            self.prev_blob, prefix + '_ch_shuffle',
+            group=self.num_groups, kernel=1
+        )
+
+    # basic unit with stride=1
+    def add_basic_unit(self, prefix, in_channels, stride=1):
+        in_channels = in_channels // 2
+        left = prefix + '_left'
+        right = prefix + '_right'
+        self.model.net.Split(self.prev_blob, [left, right])
+
+        if self.detection:
+            right = self.add_detection_unit(right, prefix + '_right_detection',
+                                            in_channels, in_channels)
+
+        right = self.add_conv1x1_bn(right, prefix + '_right_conv1',
+                                    in_channels, in_channels)
+        right = self.add_dwconv3x3_bn(right, prefix + '_right_dwconv',
+                                      in_channels, stride)
+        right = self.add_conv1x1_bn(right, prefix + '_right_conv2',
+                                    in_channels, in_channels)
+
+        self.prev_blob = brew.concat(self.model, [right, left],
+                                     prefix + '_concat')
+
+        self.prev_blob = self.model.net.ChannelShuffle(
+            self.prev_blob, prefix + '_ch_shuffle',
+            group=self.num_groups, kernel=1
+        )
+
+    # helper functions to create net's units
+    def add_detection_unit(self, prev_blob, prefix, in_channels, out_channels,
+                           kernel=3, pad=1):
+        out_blob = brew.conv(self.model, prev_blob, prefix + '_conv',
+                             in_channels, out_channels, kernel=kernel,
+                             weight_init=("MSRAFill", {}),
+                             group=in_channels, pad=pad)
+        out_blob = brew.spatial_bn(self.model, out_blob, prefix + '_bn',
+                                   out_channels, epsilon=self.bn_epsilon,
+                                   is_test=self.is_test)
+        return out_blob
+
+    def add_conv1x1_bn(self, prev_blob, blob, in_channels, out_channels):
+        prev_blob = brew.conv(self.model, prev_blob, blob, in_channels,
+                              out_channels, kernel=1,
+                              weight_init=("MSRAFill", {}))
+        prev_blob = brew.spatial_bn(self.model, prev_blob, prev_blob + '_bn',
+                                    out_channels,
+                                    epsilon=self.bn_epsilon,
+                                    is_test=self.is_test)
+        prev_blob = brew.relu(self.model, prev_blob, prev_blob)
+        return prev_blob
+
+    def add_dwconv3x3_bn(self, prev_blob, blob, channels, stride):
+            prev_blob = brew.conv(self.model, prev_blob, blob, channels,
+                                  channels, kernel=3,
+                                  weight_init=("MSRAFill", {}),
+                                  stride=stride, group=channels, pad=1)
+            prev_blob = brew.spatial_bn(self.model, prev_blob,
+                                        prev_blob + '_bn',
+                                        channels,
+                                        epsilon=self.bn_epsilon,
+                                        is_test=self.is_test)
+            return prev_blob
+
+
+def create_shufflenet(
+    model,
+    data,
+    num_input_channels,
+    num_labels,
+    label=None,
+    is_test=False,
+    no_loss=False,
+):
+    builder = ShuffleNetV2Builder(model, data, num_input_channels,
+                                  num_labels,
+                                  is_test=is_test)
+    builder.create()
+
+    if no_loss:
+        return builder.last_out
+
+    if (label is not None):
+        (softmax, loss) = model.SoftmaxWithLoss(
+            [builder.last_out, label],
+            ["softmax", "loss"],
+        )
+        return (softmax, loss)
--- a/caffe2/python/models/shufflenet_test.py
+++ b/caffe2/python/models/shufflenet_test.py
@ -0,0 +1,60 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+import caffe2.python.models.shufflenet as shufflenet
+import hypothesis.strategies as st
+from hypothesis import given, settings
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.models.imagenet_trainer_test_utils as utils
+
+
+class ShufflenetMemongerTest(hu.HypothesisTestCase):
+    @given(with_shapes=st.booleans(), **hu.gcs_cpu_only)
+    @settings(max_examples=2, timeout=120)
+    def test_shufflenet_shared_grads(self, with_shapes, gc, dc):
+        results = utils.test_shared_grads(
+            with_shapes,
+            shufflenet.create_shufflenet,
+            'gpu_0/stage1_conv_w',
+            'gpu_0/last_out_L1000'
+        )
+        self.assertTrue(results[0][0] < results[0][1])
+        np.testing.assert_almost_equal(results[1][0], results[1][1])
+        np.testing.assert_almost_equal(results[2][0], results[2][1])
+
+    def test_shufflenet_forward_only(self):
+        results = utils.test_forward_only(
+            shufflenet.create_shufflenet,
+            'gpu_0/last_out_L1000'
+        )
+        self.assertTrue(results[0][0] < results[0][1])
+        self.assertTrue(results[1] < 10 and results[1] > 0)
+        np.testing.assert_almost_equal(results[2][0], results[2][1])
+
+    def test_shufflenet_forward_only_fast_simplenet(self):
+        '''
+        Test C++ memonger that is only for simple nets
+        '''
+        results = utils.test_forward_only_fast_simplenet(
+            shufflenet.create_shufflenet,
+            'gpu_0/last_out_L1000'
+        )
+
+        self.assertTrue(results[0][0] < results[0][1])
+        self.assertTrue(results[1] < 4 and results[1] > 0)
+        np.testing.assert_almost_equal(results[2][0], results[2][1])
+
+if __name__ == "__main__":
+    import unittest
+    import random
+    random.seed(2006)
+    workspace.GlobalInit([
+        'caffe2',
+        '--caffe2_log_level=0',
+        '--caffe2_print_blob_sizes_at_exit=0',
+        '--caffe2_gpu_memory_tracking=1'])
+    unittest.main()