DeepSpeed/deepspeed/elasticity/constants.py

# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

#########################################
# Elasticity
#########################################
''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible
with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that
can support a large number of GPUs based on the user specified parameters
'''
FORMAT = '''
Elasticity should be enabled as:
"elasticity": {
  "enabled": true,
  "max_train_batch_size": 2000,
  "micro_batch_sizes": [2,4,6],
  "min_gpus": 1,
  "max_gpus" : 10000,
  "min_time": 20,
  "prefer_larger_batch": true,
  "ignore_non_elastic_batch_info": false,
  "version": 0.1
}
'''

ELASTICITY = 'elasticity'

# Current elasticity version
LATEST_ELASTICITY_VERSION = 0.2

ENABLED = 'enabled'
ENABLED_DEFAULT = False

# Max acceptable train_batch_size
MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size'
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000

# Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu
MICRO_BATCHES = 'micro_batch_sizes'
MICRO_BATCHES_DEFAULT = [2, 4, 6]

# Min/max of GPUs to search over
MIN_GPUS = 'min_gpus'
MIN_GPUS_DEFAULT = 1
MAX_GPUS = 'max_gpus'
MAX_GPUS_DEFAULT = 10000

NUM_GPUS_PER_NODE = 'num_gpus_per_node'
NUM_GPUS_PER_NODE_DEFAULT = 1

MODEL_PARALLEL_SIZE = "model_parallel_size"
MODEL_PARALLEL_SIZE_DEFAULT = 1

# Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
MIN_TIME = "min_time"
MIN_TIME_DEFAULT = 0

# When finding a suitable batch size, attempt to find one that is closest
# to the max train batch size given.
PREFER_LARGER_BATCH = 'prefer_larger_batch'
PREFER_LARGER_BATCH_DEFAULT = True

# In order to reduce confusion, if elastic mode is enabled we
# require (via assert) that no batch info is set outside of the
# elastic config. You can turn off this assert via this config
# but keep in mind that all batch info defined outside the
# elastic mode *will be ignored*.
IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info'
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False

# Version of elastic logic to use
VERSION = "version"
VERSION_DEFAULT = LATEST_ELASTICITY_VERSION

# Minimum deepspeed version to use elasticity
MINIMUM_DEEPSPEED_VERSION = "0.3.8"

# Environment variable storing elastic config from resource scheduler
DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG"