mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 23:53:48 +08:00
82 lines
2.4 KiB
Python
82 lines
2.4 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# DeepSpeed Team
|
|
|
|
#########################################
|
|
# Elasticity
|
|
#########################################
|
|
''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible
|
|
with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that
|
|
can support a large number of GPUs based on the user specified parameters
|
|
'''
|
|
FORMAT = '''
|
|
Elasticity should be enabled as:
|
|
"elasticity": {
|
|
"enabled": true,
|
|
"max_train_batch_size": 2000,
|
|
"micro_batch_sizes": [2,4,6],
|
|
"min_gpus": 1,
|
|
"max_gpus" : 10000,
|
|
"min_time": 20,
|
|
"prefer_larger_batch": true,
|
|
"ignore_non_elastic_batch_info": false,
|
|
"version": 0.1
|
|
}
|
|
'''
|
|
|
|
ELASTICITY = 'elasticity'
|
|
|
|
# Current elasticity version
|
|
LATEST_ELASTICITY_VERSION = 0.2
|
|
|
|
ENABLED = 'enabled'
|
|
ENABLED_DEFAULT = False
|
|
|
|
# Max acceptable train_batch_size
|
|
MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size'
|
|
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000
|
|
|
|
# Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu
|
|
MICRO_BATCHES = 'micro_batch_sizes'
|
|
MICRO_BATCHES_DEFAULT = [2, 4, 6]
|
|
|
|
# Min/max of GPUs to search over
|
|
MIN_GPUS = 'min_gpus'
|
|
MIN_GPUS_DEFAULT = 1
|
|
MAX_GPUS = 'max_gpus'
|
|
MAX_GPUS_DEFAULT = 10000
|
|
|
|
NUM_GPUS_PER_NODE = 'num_gpus_per_node'
|
|
NUM_GPUS_PER_NODE_DEFAULT = 1
|
|
|
|
MODEL_PARALLEL_SIZE = "model_parallel_size"
|
|
MODEL_PARALLEL_SIZE_DEFAULT = 1
|
|
|
|
# Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
|
|
MIN_TIME = "min_time"
|
|
MIN_TIME_DEFAULT = 0
|
|
|
|
# When finding a suitable batch size, attempt to find one that is closest
|
|
# to the max train batch size given.
|
|
PREFER_LARGER_BATCH = 'prefer_larger_batch'
|
|
PREFER_LARGER_BATCH_DEFAULT = True
|
|
|
|
# In order to reduce confusion, if elastic mode is enabled we
|
|
# require (via assert) that no batch info is set outside of the
|
|
# elastic config. You can turn off this assert via this config
|
|
# but keep in mind that all batch info defined outside the
|
|
# elastic mode *will be ignored*.
|
|
IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info'
|
|
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False
|
|
|
|
# Version of elastic logic to use
|
|
VERSION = "version"
|
|
VERSION_DEFAULT = LATEST_ELASTICITY_VERSION
|
|
|
|
# Minimum deepspeed version to use elasticity
|
|
MINIMUM_DEEPSPEED_VERSION = "0.3.8"
|
|
|
|
# Environment variable storing elastic config from resource scheduler
|
|
DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG"
|