mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 23:53:48 +08:00
Reduce Unit Test Times (Part 3) (#3850)
* add coverage report * define env vars in shared action * reduce time for longest running tests * fix broken shared action * reduce test time * reducing Pipeline test times * further reducing test times * rework Z3 test * testing new mp.pool and persistent dist envs * fix import * reuse distributed environment for tests with lots of param combos * fix for dist teardown * fix pickling issue with pool cache * actually fix pickling problem * avoid running pool cache stuff on non-distributed tests * fix issues with nested mp.pool * fix for nested pools in Pipeline Engine * re-add params * update workflows with pytest opts * implement feedback * resolve race condition with port selection * Update tests/unit/common.py --------- Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
This commit is contained in:
8
.github/workflows/amd-mi100.yml
vendored
8
.github/workflows/amd-mi100.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
|
pip install --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -50,7 +50,7 @@ jobs:
|
|||||||
# Runs a set of commands using the runners shell
|
# Runs a set of commands using the runners shell
|
||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -n 4 --verbose unit/
|
pytest $PYTEST_OPTS -n 4 --verbose unit/
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'sequential' unit/
|
pytest $PYTEST_OPTS -m 'sequential' unit/
|
||||||
|
8
.github/workflows/amd-mi200.yml
vendored
8
.github/workflows/amd-mi200.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
|
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ jobs:
|
|||||||
# Runs a set of commands using the runners shell
|
# Runs a set of commands using the runners shell
|
||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -n 4 --verbose unit/
|
pytest $PYTEST_OPTS -n 4 --verbose unit/
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'sequential' unit/
|
pytest $PYTEST_OPTS -m 'sequential' unit/
|
||||||
|
5
.github/workflows/cpu-inference.yml
vendored
5
.github/workflows/cpu-inference.yml
vendored
@ -75,7 +75,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
source oneCCL/build/_install/env/setvars.sh
|
source oneCCL/build/_install/env/setvars.sh
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference' unit/inference/test_inference_config.py
|
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'inference' unit/inference/test_inference_config.py
|
||||||
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -k TestDistAllReduce unit/comm/test_dist.py
|
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ pytest $PYTEST_OPTS -k TestDistAllReduce unit/comm/test_dist.py
|
||||||
|
6
.github/workflows/nv-accelerate-v100.yml
vendored
6
.github/workflows/nv-accelerate-v100.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
|
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ jobs:
|
|||||||
|
|
||||||
- name: HF Accelerate tests
|
- name: HF Accelerate tests
|
||||||
run: |
|
run: |
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
git clone https://github.com/huggingface/accelerate
|
git clone https://github.com/huggingface/accelerate
|
||||||
cd accelerate
|
cd accelerate
|
||||||
git rev-parse --short HEAD
|
git rev-parse --short HEAD
|
||||||
@ -52,4 +52,4 @@ jobs:
|
|||||||
# tmp fix: force newer datasets version
|
# tmp fix: force newer datasets version
|
||||||
#pip install "datasets>=2.0.0"
|
#pip install "datasets>=2.0.0"
|
||||||
pip list
|
pip list
|
||||||
HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
|
pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
|
||||||
|
5
.github/workflows/nv-h100.yml
vendored
5
.github/workflows/nv-h100.yml
vendored
@ -46,7 +46,6 @@ jobs:
|
|||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions python -m pytest -n 4 unit/ --torch_ver="2.0" --cuda_ver="12"
|
python -m pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.0" --cuda_ver="12"
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions python -m pytest -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12"
|
python -m pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12"
|
||||||
|
15
.github/workflows/nv-inference.yml
vendored
15
.github/workflows/nv-inference.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -49,8 +49,13 @@ jobs:
|
|||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
||||||
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
||||||
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
||||||
|
|
||||||
|
- name: Coverage report
|
||||||
|
run: |
|
||||||
|
cd tests
|
||||||
|
coverage combine
|
||||||
|
coverage report -m
|
||||||
|
6
.github/workflows/nv-lightning-v100.yml
vendored
6
.github/workflows/nv-lightning-v100.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -41,8 +41,8 @@ jobs:
|
|||||||
|
|
||||||
- name: PyTorch Lightning Tests
|
- name: PyTorch Lightning Tests
|
||||||
run: |
|
run: |
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
pip install pytorch-lightning
|
pip install pytorch-lightning
|
||||||
pip install "protobuf<4.21.0"
|
pip install "protobuf<4.21.0"
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose lightning/
|
pytest $PYTEST_OPTS lightning/
|
||||||
|
5
.github/workflows/nv-megatron.yml
vendored
5
.github/workflows/nv-megatron.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -57,6 +57,5 @@ jobs:
|
|||||||
cd Megatron-DeepSpeed
|
cd Megatron-DeepSpeed
|
||||||
pip install .
|
pip install .
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose ./
|
pytest $PYTEST_OPTS ./
|
||||||
|
5
.github/workflows/nv-mii.yml
vendored
5
.github/workflows/nv-mii.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip3 install -U --cache-dir /blob/torch_cache torch
|
pip3 install -U --cache-dir $TORCH_CACHE torch
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -54,6 +54,5 @@ jobs:
|
|||||||
cd DeepSpeed-MII
|
cd DeepSpeed-MII
|
||||||
pip install .[dev]
|
pip install .[dev]
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "deepspeed" ./
|
pytest $PYTEST_OPTS --forked -m "deepspeed" ./
|
||||||
|
5
.github/workflows/nv-nightly.yml
vendored
5
.github/workflows/nv-nightly.yml
vendored
@ -20,7 +20,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -45,6 +45,5 @@ jobs:
|
|||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
|
||||||
|
5
.github/workflows/nv-torch-latest-cpu.yml
vendored
5
.github/workflows/nv-torch-latest-cpu.yml
vendored
@ -42,7 +42,6 @@ jobs:
|
|||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -n 4 unit/ --torch_ver="1.12"
|
TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="1.12"
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'sequential' unit/ --torch_ver="1.12"
|
TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="1.12"
|
||||||
|
13
.github/workflows/nv-torch-latest-v100.yml
vendored
13
.github/workflows/nv-torch-latest-v100.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -51,7 +51,12 @@ jobs:
|
|||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -n 4 unit/ --torch_ver="2.0" --cuda_ver="11.7"
|
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.0" --cuda_ver="11.7"
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="11.7"
|
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="11.7"
|
||||||
|
|
||||||
|
- name: Coverage report
|
||||||
|
run: |
|
||||||
|
cd tests
|
||||||
|
coverage combine
|
||||||
|
coverage report -m
|
||||||
|
5
.github/workflows/nv-torch-nightly-v100.yml
vendored
5
.github/workflows/nv-torch-nightly-v100.yml
vendored
@ -45,7 +45,6 @@ jobs:
|
|||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -n 4 unit/
|
pytest $PYTEST_OPTS --forked -n 4 unit/
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -m 'sequential' unit/
|
pytest $PYTEST_OPTS --forked -m 'sequential' unit/
|
||||||
|
6
.github/workflows/nv-torch19-p40.yml
vendored
6
.github/workflows/nv-torch19-p40.yml
vendored
@ -20,7 +20,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
|
pip install -U --cache-dir $TORCH_CACHE torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -44,6 +44,6 @@ jobs:
|
|||||||
|
|
||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -n 4 unit/ --torch_ver="1.9" --cuda_ver="11.1"
|
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.9" --cuda_ver="11.1"
|
||||||
|
7
.github/workflows/nv-torch19-v100.yml
vendored
7
.github/workflows/nv-torch19-v100.yml
vendored
@ -20,7 +20,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
|
pip install -U --cache-dir $TORCH_CACHE torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -45,7 +45,6 @@ jobs:
|
|||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: |
|
run: |
|
||||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
|
||||||
cd tests
|
cd tests
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -n 4 unit/ --torch_ver="1.9" --cuda_ver="11"
|
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.9" --cuda_ver="11"
|
||||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --forked -m 'sequential' unit/ --torch_ver="1.9" --cuda_ver="11"
|
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="1.9" --cuda_ver="11"
|
||||||
|
6
.github/workflows/nv-transformers-v100.yml
vendored
6
.github/workflows/nv-transformers-v100.yml
vendored
@ -27,7 +27,7 @@ jobs:
|
|||||||
- name: Install pytorch
|
- name: Install pytorch
|
||||||
run: |
|
run: |
|
||||||
# use the same pytorch version as transformers CI
|
# use the same pytorch version as transformers CI
|
||||||
pip install -U --cache-dir /blob/torch_cache torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
|
pip install -U --cache-dir $TORCH_CACHE torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ jobs:
|
|||||||
|
|
||||||
- name: HF transformers tests
|
- name: HF transformers tests
|
||||||
run: |
|
run: |
|
||||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||||
git clone https://github.com/huggingface/transformers
|
git clone https://github.com/huggingface/transformers
|
||||||
cd transformers
|
cd transformers
|
||||||
# if needed switch to the last known good SHA until transformers@master is fixed
|
# if needed switch to the last known good SHA until transformers@master is fixed
|
||||||
@ -57,4 +57,4 @@ jobs:
|
|||||||
# force protobuf version due to issues
|
# force protobuf version due to issues
|
||||||
pip install "protobuf<4.21.0"
|
pip install "protobuf<4.21.0"
|
||||||
pip list
|
pip list
|
||||||
HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
|
WANDB_DISABLED=true RUN_SLOW=1 pytest $PYTEST_OPTS tests/deepspeed
|
||||||
|
10
.github/workflows/setup-venv/action.yml
vendored
10
.github/workflows/setup-venv/action.yml
vendored
@ -18,6 +18,16 @@ runs:
|
|||||||
pip install wheel # required after pip>=23.1
|
pip install wheel # required after pip>=23.1
|
||||||
echo PATH=$PATH >> $GITHUB_ENV # Make it so venv is inherited for other steps
|
echo PATH=$PATH >> $GITHUB_ENV # Make it so venv is inherited for other steps
|
||||||
shell: bash
|
shell: bash
|
||||||
|
- id: set-env-vars
|
||||||
|
run: |
|
||||||
|
echo TEST_DATA_DIR=/blob/ >> $GITHUB_ENV
|
||||||
|
echo TRANSFORMERS_CACHE=/blob/transformers_cache/ >> $GITHUB_ENV
|
||||||
|
echo TORCH_EXTENSIONS_DIR=./torch-extensions/ >> $GITHUB_ENV
|
||||||
|
echo TORCH_CACHE=/blob/torch_cache/ >> $GITHUB_ENV
|
||||||
|
echo HF_DATASETS_CACHE=/blob/datasets_cache/ >> $GITHUB_ENV
|
||||||
|
echo MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ >> $GITHUB_ENV
|
||||||
|
echo PYTEST_OPTS="--color=yes --durations=0 --verbose -rF" >> $GITHUB_ENV
|
||||||
|
shell: bash
|
||||||
- id: print-env
|
- id: print-env
|
||||||
run: |
|
run: |
|
||||||
which python
|
which python
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
clang-format==16.0.2
|
clang-format==16.0.2
|
||||||
|
coverage
|
||||||
docutils<0.18
|
docutils<0.18
|
||||||
future
|
future
|
||||||
importlib-metadata>=4
|
importlib-metadata>=4
|
||||||
|
5
tests/.coveragerc
Normal file
5
tests/.coveragerc
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# .coveragerc to control coverage.py
|
||||||
|
[run]
|
||||||
|
parallel = True
|
||||||
|
sigterm = True
|
||||||
|
source = deepspeed
|
@ -70,10 +70,18 @@ def pytest_runtest_call(item):
|
|||||||
item.runtest = lambda: True # Dummy function so test is not run twice
|
item.runtest = lambda: True # Dummy function so test is not run twice
|
||||||
|
|
||||||
|
|
||||||
|
# We allow DistributedTest to reuse distributed environments. When the last
|
||||||
|
# test for a class is run, we want to make sure those distributed environments
|
||||||
|
# are destroyed.
|
||||||
|
def pytest_runtest_teardown(item, nextitem):
|
||||||
|
if getattr(item.cls, "reuse_dist_env", False) and not nextitem:
|
||||||
|
dist_test_class = item.cls()
|
||||||
|
for num_procs, pool in dist_test_class._pool_cache.items():
|
||||||
|
dist_test_class._close_pool(pool, num_procs, force=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.hookimpl(tryfirst=True)
|
@pytest.hookimpl(tryfirst=True)
|
||||||
def pytest_fixture_setup(fixturedef, request):
|
def pytest_fixture_setup(fixturedef, request):
|
||||||
if getattr(fixturedef.func, "is_dist_fixture", False):
|
if getattr(fixturedef.func, "is_dist_fixture", False):
|
||||||
#for val in dir(request):
|
|
||||||
# print(val.upper(), getattr(request, val), "\n")
|
|
||||||
dist_fixture_class = fixturedef.func()
|
dist_fixture_class = fixturedef.func()
|
||||||
dist_fixture_class(request)
|
dist_fixture_class(request)
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
# DeepSpeed Team
|
# DeepSpeed Team
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import os
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
@ -98,7 +99,12 @@ def cifar_trainset(fp16=False):
|
|||||||
dist.barrier()
|
dist.barrier()
|
||||||
if local_rank != 0:
|
if local_rank != 0:
|
||||||
dist.barrier()
|
dist.barrier()
|
||||||
trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data', train=True, download=True, transform=transform)
|
|
||||||
|
data_root = os.getenv("TEST_DATA_DIR", "/tmp/")
|
||||||
|
trainset = torchvision.datasets.CIFAR10(root=os.path.join(data_root, "cifar10-data"),
|
||||||
|
train=True,
|
||||||
|
download=True,
|
||||||
|
transform=transform)
|
||||||
if local_rank == 0:
|
if local_rank == 0:
|
||||||
dist.barrier()
|
dist.barrier()
|
||||||
return trainset
|
return trainset
|
||||||
@ -114,6 +120,18 @@ def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True,
|
|||||||
trainset = cifar_trainset(fp16=fp16)
|
trainset = cifar_trainset(fp16=fp16)
|
||||||
config['local_rank'] = dist.get_rank()
|
config['local_rank'] = dist.get_rank()
|
||||||
|
|
||||||
|
# deepspeed_io defaults to creating a dataloader that uses a
|
||||||
|
# multiprocessing pool. Our tests use pools and we cannot nest pools in
|
||||||
|
# python. Therefore we're injecting this kwarg to ensure that no pools
|
||||||
|
# are used in the dataloader.
|
||||||
|
old_method = deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io
|
||||||
|
|
||||||
|
def new_method(*args, **kwargs):
|
||||||
|
kwargs["num_local_io_workers"] = 0
|
||||||
|
return old_method(*args, **kwargs)
|
||||||
|
|
||||||
|
deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = new_method
|
||||||
|
|
||||||
engine, _, _, _ = deepspeed.initialize(config=config,
|
engine, _, _, _ = deepspeed.initialize(config=config,
|
||||||
model=model,
|
model=model,
|
||||||
model_parameters=[p for p in model.parameters()],
|
model_parameters=[p for p in model.parameters()],
|
||||||
|
@ -4,8 +4,11 @@
|
|||||||
# DeepSpeed Team
|
# DeepSpeed Team
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import inspect
|
import inspect
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -14,7 +17,6 @@ import torch.multiprocessing as mp
|
|||||||
import deepspeed
|
import deepspeed
|
||||||
from deepspeed.accelerator import get_accelerator
|
from deepspeed.accelerator import get_accelerator
|
||||||
import deepspeed.comm as dist
|
import deepspeed.comm as dist
|
||||||
from torch.multiprocessing import Process
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from _pytest.outcomes import Skipped
|
from _pytest.outcomes import Skipped
|
||||||
@ -40,11 +42,10 @@ def get_xdist_worker_id():
|
|||||||
|
|
||||||
|
|
||||||
def get_master_port():
|
def get_master_port():
|
||||||
master_port = os.environ.get('DS_TEST_PORT', '29503')
|
# Select a random open port
|
||||||
xdist_worker_id = get_xdist_worker_id()
|
with socket.socket() as s:
|
||||||
if xdist_worker_id is not None:
|
s.bind(('', 0))
|
||||||
master_port = str(int(master_port) + xdist_worker_id)
|
return str(s.getsockname()[1])
|
||||||
return master_port
|
|
||||||
|
|
||||||
|
|
||||||
def set_accelerator_visible():
|
def set_accelerator_visible():
|
||||||
@ -54,7 +55,6 @@ def set_accelerator_visible():
|
|||||||
xdist_worker_id = 0
|
xdist_worker_id = 0
|
||||||
if cuda_visible is None:
|
if cuda_visible is None:
|
||||||
# CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead
|
# CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead
|
||||||
import subprocess
|
|
||||||
if get_accelerator().device_name() == 'cuda':
|
if get_accelerator().device_name() == 'cuda':
|
||||||
if is_rocm_pytorch():
|
if is_rocm_pytorch():
|
||||||
rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
|
rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
|
||||||
@ -64,7 +64,6 @@ def set_accelerator_visible():
|
|||||||
nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
|
nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
|
||||||
num_accelerators = len(nvidia_smi.decode('utf-8').strip().split('\n'))
|
num_accelerators = len(nvidia_smi.decode('utf-8').strip().split('\n'))
|
||||||
elif get_accelerator().device_name() == 'xpu':
|
elif get_accelerator().device_name() == 'xpu':
|
||||||
import re
|
|
||||||
clinfo = subprocess.check_output(['clinfo'])
|
clinfo = subprocess.check_output(['clinfo'])
|
||||||
lines = clinfo.decode('utf-8').strip().split('\n')
|
lines = clinfo.decode('utf-8').strip().split('\n')
|
||||||
num_accelerators = 0
|
num_accelerators = 0
|
||||||
@ -100,6 +99,8 @@ class DistributedExec(ABC):
|
|||||||
init_distributed = True
|
init_distributed = True
|
||||||
set_dist_env = True
|
set_dist_env = True
|
||||||
requires_cuda_env = True
|
requires_cuda_env = True
|
||||||
|
reuse_dist_env = False
|
||||||
|
_pool_cache = {}
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def run(self):
|
def run(self):
|
||||||
@ -115,7 +116,6 @@ class DistributedExec(ABC):
|
|||||||
world_size = [world_size]
|
world_size = [world_size]
|
||||||
for procs in world_size:
|
for procs in world_size:
|
||||||
self._launch_procs(procs)
|
self._launch_procs(procs)
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
def _get_fixture_kwargs(self, request, func):
|
def _get_fixture_kwargs(self, request, func):
|
||||||
if not request:
|
if not request:
|
||||||
@ -132,60 +132,53 @@ class DistributedExec(ABC):
|
|||||||
return fixture_kwargs
|
return fixture_kwargs
|
||||||
|
|
||||||
def _launch_procs(self, num_procs):
|
def _launch_procs(self, num_procs):
|
||||||
|
# Verify we have enough accelerator devices to run this test
|
||||||
if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
|
if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
|
f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Set start method to `forkserver` (or `fork`)
|
||||||
mp.set_start_method('forkserver', force=True)
|
mp.set_start_method('forkserver', force=True)
|
||||||
skip_msg = mp.Queue() # Allows forked processes to share pytest.skip reason
|
|
||||||
processes = []
|
|
||||||
for local_rank in range(num_procs):
|
|
||||||
p = Process(target=self._dist_init, args=(local_rank, num_procs, skip_msg))
|
|
||||||
p.start()
|
|
||||||
processes.append(p)
|
|
||||||
|
|
||||||
# Now loop and wait for a test to complete. The spin-wait here isn't a big
|
# Create process pool or use cached one
|
||||||
# deal because the number of processes will be O(#GPUs) << O(#CPUs).
|
master_port = None
|
||||||
any_done = False
|
if self.reuse_dist_env:
|
||||||
start = time.time()
|
if num_procs not in self._pool_cache:
|
||||||
while (not any_done) and ((time.time() - start) < DEEPSPEED_TEST_TIMEOUT):
|
self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
|
||||||
for p in processes:
|
master_port = get_master_port()
|
||||||
if not p.is_alive():
|
pool = self._pool_cache[num_procs]
|
||||||
any_done = True
|
else:
|
||||||
break
|
pool = mp.Pool(processes=num_procs)
|
||||||
time.sleep(.1) # So we don't hog CPU
|
master_port = get_master_port()
|
||||||
|
|
||||||
# If we hit the timeout, then presume a test is hanged
|
# Run the test
|
||||||
if not any_done:
|
args = [(local_rank, num_procs, master_port) for local_rank in range(num_procs)]
|
||||||
for p in processes:
|
skip_msgs_async = pool.starmap_async(self._dist_run, args)
|
||||||
p.terminate()
|
|
||||||
|
try:
|
||||||
|
skip_msgs = skip_msgs_async.get(DEEPSPEED_TEST_TIMEOUT)
|
||||||
|
except mp.TimeoutError:
|
||||||
|
# Shortcut to exit pytest in the case of a hanged test. This
|
||||||
|
# usually means an environment error and the rest of tests will
|
||||||
|
# hang (causing super long unit test runtimes)
|
||||||
pytest.exit("Test hanged, exiting", returncode=0)
|
pytest.exit("Test hanged, exiting", returncode=0)
|
||||||
|
|
||||||
# Wait for all other processes to complete
|
# Tear down distributed environment and close process pools
|
||||||
for p in processes:
|
self._close_pool(pool, num_procs)
|
||||||
p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)
|
|
||||||
|
|
||||||
failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
|
# If we skipped a test, propagate that to this process
|
||||||
for rank, p in failed:
|
if any(skip_msgs):
|
||||||
# If it still hasn't terminated, kill it because it hung.
|
assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
|
||||||
if p.exitcode is None:
|
pytest.skip(skip_msgs[0])
|
||||||
p.terminate()
|
|
||||||
pytest.fail(f'Worker {rank} hung.', pytrace=False)
|
|
||||||
if p.exitcode < 0:
|
|
||||||
pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', pytrace=False)
|
|
||||||
if p.exitcode > 0:
|
|
||||||
pytest.fail(f'Worker {rank} exited with code {p.exitcode}', pytrace=False)
|
|
||||||
|
|
||||||
if not skip_msg.empty():
|
def _dist_run(self, local_rank, num_procs, master_port):
|
||||||
# This assumed all skip messages are the same, it may be useful to
|
skip_msg = ''
|
||||||
# add a check here to assert all exit messages are equal
|
if not dist.is_initialized():
|
||||||
pytest.skip(skip_msg.get())
|
|
||||||
|
|
||||||
def _dist_init(self, local_rank, num_procs, skip_msg):
|
|
||||||
""" Initialize deepspeed.comm and execute the user function. """
|
""" Initialize deepspeed.comm and execute the user function. """
|
||||||
if self.set_dist_env:
|
if self.set_dist_env:
|
||||||
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
||||||
os.environ['MASTER_PORT'] = get_master_port()
|
os.environ['MASTER_PORT'] = str(master_port)
|
||||||
os.environ['LOCAL_RANK'] = str(local_rank)
|
os.environ['LOCAL_RANK'] = str(local_rank)
|
||||||
# NOTE: unit tests don't support multi-node so local_rank == global rank
|
# NOTE: unit tests don't support multi-node so local_rank == global rank
|
||||||
os.environ['RANK'] = str(local_rank)
|
os.environ['RANK'] = str(local_rank)
|
||||||
@ -208,16 +201,23 @@ class DistributedExec(ABC):
|
|||||||
self.run(**self._fixture_kwargs)
|
self.run(**self._fixture_kwargs)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
if isinstance(e, Skipped):
|
if isinstance(e, Skipped):
|
||||||
skip_msg.put(e.msg)
|
skip_msg = e.msg
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
if self.init_distributed or dist.is_initialized():
|
return skip_msg
|
||||||
# make sure all ranks finish at the same time
|
|
||||||
|
def _dist_destroy(self):
|
||||||
|
if (dist is not None) and dist.is_initialized():
|
||||||
dist.barrier()
|
dist.barrier()
|
||||||
# tear down after test completes
|
|
||||||
dist.destroy_process_group()
|
dist.destroy_process_group()
|
||||||
|
|
||||||
|
def _close_pool(self, pool, num_procs, force=False):
|
||||||
|
if force or not self.reuse_dist_env:
|
||||||
|
msg = pool.starmap(self._dist_destroy, [() for _ in range(num_procs)])
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
|
||||||
|
|
||||||
class DistributedFixture(DistributedExec):
|
class DistributedFixture(DistributedExec):
|
||||||
"""
|
"""
|
||||||
|
@ -244,9 +244,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
|
|||||||
check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
|
check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
#test_backward[3-1024-120-16-24-True-True-0.05]
|
# NOTE: Keep these different params as they have helped find divergence in behavior between AMD and NVIDIA.
|
||||||
#test_backward[3-1024-52-16-24-False-True-0.2]
|
|
||||||
# 3-128-54-2-24-False-True-0.2
|
|
||||||
@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
|
@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
|
||||||
[
|
[
|
||||||
(64,160,128,2,24,False,True, 0.2),
|
(64,160,128,2,24,False,True, 0.2),
|
||||||
@ -254,12 +252,6 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
|
|||||||
(8,1600,128,25,3,True,True, 0.05),
|
(8,1600,128,25,3,True,True, 0.05),
|
||||||
(8,160,128,2,3,True,True, 0.1),
|
(8,160,128,2,3,True,True, 0.1),
|
||||||
(8,1600,128,2,3,True,True, 0.05),
|
(8,1600,128,2,3,True,True, 0.05),
|
||||||
#(3,1024,119,16,24,True,False, 0.05),
|
|
||||||
#(3,1024,115,16,24,True,True, 0.05),
|
|
||||||
#(1024,128,10,2,2,False,False, 0.1),
|
|
||||||
#(3,1024,52,16,24,False,True, 0.2),
|
|
||||||
#(3,128,51,2,24,False,False, 0.1),
|
|
||||||
#(3,128,54,2,24,False,True, 0.2),
|
|
||||||
]) # yapf: disable
|
]) # yapf: disable
|
||||||
class TestCUDABackward(DistributedTest):
|
class TestCUDABackward(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
@ -267,7 +259,7 @@ class TestCUDABackward(DistributedTest):
|
|||||||
#This is to flush denorms in forward pass. Please refer to https://github.com/pytorch/pytorch/blob/main/docs/source/notes/numerical_accuracy.rst#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
|
#This is to flush denorms in forward pass. Please refer to https://github.com/pytorch/pytorch/blob/main/docs/source/notes/numerical_accuracy.rst#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
|
||||||
os.environ['ROCBLAS_INTERNAL_FP16_ALT_IMPL'] = '1'
|
os.environ['ROCBLAS_INTERNAL_FP16_ALT_IMPL'] = '1'
|
||||||
|
|
||||||
def test_backward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol):
|
def test_backward(self, is_preln, use_fp16, batch_size, hidden_size, seq_len, heads, num_layers, atol):
|
||||||
# Only run fp16 test cases on devices with FP16 capability.
|
# Only run fp16 test cases on devices with FP16 capability.
|
||||||
if not get_accelerator().is_fp16_supported() and (use_fp16 is True or is_preln is False):
|
if not get_accelerator().is_fp16_supported() and (use_fp16 is True or is_preln is False):
|
||||||
return
|
return
|
||||||
@ -286,38 +278,3 @@ class TestCUDABackward(DistributedTest):
|
|||||||
ds_config.fp16 = use_fp16
|
ds_config.fp16 = use_fp16
|
||||||
|
|
||||||
run_backward(ds_config, seq_len, atol=atol, verbose=True)
|
run_backward(ds_config, seq_len, atol=atol, verbose=True)
|
||||||
|
|
||||||
# [
|
|
||||||
# (3,1024,128,16,24,True,False, 0.07),
|
|
||||||
# (3,1024,128,16,24,True,True, 0.05),
|
|
||||||
# (3,1024,128,16,24,False,False, 0.1),
|
|
||||||
# (3,1024,128,16,24,False,True, 0.2),
|
|
||||||
# ]) # yapf: disable
|
|
||||||
#def test_backward_stochastic(batch_size,
|
|
||||||
# hidden_size,
|
|
||||||
# seq_len,
|
|
||||||
# heads,
|
|
||||||
# num_layers,
|
|
||||||
# is_preln,
|
|
||||||
# use_fp16,
|
|
||||||
# atol):
|
|
||||||
# # Only run fp16 test cases on devices with FP16 capability.
|
|
||||||
# if not get_accelerator().is_fp16_supported() and use_fp16 is True:
|
|
||||||
# return
|
|
||||||
#
|
|
||||||
# ds_config = DeepSpeedTransformerConfig()
|
|
||||||
# ds_config.layer_id = None
|
|
||||||
# ds_config.batch_size = batch_size
|
|
||||||
# ds_config.hidden_size = hidden_size
|
|
||||||
# ds_config.intermediate_size = 4 * hidden_size
|
|
||||||
# ds_config.max_seq_length = seq_len
|
|
||||||
# ds_config.heads = heads
|
|
||||||
# ds_config.attn_dropout_ratio = 0.0
|
|
||||||
# ds_config.hidden_dropout_ratio = 0.0
|
|
||||||
# ds_config.num_hidden_layers = num_layers
|
|
||||||
# ds_config.pre_layer_norm = is_preln
|
|
||||||
# ds_config.initializer_range = 0.02
|
|
||||||
# ds_config.fp16 = use_fp16
|
|
||||||
# ds_config.stochastic_mode = True
|
|
||||||
#
|
|
||||||
# run_backward(ds_config, atol=atol)
|
|
||||||
|
@ -224,6 +224,7 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
|
|||||||
]) # yapf: disable
|
]) # yapf: disable
|
||||||
class TestCUDAForward(DistributedTest):
|
class TestCUDAForward(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
reuse_dist_env = True
|
||||||
|
|
||||||
def test_forward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16):
|
def test_forward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16):
|
||||||
# Only run fp16 test cases on devices with FP16 capability.
|
# Only run fp16 test cases on devices with FP16 capability.
|
||||||
|
@ -34,17 +34,7 @@ class TestCPUAdagrad(DistributedTest):
|
|||||||
init_distributed = False
|
init_distributed = False
|
||||||
set_dist_env = False
|
set_dist_env = False
|
||||||
|
|
||||||
@pytest.mark.parametrize('model_size',
|
def test_cpu_adagrad_opt(self, model_size=64):
|
||||||
[
|
|
||||||
(64),
|
|
||||||
(22),
|
|
||||||
(55),
|
|
||||||
(127),
|
|
||||||
(1024),
|
|
||||||
(1048576),
|
|
||||||
(30000000),
|
|
||||||
]) # yapf: disable
|
|
||||||
def test_cpu_adagrad_opt(self, model_size):
|
|
||||||
device = 'cpu'
|
device = 'cpu'
|
||||||
rng_state = torch.get_rng_state()
|
rng_state = torch.get_rng_state()
|
||||||
param = torch.nn.Parameter(torch.randn(model_size, device=device))
|
param = torch.nn.Parameter(torch.randn(model_size, device=device))
|
||||||
@ -65,14 +55,7 @@ class TestCPUAdagrad(DistributedTest):
|
|||||||
|
|
||||||
check_equal(param, param1, atol=1e-2, verbose=True)
|
check_equal(param, param1, atol=1e-2, verbose=True)
|
||||||
|
|
||||||
|
def test_cpu_adagrad_opt_sparse_embedding(self, model_size=32, vocabulary_size=64, dim=16):
|
||||||
@pytest.mark.parametrize('model_size,vocabulary_size,dim',
|
|
||||||
[
|
|
||||||
(16 * 2, 16 * 4, 16),
|
|
||||||
(16 * 32, 16 * 256, 16),
|
|
||||||
(16 * 256, 16 * 16384, 16),
|
|
||||||
]) # yapf: disable
|
|
||||||
def test_cpu_adagrad_opt_sparse_embedding(self, model_size, vocabulary_size, dim):
|
|
||||||
device = 'cpu'
|
device = 'cpu'
|
||||||
rng_state = torch.get_rng_state()
|
rng_state = torch.get_rng_state()
|
||||||
|
|
||||||
|
@ -36,6 +36,7 @@ adam_configs = [["AdamW", False, False, False, (FusedAdam, True)],
|
|||||||
adam_configs)
|
adam_configs)
|
||||||
class TestAdamConfigs(DistributedTest):
|
class TestAdamConfigs(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
reuse_dist_env = True
|
||||||
|
|
||||||
def test(self,
|
def test(self,
|
||||||
optimizer,
|
optimizer,
|
||||||
|
@ -55,6 +55,7 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
|
|||||||
]) # yapf: disable
|
]) # yapf: disable
|
||||||
class TestCPUAdam(DistributedTest):
|
class TestCPUAdam(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
reuse_dist_env = True
|
||||||
requires_cuda_env = False
|
requires_cuda_env = False
|
||||||
if not get_accelerator().is_available():
|
if not get_accelerator().is_available():
|
||||||
init_distributed = False
|
init_distributed = False
|
||||||
|
@ -83,6 +83,7 @@ def _validate_handle_state(handle, single_submit, overlap_events):
|
|||||||
@pytest.mark.parametrize("overlap_events", [True, False])
|
@pytest.mark.parametrize("overlap_events", [True, False])
|
||||||
class TestRead(DistributedTest):
|
class TestRead(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
reuse_dist_env = True
|
||||||
requires_cuda_env = False
|
requires_cuda_env = False
|
||||||
if not get_accelerator().is_available():
|
if not get_accelerator().is_available():
|
||||||
init_distributed = False
|
init_distributed = False
|
||||||
@ -148,6 +149,7 @@ class TestRead(DistributedTest):
|
|||||||
@pytest.mark.parametrize("overlap_events", [True, False])
|
@pytest.mark.parametrize("overlap_events", [True, False])
|
||||||
class TestWrite(DistributedTest):
|
class TestWrite(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
reuse_dist_env = True
|
||||||
requires_cuda_env = False
|
requires_cuda_env = False
|
||||||
if not get_accelerator().is_available():
|
if not get_accelerator().is_available():
|
||||||
init_distributed = False
|
init_distributed = False
|
||||||
|
@ -8,7 +8,6 @@ import torch.nn as nn
|
|||||||
import deepspeed.comm as dist
|
import deepspeed.comm as dist
|
||||||
import deepspeed
|
import deepspeed
|
||||||
import pytest
|
import pytest
|
||||||
import copy
|
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@ -334,18 +333,10 @@ class TestOneBitAdamCheckpointing(DistributedTest):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"topo_config",
|
"topo_config",
|
||||||
[
|
[
|
||||||
{
|
|
||||||
"num_pp": 1,
|
|
||||||
"num_dp": 4
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"num_pp": 2,
|
"num_pp": 2,
|
||||||
"num_dp": 2
|
"num_dp": 2
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"num_pp": 4,
|
|
||||||
"num_dp": 1
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
class TestOneBitAdamFP16Pipeline(DistributedTest):
|
class TestOneBitAdamFP16Pipeline(DistributedTest):
|
||||||
@ -353,8 +344,8 @@ class TestOneBitAdamFP16Pipeline(DistributedTest):
|
|||||||
|
|
||||||
def test(self, topo_config):
|
def test(self, topo_config):
|
||||||
config_dict = {
|
config_dict = {
|
||||||
"train_batch_size": 16,
|
"train_batch_size": 4,
|
||||||
"train_micro_batch_size_per_gpu": 4,
|
"grandient_accumulation_steps": 1,
|
||||||
"steps_per_print": 20,
|
"steps_per_print": 20,
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "OneBitAdam",
|
"type": "OneBitAdam",
|
||||||
@ -384,20 +375,12 @@ class TestOneBitAdamFP16Pipeline(DistributedTest):
|
|||||||
}
|
}
|
||||||
|
|
||||||
topo = PipeTopo(**topo_config)
|
topo = PipeTopo(**topo_config)
|
||||||
steps = 500 # Must be >=100
|
steps = 100
|
||||||
|
|
||||||
# Allocate model for consistent initial weights.
|
# TODO: Add correctness tests/asserts comparing with baseline?
|
||||||
init_net = AlexNetPipe()
|
test_net = AlexNetPipe()
|
||||||
|
|
||||||
test_net = copy.deepcopy(init_net)
|
|
||||||
test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
|
test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
|
||||||
|
test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
|
||||||
test_losses = train_cifar(
|
|
||||||
test_model,
|
|
||||||
config=config_dict,
|
|
||||||
num_steps=steps,
|
|
||||||
fp16=config_dict["fp16"]["enabled"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
|
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
|
||||||
@ -707,18 +690,10 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"topo_config",
|
"topo_config",
|
||||||
[
|
[
|
||||||
{
|
|
||||||
"num_pp": 1,
|
|
||||||
"num_dp": 4
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"num_pp": 2,
|
"num_pp": 2,
|
||||||
"num_dp": 2
|
"num_dp": 2
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"num_pp": 4,
|
|
||||||
"num_dp": 1
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
class TestZeroOneAdamFP16Pipeline(DistributedTest):
|
class TestZeroOneAdamFP16Pipeline(DistributedTest):
|
||||||
@ -726,8 +701,8 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest):
|
|||||||
|
|
||||||
def test(self, topo_config):
|
def test(self, topo_config):
|
||||||
config_dict = {
|
config_dict = {
|
||||||
"train_batch_size": 16,
|
"train_batch_size": 4,
|
||||||
"train_micro_batch_size_per_gpu": 4,
|
"grandient_accumulation_steps": 1,
|
||||||
"steps_per_print": 20,
|
"steps_per_print": 20,
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "ZeroOneAdam",
|
"type": "ZeroOneAdam",
|
||||||
@ -760,20 +735,12 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest):
|
|||||||
}
|
}
|
||||||
|
|
||||||
topo = PipeTopo(**topo_config)
|
topo = PipeTopo(**topo_config)
|
||||||
steps = 500 # Must be >=100
|
steps = 100
|
||||||
|
|
||||||
# Allocate model for consistent initial weights.
|
# TODO: Add correctness tests/asserts comparing with baseline?
|
||||||
init_net = AlexNetPipe()
|
test_net = AlexNetPipe()
|
||||||
|
|
||||||
test_net = copy.deepcopy(init_net)
|
|
||||||
test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
|
test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
|
||||||
|
test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
|
||||||
test_losses = train_cifar(
|
|
||||||
test_model,
|
|
||||||
config=config_dict,
|
|
||||||
num_steps=steps,
|
|
||||||
fp16=config_dict["fp16"]["enabled"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
|
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
|
||||||
@ -1109,18 +1076,10 @@ class TestOneBitLambCheckpointing(DistributedTest):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"topo_config",
|
"topo_config",
|
||||||
[
|
[
|
||||||
{
|
|
||||||
"num_pp": 1,
|
|
||||||
"num_dp": 4
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"num_pp": 2,
|
"num_pp": 2,
|
||||||
"num_dp": 2
|
"num_dp": 2
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"num_pp": 4,
|
|
||||||
"num_dp": 1
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
class TestOneBitLambFP16Pipeline(DistributedTest):
|
class TestOneBitLambFP16Pipeline(DistributedTest):
|
||||||
@ -1128,8 +1087,8 @@ class TestOneBitLambFP16Pipeline(DistributedTest):
|
|||||||
|
|
||||||
def test(self, topo_config):
|
def test(self, topo_config):
|
||||||
config_dict = {
|
config_dict = {
|
||||||
"train_batch_size": 16,
|
"train_batch_size": 4,
|
||||||
"train_micro_batch_size_per_gpu": 4,
|
"grandient_accumulation_steps": 1,
|
||||||
"steps_per_print": 20,
|
"steps_per_print": 20,
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "OneBitLamb",
|
"type": "OneBitLamb",
|
||||||
@ -1159,20 +1118,12 @@ class TestOneBitLambFP16Pipeline(DistributedTest):
|
|||||||
}
|
}
|
||||||
|
|
||||||
topo = PipeTopo(**topo_config)
|
topo = PipeTopo(**topo_config)
|
||||||
steps = 500 # Must be >=100
|
steps = 100
|
||||||
|
|
||||||
# Allocate model for consistent initial weights.
|
# TODO: Add correctness tests/asserts comparing with baseline?
|
||||||
init_net = AlexNetPipe()
|
test_net = AlexNetPipe()
|
||||||
|
|
||||||
test_net = copy.deepcopy(init_net)
|
|
||||||
test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
|
test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
|
||||||
|
test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
|
||||||
test_losses = train_cifar(
|
|
||||||
test_model,
|
|
||||||
config=config_dict,
|
|
||||||
num_steps=steps,
|
|
||||||
fp16=config_dict["fp16"]["enabled"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.sequential
|
@pytest.mark.sequential
|
||||||
|
@ -319,7 +319,7 @@ class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest):
|
|||||||
|
|
||||||
model = SimpleModel(hidden_dim)
|
model = SimpleModel(hidden_dim)
|
||||||
model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
|
model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
|
||||||
data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
|
data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
|
||||||
for n, batch in enumerate(data_loader):
|
for n, batch in enumerate(data_loader):
|
||||||
loss = model(batch[0], batch[1])
|
loss = model(batch[0], batch[1])
|
||||||
model.backward(loss)
|
model.backward(loss)
|
||||||
@ -328,11 +328,10 @@ class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest):
|
|||||||
|
|
||||||
@pytest.mark.parametrize("zero_stage", [1, 2, 3])
|
@pytest.mark.parametrize("zero_stage", [1, 2, 3])
|
||||||
@pytest.mark.parametrize("use_cpu_offload", [True, False])
|
@pytest.mark.parametrize("use_cpu_offload", [True, False])
|
||||||
@pytest.mark.parametrize("hidden_dim", [9, 10])
|
|
||||||
class TestZeroStaticScale(DistributedTest):
|
class TestZeroStaticScale(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
|
||||||
def test(self, zero_stage, use_cpu_offload, hidden_dim):
|
def test(self, zero_stage, use_cpu_offload, hidden_dim=4):
|
||||||
if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
|
if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
|
||||||
pytest.skip("cpu-adam is not compatible")
|
pytest.skip("cpu-adam is not compatible")
|
||||||
|
|
||||||
|
@ -42,8 +42,8 @@ class TestPipeCifar10(DistributedTest):
|
|||||||
skip_on_arch(min_arch=7)
|
skip_on_arch(min_arch=7)
|
||||||
|
|
||||||
config_dict = {
|
config_dict = {
|
||||||
"train_batch_size": 16,
|
"train_batch_size": 4,
|
||||||
"train_micro_batch_size_per_gpu": 4,
|
"grandient_accumulation_steps": 1,
|
||||||
"steps_per_print": 20,
|
"steps_per_print": 20,
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
@ -67,7 +67,7 @@ class TestPipeCifar10(DistributedTest):
|
|||||||
}
|
}
|
||||||
|
|
||||||
topo = PipeTopo(**topo_config)
|
topo = PipeTopo(**topo_config)
|
||||||
steps = 500 # must be >=100
|
steps = 100 # must be >=100
|
||||||
|
|
||||||
# Allocate model for consistent initial weights.
|
# Allocate model for consistent initial weights.
|
||||||
init_net = AlexNetPipe()
|
init_net = AlexNetPipe()
|
||||||
|
@ -42,6 +42,7 @@ class TestDataLoaderDropLast(DistributedTest):
|
|||||||
model=model,
|
model=model,
|
||||||
training_data=train_dataset,
|
training_data=train_dataset,
|
||||||
optimizer=optimizer)
|
optimizer=optimizer)
|
||||||
|
training_dataloader.num_local_io_workers = 0 # We can't do nested mp.pool
|
||||||
for n, batch in enumerate(training_dataloader):
|
for n, batch in enumerate(training_dataloader):
|
||||||
x = batch[0].to(get_accelerator().current_device_name())
|
x = batch[0].to(get_accelerator().current_device_name())
|
||||||
y = batch[1].to(get_accelerator().current_device_name())
|
y = batch[1].to(get_accelerator().current_device_name())
|
||||||
|
@ -117,6 +117,7 @@ class TestConfigOptimizer(DistributedTest):
|
|||||||
@pytest.mark.parametrize('grad_accum_dtype', [None, 'fp16', 'bf16', 'fp32'])
|
@pytest.mark.parametrize('grad_accum_dtype', [None, 'fp16', 'bf16', 'fp32'])
|
||||||
class TestOptimizerImplementation(DistributedTest):
|
class TestOptimizerImplementation(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
reuse_dist_env = True
|
||||||
|
|
||||||
def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
|
def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
|
||||||
if optimizer_extension == 'zero1':
|
if optimizer_extension == 'zero1':
|
||||||
@ -125,9 +126,9 @@ class TestOptimizerImplementation(DistributedTest):
|
|||||||
zero_stage = 2
|
zero_stage = 2
|
||||||
else:
|
else:
|
||||||
zero_stage = 0
|
zero_stage = 0
|
||||||
amp = True if optimizer_extension == 'amp' else False
|
amp = (optimizer_extension == 'amp')
|
||||||
fp16 = True if model_dtype == 'fp16' else False
|
fp16 = (model_dtype == 'fp16')
|
||||||
bf16 = True if model_dtype == 'bf16' else False
|
bf16 = (model_dtype == 'bf16')
|
||||||
# Skip checks
|
# Skip checks
|
||||||
if bf16 and not bf16_required_version_check():
|
if bf16 and not bf16_required_version_check():
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
|
@ -52,7 +52,7 @@ def dump_state_dict(model):
|
|||||||
print(f"{name} {param.data}")
|
print(f"{name} {param.data}")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('zero_stage', [1, 2, 3])
|
@pytest.mark.parametrize("zero_stage", [1, 2, 3])
|
||||||
class TestZeroUnbalancedGradients(DistributedTest):
|
class TestZeroUnbalancedGradients(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
|
||||||
@ -73,7 +73,7 @@ class TestZeroUnbalancedGradients(DistributedTest):
|
|||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"initial_scale_power": 8
|
"initial_scale_power": 8
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 4
|
hidden_dim = 4
|
||||||
|
|
||||||
@ -96,7 +96,7 @@ class TestZero3RepeatForwardLoop(DistributedTest):
|
|||||||
"steps_per_print": 1,
|
"steps_per_print": 1,
|
||||||
"zero_optimization": {
|
"zero_optimization": {
|
||||||
"stage": zero_stage,
|
"stage": zero_stage,
|
||||||
"stage3_param_persistence_threshold": 0
|
"stage3_param_persistence_threshold": 0,
|
||||||
},
|
},
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
@ -107,7 +107,7 @@ class TestZero3RepeatForwardLoop(DistributedTest):
|
|||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"initial_scale_power": 8
|
"initial_scale_power": 8
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 4
|
hidden_dim = 4
|
||||||
|
|
||||||
@ -137,8 +137,8 @@ class TestZero3RepeatForwardLoop(DistributedTest):
|
|||||||
|
|
||||||
# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
|
# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
|
||||||
# also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372
|
# also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372
|
||||||
@pytest.mark.parametrize('zero_stage', [2, 3])
|
@pytest.mark.parametrize("zero_stage", [2, 3])
|
||||||
@pytest.mark.parametrize('freeze_params', [True, False])
|
@pytest.mark.parametrize("freeze_params", [True, False])
|
||||||
class TestZeroToFP32(DistributedTest):
|
class TestZeroToFP32(DistributedTest):
|
||||||
world_size = 2
|
world_size = 2
|
||||||
|
|
||||||
@ -151,7 +151,7 @@ class TestZeroToFP32(DistributedTest):
|
|||||||
"steps_per_print": 1,
|
"steps_per_print": 1,
|
||||||
"zero_optimization": {
|
"zero_optimization": {
|
||||||
"stage": zero_stage,
|
"stage": zero_stage,
|
||||||
"stage3_param_persistence_threshold": 0
|
"stage3_param_persistence_threshold": 0,
|
||||||
},
|
},
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
@ -162,7 +162,7 @@ class TestZeroToFP32(DistributedTest):
|
|||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"initial_scale_power": 8
|
"initial_scale_power": 8
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
class MyModel(torch.nn.Module):
|
class MyModel(torch.nn.Module):
|
||||||
@ -245,7 +245,7 @@ class TestZeroToFP32(DistributedTest):
|
|||||||
"zero_allow_untested_optimizer": 1,
|
"zero_allow_untested_optimizer": 1,
|
||||||
"zero_optimization": {
|
"zero_optimization": {
|
||||||
"stage": zero_stage,
|
"stage": zero_stage,
|
||||||
"stage3_param_persistence_threshold": 0
|
"stage3_param_persistence_threshold": 0,
|
||||||
},
|
},
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
@ -256,7 +256,7 @@ class TestZeroToFP32(DistributedTest):
|
|||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"initial_scale_power": 8
|
"initial_scale_power": 8
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
class MyModel(torch.nn.Module):
|
class MyModel(torch.nn.Module):
|
||||||
@ -293,10 +293,12 @@ class TestZeroToFP32(DistributedTest):
|
|||||||
]
|
]
|
||||||
optim = torch.optim.SGD(optim_groups, lr=0.1)
|
optim = torch.optim.SGD(optim_groups, lr=0.1)
|
||||||
|
|
||||||
model, _, _, _ = deepspeed.initialize(model=model,
|
model, _, _, _ = deepspeed.initialize(
|
||||||
|
model=model,
|
||||||
model_parameters=model.parameters(),
|
model_parameters=model.parameters(),
|
||||||
optimizer=optim,
|
optimizer=optim,
|
||||||
config=config_dict)
|
config=config_dict,
|
||||||
|
)
|
||||||
model.empty_partition_cache()
|
model.empty_partition_cache()
|
||||||
|
|
||||||
data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
|
data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
|
||||||
@ -349,7 +351,7 @@ class TestIncorectAllgatherBucketSize(DistributedTest):
|
|||||||
"steps_per_print": 1,
|
"steps_per_print": 1,
|
||||||
"zero_optimization": {
|
"zero_optimization": {
|
||||||
"stage": zero_stage,
|
"stage": zero_stage,
|
||||||
"allgather_bucket_size": allgather_bucket_size
|
"allgather_bucket_size": allgather_bucket_size,
|
||||||
},
|
},
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
@ -360,7 +362,7 @@ class TestIncorectAllgatherBucketSize(DistributedTest):
|
|||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"initial_scale_power": 8
|
"initial_scale_power": 8
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 4
|
hidden_dim = 4
|
||||||
|
|
||||||
@ -372,7 +374,7 @@ class TestIncorectAllgatherBucketSize(DistributedTest):
|
|||||||
model, _, _, _ = deepspeed.initialize(config=config_dict,
|
model, _, _, _ = deepspeed.initialize(config=config_dict,
|
||||||
model=model,
|
model=model,
|
||||||
model_parameters=model.parameters())
|
model_parameters=model.parameters())
|
||||||
assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(assertinfo)
|
assert ("allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(assertinfo))
|
||||||
|
|
||||||
|
|
||||||
class TestPartitionNcclAlignment(DistributedTest):
|
class TestPartitionNcclAlignment(DistributedTest):
|
||||||
@ -395,7 +397,7 @@ class TestPartitionNcclAlignment(DistributedTest):
|
|||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"initial_scale_power": 8
|
"initial_scale_power": 8
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 4
|
hidden_dim = 4
|
||||||
|
|
||||||
@ -405,7 +407,8 @@ class TestPartitionNcclAlignment(DistributedTest):
|
|||||||
# get nccl all-gather send buffers alignment factor
|
# get nccl all-gather send buffers alignment factor
|
||||||
nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor
|
nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor
|
||||||
|
|
||||||
parallel_partitioned_bit16_groups = model.optimizer.parallel_partitioned_bit16_groups if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups
|
parallel_partitioned_bit16_groups = (model.optimizer.parallel_partitioned_bit16_groups
|
||||||
|
if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups)
|
||||||
for data_parallel_partitions in parallel_partitioned_bit16_groups:
|
for data_parallel_partitions in parallel_partitioned_bit16_groups:
|
||||||
for partition_id, partitioned_data in enumerate(data_parallel_partitions):
|
for partition_id, partitioned_data in enumerate(data_parallel_partitions):
|
||||||
# verify that data partition start locations are 4-byte aligned
|
# verify that data partition start locations are 4-byte aligned
|
||||||
@ -458,9 +461,14 @@ class EltwiseMultiplicationTestNetwork_Dict(Module):
|
|||||||
self.loss = L1Loss(reduction="none")
|
self.loss = L1Loss(reduction="none")
|
||||||
|
|
||||||
def forward(self, x: Tensor, y: Tensor, use_module_trace: bool, param_prefetching: bool) -> Dict[str, Tensor]:
|
def forward(self, x: Tensor, y: Tensor, use_module_trace: bool, param_prefetching: bool) -> Dict[str, Tensor]:
|
||||||
_assert_partition_status(self,
|
_assert_partition_status(
|
||||||
{ZeroParamStatus.NOT_AVAILABLE, ZeroParamStatus.INFLIGHT, ZeroParamStatus.AVAILABLE}
|
self,
|
||||||
if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
|
{
|
||||||
|
ZeroParamStatus.NOT_AVAILABLE,
|
||||||
|
ZeroParamStatus.INFLIGHT,
|
||||||
|
ZeroParamStatus.AVAILABLE,
|
||||||
|
} if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE},
|
||||||
|
)
|
||||||
|
|
||||||
pre_layer_expected_states = {
|
pre_layer_expected_states = {
|
||||||
ZeroParamStatus.INFLIGHT if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
|
ZeroParamStatus.INFLIGHT if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
|
||||||
@ -485,9 +493,14 @@ class EltwiseMultiplicationTestNetwork_Dict(Module):
|
|||||||
|
|
||||||
loss = self.loss(y_hat, y)
|
loss = self.loss(y_hat, y)
|
||||||
|
|
||||||
_assert_partition_status(self,
|
_assert_partition_status(
|
||||||
{ZeroParamStatus.NOT_AVAILABLE, ZeroParamStatus.INFLIGHT, ZeroParamStatus.AVAILABLE}
|
self,
|
||||||
if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
|
{
|
||||||
|
ZeroParamStatus.NOT_AVAILABLE,
|
||||||
|
ZeroParamStatus.INFLIGHT,
|
||||||
|
ZeroParamStatus.AVAILABLE,
|
||||||
|
} if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE},
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"hidden1": hidden1,
|
"hidden1": hidden1,
|
||||||
@ -512,10 +525,12 @@ class EltwiseMultiplicationTestNetwork_NamedTuple(EltwiseMultiplicationTestNetwo
|
|||||||
|
|
||||||
def forward(self, *args, **kwargs) -> EltwiseMultiplicationNamedTuple:
|
def forward(self, *args, **kwargs) -> EltwiseMultiplicationNamedTuple:
|
||||||
outputs_dicts = super().forward(*args, **kwargs)
|
outputs_dicts = super().forward(*args, **kwargs)
|
||||||
return EltwiseMultiplicationNamedTuple(hidden1=outputs_dicts['hidden1'],
|
return EltwiseMultiplicationNamedTuple(
|
||||||
hidden2=outputs_dicts['hidden2'],
|
hidden1=outputs_dicts["hidden1"],
|
||||||
y_hat=outputs_dicts['y_hat'],
|
hidden2=outputs_dicts["hidden2"],
|
||||||
loss=outputs_dicts['loss'])
|
y_hat=outputs_dicts["y_hat"],
|
||||||
|
loss=outputs_dicts["loss"],
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
|
def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
|
||||||
@ -527,18 +542,20 @@ class EltwiseMultiplicationTestNetwork_NamedTuple(EltwiseMultiplicationTestNetwo
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
EltwiseMultiplication_namedtuple = namedtuple('EltwiseMultiplication_namedtuple',
|
EltwiseMultiplication_namedtuple = namedtuple("EltwiseMultiplication_namedtuple",
|
||||||
['hidden1', 'hidden2', 'y_hat', 'loss'])
|
["hidden1", "hidden2", "y_hat", "loss"])
|
||||||
|
|
||||||
|
|
||||||
class EltwiseMultiplicationTestNetwork_namedtuple(EltwiseMultiplicationTestNetwork_Dict):
|
class EltwiseMultiplicationTestNetwork_namedtuple(EltwiseMultiplicationTestNetwork_Dict):
|
||||||
|
|
||||||
def forward(self, *args, **kwargs) -> EltwiseMultiplication_namedtuple:
|
def forward(self, *args, **kwargs) -> EltwiseMultiplication_namedtuple:
|
||||||
outputs_dicts = super().forward(*args, **kwargs)
|
outputs_dicts = super().forward(*args, **kwargs)
|
||||||
return EltwiseMultiplication_namedtuple(hidden1=outputs_dicts['hidden1'],
|
return EltwiseMultiplication_namedtuple(
|
||||||
hidden2=outputs_dicts['hidden2'],
|
hidden1=outputs_dicts["hidden1"],
|
||||||
y_hat=outputs_dicts['y_hat'],
|
hidden2=outputs_dicts["hidden2"],
|
||||||
loss=outputs_dicts['loss'])
|
y_hat=outputs_dicts["y_hat"],
|
||||||
|
loss=outputs_dicts["loss"],
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
|
def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
|
||||||
@ -554,7 +571,12 @@ class EltwiseMultiplicationTestNetwork_Tuple(EltwiseMultiplicationTestNetwork_Di
|
|||||||
|
|
||||||
def forward(self, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
|
def forward(self, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
|
||||||
outputs_dicts = super().forward(*args, **kwargs)
|
outputs_dicts = super().forward(*args, **kwargs)
|
||||||
return (outputs_dicts['hidden1'], outputs_dicts['hidden2'], outputs_dicts['y_hat'], outputs_dicts['loss'])
|
return (
|
||||||
|
outputs_dicts["hidden1"],
|
||||||
|
outputs_dicts["hidden2"],
|
||||||
|
outputs_dicts["y_hat"],
|
||||||
|
outputs_dicts["loss"],
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def to_dict(outputs: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Dict[str, Tensor]:
|
def to_dict(outputs: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Dict[str, Tensor]:
|
||||||
@ -570,7 +592,12 @@ class EltwiseMultiplicationTestNetwork_List(EltwiseMultiplicationTestNetwork_Dic
|
|||||||
|
|
||||||
def forward(self, *args, **kwargs) -> List[Tensor]:
|
def forward(self, *args, **kwargs) -> List[Tensor]:
|
||||||
outputs_dicts = super().forward(*args, **kwargs)
|
outputs_dicts = super().forward(*args, **kwargs)
|
||||||
return [outputs_dicts['hidden1'], outputs_dicts['hidden2'], outputs_dicts['y_hat'], outputs_dicts['loss']]
|
return [
|
||||||
|
outputs_dicts["hidden1"],
|
||||||
|
outputs_dicts["hidden2"],
|
||||||
|
outputs_dicts["y_hat"],
|
||||||
|
outputs_dicts["loss"],
|
||||||
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def to_dict(outputs: List[Tensor]) -> Dict[str, Tensor]:
|
def to_dict(outputs: List[Tensor]) -> Dict[str, Tensor]:
|
||||||
@ -582,31 +609,55 @@ class EltwiseMultiplicationTestNetwork_List(EltwiseMultiplicationTestNetwork_Dic
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestZero3ParamPartitioningBase(DistributedTest):
|
||||||
|
world_size = 2
|
||||||
|
|
||||||
@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
|
@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
|
||||||
|
def test_param_persistence_threshold(self, param_persistence_threshold):
|
||||||
|
self._test(param_persistence_threshold=param_persistence_threshold)
|
||||||
|
|
||||||
@pytest.mark.parametrize("fp16_enabled", [True, False])
|
@pytest.mark.parametrize("fp16_enabled", [True, False])
|
||||||
|
def test_fp16_enabled(self, fp16_enabled):
|
||||||
|
self._test(fp16_enabled=fp16_enabled)
|
||||||
|
|
||||||
@pytest.mark.parametrize("contiguous_gradients", [True, False])
|
@pytest.mark.parametrize("contiguous_gradients", [True, False])
|
||||||
|
def test_contiguous_gradients(self, contiguous_gradients):
|
||||||
|
self._test(contiguous_gradients=contiguous_gradients)
|
||||||
|
|
||||||
@pytest.mark.parametrize("offload_optimizer", [True, False])
|
@pytest.mark.parametrize("offload_optimizer", [True, False])
|
||||||
|
def test_offload_optimizer(self, offload_optimizer):
|
||||||
|
self._test(offload_optimizer=offload_optimizer)
|
||||||
|
|
||||||
@pytest.mark.parametrize("zero_grad", [True, False])
|
@pytest.mark.parametrize("zero_grad", [True, False])
|
||||||
|
def test_zero_grad(self, zero_grad):
|
||||||
|
self._test(zero_grad=zero_grad)
|
||||||
|
|
||||||
@pytest.mark.parametrize("prefetching", [True, False])
|
@pytest.mark.parametrize("prefetching", [True, False])
|
||||||
|
def test_prefetching(self, prefetching):
|
||||||
|
self._test(prefetching=prefetching)
|
||||||
|
|
||||||
@pytest.mark.parametrize("reduce_scatter", [True, False])
|
@pytest.mark.parametrize("reduce_scatter", [True, False])
|
||||||
|
def test_reduce_scatter(self, reduce_scatter):
|
||||||
|
self._test(reduce_scatter=reduce_scatter)
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_class", [
|
@pytest.mark.parametrize("model_class", [
|
||||||
EltwiseMultiplicationTestNetwork_Dict, EltwiseMultiplicationTestNetwork_NamedTuple,
|
EltwiseMultiplicationTestNetwork_Dict, EltwiseMultiplicationTestNetwork_NamedTuple,
|
||||||
EltwiseMultiplicationTestNetwork_namedtuple, EltwiseMultiplicationTestNetwork_Tuple,
|
EltwiseMultiplicationTestNetwork_namedtuple, EltwiseMultiplicationTestNetwork_Tuple,
|
||||||
EltwiseMultiplicationTestNetwork_List
|
EltwiseMultiplicationTestNetwork_List
|
||||||
])
|
])
|
||||||
class TestZero3ParamPartitioningBase(DistributedTest):
|
def test_model_class(self, model_class):
|
||||||
world_size = 2
|
self._test(model_class=model_class)
|
||||||
|
|
||||||
def test(
|
def _test(
|
||||||
self,
|
self,
|
||||||
param_persistence_threshold: int,
|
param_persistence_threshold: int = 0,
|
||||||
fp16_enabled: bool,
|
fp16_enabled: bool = False,
|
||||||
contiguous_gradients: bool,
|
contiguous_gradients: bool = False,
|
||||||
offload_optimizer: bool,
|
offload_optimizer: bool = False,
|
||||||
zero_grad: bool,
|
zero_grad: bool = False,
|
||||||
prefetching: bool,
|
prefetching: bool = False,
|
||||||
reduce_scatter: bool,
|
reduce_scatter: bool = False,
|
||||||
model_class: EltwiseMultiplicationTestNetwork_Dict,
|
model_class: EltwiseMultiplicationTestNetwork_Dict = EltwiseMultiplicationTestNetwork_Dict,
|
||||||
) -> None:
|
) -> None:
|
||||||
if offload_optimizer and not contiguous_gradients:
|
if offload_optimizer and not contiguous_gradients:
|
||||||
return
|
return
|
||||||
@ -624,18 +675,18 @@ class TestZero3ParamPartitioningBase(DistributedTest):
|
|||||||
"stage3_param_persistence_threshold": param_persistence_threshold,
|
"stage3_param_persistence_threshold": param_persistence_threshold,
|
||||||
"contiguous_gradients": contiguous_gradients,
|
"contiguous_gradients": contiguous_gradients,
|
||||||
"stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0,
|
"stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0,
|
||||||
"reduce_scatter": reduce_scatter
|
"reduce_scatter": reduce_scatter,
|
||||||
},
|
},
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
"params": {
|
"params": {
|
||||||
"lr": 1.
|
"lr": 1.0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": fp16_enabled,
|
"enabled": fp16_enabled,
|
||||||
"loss_scale": 1.,
|
"loss_scale": 1.0,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if offload_optimizer:
|
if offload_optimizer:
|
||||||
@ -649,9 +700,11 @@ class TestZero3ParamPartitioningBase(DistributedTest):
|
|||||||
weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank()))
|
weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank()))
|
||||||
|
|
||||||
def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
|
def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
|
||||||
return torch.as_tensor(vals,
|
return torch.as_tensor(
|
||||||
|
vals,
|
||||||
dtype=dtype or (torch.float16 if fp16_enabled else torch.float32),
|
dtype=dtype or (torch.float16 if fp16_enabled else torch.float32),
|
||||||
device=ds_engine.device)
|
device=ds_engine.device,
|
||||||
|
)
|
||||||
|
|
||||||
expected_hidden1 = create_tensor([
|
expected_hidden1 = create_tensor([
|
||||||
[1, 1, 1, 1, 1],
|
[1, 1, 1, 1, 1],
|
||||||
@ -672,8 +725,16 @@ class TestZero3ParamPartitioningBase(DistributedTest):
|
|||||||
|
|
||||||
for train_iter in range(3):
|
for train_iter in range(3):
|
||||||
activations = ds_engine(
|
activations = ds_engine(
|
||||||
x=torch.ones((m, n), dtype=torch.float16 if fp16_enabled else torch.float32, device=ds_engine.device),
|
x=torch.ones(
|
||||||
y=torch.ones((m, n), dtype=torch.float16 if fp16_enabled else torch.float32, device=ds_engine.device),
|
(m, n),
|
||||||
|
dtype=torch.float16 if fp16_enabled else torch.float32,
|
||||||
|
device=ds_engine.device,
|
||||||
|
),
|
||||||
|
y=torch.ones(
|
||||||
|
(m, n),
|
||||||
|
dtype=torch.float16 if fp16_enabled else torch.float32,
|
||||||
|
device=ds_engine.device,
|
||||||
|
),
|
||||||
use_module_trace=train_iter > 0,
|
use_module_trace=train_iter > 0,
|
||||||
param_prefetching=prefetching and train_iter > 0,
|
param_prefetching=prefetching and train_iter > 0,
|
||||||
)
|
)
|
||||||
@ -708,21 +769,33 @@ class TestZero3ParamPartitioningBase(DistributedTest):
|
|||||||
|
|
||||||
grad_multiplier = 1 if zero_grad else (train_iter + 1)
|
grad_multiplier = 1 if zero_grad else (train_iter + 1)
|
||||||
if dist.get_rank() == 0:
|
if dist.get_rank() == 0:
|
||||||
assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor([2] * 8, torch.float))
|
dloss_wrt_layer3.to(get_accelerator().device_name()),
|
||||||
assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
|
grad_multiplier * create_tensor([2] * 8, torch.float),
|
||||||
grad_multiplier * create_tensor([3 * 1] * 8, torch.float))
|
)
|
||||||
assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor([3 * 2 * 1] * 8, torch.float))
|
dloss_wrt_layer2.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor([3 * 1] * 8, torch.float),
|
||||||
|
)
|
||||||
|
assert torch.allclose(
|
||||||
|
dloss_wrt_layer1.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor([3 * 2 * 1] * 8, torch.float),
|
||||||
|
)
|
||||||
elif dist.get_rank() == 1:
|
elif dist.get_rank() == 1:
|
||||||
# parameters dont split evenly across ranks so rank 1 has a zero-padded
|
# parameters dont split evenly across ranks so rank 1 has a zero-padded
|
||||||
# partition
|
# partition
|
||||||
assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor(([8] * 7) + [0], torch.float))
|
dloss_wrt_layer3.to(get_accelerator().device_name()),
|
||||||
assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
|
grad_multiplier * create_tensor(([8] * 7) + [0], torch.float),
|
||||||
grad_multiplier * create_tensor(([6 * 2] * 7) + [0], torch.float))
|
)
|
||||||
assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0], torch.float))
|
dloss_wrt_layer2.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor(([6 * 2] * 7) + [0], torch.float),
|
||||||
|
)
|
||||||
|
assert torch.allclose(
|
||||||
|
dloss_wrt_layer1.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0], torch.float),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("test has world size of two")
|
raise RuntimeError("test has world size of two")
|
||||||
|
|
||||||
@ -776,13 +849,13 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest):
|
|||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
"params": {
|
"params": {
|
||||||
"lr": 1.
|
"lr": 1.0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"loss_scale": 1.,
|
"loss_scale": 1.0,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager):
|
with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager):
|
||||||
model = LargeParamModel()
|
model = LargeParamModel()
|
||||||
@ -794,26 +867,27 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest):
|
|||||||
partition_sz = math.ceil(param_sz / self.world_size)
|
partition_sz = math.ceil(param_sz / self.world_size)
|
||||||
for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)):
|
for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)):
|
||||||
activation_from_partition = activation[start_idx:start_idx + partition_sz]
|
activation_from_partition = activation[start_idx:start_idx + partition_sz]
|
||||||
assert torch.allclose(activation_from_partition, torch.full_like(activation_from_partition, rank_idx))
|
assert torch.allclose(
|
||||||
|
activation_from_partition,
|
||||||
|
torch.full_like(activation_from_partition, rank_idx),
|
||||||
|
)
|
||||||
|
|
||||||
ds_engine.backward(activation.sum())
|
ds_engine.backward(activation.sum())
|
||||||
ds_engine.allreduce_gradients()
|
ds_engine.allreduce_gradients()
|
||||||
|
|
||||||
avgd_gradients = ds_engine.optimizer.averaged_gradients
|
avgd_gradients = ds_engine.optimizer.averaged_gradients
|
||||||
assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
|
assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
|
||||||
weight_gradient, = avgd_gradients[0]
|
(weight_gradient, ) = avgd_gradients[0]
|
||||||
expected_weight_gradient = (train_iter + 1) * torch.full_like(weight_gradient, 1)
|
expected_weight_gradient = (train_iter + 1) * torch.full_like(weight_gradient, 1)
|
||||||
|
|
||||||
assert torch.allclose(weight_gradient, expected_weight_gradient)
|
assert torch.allclose(weight_gradient, expected_weight_gradient)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("param_sz", [100, 1_000, 10_000])
|
|
||||||
@pytest.mark.parametrize("n_layers", [100, 1_000])
|
|
||||||
@pytest.mark.parametrize("init_context_manager", [True, False])
|
@pytest.mark.parametrize("init_context_manager", [True, False])
|
||||||
class TestZero3ParamPartitioningManyParams(DistributedTest):
|
class TestZero3ParamPartitioningManyParams(DistributedTest):
|
||||||
world_size = 4
|
world_size = 2
|
||||||
|
|
||||||
def test(self, param_sz: int, n_layers: int, init_context_manager: bool) -> None:
|
def test(self, init_context_manager: bool, param_sz: int = 100, n_layers: int = 100) -> None:
|
||||||
|
|
||||||
class ManyParamModel(Module):
|
class ManyParamModel(Module):
|
||||||
|
|
||||||
@ -854,13 +928,13 @@ class TestZero3ParamPartitioningManyParams(DistributedTest):
|
|||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
"params": {
|
"params": {
|
||||||
"lr": 1.
|
"lr": 1.0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"loss_scale": 1.,
|
"loss_scale": 1.0,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager):
|
with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager):
|
||||||
@ -923,20 +997,23 @@ class TestZero3InitForParentWeightInitialization(DistributedTest):
|
|||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
"params": {
|
"params": {
|
||||||
"lr": 1.
|
"lr": 1.0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"loss_scale": 1.,
|
"loss_scale": 1.0,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True):
|
with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True):
|
||||||
model = ModelWhereParentInitializesChildWeights()
|
model = ModelWhereParentInitializesChildWeights()
|
||||||
|
|
||||||
assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / self.world_size)
|
assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / self.world_size)
|
||||||
assert torch.allclose(model.linear.weight.ds_tensor, torch.full_like(model.linear.weight.ds_tensor, 1))
|
assert torch.allclose(
|
||||||
|
model.linear.weight.ds_tensor,
|
||||||
|
torch.full_like(model.linear.weight.ds_tensor, 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip("not working")
|
@pytest.mark.skip("not working")
|
||||||
@ -946,17 +1023,29 @@ class TestZero3InitForParentWeightInitialization(DistributedTest):
|
|||||||
@pytest.mark.parametrize("zero_grad", [True, False])
|
@pytest.mark.parametrize("zero_grad", [True, False])
|
||||||
@pytest.mark.parametrize("prefetching", [True, False])
|
@pytest.mark.parametrize("prefetching", [True, False])
|
||||||
@pytest.mark.parametrize("reduce_scatter", [True, False])
|
@pytest.mark.parametrize("reduce_scatter", [True, False])
|
||||||
@pytest.mark.parametrize("model_class", [
|
@pytest.mark.parametrize(
|
||||||
EltwiseMultiplicationTestNetwork_Dict, EltwiseMultiplicationTestNetwork_NamedTuple,
|
"model_class",
|
||||||
EltwiseMultiplicationTestNetwork_namedtuple, EltwiseMultiplicationTestNetwork_Tuple,
|
[
|
||||||
EltwiseMultiplicationTestNetwork_List
|
EltwiseMultiplicationTestNetwork_Dict,
|
||||||
])
|
EltwiseMultiplicationTestNetwork_NamedTuple,
|
||||||
|
EltwiseMultiplicationTestNetwork_namedtuple,
|
||||||
|
EltwiseMultiplicationTestNetwork_Tuple,
|
||||||
|
EltwiseMultiplicationTestNetwork_List,
|
||||||
|
],
|
||||||
|
)
|
||||||
class TestZero3ParamPartitioningBaseBF16(DistributedTest):
|
class TestZero3ParamPartitioningBaseBF16(DistributedTest):
|
||||||
world_size = 2
|
world_size = 2
|
||||||
|
|
||||||
def test(self, param_persistence_threshold: int, contiguous_gradients: bool, offload_optimizer: bool,
|
def test(
|
||||||
zero_grad: bool, prefetching: bool, reduce_scatter: bool,
|
self,
|
||||||
model_class: EltwiseMultiplicationTestNetwork_Dict) -> None:
|
param_persistence_threshold: int,
|
||||||
|
contiguous_gradients: bool,
|
||||||
|
offload_optimizer: bool,
|
||||||
|
zero_grad: bool,
|
||||||
|
prefetching: bool,
|
||||||
|
reduce_scatter: bool,
|
||||||
|
model_class: EltwiseMultiplicationTestNetwork_Dict,
|
||||||
|
) -> None:
|
||||||
if offload_optimizer and not contiguous_gradients:
|
if offload_optimizer and not contiguous_gradients:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -973,18 +1062,18 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest):
|
|||||||
"stage3_param_persistence_threshold": param_persistence_threshold,
|
"stage3_param_persistence_threshold": param_persistence_threshold,
|
||||||
"contiguous_gradients": contiguous_gradients,
|
"contiguous_gradients": contiguous_gradients,
|
||||||
"stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0,
|
"stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0,
|
||||||
"reduce_scatter": reduce_scatter
|
"reduce_scatter": reduce_scatter,
|
||||||
},
|
},
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "Adam",
|
"type": "Adam",
|
||||||
"params": {
|
"params": {
|
||||||
"lr": 1.
|
"lr": 1.0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"bf16": {
|
"bf16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"loss_scale": 1.,
|
"loss_scale": 1.0,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if offload_optimizer:
|
if offload_optimizer:
|
||||||
@ -1055,21 +1144,33 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest):
|
|||||||
|
|
||||||
grad_multiplier = 1 if zero_grad else (train_iter + 1)
|
grad_multiplier = 1 if zero_grad else (train_iter + 1)
|
||||||
if dist.get_rank() == 0:
|
if dist.get_rank() == 0:
|
||||||
assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype))
|
dloss_wrt_layer3.to(get_accelerator().device_name()),
|
||||||
assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
|
grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype),
|
||||||
grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype))
|
)
|
||||||
assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype))
|
dloss_wrt_layer2.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype),
|
||||||
|
)
|
||||||
|
assert torch.allclose(
|
||||||
|
dloss_wrt_layer1.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype),
|
||||||
|
)
|
||||||
elif dist.get_rank() == 1:
|
elif dist.get_rank() == 1:
|
||||||
# parameters dont split evenly across ranks so rank 1 has a zero-padded
|
# parameters dont split evenly across ranks so rank 1 has a zero-padded
|
||||||
# partition
|
# partition
|
||||||
assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor(([8] * 7) + [0]).to(expected_grad_dtype))
|
dloss_wrt_layer3.to(get_accelerator().device_name()),
|
||||||
assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
|
grad_multiplier * create_tensor(([8] * 7) + [0]).to(expected_grad_dtype),
|
||||||
grad_multiplier * create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype))
|
)
|
||||||
assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
|
assert torch.allclose(
|
||||||
grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype))
|
dloss_wrt_layer2.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype),
|
||||||
|
)
|
||||||
|
assert torch.allclose(
|
||||||
|
dloss_wrt_layer1.to(get_accelerator().device_name()),
|
||||||
|
grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("test has world size of two")
|
raise RuntimeError("test has world size of two")
|
||||||
|
|
||||||
@ -1104,7 +1205,7 @@ class TestZeroOffloadStage1(DistributedTest):
|
|||||||
"offload_optimizer": {
|
"offload_optimizer": {
|
||||||
"device": "cpu"
|
"device": "cpu"
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 10
|
hidden_dim = 10
|
||||||
|
|
||||||
@ -1118,7 +1219,7 @@ class TestZeroOffloadStage1(DistributedTest):
|
|||||||
model.step()
|
model.step()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('return_type', [tuple, list, dict])
|
@pytest.mark.parametrize("return_type", [tuple, list, dict])
|
||||||
class TestZero3DictFwd(DistributedTest):
|
class TestZero3DictFwd(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
|
||||||
@ -1137,7 +1238,7 @@ class TestZero3DictFwd(DistributedTest):
|
|||||||
},
|
},
|
||||||
"zero_optimization": {
|
"zero_optimization": {
|
||||||
"stage": 3
|
"stage": 3
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 10
|
hidden_dim = 10
|
||||||
|
|
||||||
@ -1152,7 +1253,7 @@ class TestZero3DictFwd(DistributedTest):
|
|||||||
x = self.l1(x)
|
x = self.l1(x)
|
||||||
loss = self.cel(x, y)
|
loss = self.cel(x, y)
|
||||||
if return_type == dict:
|
if return_type == dict:
|
||||||
val = {'a': x, 'loss': loss, 'b': 1, 'c': None}
|
val = {"a": x, "loss": loss, "b": 1, "c": None}
|
||||||
elif return_type == list:
|
elif return_type == list:
|
||||||
val = [x, loss]
|
val = [x, loss]
|
||||||
elif return_type == tuple:
|
elif return_type == tuple:
|
||||||
@ -1170,14 +1271,14 @@ class TestZero3DictFwd(DistributedTest):
|
|||||||
for n, batch in enumerate(data_loader):
|
for n, batch in enumerate(data_loader):
|
||||||
loss = model(batch[0], batch[1])
|
loss = model(batch[0], batch[1])
|
||||||
if return_type == dict:
|
if return_type == dict:
|
||||||
loss = loss['loss']
|
loss = loss["loss"]
|
||||||
else:
|
else:
|
||||||
loss = loss[1]
|
loss = loss[1]
|
||||||
model.backward(loss)
|
model.backward(loss)
|
||||||
model.step()
|
model.step()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('zero_stage', [1, 2, 3])
|
@pytest.mark.parametrize("zero_stage", [1, 2, 3])
|
||||||
class TestZeroAdamOptimizerStepCount(DistributedTest):
|
class TestZeroAdamOptimizerStepCount(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
|
||||||
@ -1201,7 +1302,7 @@ class TestZeroAdamOptimizerStepCount(DistributedTest):
|
|||||||
"fp16": {
|
"fp16": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"initial_scale_power": 8
|
"initial_scale_power": 8
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 4
|
hidden_dim = 4
|
||||||
|
|
||||||
@ -1221,13 +1322,13 @@ class TestZeroAdamOptimizerStepCount(DistributedTest):
|
|||||||
for sub_group_id, _ in enumerate(optimizer.fp16_groups):
|
for sub_group_id, _ in enumerate(optimizer.fp16_groups):
|
||||||
fp32_param = optimizer.fp32_partitioned_groups_flat[sub_group_id]
|
fp32_param = optimizer.fp32_partitioned_groups_flat[sub_group_id]
|
||||||
state = optimizer.optimizer.state[fp32_param]
|
state = optimizer.optimizer.state[fp32_param]
|
||||||
step_counts.append(state['step'])
|
step_counts.append(state["step"])
|
||||||
assert all(step == step_counts[0] for step in step_counts)
|
assert all(step == step_counts[0] for step in step_counts)
|
||||||
elif zero_stage == 1 or zero_stage == 2:
|
elif zero_stage == 1 or zero_stage == 2:
|
||||||
for param_group in optimizer.optimizer.param_groups:
|
for param_group in optimizer.optimizer.param_groups:
|
||||||
for param in param_group['params']:
|
for param in param_group["params"]:
|
||||||
state = optimizer.optimizer.state[param]
|
state = optimizer.optimizer.state[param]
|
||||||
step_counts.append(state['step'])
|
step_counts.append(state["step"])
|
||||||
assert all(step == step_counts[0] for step in step_counts)
|
assert all(step == step_counts[0] for step in step_counts)
|
||||||
|
|
||||||
|
|
||||||
@ -1249,7 +1350,7 @@ class TestZeroFrozenWeights(DistributedTest):
|
|||||||
},
|
},
|
||||||
"zero_optimization": {
|
"zero_optimization": {
|
||||||
"stage": 3
|
"stage": 3
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
hidden_dim = 10
|
hidden_dim = 10
|
||||||
|
|
||||||
@ -1287,7 +1388,7 @@ class TestZeroFrozenWeights(DistributedTest):
|
|||||||
model.step()
|
model.step()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('force_ds_optim', [True, False])
|
@pytest.mark.parametrize("force_ds_optim", [True, False])
|
||||||
class TestZeroOffloadOptim(DistributedTest):
|
class TestZeroOffloadOptim(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
|
||||||
@ -1320,7 +1421,7 @@ class TestZeroOffloadOptim(DistributedTest):
|
|||||||
model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
|
model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('training', [True, False])
|
@pytest.mark.parametrize("training", [True, False])
|
||||||
class TestZeroPartitionCache(DistributedTest):
|
class TestZeroPartitionCache(DistributedTest):
|
||||||
world_size = 1
|
world_size = 1
|
||||||
|
|
||||||
@ -1334,8 +1435,8 @@ class TestZeroPartitionCache(DistributedTest):
|
|||||||
},
|
},
|
||||||
"zero_optimization": {
|
"zero_optimization": {
|
||||||
"stage": 3,
|
"stage": 3,
|
||||||
"stage3_param_persistence_threshold": hidden_dim
|
"stage3_param_persistence_threshold": hidden_dim,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
if training:
|
if training:
|
||||||
config_dict["optimizer"] = {"type": "Adam"}
|
config_dict["optimizer"] = {"type": "Adam"}
|
||||||
@ -1346,11 +1447,13 @@ class TestZeroPartitionCache(DistributedTest):
|
|||||||
model, _, _, _ = deepspeed.initialize(model=model, config=config_dict)
|
model, _, _, _ = deepspeed.initialize(model=model, config=config_dict)
|
||||||
|
|
||||||
dtype = torch.half
|
dtype = torch.half
|
||||||
data_loader = random_dataloader(model=model,
|
data_loader = random_dataloader(
|
||||||
|
model=model,
|
||||||
total_samples=6,
|
total_samples=6,
|
||||||
hidden_dim=hidden_dim,
|
hidden_dim=hidden_dim,
|
||||||
device=model.device,
|
device=model.device,
|
||||||
dtype=dtype)
|
dtype=dtype,
|
||||||
|
)
|
||||||
|
|
||||||
for _, batch in enumerate(data_loader):
|
for _, batch in enumerate(data_loader):
|
||||||
loss = model(batch[0], batch[1])
|
loss = model(batch[0], batch[1])
|
||||||
|
@ -68,11 +68,15 @@ def run_fragmented_model(model, config_dict, hidden_dim, dtype):
|
|||||||
validate_full_tensors(model)
|
validate_full_tensors(model)
|
||||||
model.step()
|
model.step()
|
||||||
|
|
||||||
|
# Needed in ZeRO 3. Not doing so can give memory leak
|
||||||
|
model.destroy()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('frozen_weights', [True, False])
|
@pytest.mark.parametrize('frozen_weights', [True, False])
|
||||||
class TestTensorFragment(DistributedTest):
|
class TestTensorFragment(DistributedTest):
|
||||||
# Need multiple gpus to test possible hanging
|
# Need multiple gpus to test possible hanging
|
||||||
world_size = 2
|
world_size = 2
|
||||||
|
reuse_dist_env = True
|
||||||
|
|
||||||
@pytest.mark.parametrize('zero_stage', [1, 2, 3])
|
@pytest.mark.parametrize('zero_stage', [1, 2, 3])
|
||||||
@pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
|
@pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
|
||||||
|
Reference in New Issue
Block a user