Merge branch 'master' into loadams/reenable-py311-312

This commit is contained in:
Olatunji Ruwase
2025-08-11 19:53:51 -04:00
committed by GitHub
333 changed files with 18655 additions and 3116 deletions

View File

@ -1,103 +0,0 @@
name: cpu-inference
on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/cpu-inference.yml'
- 'requirements/**'
- 'deepspeed/__init__.py'
- 'deepspeed/inference/**'
- '!deepspeed/inference/v2/**' # exclude v2 dir
- 'tests/unit/inference/**'
- '!tests/unit/inference/v2/**' # exclude v2 tests dir
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * 0"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, cpu]
env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install gcc-9
run: |
sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
sudo apt install -y gcc-9 g++-9
# set gcc-9 and g++9 to default
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
- name: Check gcc version
run: |
# Get gcc version
gcc --version
g++ --version
- name: Detect instruction sets on instance
run: |
lscpu
- name: Install numactl
run: |
sudo apt-get install -y numactl
- name: Install dependencies
run: |
pip install torch
# check installed version
pip list |grep \\\<torch\\\>
- name: Install oneCCL
run: |
pip install cmake
git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
mkdir build
cd build
cmake ..
make -j install
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
# check why the host does not have AVX2 support
pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment check
run: |
pip list
source oneCCL/build/_install/env/setvars.sh
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
# check whether the environment is properly setup
python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
- name: Unit tests
run: |
# prep oneCCL for CCLBackend comm ops building
source oneCCL/build/_install/env/setvars.sh
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
# LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/

View File

@ -33,7 +33,7 @@ jobs:
- name: Install pytorch
run: |
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
pip install torch==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cpu
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@ -42,7 +42,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout 981c276
# git checkout 981c276
git rev-parse --short HEAD
pip install .
@ -59,5 +59,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.6"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.6"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.7.1+cpu"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.7.1+cpu"

View File

@ -21,7 +21,7 @@ jobs:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@ -45,6 +45,8 @@ jobs:
test_zero_leaf_module.py
test_zero_offloadpp.py
test_zero_tiled.py
test_autotp_training.py
test_ulysses.py
# Steps represent a sequence of tasks that will be executed as part of the job
steps:

View File

@ -39,7 +39,7 @@ jobs:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@ -94,6 +94,8 @@ jobs:
test_zero_nesting_init.py
test_zeropp.py
(test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
(test_linear.py and (TestLoRALinear or TestBasicLinear))
(test_ctx.py and TestEngine)
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
@ -112,7 +114,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout 981c276
# git checkout 981c276
git rev-parse --short HEAD
pip install .

99
.github/workflows/modal-accelerate.yml vendored Normal file
View File

@ -0,0 +1,99 @@
name: modal-accelerate
# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#
on:
workflow_dispatch:
push:
branches:
- master
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [draft, opened, ready_for_review, synchronize]
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true
jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'
deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}
if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal
- name: Run tests
run: |
modal run -m ci.accelerate

View File

@ -0,0 +1,99 @@
name: modal-torch-latest
# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see deepspeed/modal_ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#
on:
workflow_dispatch:
push:
branches:
- master
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [draft, opened, ready_for_review, synchronize]
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true
jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'
deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}
if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal
- name: Run tests
run: |
modal run -m ci.torch_latest

View File

@ -23,7 +23,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:24.09-py3
image: nvcr.io/nvidia/pytorch:25.01-py3
ports:
- 80
options: --gpus all --shm-size "8G"
@ -43,7 +43,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if you need to use an older transformers version temporarily in case of breakage
git checkout 981c276
# git checkout 981c276
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
@ -58,8 +58,8 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.6" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.6" --cuda_ver="12"
- name: MII unit tests
run: |
BRANCH="main"

View File

@ -48,6 +48,10 @@ jobs:
git clone https://github.com/huggingface/accelerate
cd accelerate
git rev-parse --short HEAD
# temp workaround until this is resolved https://github.com/huggingface/accelerate/issues/3676
pip install datasets==3.6.0
# installing dependencies
pip install .[testing]
# force protobuf version due to issues

View File

@ -43,8 +43,8 @@ jobs:
- name: Install deepspeed
run: |
pip install transformers==4.48.3
pip install .[dev]
pip install transformers==4.48.3
ds_report
- name: Install deepspeed-chat

View File

@ -18,7 +18,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:24.09-py3
image: nvcr.io/nvidia/pytorch:24.12-py3
ports:
- 80
options: --gpus all --shm-size "8G"
@ -33,16 +33,19 @@ jobs:
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone --depth=1 https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install .[dev]
ds_report
# install transformers after deepspeed so that the right version of transformers is installed
- name: Install transformers
run: |
python -m pip install transformers==4.50.0
- name: Install FlashAttention
run: |
python -m pip install flash-attn
@ -53,7 +56,7 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.6" --cuda_ver="12"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2

View File

@ -1,65 +0,0 @@
name: nv-h100
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, h100]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
steps:
- uses: actions/checkout@v4
- name: Check container state
run: |
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
python -m pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true

View File

@ -1,53 +0,0 @@
name: nv-human-eval
on:
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:24.09-py3
ports:
- 80
options: --gpus all --shm-size "8G"
steps:
- uses: actions/checkout@v4
- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone --depth=1 https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Clone Human Eval
run: |
git clone --depth=1 https://github.com/openai/human-eval.git
sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py
cd human-eval
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"

View File

@ -41,6 +41,12 @@ jobs:
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install deepspeed
run: |
pip install .[dev]
ds_report
# install transformers after deepspeed so that the right version of transformers is installed
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
@ -50,11 +56,6 @@ jobs:
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev]
ds_report
- name: Python environment
run: |
pip list

View File

@ -36,7 +36,7 @@ jobs:
#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Compile DeepSpeed Ops
run: |
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
- name: DS Report
run: |
ds_report

View File

@ -44,7 +44,8 @@ jobs:
- name: Install deepspeed
run: |
pip install .[dev,1bit,autotuning]
pip install .[dev,1bit,autotuning,deepcompile]
pip install pytest-timeout pytest-instafail
ds_report
- name: Python environment
@ -55,5 +56,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
pytest -x $PYTEST_OPTS --instafail --timeout 600 --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
pytest $PYTEST_OPTS --instafail --timeout 600 --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"

View File

@ -37,7 +37,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout 981c276
# git checkout 981c276
git rev-parse --short HEAD
pip install .

View File

@ -6,7 +6,9 @@ runs:
- id: update-env
run: |
sudo apt-get update
sudo apt-get install -y libaio-dev
# Temporary disable nvme UTs
# sudo apt-get install -y libaio-dev
sudo apt remove -y libaio-dev
python -m pip install --user --upgrade pip
python -m pip install --user --upgrade virtualenv
shell: bash
@ -28,7 +30,7 @@ runs:
echo HF_DATASETS_CACHE=/blob/datasets_cache/ >> $GITHUB_ENV
echo MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ >> $GITHUB_ENV
echo CRITIC_CKPT_DIR=/blob/step2_opt_125m_ckpt/ >> $GITHUB_ENV
echo PYTEST_OPTS="--color=yes --durations=0 --verbose -rF" >> $GITHUB_ENV
echo PYTEST_OPTS="--maxfail=100 --color=yes --durations=0 --verbose -rF" >> $GITHUB_ENV
shell: bash
- id: print-env
run: |

View File

@ -36,7 +36,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, intel, xpu]
container:
image: intel/oneapi-basekit:2025.0.1-0-devel-ubuntu24.04
image: intel/oneapi-basekit:2025.0.2-0-devel-ubuntu22.04
ports:
- 80
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
@ -47,20 +47,16 @@ jobs:
shell: bash
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip python3.12-venv -y
python3 -m venv ~/ds_env
source ~/ds_env/bin/activate
pip install torch==2.5.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
pip install intel-extension-for-pytorch==2.5.10+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.5.0+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
pip install torchvision==0.20.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
pip install py-cpuinfo numpy
apt-get install -y python3.11 python3.11-dev python3-pip clinfo libaio-dev
pip install --upgrade pip
pip install py-cpuinfo
pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us
pip install .[dev,autotuning]
- name: Check container state
shell: bash
run: |
source ~/ds_env/bin/activate
ldd --version
ds_report
python3 -c "import torch; print('torch:', torch.__version__, torch)"
@ -71,8 +67,9 @@ jobs:
- name: Unit tests
shell: bash
run: |
source ~/ds_env/bin/activate
cd tests/unit
export FI_PROVIDER="tcp"
export I_MPI_SHM=off
pytest --verbose accelerator/*
pytest --verbose autotuning/*
pytest --verbose checkpoint/test_reshape_checkpoint.py

View File

@ -9,3 +9,4 @@
| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC |
| Ashwin Aji | [ashwinma](https://github.com/ashwinma) | AMD |
| Sam Foreman | [saforem2](https://github.com/saforem2) | Argonne National Laboratory |
| Zhipeng Wang | [PKUWZP](https://github.com/PKUWZP) | LinkedIn |

View File

@ -19,6 +19,12 @@ If a formatting test fails, it will fix the modified code in place and abort
the `git commit`. After looking over the changes, you can `git add <modified files>`
and then repeat the previous `git commit` command.
You can also run:
```
make format
```
which will do the same as above, and it'll also automatically build a `venv` python environment if you
don't already have one, which will isolate the requirements of this project from requirements of other projects.
## Testing
DeepSpeed tracks two types of tests: unit tests and more costly model convergence tests.
@ -38,6 +44,11 @@ You can also provide the `-v` flag to `pytest` to see additional information abo
tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) and the
`--forked` flag are required to test CUDA functionality in distributed tests.
You can also run:
```
make test
```
### Model Tests
To execute model tests, first [install DeepSpeed](#installation). The
[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) repository is cloned
@ -48,16 +59,15 @@ pytest run_sanity_check.py
```
Note that the `--forked` flag is not necessary for the model tests.
## Contributor License Agreement
This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
actually do, grant us the rights to use your contribution. For details, visit
https://cla.opensource.microsoft.com.
## Developer Certificate of Origin
This project welcomes contributions and suggestions. All contributions to deepspeedai projects
require commits to be signed off with a [Developer Certificate of Origin](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin)
(DCO) declaring that you have the right to, and actually do, grant us the rights to use your contribution.
When you submit a pull request, a CLA bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
follow the instructions provided by the bot. You will only need to do this once across
all repos using our CLA.
When you submit a pull request, the DCO app will check for the presence of signed commits.
Information about how this check works is here: https://github.com/dcoapp/app?tab=readme-ov-file#how-it-works
To sign commits, you will need to include `-s` when running `git commit`. For example, `git commit -s -m "Commit message"`. One note, creating PRs via the GitHub interface do not appear to include this option. If you forget this, clicking on the failing check in your PR will point you to commands you can run to rebase and sign previous commits.
## Code of Conduct
This project has adopted the [Microsoft Open Source Code of

23
Makefile Normal file
View File

@ -0,0 +1,23 @@
# usage: make help
.PHONY: help test format
.DEFAULT_GOAL := help
help: ## this help
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[0-9a-zA-Z_-]+:.*?##/ { printf " \033[36m%-22s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
echo $(MAKEFILE_LIST)
test: ## run tests
pytest --forked tests/unit/
format: ## fix formatting
@if [ ! -d "venv" ]; then \
python -m venv venv; \
. venv/bin/activate; \
pip install pre-commit -U; \
pre-commit clean; \
pre-commit uninstall; \
pre-commit install; \
deactivate; \
fi
. venv/bin/activate && pre-commit run --files $$(git diff --name-only master) && deactivate

View File

@ -6,6 +6,7 @@
[![Twitter](https://img.shields.io/twitter/follow/DeepSpeedAI)](https://twitter.com/intent/follow?screen_name=DeepSpeedAI)
[![Japanese Twitter](https://img.shields.io/badge/%E6%97%A5%E6%9C%AC%E8%AA%9ETwitter-%40DeepSpeedAI_JP-blue)](https://twitter.com/DeepSpeedAI_JP)
[![Chinese Zhihu](https://img.shields.io/badge/%E7%9F%A5%E4%B9%8E-%E5%BE%AE%E8%BD%AFDeepSpeed-blue)](https://www.zhihu.com/people/deepspeed)
[![Slack](https://img.shields.io/badge/Slack-4A154B?style=for-the-badge&logo=slack&logoColor=white)](https://join.slack.com/t/deepspeedworkspace/shared_invite/zt-3a8pjd8dd-PCj2hMvR4Y2syPwVnjEoww)
<div align="center">
@ -15,32 +16,23 @@
## Latest News
<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
* [2025/06] [Arctic Long Sequence Training (ALST) with DeepSpeed: Scalable And Efficient Training For Multi-Million Token Sequences](https://www.snowflake.com/en/engineering-blog/arctic-long-sequence-training-multi-million-token-ai/)
* [2025/04] [DeepCompile: Unlocking Compiler Optimization for Distributed Training](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepcompile/README.md)
* [2025/03] [DeepSpeed-AutoTP: Automatic Tensor Parallel Training of Hugging Face models](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/huggingface-tp/README.md)
* [2024/12] [Ulysses-Offload: Democratizing Long Context LLM Training ](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/ulysses-offload/README.md)
* [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
* [2024/08] [DeepSpeed on Windows](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/chinese/README.md)]
* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)]
* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
* [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
* [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
<!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
<details>
<summary>More news</summary>
<ul>
<li>[2023/08] <a href="https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md">DeepSpeed ZeRO-Inference: 20x faster inference through weight quantization and KV cache offloading</a></li>
<li> [2024/08] <a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md"> DeepNVMe: Improving DL Applications through I/O Optimizations</a> [<a href="ttps://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md"> 日本語 </a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md"> 中文 </a>]</li>
<li>[2023/08] <a href="https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md">DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements</a></li>
<li> [2024/07] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md"> DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training</a> [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md"> 日本語 </a>] </li>
<li>[2023/08] <a href="https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses">DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models</a> [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md">中文</a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md">日本語</a>]</li>
<li> [2024/03] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md"> DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models</a> [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md"> 中文 </a>] </li>
<li>[2023/06] <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">ZeRO++: A leap in speed for LLM and chat model training with 4X less communication</a> [<a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">English</a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md">中文</a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md">日本語</a>]</li>
</ul>
</details>
@ -135,9 +127,9 @@ DeepSpeed has been integrated with several different popular open-source DL fram
| Description | Status |
| ----------- | ------ |
| NVIDIA | [![nv-torch110-p40](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml) |
| NVIDIA | [![nv-torch-latest-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml) |
| AMD | [![amd-mi200](https://github.com/deepspeedai/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/amd-mi200.yml) |
| CPU | [![torch-latest-cpu](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-inference.yml) |
| CPU | [![torch-latest-cpu](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml) |
| Intel Gaudi | [![hpu-gaudi2](https://github.com/deepspeedai/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/hpu-gaudi2.yml) |
| Intel XPU | [![xpu-max1100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/xpu-max1100.yml) |
| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
@ -280,8 +272,7 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
31. Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, Shuaiwen Leon Song. (2024) FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design [arXiv:2401.14112](https://arxiv.org/abs/2401.14112)
32. Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Reza Yazdani Aminadabi, Shuaiwen Leon Song, Samyam Rajbhandari, Yuxiong He. (2024) [System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://dl.acm.org/doi/10.1145/3662158.3662806)
33. Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, Minjia Zhang. (2024) Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training [arXiv:2406.18820](https://arxiv.org/abs/2406.18820)
34. Stas Bekman, Samyam Rajbhandari, Michael Wyatt, Jeff Rasley, Tunji Ruwase, Zhewei Yao, Aurick Qiao, Yuxiong He. (2025) Arctic Long Sequence Training: Scalable And Efficient Training For Multi-Million Token Sequences [arXiv:2506.13996](https://arxiv.org/abs/2506.13996)
# Videos

View File

@ -229,10 +229,17 @@ class CPU_Accelerator(DeepSpeedAccelerator):
return True
def is_fp16_supported(self):
return False
try:
if torch.ops.mkldnn._is_mkldnn_fp16_supported():
return True
except:
return False
def supported_dtypes(self):
return [torch.float, torch.bfloat16]
supported_dtypes = [torch.float, torch.bfloat16]
if self.is_fp16_supported():
supported_dtypes.append(torch.float16)
return supported_dtypes
# Graph operations
def create_graph(self):

View File

@ -60,8 +60,11 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
return 'cuda'
return 'cuda:{}'.format(device_index)
def communication_backend_version(self):
return torch.cuda.nccl.version()
def device(self, device_index=None):
return torch.cuda.device(device_index)
return torch.device('cuda', device_index)
def set_device(self, device_index):
torch.cuda.set_device(device_index)

View File

@ -23,6 +23,8 @@ class HPU_Accelerator(DeepSpeedAccelerator):
import habana_frameworks.torch.hpu as hpu
self.hpu = hpu
torch.use_deterministic_algorithms(True)
# TODO: remove this WA when memory mapping break is resolved.
torch.utils.deterministic.fill_uninitialized_memory = False
except ImportError as e:
raise ValueError(
f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")

View File

@ -67,7 +67,7 @@ def get_accelerator():
f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
elif accelerator_name == "xpu.external":
try:
import intel_extension_for_deepspeed # noqa: F401 # type: ignore
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore
except ImportError as e:
raise ValueError(
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
@ -136,6 +136,21 @@ def get_accelerator():
accelerator_name = "xpu"
except ImportError as e:
pass
if accelerator_name is None:
try:
import torch
# torch.xpu will be supported in upstream pytorch-2.8.
# Currently we can run on xpu device only using pytorch,
# also reserve the old path using ipex when the torch version is old.
if hasattr(torch, 'xpu'):
if torch.cuda.device_count() == 0: #ignore-cuda
if torch.xpu.device_count() > 0 and torch.xpu.is_available():
accelerator_name = "xpu"
else:
pass
except ImportError as e:
pass
if accelerator_name is None:
try:
import torch_npu # noqa: F401,F811 # type: ignore
@ -209,6 +224,12 @@ def get_accelerator():
ds_accelerator = CPU_Accelerator()
elif accelerator_name == "xpu.external":
# XPU_Accelerator is already imported in detection stage
try:
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F811
except ImportError as e:
raise ValueError(
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
)
ds_accelerator = XPU_Accelerator()
elif accelerator_name == "xpu":
from .xpu_accelerator import XPU_Accelerator
@ -243,7 +264,7 @@ def get_accelerator():
def set_accelerator(accel_obj):
global ds_accelerator
_validate_accelerator(accel_obj)
if accel_logger is not None:
if accel_logger is not None and accel_obj is not None:
accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
ds_accelerator = accel_obj

View File

@ -5,19 +5,32 @@
import torch
from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
import intel_extension_for_pytorch as ipex # noqa: F401 # type: ignore
import oneccl_bindings_for_pytorch # noqa: F401 # type: ignore
import functools
import importlib
import inspect
try:
import oneccl_bindings_for_pytorch # noqa: F401 # type: ignore
oneccl_imported_p = True
except ImportError as e:
oneccl_imported_p = False
try:
import intel_extension_for_pytorch as ipex # noqa: F401 # type: ignore
ipex_imported_p = True
except ImportError as e:
ipex_imported_p = False
class XPU_Accelerator(DeepSpeedAccelerator):
def __init__(self):
self._name = 'xpu'
self._communication_backend_name = 'ccl'
if oneccl_imported_p:
self._communication_backend_name = 'ccl'
else:
# changed to xccl if not using torch-CCL on XPU device
self._communication_backend_name = 'xccl'
self._compile_backend = "inductor"
self.aligned_tensors = []
self.class_dict = None
@ -26,11 +39,14 @@ class XPU_Accelerator(DeepSpeedAccelerator):
return False
def use_host_timers(self):
# WA XPU event will be consolidated in 2.6
if ipex.__version__ < '2.6':
return True
else:
if not ipex_imported_p:
return self.is_synchronized_device()
else:
# WA XPU event will be consolidated in 2.6
if ipex.__version__ < '2.6':
return True
else:
return self.is_synchronized_device()
def resolves_data_dependency(self):
return self.is_synchronized_device()
@ -290,10 +306,13 @@ class XPU_Accelerator(DeepSpeedAccelerator):
return self.class_dict['NotImplementedBuilder']
def build_extension(self):
try:
from intel_extension_for_pytorch.xpu.cpp_extension import DpcppBuildExtension
except ImportError:
from intel_extension_for_pytorch.xpu.utils import DpcppBuildExtension
if ipex_imported_p:
try:
from intel_extension_for_pytorch.xpu.cpp_extension import DpcppBuildExtension
except ImportError:
from intel_extension_for_pytorch.xpu.utils import DpcppBuildExtension
else:
from torch.utils.cpp_extension import DpcppBuildExtension
return DpcppBuildExtension
def export_envs(self):

174
blogs/deepcompile/README.md Normal file
View File

@ -0,0 +1,174 @@
<div align="center">
# DeepCompile: Unlocking Compiler Optimization for Distributed Training
</div>
# Introduction
<div align="center">
<img src="media/perf_summary.png" width="1000">
</div>
Distributed training has become essential for scaling todays massive deep learning models. While deep learning compilers like PyTorch compiler dramatically improved single-GPU training performance through optimizations like kernel fusion and operator scheduling, they fall short when it comes to distributed workloads.
Existing distributed training frameworks such as DeepSpeed and FSDP have made large-scale model training feasible through advanced parallelization strategies. While powerful, their optimizations are implemented at the PyTorch framework level, which limits the ability to apply compiler-style techniques like dependency analysis or operator scheduling.
DeepCompile addresses this gap by enabling compiler-level optimizations for distributed training. It takes a standard single-GPU model implementation and transforms it into an optimized multi-GPU training graph without requiring changes to the model code. Unlike existing approaches, DeepCompile automatically applies parameter sharding, communication scheduling, and memory-aware execution at the compiler IR level, enabling global analysis and optimization that are difficult to express in traditional frameworks. Furthermore, during training, DeepCompile employs profile-guided optimization techniques to dynamically tune these parallelization strategies and improve training performance.
Our evaluation demonstrates that DeepCompile improves training performance over ZeRO-3 baselines, achieving up to 1.5x speedup when sufficient GPU resources are available, and up to 7x speedup in GPU-constrained settings that require offloading. DeepCompile is available in DeepSpeed versions >= [0.16.6](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.16.6). As it is under active development, we recommend using the latest version of DeepSpeed or installing from source to access the most recent updates and bug fixes.
# Design Overview
DeepCompile extends the capabilities of deep learning compilers to support distributed training. It starts from a standard single-GPU model implementation, such as those available on the Hugging Face model hub, and automatically transforms it by inserting necessary distributed training operations such as parameter sharding and communication primitives. Users are not required to embed any distributed logic into the model code.
The process begins by compiling the model into an intermediate representation (IR), which forms a computation graph. DeepCompile then applies a sequence of *optimization passes*, each responsible for a specific transformation of the computation graph or a targeted performance improvement, to incrementally introduce distributed behavior and optimize the graph. These include operations such as all-gather for sharded parameters or offloading of optimizer states, all while preserving the original computation semantics (Fig. 1).
<div align="center">
<img src="media/workflow.png" width="400">
*Figure 1: Workflow of compilation and optimization with DeepCompile.*
</div>
At its core, DeepCompile builds on two key capabilities:
- **Automatic parallelization**: DeepCompile allows optimization passes to rewrite the single-GPU computation graph into a distributed multi-GPU version, incorporating strategies such as ZeRO, FSDP, and more. This eliminates the need for manual implementation of distributed training logic, drastically reducing engineering effort.
- **Profile-guided performance tuning**: At runtime, DeepCompile collects profiling data such as operator-level memory usage and execution latency. It uses this information to dynamically schedule computation and communication operators. This enables effects such as an improved overlap between communication and computation, and an avoidance of memory bottlenecks. Fine-grained tuning through these optimization passes often leads to better performance than even manually engineered implementations.
Figure 2 illustrates the optimization cycle employed by DeepCompile. After the initial computation graph is generated by the compiler, DeepCompile profiles its behavior by measuring operator execution time, communication overhead, and memory usage throughout the forward and backward passes.
<div align="center">
<img src="media/opt_loop.png" width="600">
*Figure 2. Optimization cycle.*
</div>
Based on the collected profiling data, DeepCompile applies a sequence of optimization passes. These passes modify the computation graph by inserting, removing, or reordering operators to improve overall efficiency. The modified graph is then re-profiled, and this cycle of profiling and optimization is repeated.
Once a stable set of optimizations has been applied, the graph is deployed for the remaining training iterations. During execution, memory usage and other runtime characteristics may change. In such cases, DeepCompile can resume the profiling and optimization cycle according to the predefined schedule of passes, allowing the graph to adapt and maintain high performance.
# Optimizations
DeepCompile is designed as a general compiler framework for applying and optimizing a wide range of parallelization strategies. In the following, we describe several optimizations that have been implemented as optimization passes within DeepCompile.
## ZeRO3
As an initial step, we have used DeepCompile to implement and enhance ZeRO-3-style optimizations at the compiler level. ZeRO-3 partitions model parameters, gradients, and optimizer states across devices, reducing memory usage and enabling large-scale training.
In conventional ZeRO-3 implementations, operations such as all-gather, reduce-scatter, and buffer release are typically inserted using Python hooks at runtime. DeepCompile replaces this approach by injecting these operations directly into the computation graph during compilation. This allows the compiler to determine their placement precisely, guided by both the static structure of the graph and runtime profiling information.
One of the key optimizations is **proactive prefetching**, which launches all-gather operations earlier in the computation based on memory usage profiling. This reordering increases the overlap between communication and computation thereby improving throughput, while avoiding OOMs. In addition, small communication operations are often fused to reduce launch latency and improve efficiency.
Another optimization is **selective unsharding**, which keeps certain parameters in an unsharded form during the forward and backward passes when memory conditions permit. This reduces the frequency of all-gather operations and avoids redundant communication, particularly in scenarios where gradient accumulation is enabled.
## Offloading
DeepCompile also supports **adaptive offloading**, which offloads optimizer states to reduce GPU memory pressure. Unlike approaches that offload all the optimizer states, adaptive offloading identifies only the portions that exceed the memory limit—such as momentum and variance used by the Adam optimizer—and schedules data transfers to overlap with computation. This selective and asynchronous strategy minimizes overhead and enables efficient training even in memory-constrained environments.
## ZeRO1
ZeRO-1 differs from ZeRO-3 in that it shards only the optimizer states across devices, while keeping parameters and gradients fully replicated. This approach reduces memory usage with minimal changes to computation flow, making it a lightweight alternative for certain training scenarios.
DeepCompile implements ZeRO-1-style optimization by inserting reduce-scatter operations directly into the computation graph. By avoiding Python-level hooks, this graph-level integration reduces overhead and improves execution efficiency.
# Performance Improvements
## ZeRO-3
We evaluated DeepCompile on Llama-3-70B and Mixtral 8x7B using parameter sharding on top of Hugging Face model implementations.
Figure 3 shows training throughput (TFLOPs/GPU) across different gradient accumulation steps, using 32 H100 GPUs with a sequence length of 1024.
We compare DeepCompile against two DeepSpeed ZeRO-3 baselines: (i) an eager-mode version without compiler support (labelled ZeRO3+Eager), and (ii) a compiled version using PyTorch compiler (labelled ZeRO3+Compile). For DeepCompile, we enabled both proactive prefetching and selective unsharding to demonstrate the combined effect of these optimization passes.
<div align="center"> <img src="media/perf_zero3.png" width="800">
*Figure 3. Achieved throughputs for ZeRO3 training of Llama-3 70B and Mixtral 8x7B models.*
</div>
Across both models, DeepCompile consistently delivers higher throughput. The benefit becomes more pronounced at higher accumulation steps, where the reduced frequency of parameter updates makes selective unsharding more effective. DeepCompile with proactive prefetching and selective unsharding achieves up to 1.28× speedup over ZeRO-3 on Llama-3-70B and 1.54× on Mixtral 8x7B.
Meanwhile, enabling the PyTorch compiler with ZeRO-3, i.e., ZeRO3+Compile introduces minor overheads in some settings. This is because ZeRO-3 includes many conditional branches for runtime features such as prefetching. When the compiler encounters branches that cannot be statically resolved, it splits the computation into multiple graph segments. These fragmented segments can reduce optimization opportunities and introduce additional overheads during execution.
## Offloading
Training models as large as Llama-3 70B with ZeRO-3 typically requires 32 GPUs with 80GB of memory.
DeepSpeed addresses this challenge by offering offloading capabilities, which transfer optimizer states and optionally model parameters to CPU memory to reduce GPU memory usage. DeepCompile also supports offloading through a dedicated optimization pass, but with a few key differences in design.
Unlike the traditional approach of offloading both optimizer computation and memory, DeepCompile offloads only optimizer memory (e.g., momentum, variance, and master weights of Adam optimizer) while the optimizer computation remains on GPU. DeepCompile profiles memory usage during both forward and backward passes to identify when offloading is necessary, and transfers only the required data. This fine-grained approach avoids unnecessary overhead and helps maintain high computational throughput.
Furthermore, DeepCompile overlaps data transfers with computation whenever possible, dynamically adjusting the timing based on observed memory usage patterns. This asynchronous behavior is a crucial aspect of DeepCompiles offloading strategy, allowing it to reduce GPU memory pressure without stalling execution.
We evaluated DeepCompile's offloading using Llama-3 70B on 16xH100-80GB (half the required GPU counts) and present the results in Figure 4.
<div align="center">
<img src="media/perf_offload.png" width="400">
*Figure 4. Achieved throughput of optimizer offloading for Llama-3 70B on 16x80GB GPUs*
</div>
We compare against two ZeRO-3 offloading baselines: (i) an eager-mode version without compiler support (ZeRO3+Eager), and (ii) a compiled version using PyTorch compiler (ZeRO3+Compile). As shown by the results, DeepCompile significantly improves offloading efficiency and provides up to 7× speedup over ZeRO3+Eager. In contrast, we see that ZeRO3+Compile achieves similar performance as ZeRO3+Eager.
## ZeRO-1
We also evaluated DeepCompile with ZeRO-1 using the Llama-3-8B model. We compare DeepCompile against two ZeRO-1 baselines: (i) an eager-mode version without compiler support (ZeRO1+Eager), and (ii) a compiled version using PyTorch compiler (ZeRO1+Compile). In our experiment with 8 GPUs and a batch size of 2, DeepCompile achieved consistent throughput improvements across different sequence lengths, as shown in Figure 5.
<div align="center">
<img src="media/perf_zero1.png" width="800">
*Figure 5. Achieved throughput of ZeRO-1 training of Llama-3 8B*
</div>
The most significant speedup was observed with batch size 1 and sequence length 512, where DeepCompile outperformed ZeRO1+Eager by up to 1.9×, and ZeRO1+Compile by up to 2.5×.
While compiler-based approaches can be effective for large batch sizes and long sequences by replacing suboptimal operations with more efficient kernels, they may also introduce overheads in ZeRO-1-style training in the form of *graph breaks* around the communication operations. These overheads become more pronounced with smaller batch sizes and sequence lengths, thus hurting performance compared to the non-compiled execution. In contrast, DeepCompile inserts communication operators directly into the computation graph during compilation, avoiding graph fragmentation and minimizing associated overhead. This makes DeepCompile more robust to small-scale workloads, while still benefiting from compiler-level optimizations.
## Additional Results and Analysis
Please refer to our [arXiv paper](https://arxiv.org/abs/2504.09983) for additional results, such as detailed comparisons across different batch sizes, sequence lengths, and memory usage.
# Looking Ahead
DeepCompile brings the power of compiler-based optimizations to distributed deep learning. By transforming computation graphs and applying profile-guided optimization passes, it enables more efficient training without requiring changes to model code.
This release is just the beginning. Were actively working on expanding the set of optimization passes and improving integration with a broader range of distributed training strategies. Future directions include automated parallelization (sequence/tensor parallelisms), smarter memory management, and dynamic adaptation to runtime behavior.
We invite the community to try DeepCompile, explore its capabilities, and contribute to its evolution. Lets build the next generation of scalable deep learning together.
# Acknowledgments
We would like to thank everyone who supported this project.
This project would not have been possible without the PyTorch Compiler—a platform that is not only powerful and flexible, but also a pleasure to work with. We are especially grateful to the developers and researchers behind PyTorch Compiler for making such an excellent foundation available to the community.
# Contributors
This project is the result of a close collaboration between Microsoft and the University of Virginia. The contributors are: Masahiro Tanaka, Du Li, and Umesh Chand, Olatunji Ruwase (Microsoft); and Ali Zafar and Haiying Shen (University of Virginia).
# Appendix
## Examples and Benchmarks
Our DeepSpeedExamples repository provides [example code](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/deepcompile) to enable DeepCompile.
## Optimization Passes
The following optimization passes are currently available in DeepCompile:
- All-gather & reduce-scatter insertion (ZeRO3)
- Proactive prefetching (ZeRO3)
- Selective unsharding (ZeRO3)
- Reduce-scatter insertion (ZeRO1)
- Adaptive offloading
We used the following combinations of passes in the experiments presented above:
- Improved communication scheduling for ZeRO-3: All-gather & reduce-scatter → Proactive prefetching → Selective unsharding
- Offloading optimizer states for ZeRO3: Adding all-gather & reduce-scatter → Adaptive offloading
- Reduced overhead and improved overlap for ZeRO-1: Adding reduce-scatter

Binary file not shown.

After

Width:  |  Height:  |  Size: 355 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 193 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 129 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 85 KiB

View File

@ -0,0 +1,137 @@
<div align="center">
# DeepNVMe: Affordable I/O scaling for Deep Learning Applications.
</div>
# Introduction
We introduced [DeepNVMe](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepnvme/08-2024/README.md) in summer 2024 as a suite of optimizations for tackling I/O bottlenecks in Deep Learning (DL). DeepNVMe delivers significant speedups for I/O bound DL workloads by leveraging storage innovations including local NVMe SSDs, NVIDIA Magnum IO<sup>TM</sup> GPUDirect® Storage (GDS), and Linux Asynchronous I/O (AIO).
In this update, we are delighted to announce DeepNVMe improvements on multiple fronts: (i) expanding application coverage to FastPersist model checkpointing and SGLang inference, (ii) I/O performance scaling by upgrading from PCIe Gen4 to Gen5 NVMe SSDs, and (iii) expanding usability to CPU-only environments, offset-based I/O operations, and tensor data type casting. The results reported in this blog are available in DeepSpeed versions >= [0.17.1](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.17.1).
# Evaluation environments
Our experiments are conducted on Azure [ND-H200-v5](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/nd-h200-v5-series?tabs=sizebasic) VM. The key software configurations are summarized in the following table.
|Software | Version
|---|--|
|Ubuntu | 24.04.2|
|PyTorch | 2.6.0|
|CUDA | 12.6 |
SGLang | 0.4.4.post4 |
# Addressing I/O Bottlenecks of Deep Learning
We used DeepNVMe to develop FastPersist and ZeRO-Inference to target I/O bottlenecks in DL training and inference respectively. Our experiments are conducted using a single VM, in which we combine the available NVMe SSDs into a single RAID-0 (i.e., disk striping) volume to leverage aggregate read and write bandwidths. Since DeepNVMe can offload tensors using CPU bounce buffers (a.k.a., AIO), or NVIDIA GPUDirect Storage (a.k.a., GDS), we report results for both modes.
## FastPersist: Faster Model Checkpoint Creation
Although saving model checkpoints to persistent storage is critical in model training, it is also a major bottleneck due to the inefficiencies of existing approaches. We developed [FastPersist](https://arxiv.org/abs/2406.13768) to address the performance challenges of checkpointing. FastPersist makes checkpointing overheads negligible during training through three key techniques: (i) DeepNVMe, (ii) data parallelism, and (iii) overlapping I/O and computation.
Our goal here is to demonstrate the impact of DeepNVMe in FastPersist using single-process micro-benchmarks (available [here](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/deepnvme/model_checkpoint)) which serialize a model checkpoint state from HBM to local NVMe. We use the popular PyTorch `torch.save()` as the baseline in our experiments, and integrate FastPersist into `torch.save()` to simplify adoption and performance comparisons.
### Faster Saving of PyTorch Models to local NVMe Storage
We measure the throughput of serializing Phi-3-Mini checkpoint state from HBM to local NVMe storage. The results are summarized in the Figure below. We observe significantly faster checkpointing with FastPersist compared to the baseline. We see speedups of over 20X in the 8xGen5 NVMe settings. We also observe FastPersist scaling with increased NVMe bandwidth of 8xGen5 compared with 4xGen5.
<img src="./media/fastpersist_phi3_mini.png">
<div align="center">
FastPersist provides significantly faster model checkpointing to local NVMe.
</div>
## ZeRO-Inference: Democratizing Generative AI
[ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) is a technology that democratizes access to state-of-the-art models by reducing the GPU costs of model inference. ZeRO-Inference enables inference computations of massive models (hundreds-of-billions of parameters) on as few as one GPU by offloading the model weights to DRAM and NVMe storage. ZeRO-Inference is designed for offline or throughput-oriented inference scenarios. In this blog, we share two updates on ZeRO-Inference. First, we have integrated ZeRO-Inference into SGLang, a state-of-the-art model serving framework. Second, we observed ZeRO-Inference performance scales with the faster NVMe SSDs in the latest Azure SKUs.
### Democratizing SGLang through ZeRO-Inference integration
[SGLang](https://docs.sglang.ai/) is a state-of-the-art serving framework for large language models (LLMs) and vision language models (VLMs). Our integration of ZeRO-Inference into SGLang makes SGLang available to budget-constrained users, and offers a cost-reduction option to existing SGLang users. We used SGLang's [offline benchmarking tool](https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_offline_throughput.py) to measure the generation throughput of LLAMA3-70B on a single H200 with NVMe offloading (LLAMA3-70B cannot fit in the 141GB VRAM without offloading). The experiment is configured with prompt length of 512, generation length of 32, and batch size of 128. We summarize the results in the figure below for both AIO and GDS offloading.
<img src="./media/sg_zinf_llama_70b.png">
<div align="center">
ZeRO-Inference improves SGLang inference with NVMe offloading to reduce hardware costs.
</div>
### Scaling HF Transformer Generation with Faster NVMe SSDs
ZeRO-Inference enhances HF Transformer inference with efficient model offloading to DRAM or NVMe. We previously [evaluated](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepnvme/08-2024/README.md#high-performance-offloading-via-nvme-scaling) LLAMA-3-70B generation performance with NVMe offloading on a single GPU and four Gen4 NVMes in an Azure [NC_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/nca100v4-series?tabs=sizebasic) VM. We measured the generation speed for a prompt of 512 tokens, output of 32 tokens, and batch size 96. Since NVMe bandwidth was the main bottleneck, we repeat the experiments on Azure ND-H200-v5 offering Gen5 NVMes. The results summarized in the Figure below show that ZeRO-Inference uses the increased NVMe bandwidths to improve generation speeds. For example, with GDS, generation speed improves from 7 tokens/sec with four Gen4 NVMes to 17 tokens/sec with four Gen5 NVMes, and further to 26 tokens/sec with eight Gen5 NVMes. We observe similar improvements without GDS. These results show that ZeRO-Inference performance can be improved in cost-effective manner by increasing NVMe bandwidths.
<img src="./media/hf_zinf_llama_70b.png">
<div align="center">
ZeRO-Inference leverages available NVMe bandwidth to scale LLAMA-3-70B generation.
</div>
# I/O performance scaling
We used our `ds_io` benchmarking tool to demonstrate DeepNVMe proportionally scaling I/O performance with available NVMe bandwidths. This empowers users to accelerate I/O bound DL applications at modest cost using more or faster NVMe SSDs. In our experiments, we measure the achieved read and write bandwidths of 1GB data transfers between HBM and NVMes. We evaluate scaling up NVMes from PCIe Gen4 to Gen5, and scaling out from 4 to 8 SSDs. The SSDs are combined into a single RAID-0 (disk striping) volume. We summarize the results in the Figure below which show that DeepNVMe scales I/O performance on both dimensions. Scaling up from 4xGen4 SSDs to 4xGen5 SSDs improves reads from 10GB/sec to 27GB/sec, and writes from 5GB/sec to 11GB/sec. Scaling out from 4xGen5 to 8xGen5 further improves reads to 48GB/sec, and writes to 26GB/sec.
<img src="./media/dnvme_scaling.png">
<div align="center">
Microbenchmark shows DeepNVMe scales I/O performance with available NVMe bandwidth
</div>
# Broadening usability
We have increased the usage scenarios of DeepNVMe by removing restrictions regarding hardware environments and I/O operations, as explained below.
## CPU-Only environments
Although GPUs (and similar accelerators) dominate DL, CPUs are still used in important machine learning (ML) workloads such as recommendation systems. However, DeepNVMe was previously unusable in CPU-only environments. This was because DeepNVMe relied on `torch.pin_memory()` for page-locked CPU tensors, whereas `torch.pin_memory()` does not work in the CPU versions of `torch` as illustrated below.
```bash
>>> import torch
>>> torch.__version__
'2.6.0+cpu'
>>> x = torch.empty(1024).pin_memory()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
RuntimeError: Cannot access accelerator device when none is available.
>>>
```
We have made DeepNVMe usable in CPU environments by adding mechanisms for allocating (`new_cpu_locked_tensor()`) and releasing (`free_cpu_locked_tensor()`) page-locked CPU tensors. The snippet below illustrates allocating a pinned CPU tensor (`x`).
```bash
>> import torch
>>> torch.__version__
'2.6.0+cpu'
>>> from deepspeed.ops.op_builder import AsyncIOBuilder
>>> h = AsyncIOBuilder().load().aio_handle()
>>> x = h.new_cpu_locked_tensor(1024, torch.Tensor())
>>> x.shape
torch.Size([1024])
>>> x.dtype
torch.float32
```
## Offset-based I/O operations
Previously, DeepNVMe functionality was restricted to reading or writing the entire contents of a file. We have now improved DeepNVMe to read or write a user-specified portion of file content from a given offset. In particular, we have extended the existing read/write APIs to accept a user-specified `file offset` argument (with default value 0) such as below:
```bash
>> from deepspeed.ops.op_builder import AsyncIOBuilder
>>> help(AsyncIOBuilder().load().aio_handle().pread)
Help on method pread in module async_io:
pread(...) method of async_io.aio_handle instance
pread(self: async_io.aio_handle, buffer: torch.Tensor, filename: str, validate: bool, async: bool, file_offset: int = 0) -> int
```
## Tensor data type casting
While developing FastPersist, we needed to manipulate model tensors, typically of floating point data types, in byte format for both performance and convenience of I/O operations. However, we could not find a zero-copy mechanism for casting tensors from arbitrary data types to a byte data type (i.e., torch.uint8), so we decided to create one. This functionality is available via the `UtilsBuilder` op as demonstrated in the example below. In the example, we cast a `torch.bfloat16` tensor into `torch.uint8`. Note that due to the zero-copy nature of the functionality, `bf16_tensor` and `byte_tensor` are aliases.
```
>>> import torch
>>> from deepspeed.ops.op_builder import UtilsBuilder
>>> util_ops = UtilsBuilder().load()
>>> bf16_tensor = torch.zeros(1024, dtype=torch.bfloat16, device='cuda')
>>> bf16_tensor
tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0', dtype=torch.bfloat16)
>>> byte_tensor = util_ops.cast_to_byte_tensor(bf16_tensor)
>>> byte_tensor
tensor([0, 0, 0, ..., 0, 0, 0], device='cuda:0', dtype=torch.uint8)
>>> bf16_tensor += 1.0
>>> bf16_tensor
tensor([1., 1., 1., ..., 1., 1., 1.], device='cuda:0', dtype=torch.bfloat16)
>>> byte_tensor
tensor([128, 63, 128, ..., 63, 128, 63], device='cuda:0',
dtype=torch.uint8)
```
# Summary
This blog post has provided updates on our continued development of DeepNVMe, an I/O optimization technology for accelerating DL applications. We have announced DeepNVMe improvements on multiple aspects, including application coverage, I/O performance scaling, and usability.
# Acknowledgements
This blog describes work done by Joe Mayer, Logan Adams, and Olatunji Ruwase of the DeepSpeed team at Microsoft.

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 277 KiB

View File

@ -47,7 +47,7 @@ We used three benchmarking tools for our evaluations. The first is fio, the popu
## High-Performance I/O with CPU Buffers via NVMe Scaling
Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
<img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />
@ -85,4 +85,4 @@ In this blog post, we introduced DeepNVMe, an I/O optimization technology create
# Acknowlegements
This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nvidia.

View File

Before

Width:  |  Height:  |  Size: 31 KiB

After

Width:  |  Height:  |  Size: 31 KiB

View File

Before

Width:  |  Height:  |  Size: 39 KiB

After

Width:  |  Height:  |  Size: 39 KiB

View File

Before

Width:  |  Height:  |  Size: 43 KiB

After

Width:  |  Height:  |  Size: 43 KiB

View File

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 46 KiB

View File

@ -0,0 +1,183 @@
<p align="center">
<img height="250" src="./images/zenflow-logo.png" alt="zenflow logo"/>
</p>
<div align="center">
# ZenFlow: Stall-Free Offloading Engine for LLM Training
<div align="center">
<img src="./images/zenflow-overview.png" alt="" width="1200" />
<div align="left">
*Figure 1: ZenFlow is DeepSpeeds stall-free offloading engine for LLM training. It decouples GPU and CPU updates by prioritizing important gradients for immediate GPU updates and deferring the rest for asynchronous CPU-side accumulation. By fully overlapping CPU work and PCIe transfers with GPU computation, ZenFlow eliminates stalls and achieves high hardware utilization across both single-GPU and multi-GPUs settings.*
## Table of Content
- [ZenFlow: Stall-Free Offloading Engine for LLM Training](#zenflow-stall-free-offloading-engine-for-llm-training)
- [Table of Content](#table-of-content)
- [Introduction](#introduction)
- [ZenFlow at a Glance](#zenflow-at-a-glance)
- [ZenFlow Highlights](#zenflow-highlights)
- [Design Motivation](#design-motivation)
- [ZenFlow Design](#zenflow-design)
- [Getting Started: Try out DeepSpeed-ZenFlow](#getting-started-try-out-deepspeed-zenflow)
- [Citation](#citation)
- [Acknowledgements](#acknowledgements)
---
## Introduction
<div align="center">
<img src="./images/zero-offload-stall.png" alt="" width="600" />
<div align="left">
*Figure 2: ZeRO-Offload causes repeated GPU stalls due to blocking CPU updates and PCIe transfers, leading to >60% idle time per step when training Llama 2-7B on 4× A100s.*
Offloading has become a standard approach to scale fine-tuning of large language models (LLMs) beyond GPU memory limits. Frameworks like ZeRO-Offload reduce GPU memory usage by pushing gradients and optimizer states to the CPU. However, they also create a new bottleneck: expensive GPUs often sit idle, waiting on slow CPU updates and PCIe data transfers. In practice, enabling offloading when training Llama 2-7B on 4× A100 GPUs can inflate each step from 0.5s to over 7s—a 14× slowdown.
<div align="center">
<img src="./images/zenflow-example.png" alt="" width="1200" />
<div align="left">
*Figure 3: In ZeRO-Offload, CPU-side optimizer updates and PCIe transfers dominate iteration time, leaving the GPU idle for over 5 seconds.*
**ZenFlow** addresses this bottleneck with a stall-free training pipeline. It prioritizes high-impact gradients for immediate GPU updates, while offloading the rest to the CPU and applying them asynchronously. These deferred CPU updates are fully overlapped with GPU compute, eliminating stalls and significantly improving throughput. Best of all, ZenFlow maintains the same model accuracy and integrates seamlessly with DeepSpeed.
---
## ZenFlow at a Glance
- **Zero GPU stalls:** Top-k important gradients are updated immediately on GPU; low-priority gradients are asynchronously processed on CPU—no GPU wait time.
- **Asynchronous and bounded:** ZenFlow decouples CPU and GPU execution with a bounded-staleness strategy that preserves convergence.
- **Auto-tuned:** ZenFlow adapts update intervals at runtime based on gradient dynamics—no need to tune manually.
---
## ZenFlow Highlights
ZenFlow is the **first offloading framework** to offer a **bounded-asynchronous** update scheme that preserves convergence while delivering **up to 5× end-to-end speed-up** over ZeRO-Offload.
### Performance
| Feature | Benefit |
|--------|---------|
| Up to **5×** end-to-end speed-up over ZeRO-Offload and **6.3×** over ZeRO-Infinity | Faster time-to-convergence |
| **> 85% reduction in GPU stalls** on A100 / H100 nodes | Keeps GPUs busy, higher utilization |
| **≈ 2× lower PCIe traffic** (1.13× model size per step vs. 2× in ZeRO) | Less bandwidth pressure on clusters |
| **Maintains or improves accuracy** on GLUE (OPT-350M → Llama-13B) | No accuracy loss |
| **Lightweight gradient selection** (6000× cheaper than full AllGather) | Scales to multi-GPU settings without memory footprint spikes |
| **Auto-tuning (Zen-auto)** automatically adapts update interval on-the-fly | No manual knob tuning |
For more detailed performance results, please refer to our [arXiv paper](https://arxiv.org/abs/2505.12242).
---
## Design Motivation
Training large models with offloading can save GPU memory, but often at the cost of *performance*. In this section, we briefly discuss three topics. **First**, we explain why coupling CPU-side optimizer updates with GPU compute leads to severe GPU stalls during LLM fine-tuning. **Next**, we quantify how full-gradient offloading saturates the limited PCIe bandwidth on A100/H100 servers, inflating iteration time. **Finally**, we reveal the highly skewed importance distribution of gradients, showing that uniformly updating all parameters in GPUs at the same time is wasteful and unnecessary.
### Offloading-Induced GPU Stalls
<div align="center">
<img src="./images/zenflow-no-overlap.png" alt="" width="1200" />
<div align="left">
*Figure 4: CPU updates dominate step time, causing >60% GPU idle due to poor overlap with compute.*
Synchronous offloading frameworks (e.g., ZeRO-Offload) keep the GPU idle while the CPU performs a full optimizer step and transfers updated parameters back to GPU. For Llama-2-7B with 4× A100, the CPU path can take **longer than 4s** while the backward pass takes **approximately 2s**, so **over 60% of each iteration is pure GPU wait time**. Eliminating this serialization is essential for achieving high GPU utilization.
### Bandwidth Bottlenecks
A single training step moves a full copy of the model gradients from GPU to CPU and a full copy of the model parameters back, i.e., **2× model size of PCIe traffic per step**. Even on PCIe 4.0 (≈ 32 GB/s), Llama-2-13B pushes ~40 GB per iteration, adding **> 1s** of transfer latency.
### Unequal Gradient Importance
Not all gradients matter equally. Our analysis shows that **the top 1% of gradient channels contribute over 90% of the ℓ²-norm energy** during fine-tuning. In other words, most updates have little impact on model learning, yet still incur disproportionately high compute and I/O costs in traditional offloading pipelines.
This skew in gradient importance opens the door to a better design: update critical gradients on GPU right away, and defer the rest for asynchronously batched, lower-priority updates on CPU. ZenFlow turns this idea into a principled, efficient training engine.
<div align="center">
<img src="./images/zenflow-gradients.png" alt="" width="1200" />
<div align="left">
*Figure 5: Top 1% of gradients may contribute over 85% of gradient norms.*
---
## ZenFlow Design
ZenFlow is designed around three key ideas that separate critical and non-critical gradient updates while minimizing communication bottlenecks. Here's how we break the tight coupling between GPU and CPU computation to create a **stall-free** pipeline.
### Idea 1: Importance-Aware Top-k Gradient Update
Not all gradients are equally impactful for training. ZenFlow introduces an **importance-aware** design that prioritizes updates for the top-k most significant gradients. These gradients are updated directly on the GPU, using its high compute bandwidth. This approach allows us to **reduce the size of the per-step gradient update** by nearly **50%**, cutting down the communication load by around 2×.
For the rest of the gradients, which contribute less to the model's learning, ZenFlow batches them and performs asynchronous updates on the CPU. These updates are **deferred** until they are sufficiently accumulated, thereby reducing the impact on training speed.
### Idea 2: Bounded-Asynchronous CPU Accumulation
ZenFlows **asynchronous accumulation** allows the CPU to stay busy while the GPU performs other computations. We apply an **accumulation window** for the non-critical gradients, allowing them to accumulate over several iterations before updating. This gives ZenFlow the ability to process **multiple rounds of gradient updates** concurrently, eliminating idle time typically spent waiting for the CPU optimizer.
By carefully coordinating CPU updates with GPU execution, ZenFlow **fully hides CPU execution** behind GPU computation—ensuring that GPUs remain actively utilized, avoiding stalls, and **maximizing hardware efficiency**.
### Idea 3: Lightweight Gradient Selection
A key challenge in distributed training is **selecting important gradients** without introducing prohibitive communication and GPU memory costs. Traditional systems rely on global synchronization (via `AllGather`) to gather full gradients, which can become a major bottleneck in multi-GPU settings.
ZenFlow solves this with a **lightweight gradient proxy**: instead of transferring full gradients, ZenFlow uses a **per-column gradient norm** to approximate the importance of each gradient. By computing a compact summary of per-column gradients (e.g., squared norms), ZenFlow reduces communication volume by more than **4,000×**—with nearly no loss in accuracy.
This approach allows ZenFlow to **scale efficiently across GPUs**, without high memory or communication overhead, and it supports **dynamic gradient selection** as the model evolves.
### Putting It All Together: ZenFlows Zero-Stall Pipeline
<div align="center">
<img src="./images/zenflow-workflow.png" alt="" width="1200" />
<div align="left">
*Figure 6: ZenFlows stall-free pipeline overlaps CPU updates and transfers with multi-steps GPU compute.*
1. **Forward/Backward Pass on GPU:** ZenFlow processes the forward and backward passes on the GPU, immediately updating the **top-k gradients** on the GPU without waiting for the CPU.
2. **Gradient Transfer to CPU:** While the GPU is busy, gradients from the current iteration (or previous ones) are transferred to the CPU over a dedicated PCIe stream. This is done in parallel with GPU computation, without causing any GPU wait time.
3. **CPU Update:** Once a batch of non-critical gradients has accumulated, the CPU performs the update asynchronously. This update typically spans multiple GPU iterations, but is hidden behind GPU work, making it virtually invisible to the overall pipeline.
4. **Double Buffering:** ZenFlow uses **double buffering** to manage the newly updated gradients. When the CPU update is complete, the new parameters are transferred back to the GPU. The swap is as fast as a pointer flip—no need to reload the entire model or re-launch the kernel.
By constantly **overlapping GPU computation with CPU-side work**, ZenFlow transforms the traditional compute → wait → update cycle into a continuous, **stall-free pipeline**.
---
## Getting Started: Try out DeepSpeed-ZenFlow
To try out DeepSpeed-ZenFlow, please refer to the [ZenFlow tutorial](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/DeepSpeed-ZenFlow/README.md) in our DeepSpeedExamples repo.
---
## Citation
```bibtex
@article{lan2025zenflow,
title = {ZenFlow: Enabling Stall-Free Offloading Training via Asynchronous Updates},
author = {Tingfeng Lan and Yusen Wu and Bin Ma and Zhaoyuan Su and Rui Yang and Tekin Bicer and Masahiro Tanaka and Olatunji Ruwase and Dong Li and Yue Cheng},
journal = {arXiv preprint arXiv:2505.12242},
year = {2025}
}
```
---
## Acknowledgements
This work is the result of a close collaboration between University of Virginia (UVA), University of California, Merced (UC Merced), Argonne National Laboratory (ANL) and DeepSpeed team.
The contributors include [Tingfeng Lan](https://antlera.github.io/), [Yusen Wu](https://joshwoo2003.github.io/), [Zhaoyuan Su](https://alexsssu.github.io/), [Rui Yang](https://ruiyang00.github.io/), and [Yue Cheng](https://tddg.github.io/) from UVA; [Bin Ma](https://www.linkedin.com/in/bin-ma-ba665b182/) and [Dong Li](https://faculty.ucmerced.edu/dong-li/) from UC Merced; [Tekin Bicer](https://www.anl.gov/profile/tekin-bicer) from ANL; [Olatunji Ruwase](https://www.linkedin.com/in/tunji-ruwase-088952/) and [Masahiro Tanaka](https://www.linkedin.com/in/masahiro-tanaka-77482926/) from the DeepSpeed team. We especially thank [Olatunji Ruwase](https://www.linkedin.com/in/tunji-ruwase-088952/) and [Masahiro Tanaka](https://www.linkedin.com/in/masahiro-tanaka-77482926/) for their early feedback and insightful discussions and also for open-source community support.

Binary file not shown.

After

Width:  |  Height:  |  Size: 513 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 907 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 337 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 329 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 206 KiB

View File

@ -0,0 +1,242 @@
<div align="center">
# Automatic Tensor Parallel (AutoTP) Training of Hugging Face models
</div>
# Introduction
Tensor parallelism (TP) is an important memory optimization for training large-scale deep learning models. Despite the popularity of training Hugging Face (HF) [models](https://huggingface.co/models), the model scaling options for **[HF trainer](https://huggingface.co/docs/transformers/main_classes/trainer)** was previously limited to sharded data parallelism through [ZeRO](https://huggingface.co/docs/accelerate/usage_guides/deepspeed)/[FSDP](https://huggingface.co/docs/accelerate/usage_guides/fsdp). While ZeRO3 offers superior memory efficiency, it incurs significant communication costs. ZeRO (1/2) has lower communication overhead, but in the case of very large models, it cannot be used directly due to memory limitations. Therefore, combining TP with ZeRO (1/2) offers more balanced options for memory and performance. Moreover, through TP, we can alleviate the batch scaling limitations imposed by ZeRO/FSDP.
We are pleased to announce that DeepSpeed now provides native automatic tensor parallel training for Hugging Face (HF) transformers. This new feature builds on DeepSpeed's [AutoTP](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/) mechanism, which was previously restricted to inference. AutoTP training can be combined with ZeRO to unlock unprecented efficiency benefits for HF model post-training, including:
**1**. Model scaling with lower communication costs than FSDP/ZeRO3 (e.g., use AutoTP + ZeRO1 to achieve ZeRO3 memory savings).
**2**. Batch size scaling for faster training and increased throughput.
**3**. Context length scaling to enable new application scenarios.
We have integrated AutoTP training with ZeRO1 & ZeRO2, with ZeRO3 integration on the way. AutoTP training is available in DeepSpeed versions >= 0.16.4
# Batch Scaling with AutoTP Training + ZeRO
The following is a batch scaling experiment of Llama3 8B training conducted on [Gaudi2 Accelerator](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi.html).
<div align="center">
<img src="media/batchscale.png">
*Figure 1. Batch scaling experiment on Gaudi2, showing throughput performance improvements from 2 to 4 cards by combining AutoTP and ZeRO. The used mbs is the max possible value with the given config. A higher speedup indicates better performance.*
</div>
<div align="center">
<img src="media/flowchart.png">
*Figure 2. Model training with AutoTP + ZeRO*
</div>
Figure 2 illustrates the basic flowchart, The division of TP and ZeRO is implemented through the AutoTP parser and ZeRO Wrapper in [Accelerate](https://github.com/huggingface/accelerate.git). Besides, The TP-based dataloader and save mechanism are both supported in DeepSpeed and Accelerate.
# Usage
Although we evaluated AutoTP training with Llama2 & Llama3 models in this blog, we expect compatibility with other Hugging Face models, especially [those](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/) previously validated with AutoTP inference.
**Requirements**
- `deepspeed >= 0.16.4`
- `transformers >= 4.50.1`
- `accelerate >= 1.6.0`
**Enable TP training**
Similar to ZeRO, AutoTP training is enabled using the [deepspeed configuration file](https://www.deepspeed.ai/docs/config-json/) by specifying ```[tensor_parallel][autotp_size]```.
```
"ZeRO_optimization": {
"stage": 1,
"gather_16bit_weights_on_model_save": true,
...
},
"tensor_parallel":{
"autotp_size": 4
},
```
The parallel configuration follows this logic:
```
tp_size = auto_tp_size
dp_size = num_gpus / tp_size
```
Note that the global_batch_size (gbs) changes with different TP settings:
```
gbs (only dp) = per_device_batch_size * n_gpus * gradient_accumulation_steps
gbs (dp with tp) = per_device_batch_size * n_gpus / tp_size * gradient_accumulation_steps
```
**Save Model**
Saving checkpoints and model files is fully compatible with HF transformers. The [trainer.save_model()](https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.Trainer.save_model) method saves the original model. Ensure ```gather_16bit_weights_on_model_save``` is set to ```true```in the [deepspeed configuration file](https://www.deepspeed.ai/docs/config-json/).
```gather_16bit_weights_on_model_save=true in config.
"ZeRO_optimization": {
...
"gather_16bit_weights_on_model_save": true,
},
```
```
trainer.save_model(your_saved_path)
```
Models saved this way can be directly used for HF format inference without intermediate transformations.
**Saving Checkpoints and Resuming**
Saving Checkpoints remains compatible with HF transformers. Use [trainer.save_state()](https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.Trainer.save_state) or set the save interval for automatic saving, which can be used to resume training.
```
trainer.train(resume_from_checkpoint="your_saved_path/checkpoint-1200")
```
# Example
We validated AutoTP training using supervised finetune training (SFT) task: [stanford_alpaca](https://github.com/tatsu-lab/stanford_alpaca). The original benchmark model used in this project is Llama2-7B. The example code is also available [here](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/training/tensor_parallel)
**Training Loss curve**
The following loss curves depict SFT training, where gbs is uniformly set to 32, and other configurations match the default experiment settings from ([stanford_alpaca](https://github.com/tatsu-lab/stanford_alpaca)). The loss curves are largely consistent across the following setups:
- ZeRO3
- TP + disable ZeRO
- ZeRO1 and ZeRO1 + AutoTP
- ZeRO2 and ZeRO2 + AutoTP
<div align="center">
<img src="media/zero3.png">
*Figure 3. Loss curve of ZeRO3 stage training (gbs=32, dp=8)*
</div>
<div align="center">
<img src="media/tp8.png">
*Figure 4. Loss curve of AutoTP training (gbs=32, tp=8)*
</div>
<div align="center">
<img src="media/tpzero1.png">
*Figure 5. Loss curve of AutoTP + ZeRO1 training (gbs=32, dp=2, tp=4)*
</div>
<div align="center">
<img src="media/tpzero2.png">
*Figure 6. Loss curve of AutoTP + ZeRO2 training (gbs=32, dp=2, tp=4)*
</div>
**Resuming Training**
We tested recovery training curves from step 1200 in AutoTP + ZeRO1 and AutoTP + ZeRO2, which align with the original training curves.
<div align="center">
<img src="media/zero1tpload.png">
*Figure 7. AutoTP + ZeRO1 resuming training*
<img src="media/zero2tpload.png">
*Figure 8. AutoTP + ZeRO2 resuming training*
</div>
**Model Evaluation**
We conducted inference evaluations for the [MMLU task](https://github.com/EleutherAI/lm-evaluation-harness).
In MMLU, the scores for AutoTP + ZeRO1 and ZeRO1, as well as AutoTP + ZeRO2 and ZeRO2, are consistent, showing a fixed improvement over the pre-training model before SFT.
<div align="center">
| Groups | Version | Filter | n-shot | Metric | Model before SFT | ZeRO1 DP8 training | ZeRO1 TP4 DP2 training | ZeRO2 DP8 training | ZeRO2 TP4DP2 training |
|--------|---------|--------|--------|--------|-----------------------|--------------------|------------------------|--------------------|------------------------|
| mmlu | 2 | none | | acc | 0.4185 ± 0.0041 | 0.4472 ± 0.0041 | 0.4444 ± 0.0041 | 0.4543 ± 0.0041 | 0.4529 ± 0.0041 |
| - humanities | 2 | none | | acc | 0.3979 ± 0.0069 | 0.4185 ± 0.0070 | 0.4145 ± 0.0069 | 0.4274 ± 0.0070 | 0.4272 ± 0.0070 |
| - other | 2 | none | | acc | 0.4712 ± 0.0089 | 0.5249 ± 0.0087 | 0.5182 ± 0.0088 | 0.5282 ± 0.0087 | 0.5269 ± 0.0087 |
| - social sciences | 2 | none | | acc | 0.4742 ± 0.0089 | 0.5070 ± 0.0089 | 0.5083 ± 0.0088 | 0.5151 ± 0.0088 | 0.5115 ± 0.0089 |
| - stem | 2 | none | | acc | 0.3428 ± 0.0084 | 0.3549 ± 0.0084 | 0.3539 ± 0.0084 | 0.3622 ± 0.0084 | 0.3609 ± 0.0084 |
*Table 1. MMLU score with Llama2-7B inference*
</div>
# Miscellaneous
If users define their own dataloader, please ensure data consistency within ```deepspeed.utils.groups.get_tensor_model_parallel_group()```. DeepSpeed provides basic validation functions to assist with this.
Furthermore, if users are not using transformers library, you can replace the ```TensorParallel_Layer``` layer and its subclasses as needed. See ```prepare_tp_model``` function in ```unit/model_parallelism/test_autotp_training.py```. Users can also define different shard and gather for subclasses of ```TensorParallel_Layer.```
# Ongoing Work
- **Optimization**: Communication/Activation optimization.
- **Usability**: Support [Transformers TP plan](https://github.com/huggingface/transformers/blob/336dc69d63d56f232a183a3e7f52790429b871ef/src/transformers/models/llama/configuration_llama.py#L145), decouple AutoTP parser and more model testing,
Theoretically, features supported by ZeRO should also be supported, though extensive testing is pending.
Welcome bug reports, enhancement, and additional model training examples.
# Contributors
This work was made possible through a deep collaboration between Intel and Microsoft. The contributors include Mingzhi Liu, Guokai Ma, Kiefer Kuah, Yejing Lai, Kurt Chen, Yejun Guo, Guangxin Xu, Xiaofei Feng, and Yang Wang from Intel; Guanhua Wang and Olatunji Ruwase from Microsoft.

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

View File

@ -97,7 +97,7 @@ DeepSpeed可以通过两种方式在Windows系统上安装。较为简单的方
# 总结
使得DeepSpeed一个流行的深度学习框架能够原生运行在最流行的操作系统 Windows 上,是让每个人和组织从当前的人工智能革命中受益的重要一步。在这篇博客中,我们分享了我们为实现这一目标所取得的早期成果。尽管 DeepSpeed 对 Windows 的支持仍在继续开发中我们希望上述结果已经能够对我们的用户有实用价值并且鼓舞他们。我们接下来的工作计划涵盖多GPU支持、权重量化以及性能优化。
让流行的深度学习框架 DeepSpeed 能够在最流行的操作系统 Windows 上原生运行,是让每个人和每个组织都能从正在进行的人工智能革命中受益的关键一步。在这篇博客中,我们分享了我们为实现这一目标所取得的早期成果。尽管 DeepSpeed 对 Windows 的支持仍在继续开发中我们希望上述结果已经能够对我们的用户有实用价值并且鼓舞他们。我们接下来的工作计划涵盖多GPU支持、权重量化以及性能优化。
# 致谢
这给项目的完成得益于现任和前任 DeepSpeed 成员的大力合作,包括 Costin Eseanu、Logan Adams、Elton Zheng、Reza Yazdani Aminabadi、Martin Cai 和 Olatunji Ruwase。我们还要感谢那些及时提出此项需求、提供关键的临时解决方法、部分解决方案和建设性反馈的 DeepSpeed 用户,最重要的是,他们始终与我们同行.

View File

@ -10,6 +10,7 @@ set DS_BUILD_FP_QUANTIZER=0
set DS_BUILD_GDS=0
set DS_BUILD_RAGGED_DEVICE_OPS=0
set DS_BUILD_SPARSE_ATTN=0
set DS_BUILD_DEEP_COMPILE=0
python -m build --wheel --no-isolation

4
ci/__init__.py Normal file
View File

@ -0,0 +1,4 @@
# Copyright (c) DeepSpeed Team.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team

43
ci/accelerate.py Normal file
View File

@ -0,0 +1,43 @@
# Copyright (c) Snowflake.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from pathlib import Path
import modal
ROOT_PATH = Path(__file__).parents[1]
# yapf: disable
image = (modal.Image
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
.run_commands("apt update && apt install -y libaio-dev")
.apt_install("git")
.run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
.run_commands(
"git clone https://github.com/huggingface/accelerate && \
uv pip install --system --compile-bytecode ./accelerate[testing]"
)
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
.run_commands("pip install /root")
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
)
app = modal.App("deepspeedai-accelerate-ci", image=image)
@app.function(
gpu="l40s:1",
timeout=1800,
)
def pytest():
import subprocess
subprocess.run(
"pytest /accelerate/tests/deepspeed".split(),
check=True,
cwd=ROOT_PATH / ".",
)

39
ci/torch_latest.py Normal file
View File

@ -0,0 +1,39 @@
# Copyright (c) Snowflake.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from pathlib import Path
import modal
ROOT_PATH = Path(__file__).parents[1]
# yapf: disable
image = (modal.Image
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
.run_commands("apt update && apt install -y libaio-dev")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
.run_commands("pip install /root")
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
)
app = modal.App("deepspeedai-torch-latest-ci", image=image)
@app.function(
gpu="l40s:2",
timeout=1800,
)
def pytest():
import subprocess
subprocess.run(
"pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
check=True,
cwd=ROOT_PATH / ".",
)

View File

@ -101,7 +101,7 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>*
return actual_n_iocbs;
}
int get_file_size(const char* filename, int64_t& size)
int64_t get_file_size(const char* filename, int64_t& size)
{
struct stat st;
if (stat(filename, &st) == -1) { return -1; }
@ -109,6 +109,14 @@ int get_file_size(const char* filename, int64_t& size)
return 0;
}
int64_t get_fd_file_size(const int fd, int64_t& size)
{
struct stat st;
if (fstat(fd, &st) == -1) { return -1; }
size = st.st_size;
return 0;
}
void* ds_page_aligned_alloc(const int64_t size, const bool lock)
{
void* ptr;

View File

@ -78,4 +78,5 @@ struct io_prep_generator {
void* ds_page_aligned_alloc(const int64_t size, const bool lock = false);
int get_file_size(const char* filename, int64_t& size);
int64_t get_file_size(const char* filename, int64_t& size);
int64_t get_fd_file_size(const int fd, int64_t& size);

View File

@ -11,20 +11,19 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
const torch::Tensor& buffer,
const int fd,
const char* filename,
const int64_t file_num_bytes,
const int intra_op_parallelism,
const bool validate,
const int64_t file_offset)
: _read_op(read_op),
_buffer(buffer),
_fd(fd),
_filename(filename),
_file_num_bytes(file_num_bytes),
_filename((filename == nullptr) ? std::string() : filename),
_file_offset(file_offset),
_intra_op_parallelism(intra_op_parallelism),
_num_bytes_per_thread(static_cast<int64_t>(buffer.nbytes()) / intra_op_parallelism),
_validate(validate)
{
if (validate) { assert(nullptr != filename); }
}
char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }

View File

@ -13,8 +13,7 @@ struct io_op_desc_t {
const bool _read_op;
torch::Tensor _buffer;
int _fd;
const std::string _filename;
const int64_t _file_num_bytes;
std::string _filename;
const int _intra_op_parallelism;
const int64_t _num_bytes_per_thread;
torch::Tensor _contiguous_buffer;
@ -25,7 +24,6 @@ struct io_op_desc_t {
const torch::Tensor& buffer,
const int fd,
const char* filename,
const int64_t file_num_bytes,
const int intra_op_parallelism,
const bool validate,
const int64_t file_offset);

View File

@ -9,23 +9,15 @@
using namespace std;
cpu_op_desc_t::cpu_op_desc_t(
const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
const bool read_op,
const torch::Tensor& buffer,
const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
const int fd,
const char* filename,
const int64_t file_num_bytes,
const int intra_op_parallelism,
const bool validate,
const int64_t file_offset)
: io_op_desc_t(read_op,
buffer,
fd,
filename,
file_num_bytes,
intra_op_parallelism,
validate,
file_offset),
: io_op_desc_t(read_op, buffer, fd, filename, intra_op_parallelism, validate, file_offset),
_cpu_buffer(buffer),
_pinned_tensor_mgr(pinned_tensor_mgr),
_is_managed_bounce_buffer(false)
@ -66,7 +58,8 @@ void cpu_op_desc_t::finish()
void cpu_op_desc_t::validate()
{
validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
const auto num_io_bytes = static_cast<int64_t>(_contiguous_buffer.nbytes());
validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), num_io_bytes);
}
void cpu_op_desc_t::run(const int tid,

View File

@ -13,12 +13,11 @@ struct cpu_op_desc_t : io_op_desc_t {
bool _is_managed_bounce_buffer;
const std::unique_ptr<struct deepspeed_pin_tensor_t>& _pinned_tensor_mgr;
cpu_op_desc_t(const bool read_op,
cpu_op_desc_t(const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
const bool read_op,
const torch::Tensor& buffer,
const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
const int fd,
const char* filename,
const int64_t file_num_bytes,
const int intra_op_parallelism,
const bool validate,
const int64_t file_offset);

View File

@ -6,7 +6,6 @@
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <condition_variable>
#include <memory>
#include "deepspeed_py_io_handle.h"

View File

@ -10,10 +10,30 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
#include "deepspeed_py_io_handle.h"
#include <cstdlib>
#define O_DIRECT_ALIGNMENT 512
using namespace std;
static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
static bool is_valid_bytes_to_read(const char* filename,
const int64_t file_offset,
const int64_t num_bytes_to_read)
{
int64_t num_file_bytes;
if (-1 == get_file_size(filename, num_file_bytes)) {
const auto error_code = errno;
report_file_error(filename, " fstat for read", error_code);
return false;
}
if ((file_offset + num_bytes_to_read) > num_file_bytes) {
std::cout << filename << ": file_offset + buffer nbytes > file bytes "
<< (file_offset + num_bytes_to_read) << " > " << num_file_bytes << std::endl;
}
assert((file_offset + num_bytes_to_read) <= num_file_bytes);
return true;
}
deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
const int queue_depth,
const bool single_submit,
@ -58,6 +78,11 @@ const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_e
const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; }
const int deepspeed_io_handle_t::get_alignment() const
{
return _intra_op_parallelism * O_DIRECT_ALIGNMENT;
}
int deepspeed_io_handle_t::read(torch::Tensor& buffer,
const char* filename,
const bool validate,
@ -185,7 +210,7 @@ int deepspeed_io_handle_t::wait()
completed_op->finish();
close(completed_op->_fd);
if (!completed_op->_filename.empty()) { (completed_op->_fd); }
--_num_pending_ops;
++num_completed_ops;
@ -199,7 +224,8 @@ bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op, const
const auto op_string = read_op ? "Read" : "Write";
if (num_bytes % get_intra_op_parallelism()) {
std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
<< " not divisible by thread count = " << get_intra_op_parallelism() << std::endl;
<< " not divisible by intra op parallelism = " << get_intra_op_parallelism()
<< std::endl;
return false;
}
@ -211,45 +237,61 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
const torch::Tensor& buffer,
const int fd,
const char* filename,
const int64_t file_num_bytes,
const bool validate,
const int64_t file_offset)
{
return std::make_shared<cpu_op_desc_t>(read_op,
return std::make_shared<cpu_op_desc_t>(_pinned_tensor_mgr,
read_op,
buffer,
_pinned_tensor_mgr,
fd,
filename,
file_num_bytes,
_intra_op_parallelism,
validate,
file_offset);
}
int deepspeed_io_handle_t::_pread(const torch::Tensor& buffer,
const int fd,
const char* filename,
const bool validate,
const bool async,
const int64_t file_offset)
{
auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, validate, file_offset);
_schedule_aio_work(scheduled_op);
if (async) { return 0; }
return wait();
}
int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
const char* filename,
const bool validate,
const bool async,
const int64_t file_offset)
{
int64_t num_file_bytes;
if (-1 == get_file_size(filename, num_file_bytes)) {
const auto error_code = errno;
report_file_error(filename, " fstat for read", error_code);
return -1;
}
// buffer can exceed file size to enable 4k alignment
const auto buffer_bytes = static_cast<int64_t>(buffer.nbytes());
assert((num_file_bytes % _intra_op_parallelism) == 0);
if (!is_valid_bytes_to_read(filename, file_offset, buffer_bytes)) { return -1; }
if (!_is_valid_parallel_aio_op(true, buffer_bytes)) { return -1; }
const auto fd = open_file(filename, true);
if (fd == -1) { return -1; }
auto scheduled_op =
_create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate, file_offset);
return _pread(buffer, fd, filename, validate, async, file_offset);
}
int deepspeed_io_handle_t::_pwrite(const torch::Tensor& buffer,
const int fd,
const char* filename,
const bool validate,
const bool async,
const int64_t file_offset)
{
auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, validate, file_offset);
_schedule_aio_work(scheduled_op);
@ -265,21 +307,13 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
const int64_t file_offset)
{
const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
assert((num_write_bytes % _intra_op_parallelism) == 0);
if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
const auto fd = open_file(filename, false);
if (fd == -1) { return -1; }
auto scheduled_op =
_create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate, file_offset);
_schedule_aio_work(scheduled_op);
if (async) { return 0; }
return wait();
return _pwrite(buffer, fd, filename, validate, async, file_offset);
}
int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer,
@ -310,6 +344,16 @@ int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer,
return pwrite(buffer, filename, false, true, file_offset);
}
int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer,
const int fd,
const int64_t file_offset = 0)
{
const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
return _pwrite(buffer, fd, nullptr, false, true, file_offset);
}
at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem,
const torch::Tensor& example_tensor)
{

View File

@ -37,6 +37,7 @@ struct deepspeed_io_handle_t {
const bool get_single_submit() const;
const bool get_overlap_events() const;
const int get_intra_op_parallelism() const;
const int get_alignment() const;
int read(torch::Tensor& buffer,
const char* filename,
@ -67,6 +68,7 @@ struct deepspeed_io_handle_t {
int async_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);
int async_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
int async_pwrite(const torch::Tensor& buffer, const int fd, const int64_t file_offset);
// TODO: Make API's args to be shape and dtype.
torch::Tensor new_cpu_locked_tensor(const int64_t num_elem,
@ -84,11 +86,24 @@ struct deepspeed_io_handle_t {
bool _is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes);
int _pread(const torch::Tensor& buffer,
const int fd,
const char* filename,
const bool validate,
const bool async,
const int64_t file_offset);
int _pwrite(const torch::Tensor& buffer,
const int fd,
const char* filename,
const bool validate,
const bool async,
const int64_t file_offset);
virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
const torch::Tensor& buffer,
const int fd,
const char* filename,
const int64_t file_num_bytes,
const bool validate,
const int64_t file_offset);
};

View File

@ -6,7 +6,6 @@
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <torch/extension.h>
#include "deepspeed_py_aio_handle.h"
#include "deepspeed_py_copy.h"
@ -34,6 +33,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
.def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
.def("get_intra_op_parallelism", &deepspeed_aio_handle_t::get_intra_op_parallelism)
.def("get_alignment", &deepspeed_aio_handle_t::get_alignment)
.def("read",
&deepspeed_aio_handle_t::read,
@ -53,7 +53,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("pread",
&deepspeed_aio_handle_t::pread,
"Parallel file read with option of parallelism. Returns count of completed read ops",
"Parallel file read with option of asynchronous completion. If synchronous, returns "
"count of completed read ops",
"buffer"_a,
"filename"_a,
"validate"_a,
@ -62,7 +63,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("pwrite",
&deepspeed_aio_handle_t::pwrite,
"Parallel file write with option of parallelism. Returns count of completed write ops",
"Parallel file write with option of asynchronous completion. If synchronous, returns "
"count of completed write ops",
"buffer"_a,
"filename"_a,
"validate"_a,
@ -71,7 +73,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("sync_pread",
&deepspeed_aio_handle_t::sync_pread,
"Synchrononous parallel file read. Returns count of completed read ops",
"Synchronous parallel file read. Returns count of completed read ops",
"buffer"_a,
"filename"_a,
"file_offset"_a = 0)
@ -86,17 +88,27 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("async_pread",
&deepspeed_aio_handle_t::async_pread,
"Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
"following wait() returns count of completed ops.",
"subsequent wait() returns count of completed ops.",
"buffer"_a,
"filename"_a,
"file_offset"_a = 0)
.def(
"async_pwrite",
py::overload_cast<const torch::Tensor&, const char*, const int64_t>(
&deepspeed_aio_handle_t::async_pwrite),
"Asynchronous parallel file write. Returns 0 on success, and subsequent wait() returns "
"count of completed ops.",
"buffer"_a,
"filename"_a,
"file_offset"_a = 0)
.def("async_pwrite",
&deepspeed_aio_handle_t::async_pwrite,
"Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
"count of completed ops.",
py::overload_cast<const torch::Tensor&, const int, const int64_t>(
&deepspeed_aio_handle_t::async_pwrite),
"Asynchronous parallel file write using opened python file object.",
"buffer"_a,
"filename"_a,
"fd"_a,
"file_offset"_a = 0)
.def("new_cpu_locked_tensor",

View File

@ -17,7 +17,7 @@ from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
READ_LOG_DIR, WRITE_LOG_DIR
from deepspeed.ops.op_builder import AsyncIOBuilder
OTHER_OPTIONS = '--handle'
OTHER_OPTIONS = '--engine aio_handle'
PERF_SCRIPT = 'test_ds_aio.py'
DEFAULT_SWEEP_CONFIG = {
"block_size": ["128K", "1M"],
@ -109,6 +109,20 @@ def get_sweep_config_dict(sweep_config_json):
return sweep_config
QUEUE_DEPTH = "--queue_depth"
BLOCK_SIZE = "--block_size"
SINGLE_SUBMIT = "--single_submit"
SEQUENTIAL_REQUESTS = "--sequential_requests"
THREAD_COUNT = "--threads"
IO_PARALLEL = "--io_parallel"
DEPRECATED_KEYS = {THREAD_COUNT: "multi_process"}
def _handle_key_deprecation(key):
return DEPRECATED_KEYS.get(f'--{key}', key)
def get_sweep_cmd_lines(sweep_config_dict):
def flatten_options(key, value_list):
@ -123,7 +137,7 @@ def get_sweep_cmd_lines(sweep_config_dict):
return flat_list
flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
flat_list = [flatten_options(_handle_key_deprecation(key), value) for key, value in sweep_config_dict.items()]
cmd_list = list(itertools.product(*flat_list))
cmd_list = [list(cmd) for cmd in cmd_list]
#dump_cmd_lines(cmd_list)

View File

@ -0,0 +1,21 @@
python test_ds_aio.py \
--read \
--handle --io_size 400M \
--loops 3 \
--folder_to_device_mapping \
/mnt/nvme23/aio:0 \
/mnt/nvme23/aio:1 \
/mnt/nvme23/aio:2 \
/mnt/nvme23/aio:3 \
/mnt/nvme45/aio:4 \
/mnt/nvme45/aio:5 \
/mnt/nvme45/aio:6 \
/mnt/nvme45/aio:7 \
/mnt/nvme67/aio:8 \
/mnt/nvme67/aio:9 \
/mnt/nvme67/aio:10 \
/mnt/nvme67/aio:11 \
/mnt/nvme89/aio:12 \
/mnt/nvme89/aio:13 \
/mnt/nvme89/aio:14 \
/mnt/nvme89/aio:15 \

View File

@ -0,0 +1,20 @@
python test_ds_aio.py \
--handle --io_size 400M \
--loops 3 \
--folder_to_device_mapping \
/mnt/nvme23/aio:0 \
/mnt/nvme23/aio:1 \
/mnt/nvme23/aio:2 \
/mnt/nvme23/aio:3 \
/mnt/nvme45/aio:4 \
/mnt/nvme45/aio:5 \
/mnt/nvme45/aio:6 \
/mnt/nvme45/aio:7 \
/mnt/nvme67/aio:8 \
/mnt/nvme67/aio:9 \
/mnt/nvme67/aio:10 \
/mnt/nvme67/aio:11 \
/mnt/nvme89/aio:12 \
/mnt/nvme89/aio:13 \
/mnt/nvme89/aio:14 \
/mnt/nvme89/aio:15 \

View File

@ -0,0 +1,6 @@
python test_ds_aio.py \
--read \
--handle --io_size 400M \
--loops 3 \
--folder /mnt/nvme23/aio \
--multi_process 16

View File

@ -0,0 +1,5 @@
python test_ds_aio.py \
--handle --io_size 400M \
--loops 3 \
--folder /mnt/nvme23/aio \
--multi_process 16

View File

@ -9,6 +9,7 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
import argparse
import os
from test_ds_aio_utils import refine_integer_value
from ds_aio_constants import AIO_HANDLE, AIO_BASIC, TORCH_FAST_IO, TORCH_IO, VALID_ENGINES
from deepspeed.accelerator import get_accelerator
MAPPING_DELIMITER = ':'
@ -21,6 +22,9 @@ def refine_args(args):
if args.block_size and type(args.block_size) == str:
args.block_size = refine_integer_value(args.block_size)
if args.fast_io_size and type(args.fast_io_size) == str:
args.fast_io_size = refine_integer_value(args.fast_io_size)
return args
@ -83,6 +87,19 @@ def validate_args(args):
no_error = no_error and no_mapping_error
error_messages += mapping_error_messages
# Validate --engine
if args.engine not in VALID_ENGINES:
no_error = False
error_messages.append(f'Invalid engine {args.engine}. Valid options = {VALID_ENGINES}')
# Validate --engine=torch_io
if args.engine == TORCH_IO:
if args.read:
no_error = False
error_messages.append(f'Read not currently supported for --engine={TORCH_IO}')
if not no_error:
print(f'Found {len(error_messages)} validation error(s)')
# Validate --gpu, --use_gds
if args.use_gds and not args.gpu:
error_messages.append(f'--gpu must be set to transfer with --use_gds')
@ -111,6 +128,8 @@ def parse_arguments():
parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
parser.add_argument('--fast_io_size', type=str, default='64M', help='Size of fast_io pinned buffer (bytes).')
parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')
parser.add_argument('--multi_process',
@ -138,7 +157,13 @@ def parse_arguments():
parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')
parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
parser.add_argument(
'--engine',
type=str,
default=AIO_HANDLE,
help=
f'Engine to perform I/O. Options are [{AIO_HANDLE}, {AIO_BASIC}, {TORCH_IO}, {TORCH_FAST_IO}]. Default is aio_handle'
)
parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')
@ -152,6 +177,20 @@ def parse_arguments():
action='store_true',
help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
parser.add_argument('--torch_legacy_save', action='store_true', help='Use torch legacy save approach')
parser.add_argument('--use_accelerator_pin_memory',
action='store_true',
help='Obtain pinned (CPU page-locked) tensors from accelerator')
parser.add_argument('--warmup_loops', type=int, default=1, help='Count of operation warmup repetitions')
parser.add_argument('--include_warmup_time', action='store_true', help='Include warmup latency in results')
parser.add_argument('--different_file_each_iteration',
action='store_true',
help='Read/write a different file on each iteration.')
args = parser.parse_args()
print(f'args = {args}')
return args
@ -163,7 +202,7 @@ def get_validated_args():
if not validate_args(args):
quit()
print(f'Successful validation of command line arguments')
args.total_loops = args.warmup_loops + args.loops
peer_tag = 'gpu' if args.gpu else 'process'
args.mapping_dict = _get_mapping_dict(args)
args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]

View File

@ -6,129 +6,59 @@
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import torch
import os
import time
from deepspeed.ops.aio import AsyncIOBuilder
from multiprocessing import Pool, Barrier
from test_ds_aio_utils import report_results, task_log, task_barrier
from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
from ds_aio_constants import *
def pre_basic(args, tid, read_op):
io_string = "Read" if read_op else "Write"
num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
file = args.read_file if read_op else f'{args.write_file}.{tid}'
class AIOBasic_Engine(object):
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
def __init__(self, args, tid, read_op):
self.ctxt = self._create_context(args, tid, read_op)
ctxt = {}
ctxt['file'] = file
ctxt['num_bytes'] = num_bytes
ctxt['buffer'] = buffer
ctxt['elapsed_sec'] = 0
def fini(self):
self.ctxt[BUFFER].detach()
self.ctxt[BUFFER] = None
return ctxt
def pre_basic_read(pool_params):
args, tid = pool_params
ctxt = pre_basic(args, tid, True)
return ctxt
def pre_basic_write(pool_params):
args, tid = pool_params
ctxt = pre_basic(args, tid, False)
return ctxt
def post_basic(pool_params):
_, _, ctxt = pool_params
ctxt["buffer"].detach()
ctxt["buffer"] = None
return ctxt
def main_basic_read(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
args.single_submit, not args.sequential_requests, args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_basic_write(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
args.single_submit, not args.sequential_requests, args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def get_schedule(args, read_op):
schedule = {}
if read_op:
schedule['pre'] = pre_basic_read
schedule['post'] = post_basic
schedule['main'] = main_basic_read
else:
schedule['pre'] = pre_basic_write
schedule['post'] = post_basic
schedule['main'] = main_basic_write
return schedule
def _aio_handle_tasklet(pool_params):
args, tid, read_op = pool_params
num_processes = len(args.mapping_dict)
# Create schedule
schedule = get_schedule(args, read_op)
task_log(tid, f'schedule = {schedule}')
task_barrier(aio_barrier, num_processes)
# Run pre task
task_log(tid, f'running pre-task')
ctxt = schedule["pre"]((args, tid))
task_barrier(aio_barrier, num_processes)
# Run main tasks in a loop
ctxt["main_task_sec"] = 0
for i in range(args.loops):
task_log(tid, f'running main task {i}')
def read(self, args, tid, loop_id):
start_time = time.time()
ctxt = schedule["main"]((args, tid, ctxt))
task_barrier(aio_barrier, num_processes)
stop_time = time.time()
ctxt["main_task_sec"] += stop_time - start_time
AsyncIOBuilder().load().aio_read(self.ctxt[BUFFER], self.ctxt[FILE], args.block_size, args.queue_depth,
args.single_submit, not args.sequential_requests, args.validate)
end_time = time.time()
self.ctxt[ELAPSED_SEC] += end_time - start_time
# Run post task
task_log(tid, f'running post-task')
ctxt = schedule["post"]((args, tid, ctxt))
task_barrier(aio_barrier, num_processes)
def write(self, args, tid, loop_id):
# Avoid overwriting existing files as it could be artificially faster
if os.path.isfile(self.ctxt[FILE]):
os.remove(self.ctxt[FILE])
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
start_time = time.time()
AsyncIOBuilder().load().aio_write(self.ctxt[BUFFER], self.ctxt[FILE], args.block_size, args.queue_depth,
args.single_submit, not args.sequential_requests, args.validate)
end_time = time.time()
self.ctxt[ELAPSED_SEC] += end_time - start_time
def _create_context(self, args, tid, read_op):
io_string = "Read" if read_op else "Write"
device_id, folder = args.mapping_list[tid]
filename = create_filename(folder, args.read, args.io_size, tid)
if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
create_file(filename, args.io_size)
def _init_tasklet(b):
global aio_barrier
aio_barrier = b
task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
buffer = create_page_locked_tensor(args.io_size, True)
def aio_basic_multiprocessing(args, read_op):
num_processes = len(args.mapping_dict)
b = Barrier(num_processes)
pool_params = [(args, p, read_op) for p in range(num_processes)]
with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)
task_log(tid,
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}')
report_results(args, read_op, pool_results)
task_log(tid, f'created deepspeed aio basic engine')
ctxt = {}
ctxt[FILE] = filename
ctxt[NUM_BYTES] = args.io_size
ctxt[BUFFER] = buffer
ctxt[ELAPSED_SEC] = 0
return ctxt

View File

@ -0,0 +1,19 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
AIO_HANDLE = 'aio_handle'
AIO_BASIC = 'aio_basic'
TORCH_IO = 'torch_io'
TORCH_FAST_IO = 'torch_fastio'
VALID_ENGINES = [AIO_HANDLE, AIO_BASIC, TORCH_IO, TORCH_FAST_IO]
BUFFER = 'buffer'
BOUNCE_BUFFER = 'bounce_buffer'
NUM_BYTES = 'num_bytes'
FILE = 'file'
HANDLE = 'handle'
ELAPSED_SEC = 'elapsed_sec'
FAST_IO_BUFFER = 'fast_io_buffer'
USE_CPU_LOCKED_TENSOR = 'cpu_locked_tensor'

View File

@ -2,221 +2,105 @@
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
"""
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import torch
import os
import time
from multiprocessing import Pool, Barrier
from deepspeed.ops.aio import AsyncIOBuilder
from deepspeed.ops.op_builder import GDSBuilder
from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
from deepspeed.accelerator import get_accelerator
BUFFER = 'buffer'
BOUNCE_BUFFER = 'bounce_buffer'
from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
from ds_aio_constants import *
def pre_handle(args, tid, read_op):
io_string = "Read" if read_op else "Write"
gds = True if args.use_gds else False
device_id, folder = args.mapping_list[tid]
filename = create_filename(folder, args.read, args.io_size, tid)
if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
create_file(filename, args.io_size)
class AIOHandle_Engine(object):
task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
bounce_buffer = None
if args.gpu:
device_name = get_accelerator().device_name(device_id)
buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
if not (args.slow_bounce_buffer or gds):
bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
device='cpu').pin_memory()
else:
buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
task_log(tid,
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
force=True)
def __init__(self, args, tid, read_op):
self.ctxt = self._create_context(args, tid, read_op)
io_parallel = args.io_parallel if args.io_parallel else 1
if gds:
handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
not args.sequential_requests, io_parallel)
handle.pin_device_tensor(buffer)
else:
def fini(self):
for buf in [BUFFER, BOUNCE_BUFFER]:
if self.ctxt[buf] is not None:
if self.ctxt[USE_CPU_LOCKED_TENSOR]:
self.ctxt[HANDLE].free_cpu_locked_tensor(self.ctxt[buf])
self.ctxt[buf].detach()
self.ctxt[buf] = None
def read(self, args, tid, loop_id):
handle = self.ctxt[HANDLE]
start_time = time.time()
dest_buffer = BOUNCE_BUFFER if self.ctxt[BOUNCE_BUFFER] is not None else BUFFER
ret = handle.pread(self.ctxt[dest_buffer], self.ctxt[FILE][loop_id], args.validate, True)
assert ret != -1
handle.wait()
if dest_buffer == BOUNCE_BUFFER:
self.ctxt[BUFFER].data.copy_(self.ctxt[BOUNCE_BUFFER].data)
end_time = time.time()
self.ctxt[ELAPSED_SEC].append(end_time - start_time)
def write(self, args, tid, loop_id):
handle = self.ctxt[HANDLE]
start_time = time.time()
if self.ctxt[BOUNCE_BUFFER] is not None:
source_buffer = BOUNCE_BUFFER
self.ctxt[BOUNCE_BUFFER].data.copy_(self.ctxt[BUFFER].data)
else:
source_buffer = BUFFER
ret = handle.pwrite(self.ctxt[source_buffer], self.ctxt[FILE][loop_id], args.validate, True)
assert ret != -1
handle.wait()
end_time = time.time()
self.ctxt[ELAPSED_SEC].append(end_time - start_time)
def _create_files(self, args, folder, tid):
if args.different_file_each_iteration:
filenames = [
create_filename(folder, args.read, args.io_size, f'{tid}_{l}') for l in range(args.total_loops)
]
else:
filenames = [
create_filename(folder, args.read, args.io_size, f'{tid}_{0}') for _ in range(args.total_loops)
]
if args.read:
for f in filenames:
if not (os.path.isfile(f) and os.path.getsize(f) == args.io_size):
create_file(f, args.io_size)
else:
for f in filenames:
if os.path.isfile(f):
os.remove(f)
return filenames
def _create_context(self, args, tid, read_op):
io_string = "Read" if read_op else "Write"
device_id, folder = args.mapping_list[tid]
filenames = self._create_files(args, folder, tid)
io_parallel = args.io_parallel if args.io_parallel else 1
handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
not args.sequential_requests, io_parallel)
task_log(tid, f'created deepspeed aio handle')
task_log(tid, f'created deepspeed aio handle engine')
ctxt = {}
ctxt['file'] = filename
ctxt['num_bytes'] = args.io_size
ctxt['handle'] = handle
ctxt['gds'] = gds
ctxt[BUFFER] = buffer
ctxt[BOUNCE_BUFFER] = bounce_buffer
ctxt['elapsed_sec'] = 0
bounce_buffer = None
if args.gpu:
buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=f'cuda:{device_id}')
bounce_buffer = create_page_locked_tensor(args.io_size, args.use_accelerator_pin_memory, handle)
else:
buffer = create_page_locked_tensor(args.io_size, args.use_accelerator_pin_memory, handle)
task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
return ctxt
ctxt = {}
ctxt[FILE] = filenames
ctxt[NUM_BYTES] = args.io_size
ctxt[HANDLE] = handle
ctxt[BUFFER] = buffer
ctxt[BOUNCE_BUFFER] = bounce_buffer
ctxt[ELAPSED_SEC] = []
ctxt[USE_CPU_LOCKED_TENSOR] = not args.use_accelerator_pin_memory
task_log(tid,
f'{io_string} file {filenames} of size {args.io_size} bytes from buffer on device {buffer.device}',
force=True)
def pre_handle_read(pool_params):
args, tid = pool_params
ctxt = pre_handle(args, tid, True)
return ctxt
def pre_handle_write(pool_params):
args, tid = pool_params
ctxt = pre_handle(args, tid, False)
return ctxt
def post_handle(pool_params):
_, _, ctxt = pool_params
for buf in [BUFFER, BOUNCE_BUFFER]:
if ctxt[buf] is not None:
if ctxt['gds']:
ctxt['handle'].unpin_device_tensor(ctxt[buf])
ctxt[buf].detach()
ctxt[buf] = None
return ctxt
def main_parallel_read(pool_params):
args, tid, ctxt = pool_params
handle = ctxt['handle']
start_time = time.time()
dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, 0, True)
assert ret != -1
handle.wait()
if dest_buffer == BOUNCE_BUFFER:
ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_parallel_write(pool_params):
args, tid, ctxt = pool_params
# Avoid overwriting existing files as it could be artificially faster
if os.path.isfile(ctxt['file']):
os.remove(ctxt['file'])
handle = ctxt['handle']
start_time = time.time()
if ctxt[BOUNCE_BUFFER] is not None:
source_buffer = BOUNCE_BUFFER
ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
else:
source_buffer = BUFFER
ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
assert ret != -1
handle.wait()
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_handle_read(pool_parms):
args, tid, ctxt = pool_parms
handle = ctxt['handle']
start_time = time.time()
dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
assert ret != -1
if dest_buffer == BOUNCE_BUFFER:
ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_handle_write(pool_parms):
args, tid, ctxt = pool_parms
# Avoid overwriting existing files as it could be artificially faster
if os.path.isfile(ctxt['file']):
os.remove(ctxt['file'])
handle = ctxt['handle']
start_time = time.time()
if ctxt[BOUNCE_BUFFER] is not None:
source_buffer = BOUNCE_BUFFER
ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
else:
source_buffer = BUFFER
ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
assert ret != -1
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def get_schedule(args, read_op):
schedule = {}
if read_op:
schedule['pre'] = pre_handle_read
schedule['post'] = post_handle
schedule['main'] = main_parallel_read
else:
schedule['pre'] = pre_handle_write
schedule['post'] = post_handle
schedule['main'] = main_parallel_write
return schedule
def _aio_handle_tasklet(pool_params):
args, tid, read_op = pool_params
num_processes = len(args.mapping_dict)
# Create schedule
schedule = get_schedule(args, read_op)
task_log(tid, f'schedule = {schedule}')
task_barrier(aio_barrier, num_processes)
# Run pre task
task_log(tid, f'running pre-task')
ctxt = schedule["pre"]((args, tid))
task_barrier(aio_barrier, num_processes)
# Run main tasks in a loop
ctxt["main_task_sec"] = 0
for i in range(args.loops):
task_log(tid, f'running main task {i}')
start_time = time.time()
ctxt = schedule["main"]((args, tid, ctxt))
task_barrier(aio_barrier, num_processes)
stop_time = time.time()
ctxt["main_task_sec"] += stop_time - start_time
# Run post task
task_log(tid, f'running post-task')
ctxt = schedule["post"]((args, tid, ctxt))
task_barrier(aio_barrier, num_processes)
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
def _init_tasklet(b):
global aio_barrier
aio_barrier = b
def aio_handle_multiprocessing(args, read_op):
num_processes = len(args.mapping_dict)
b = Barrier(num_processes)
pool_params = [(args, p, read_op) for p in range(num_processes)]
with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)
report_results(args, read_op, pool_results)
return ctxt

View File

@ -0,0 +1,126 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import time
from multiprocessing import Pool, Barrier
from ds_aio_constants import AIO_BASIC, TORCH_FAST_IO, TORCH_IO
from test_ds_aio_utils import report_results, task_log, task_barrier
from ds_aio_handle import AIOHandle_Engine
from ds_aio_basic import AIOBasic_Engine
from torch_io import TorchIO_Engine
from torch_fastio_engine import Torch_FastIO_Engine
def prepare_operation(args, tid, read_op):
if args.engine == TORCH_IO:
io_engine = TorchIO_Engine(args, tid, read_op)
elif args.engine == AIO_BASIC:
io_engine = AIOBasic_Engine(args, tid, read_op)
elif args.engine == TORCH_FAST_IO:
io_engine = Torch_FastIO_Engine(args, tid, read_op)
else:
io_engine = AIOHandle_Engine(args, tid, read_op)
return io_engine
def prepare_read(pool_params):
args, tid = pool_params
return prepare_operation(args, tid, True)
def prepare_write(pool_params):
args, tid = pool_params
return prepare_operation(args, tid, False)
def post_operation(pool_params):
_, _, io_engine = pool_params
io_engine.fini()
def read_operation(pool_params):
args, tid, loop_id, io_engine = pool_params
return io_engine.read(args, tid, loop_id)
def write_operation(pool_params):
args, tid, loop_id, io_engine = pool_params
return io_engine.write(args, tid, loop_id)
def get_schedule(args, read_op):
schedule = {}
if read_op:
schedule['pre'] = prepare_read
schedule['post'] = post_operation
schedule['main'] = read_operation
else:
schedule['pre'] = prepare_write
schedule['post'] = post_operation
schedule['main'] = write_operation
return schedule
def io_engine_tasklet(pool_params):
args, tid, read_op = pool_params
num_processes = len(args.mapping_dict)
# Create schedule
schedule = get_schedule(args, read_op)
task_log(tid, f'schedule = {schedule}')
task_barrier(aio_barrier, num_processes)
# Run pre task
task_log(tid, f'running pre-task')
io_engine = schedule["pre"]((args, tid))
task_barrier(aio_barrier, num_processes)
# Run main tasks in a loop
io_engine.ctxt["main_task_sec"] = []
for i in range(args.total_loops):
task_log(tid, f'running main task {i}')
start_time = time.time()
schedule["main"]((args, tid, i, io_engine))
task_barrier(aio_barrier, num_processes)
stop_time = time.time()
io_engine.ctxt["main_task_sec"].append(stop_time - start_time)
# Run post task
task_log(tid, f'running post-task')
schedule["post"]((args, tid, io_engine))
task_barrier(aio_barrier, num_processes)
ctxt = io_engine.ctxt
# return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
if args.include_warmup_time:
e2e_latency_sec = sum(ctxt["main_task_sec"])
task_latency_sec = sum(ctxt["elapsed_sec"])
actual_loops = args.total_loops
else:
e2e_latency_sec = sum(ctxt["main_task_sec"][args.warmup_loops:])
task_latency_sec = sum(ctxt["elapsed_sec"][args.warmup_loops:])
actual_loops = args.loops
l = ctxt["elapsed_sec"]
task_log(tid, f'task_latency_sec = {l}')
return e2e_latency_sec, task_latency_sec, ctxt["num_bytes"] * actual_loops
def _init_takslet(b):
global aio_barrier
aio_barrier = b
def io_engine_multiprocessing(args, read_op):
num_processes = len(args.mapping_dict)
b = Barrier(num_processes)
pool_params = [(args, p, read_op) for p in range(num_processes)]
with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p:
pool_results = p.map(io_engine_tasklet, pool_params)
report_results(args, read_op, pool_results)

View File

@ -79,9 +79,9 @@ for xtype in cpu gpu gds; do
gpu_opt="--gpu"
gds_opt="--use_gds"
fi
for sub in single block; do
if [[ $sub == "single" ]]; then
sub_opt="--single_submit"
for ov in overlap sequential; do
if [[ $ov == "sequential" ]]; then
ov_opt="--sequential_requests"
else
sub_opt=""
fi

View File

@ -25,11 +25,42 @@ function validate_environment()
validate_environment
IO_SIZE=$1
LOG_DIR=$2/aio_perf_sweep
MAP_DIR=$2/aio
GPU_MEM=$3
USE_GDS=$4
if [[ $# -ne 3 ]]; then
echo "Usage: $0 <write size in [K,M,G]> <write dir ><output log dir>"
exit 1
fi
SIZE=$1
WRITE_DIR=$2
LOG_DIR=$3/aio_perf_sweep
WRITE_OPT="--folder ${WRITE_DIR} --io_size ${SIZE} --loops 3"
IO_ENGINE="torch_fastio"
ENGINE_OPTS=""
if [[ $IO_ENGINE == "aio_handle" ]]; then
IO_PARALLEL="1" # "1 2 4 8"
QUEUE_DEPTH="8 16 32 64 128"
BLOCK_SIZE="128K 256K 512K 1M 2M 4M 8M 16M"
SUBMIT="block"
OVERLAP="overlap"
elif [[ $IO_ENGINE == "torch_fastio" ]]; then
IO_PARALLEL="1" # "1 2 4 8"
QUEUE_DEPTH="8 16 32 64 128"
BLOCK_SIZE="128K 256K 512K 1M 2M 4M 8M 16M"
SUBMIT="block"
OVERLAP="overlap"
ENGINE_OPTS="--torch_legacy --fast_io_size ${SIZE}"
else
IO_PARALLEL="1"
QUEUE_DEPTH="8"
BLOCK_SIZE="128K"
SUBMIT="single"
OVERLAP="sequential"
fi
prep_folder ${WRITE_DIR}
prep_folder ${LOG_DIR}
RUN_SCRIPT=./test_ds_aio.py
OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt
@ -54,24 +85,24 @@ fi
DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC="sync"
for sub in single block; do
for sub in ${SUBMIT}; do
if [[ $sub == "single" ]]; then
sub_opt="--single_submit"
else
sub_opt=""
fi
for ov in overlap sequential; do
for ov in ${OVERLAP}; do
if [[ $ov == "sequential" ]]; then
ov_opt="--sequential_requests"
else
ov_opt=""
fi
for p in 1 2 4 8; do
for t in 1 2 4 8; do
for d in 32 64 128; do
for bs in 256K 512K 1M; do
SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
for p in 1; do
for t in ${IO_PARALLEL}; do
for d in ${QUEUE_DEPTH}; do
for bs in ${BLOCK_SIZE}; do
SCHED_OPTS="${sub_opt} ${ov_opt} --engine ${IO_ENGINE} --io_parallel ${t} ${ENGINE_OPTS}"
OPTS="--multi_process ${p} --queue_depth ${d} --block_size ${bs}"
LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
echo ${DISABLE_CACHE}

View File

@ -2,12 +2,17 @@
"block_size": [
"128K",
"256K",
"1M"
"1M",
"2M",
"4M",
"8M",
"16M"
],
"queue_depth": [
4,
8,
16,
32
32,
64
],
"io_parallel": [
1,
@ -19,7 +24,7 @@
true,
false
],
"overlap_events": [
"sequential_requests": [
true,
false
],

View File

@ -7,17 +7,16 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import multiprocessing as mp
from ds_aio_basic import aio_basic_multiprocessing
from ds_aio_handle import aio_handle_multiprocessing
from ds_aio_args import get_validated_args
from io_engine import io_engine_multiprocessing
def main():
print(f'Testing deepspeed_aio python frontend')
args = get_validated_args()
mp.set_start_method('spawn')
multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
mp.set_start_method('spawn', force=True)
multiprocess_function = io_engine_multiprocessing
multiprocess_function(args, args.read)

View File

@ -8,6 +8,8 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
import os
from ds_aio_job import Job, run_job
import torch
from deepspeed.accelerator import get_accelerator
BYTES_PER_GB = 1024**3
BYTES_PER_MB = 1024**2
@ -79,3 +81,11 @@ def create_file(filename, num_bytes):
print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
run_job(dd_job)
print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
def create_page_locked_tensor(num_elem, use_accelerator, aio_handle=None):
if use_accelerator:
return get_accelerator().pin_memory(torch.randint(high=128, size=(num_elem, ), dtype=torch.uint8,
device='cpu'))
else:
return aio_handle.new_cpu_locked_tensor(num_elem, torch.empty(0, dtype=torch.uint8))

View File

@ -0,0 +1,87 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
import os
import time
from deepspeed.ops.aio import AsyncIOBuilder
from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
from ds_aio_constants import *
from deepspeed.io import FastFileWriter
class Torch_FastIO_Engine(object):
def __init__(self, args, tid, read_op):
assert read_op is False, f'Read operation is not currently supported'
self.ctxt = self._create_context(args, tid, read_op)
self.zipfile_serialization = not args.torch_legacy_save
def fini(self):
if self.ctxt[USE_CPU_LOCKED_TENSOR]:
for buf in [BUFFER, FAST_IO_BUFFER]:
self.ctxt[HANDLE].free_cpu_locked_tensor(self.ctxt[buf])
self.ctxt[BUFFER].detach()
self.ctxt[BUFFER] = None
def read(self, args, tid):
start_time = time.time()
torch.load(f=self.ctxt[FILE], map_location=self.ctxt[BUFFER].device)
end_time = time.time()
self.ctxt[ELAPSED_SEC] += end_time - start_time
def write(self, args, tid):
# Avoid overwriting existing files as it could be artificially faster
if os.path.isfile(self.ctxt[FILE]):
os.remove(self.ctxt[FILE])
ds_file_writer = FastFileWriter(file_path=self.ctxt[FILE],
aio_handle=self.ctxt[HANDLE],
pinned_tensor=self.ctxt[FAST_IO_BUFFER])
start_time = time.time()
torch.save(obj=self.ctxt[BUFFER], f=ds_file_writer, _use_new_zipfile_serialization=self.zipfile_serialization)
ds_file_writer.close() # Force flush to storage
end_time = time.time()
self.ctxt[ELAPSED_SEC] += end_time - start_time
ds_file_writer._dump_state()
def _create_context(self, args, tid, read_op):
io_string = "Read" if read_op else "Write"
device_id, folder = args.mapping_list[tid]
filename = create_filename(folder, args.read, args.io_size, tid)
if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
create_file(filename, args.io_size)
io_parallel = args.io_parallel if args.io_parallel else 1
aio_handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
not args.sequential_requests, io_parallel)
if args.gpu:
buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=f'cuda:{device_id}')
else:
buffer = create_page_locked_tensor(args.io_size, args.use_accelerator_pin_memory, aio_handle)
task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle)
task_log(tid, f'created torch_fastio engine')
ctxt = {}
ctxt[FILE] = filename
ctxt[NUM_BYTES] = args.io_size
ctxt[BUFFER] = buffer
ctxt[HANDLE] = aio_handle
ctxt[FAST_IO_BUFFER] = fast_io_buffer
ctxt[ELAPSED_SEC] = 0
ctxt[USE_CPU_LOCKED_TENSOR] = not args.use_accelerator_pin_memory
task_log(tid,
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
force=True)
return ctxt

View File

@ -0,0 +1,64 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
import os
import time
from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
from ds_aio_constants import *
class TorchIO_Engine(object):
def __init__(self, args, tid, read_op):
self.ctxt = self._create_context(args, tid, read_op)
self.zipfile_serialization = not args.torch_legacy_save
def fini(self):
self.ctxt[BUFFER].detach()
self.ctxt[BUFFER] = None
def read(self, args, tid):
start_time = time.time()
torch.load(f=self.ctxt[FILE], map_location=self.ctxt[BUFFER].device)
end_time = time.time()
self.ctxt[ELAPSED_SEC] += end_time - start_time
def write(self, args, tid):
# Avoid overwriting existing files as it could be artificially faster
if os.path.isfile(self.ctxt[FILE]):
os.remove(self.ctxt[FILE])
start_time = time.time()
torch.save(obj=self.ctxt[BUFFER], f=self.ctxt[FILE], _use_new_zipfile_serialization=self.zipfile_serialization)
end_time = time.time()
self.ctxt[ELAPSED_SEC] += end_time - start_time
def _create_context(self, args, tid, read_op):
io_string = "Read" if read_op else "Write"
device_id, folder = args.mapping_list[tid]
filename = create_filename(folder, args.read, args.io_size, tid)
if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
create_file(filename, args.io_size)
task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
if args.gpu:
buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=f'cuda:{device_id}')
else:
buffer = create_page_locked_tensor(args.io_size, True)
task_log(tid,
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
force=True)
task_log(tid, f'created torch_io engine')
ctxt = {}
ctxt[FILE] = filename
ctxt[NUM_BYTES] = args.io_size
ctxt[BUFFER] = buffer
ctxt[ELAPSED_SEC] = 0
return ctxt

View File

@ -0,0 +1,15 @@
#!/bin/bash
MOUNT_CMD="sudo mount -v -o data=ordered"
for dir in nvme23 nvme45 nvme67 nvme89; do
mnt_point=/mnt/${dir}
sudo mkdir -p ${mnt_point}
sudo chmod -R a+rw ${mnt_point}
done
${MOUNT_CMD} /dev/md127 /mnt/nvme23
${MOUNT_CMD} /dev/md126 /mnt/nvme45
${MOUNT_CMD} /dev/md125 /mnt/nvme67
${MOUNT_CMD} /dev/md124 /mnt/nvme89
lsblk -f

View File

@ -0,0 +1,10 @@
#!/bin/bash
UMOUNT_CMD="sudo umount -v"
for md in md127 md126 md125 md124; do
mnt_device=/dev/${md}
${UMOUNT_CMD} ${mnt_device}
done
lsblk -f

View File

@ -0,0 +1,188 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#include "deepcompile.h"
#define USE_C10D_NCCL
namespace dc {
std::shared_ptr<DSParamRegistry> param_registry;
std::unordered_map<long, std::shared_ptr<CustomOpExecutor>> executors;
std::shared_ptr<DoubleBufferedReduceBucket> reduce_buckets = nullptr;
c10::intrusive_ptr<c10d::ProcessGroup> process_group = nullptr;
c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem = nullptr;
ncclComm_t nccl_comm;
bool use_symm_mem;
bool clone_custom_op_output;
bool profile = false;
bool pre_div_reduce = true;
bool sync_before_reduce; // for debugging
bool sync_after_reduce; // for debugging
bool sync_before_allgather; // for debugging
bool sync_after_allgather; // for debugging
std::vector<int64_t> sizes_to_int_vector(at::IntArrayRef sizes)
{
std::vector<int64_t> result;
for (int i = 0; i < sizes.size(); i++) { result.push_back(sizes[i]); }
return result;
}
void enable_profiling(bool enable) { profile = enable; }
bool is_profiling() { return profile; }
c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> getSymmMemWorkspace(int64_t size)
{
c10::Device device = c10::Device(c10::kCUDA, c10::cuda::current_device());
std::vector<int64_t> sizes = {size};
std::vector<int64_t> strides = {1};
at::Tensor sym_mem_ws = c10d::symmetric_memory::empty_strided_p2p(
{size}, {1}, c10::ScalarType::Byte, device, process_group->getGroupName(), std::nullopt);
return c10d::symmetric_memory::rendezvous(sym_mem_ws);
}
void lazy_init_symm_memory()
{
if (use_symm_mem && !symm_mem) {
int64_t max_param_size = 0;
for (const auto& it : param_registry->getParams()) {
int64_t size = it.second.getDSTensor().numel() * it.second.getDSTensor().element_size();
if (size > max_param_size) { max_param_size = size; }
}
symm_mem = getSymmMemWorkspace(max_param_size);
}
}
ncclDataType_t get_nccl_data_type(at::ScalarType scalar_type)
{
switch (scalar_type) {
case at::kFloat: return ncclFloat;
case at::kHalf: return ncclHalf;
case at::kDouble: return ncclDouble;
case at::kBFloat16: return ncclBfloat16;
case at::kLong: return ncclInt64;
case at::kInt: return ncclInt;
case at::kChar: return ncclInt8;
default: throw std::runtime_error("Unsupported scalar type");
}
}
void reset()
{
executors.clear();
// We keep the buckets for memory estimation
// reduce_buckets->clear();
}
void cleanup()
{
reset();
ncclCommDestroy(nccl_comm);
process_group = nullptr;
symm_mem = nullptr;
}
at::Tensor reduce_grad(at::Tensor grad_tensor, long graph_id, long ds_id)
{
if (sync_before_reduce) { c10::cuda::device_synchronize(); }
assert(hasKey(executors, graph_id));
if (!profile) { executors[graph_id]->reduceGrad(grad_tensor, ds_id); }
if (sync_after_reduce) { c10::cuda::device_synchronize(); }
return at::Tensor();
}
at::Tensor reduce_grad_meta(at::Tensor grad_tensor, long graph_id, long ds_id)
{
return at::Tensor();
}
void free_tensors(std::vector<at::Tensor> tensors)
{
int64_t THRESHOLD = 10 * 1024 * 1024;
if (!profile) {
for (auto& tensor : tensors) {
if (tensor.is_cuda() && tensor.numel() > THRESHOLD) {
tensor.record_stream(at::cuda::getCurrentCUDAStream());
tensor.set_data(torch::empty({0}, tensor.options()));
}
}
}
}
void free_tensors_meta(std::vector<at::Tensor> tensors) {}
void init(c10::intrusive_ptr<c10d::ProcessGroup> pg,
int64_t initial_reduce_bucket_size,
bool enable_double_buffer,
bool _use_symm_mem,
bool _clone_custom_op_output,
bool _sync_before_reduce,
bool _sync_after_reduce,
bool _sync_before_allgather,
bool _sync_after_allgather)
{
process_group = pg;
ncclUniqueId ncclID;
ncclGetUniqueId(&ncclID);
// ProcessGroup doesn't have an API to get the CUDA stream for comm calls.
// So we create a NCCL communicator and call NCCL APIs directly.
auto vec = std::vector<uint8_t>(reinterpret_cast<uint8_t*>(&ncclID),
reinterpret_cast<uint8_t*>(&ncclID) + NCCL_UNIQUE_ID_BYTES);
auto device = torch::Device(torch::kCUDA);
at::Tensor tensor = torch::from_blob(vec.data(), {static_cast<long>(vec.size())}, torch::kUInt8)
.to(torch::Device(torch::kCUDA));
std::vector<at::Tensor> bcast_input = {tensor};
process_group->broadcast(bcast_input, c10d::BroadcastOptions())->wait();
// create a new nccl communicator
std::memcpy(&ncclID, tensor.to(torch::Device(torch::kCPU)).data_ptr(), NCCL_UNIQUE_ID_BYTES);
ncclCommInitRank(&nccl_comm, process_group->getSize(), ncclID, process_group->getRank());
param_registry = std::make_shared<DSParamRegistry>();
reduce_buckets = std::make_shared<DoubleBufferedReduceBucket>(initial_reduce_bucket_size,
enable_double_buffer);
use_symm_mem = _use_symm_mem;
clone_custom_op_output = _clone_custom_op_output;
sync_before_reduce = _sync_before_reduce;
sync_after_reduce = _sync_after_reduce;
sync_before_allgather = _sync_before_allgather;
sync_after_allgather = _sync_after_allgather;
}
void start_forward()
{
lazy_init_symm_memory();
for (auto& it : executors) { it.second->startForward(); }
}
void end_forward()
{
for (auto& it : executors) { it.second->endForward(); }
}
void start_backward(bool update)
{
for (auto& it : executors) { it.second->startBackward(update); }
}
// We don't call this
// void end_backward(bool update)
// {
// }
} // namespace dc

Some files were not shown because too many files have changed in this diff Show More