Merge branch 'master' into loadams/reenable-py311-312

2025-10-20 15:33:51 +08:00 · 2025-08-11 19:53:51 -04:00
parent adb2120179 a54c394392
commit 6b7dd7ecb3
333 changed files with 18655 additions and 3116 deletions
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@ -1,103 +0,0 @@
-name: cpu-inference
-
-on:
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - '.github/workflows/cpu-inference.yml'
-      - 'requirements/**'
-      - 'deepspeed/__init__.py'
-      - 'deepspeed/inference/**'
-      - '!deepspeed/inference/v2/**' # exclude v2 dir
-      - 'tests/unit/inference/**'
-      - '!tests/unit/inference/v2/**' # exclude v2 tests dir
-  merge_group:
-    branches: [ master ]
-  schedule:
-        - cron: "0 0 * * 0"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: [self-hosted, cpu]
-
-    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install gcc-9
-        run: |
-          sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
-          sudo apt install -y gcc-9 g++-9
-          # set gcc-9 and g++9 to default
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
-
-      - name: Check gcc version
-        run: |
-          # Get gcc version
-          gcc --version
-          g++ --version
-
-      - name: Detect instruction sets on instance
-        run: |
-          lscpu
-
-      - name: Install numactl
-        run: |
-          sudo apt-get install -y numactl
-
-      - name: Install dependencies
-        run: |
-          pip install torch
-          # check installed version
-          pip list |grep \\\<torch\\\>
-
-      - name: Install oneCCL
-        run: |
-          pip install cmake
-          git clone https://github.com/oneapi-src/oneCCL
-          cd oneCCL
-          mkdir build
-          cd build
-          cmake ..
-          make -j install
-
-      - name: Install transformers
-        run: |
-          git clone https://github.com/huggingface/transformers
-          cd transformers
-          git rev-parse --short HEAD
-          pip install .
-
-      - name: Install deepspeed
-        run: |
-          # check why the host does not have AVX2 support
-          pip install .[dev,1bit,autotuning,inf]
-          ds_report
-
-      - name: Python environment check
-        run: |
-          pip list
-          source oneCCL/build/_install/env/setvars.sh
-          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
-          # check whether the environment is properly setup
-          python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
-
-      - name: Unit tests
-        run: |
-          # prep oneCCL for CCLBackend comm ops building
-          source oneCCL/build/_install/env/setvars.sh
-          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd  tests
-          # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
-          LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
-          LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@ -33,7 +33,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+          pip install torch==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cpu
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

@ -42,7 +42,7 @@ jobs:
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
          git rev-parse --short HEAD
          pip install .

@ -59,5 +59,5 @@ jobs:
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          cd tests
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.6"
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.6"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.7.1+cpu"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.7.1+cpu"
--- a/.github/workflows/hpu-gaudi2-nightly.yml
+++ b/.github/workflows/hpu-gaudi2-nightly.yml
@ -21,7 +21,7 @@ jobs:
    # The type of runner that the job will run on
    runs-on: [self-hosted, intel, gaudi2]
    container:
-      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+      image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      ports:
        - 80
      options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@ -45,6 +45,8 @@ jobs:
        test_zero_leaf_module.py
        test_zero_offloadpp.py
        test_zero_tiled.py
+        test_autotp_training.py
+        test_ulysses.py

    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
--- a/.github/workflows/hpu-gaudi2.yml
+++ b/.github/workflows/hpu-gaudi2.yml
@ -39,7 +39,7 @@ jobs:
    # The type of runner that the job will run on
    runs-on: [self-hosted, intel, gaudi2]
    container:
-      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+      image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      ports:
        - 80
      options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@ -94,6 +94,8 @@ jobs:
        test_zero_nesting_init.py
        test_zeropp.py
        (test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
+        (test_linear.py and (TestLoRALinear or TestBasicLinear))
+        (test_ctx.py and TestEngine)

    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
@ -112,7 +114,7 @@ jobs:
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
          git rev-parse --short HEAD
          pip install .

--- a/.github/workflows/modal-accelerate.yml
+++ b/.github/workflows/modal-accelerate.yml
@ -0,0 +1,99 @@
+name: modal-accelerate
+
+# This CI is running on modal.com's GPUs.
+#
+# It's set up here on github actions and then the cloned repo is sent to modal and everything
+# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
+# run.
+#
+# Both files are annotated to what's important and how one might change or update things if needed.
+#
+# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
+# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
+# Required status for PRs to pass.
+#
+
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+    types: [draft, opened, ready_for_review, synchronize]
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  collect-tests:
+    name: Collect tests to run
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      deepspeed: ${{ steps.filter.outputs.deepspeed }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Filter changed files
+        uses: dorny/paths-filter@v2
+        id: filter
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: |
+            deepspeed:
+              - 'deepspeed/**'
+              - '.github/workflows/modal*.yml'
+              - 'ci/**'
+              - 'tests/unit/**'
+              - 'csrc/**'
+
+  deploy:
+    name: DeepSpeedAI CI
+    runs-on: ubuntu-latest
+    needs: collect-tests
+    env:
+      # these are created at https://modal.com/settings/deepspeedai/tokens
+      # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+      # this one comes from https://huggingface.co/settings/profile of the bot user
+      # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+    if: needs.collect-tests.outputs.deepspeed == 'true'
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install build dependencies
+        run: |
+          pip install uv # much faster than pip
+          uv pip install --system modal
+
+      - name: Run tests
+        run: |
+          modal run -m ci.accelerate
--- a/.github/workflows/modal-torch-latest.yml
+++ b/.github/workflows/modal-torch-latest.yml
@ -0,0 +1,99 @@
+name: modal-torch-latest
+
+# This CI is running on modal.com's GPUs.
+#
+# It's set up here on github actions and then the cloned repo is sent to modal and everything
+# happens on their hw - see deepspeed/modal_ci/torch_latest.py  for where the actual vm is loaded, updated and the tests are
+# run.
+#
+# Both files are annotated to what's important and how one might change or update things if needed.
+#
+# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
+# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
+# Required status for PRs to pass.
+#
+
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+    types: [draft, opened, ready_for_review, synchronize]
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  collect-tests:
+    name: Collect tests to run
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      deepspeed: ${{ steps.filter.outputs.deepspeed }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Filter changed files
+        uses: dorny/paths-filter@v2
+        id: filter
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: |
+            deepspeed:
+              - 'deepspeed/**'
+              - '.github/workflows/modal*.yml'
+              - 'ci/**'
+              - 'tests/unit/**'
+              - 'csrc/**'
+
+  deploy:
+    name: DeepSpeedAI CI
+    runs-on: ubuntu-latest
+    needs: collect-tests
+    env:
+      # these are created at https://modal.com/settings/deepspeedai/tokens
+      # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+      # this one comes from https://huggingface.co/settings/profile of the bot user
+      # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+    if: needs.collect-tests.outputs.deepspeed == 'true'
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install build dependencies
+        run: |
+          pip install uv # much faster than pip
+          uv pip install --system modal
+
+      - name: Run tests
+        run: |
+          modal run -m ci.torch_latest
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@ -23,7 +23,7 @@ jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, a6000]
    container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:25.01-py3
      ports:
        - 80
      options: --gpus all --shm-size "8G"
@ -43,7 +43,7 @@ jobs:
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if you need to use an older transformers version temporarily in case of breakage
-          git checkout 981c276
+          # git checkout 981c276
          git rev-parse --short HEAD
          python -m pip install .
      - name: Install deepspeed
@ -58,8 +58,8 @@ jobs:
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.6" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.6" --cuda_ver="12"
      - name: MII unit tests
        run: |
          BRANCH="main"
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@ -48,6 +48,10 @@ jobs:
          git clone https://github.com/huggingface/accelerate
          cd accelerate
          git rev-parse --short HEAD
+
+          # temp workaround until this is resolved https://github.com/huggingface/accelerate/issues/3676
+          pip install datasets==3.6.0
+
          # installing dependencies
          pip install .[testing]
          # force protobuf version due to issues
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@ -43,8 +43,8 @@ jobs:

      - name: Install deepspeed
        run: |
-          pip install transformers==4.48.3
          pip install .[dev]
+          pip install transformers==4.48.3
          ds_report

      - name: Install deepspeed-chat
--- a/.github/workflows/nv-flash-attn.yml
+++ b/.github/workflows/nv-flash-attn.yml
@ -18,7 +18,7 @@ jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, a6000]
    container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
      ports:
        - 80
      options: --gpus all --shm-size "8G"
@ -33,16 +33,19 @@ jobs:
          nvidia-smi
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-      - name: Install transformers
-        run: |
-          git clone --depth=1 https://github.com/huggingface/transformers
-          cd transformers
-          git rev-parse --short HEAD
-          python -m pip install .
+
+
+
      - name: Install deepspeed
        run: |
          python -m pip install .[dev]
          ds_report
+
+      # install transformers after deepspeed so that the right version of transformers is installed
+      - name: Install transformers
+        run: |
+          python -m pip install transformers==4.50.0
+
      - name: Install FlashAttention
        run: |
          python -m pip install flash-attn
@ -53,7 +56,7 @@ jobs:
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.6" --cuda_ver="12"
      - name: Open GitHub issue if nightly CI fails
        if: ${{ failure() && (github.event_name == 'schedule') }}
        uses: JasonEtco/create-an-issue@v2
--- a/.github/workflows/nv-h100.yml
+++ b/.github/workflows/nv-h100.yml
@ -1,65 +0,0 @@
-name: nv-h100
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-  issues: write
-
-jobs:
-  unit-tests:
-    runs-on: [self-hosted, nvidia, h100]
-    container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
-      ports:
-        - 80
-      options: --gpus all --shm-size "8G"
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Check container state
-        run: |
-          nvidia-smi
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      - name: Install transformers
-        run: |
-          git clone https://github.com/huggingface/transformers
-          cd transformers
-          git rev-parse --short HEAD
-          python -m pip install .
-
-      - name: Install deepspeed
-        run: |
-          python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
-          python -m pip install .[dev,1bit,autotuning]
-          ds_report
-
-      - name: Python environment
-        run: |
-          python -m pip list
-
-      - name: Unit tests
-        run: |
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          python -m pytest $PYTEST_OPTS -n 4  unit/ --torch_ver="2.0" --cuda_ver="12"
-          python -m pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12"
-
-      - name: Open GitHub issue if nightly CI fails
-        if: ${{ failure() && (github.event_name == 'schedule') }}
-        uses: JasonEtco/create-an-issue@v2
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
-          update_existing: true
--- a/.github/workflows/nv-human-eval.yml
+++ b/.github/workflows/nv-human-eval.yml
@ -1,53 +0,0 @@
-name: nv-human-eval
-
-on:
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: [self-hosted, nvidia, a6000]
-    container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
-      ports:
-        - 80
-      options: --gpus all --shm-size "8G"
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Check container state
-        run: |
-          ldd --version
-          nvcc --version
-          nvidia-smi
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-      - name: Install transformers
-        run: |
-          git clone --depth=1 https://github.com/huggingface/transformers
-          cd transformers
-          git rev-parse --short HEAD
-          python -m pip install .
-      - name: Clone Human Eval
-        run: |
-          git clone --depth=1 https://github.com/openai/human-eval.git
-          sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py
-          cd human-eval
-          git rev-parse --short HEAD
-          python -m pip install .
-      - name: Install deepspeed
-        run: |
-          python -m pip install .[dev,1bit,autotuning]
-          ds_report
-      - name: Python environment
-        run: |
-          python -m pip list
-      - name: Unit tests
-        run: |
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@ -41,6 +41,12 @@ jobs:
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

+      - name: Install deepspeed
+        run: |
+          pip install .[dev]
+          ds_report
+
+      # install transformers after deepspeed so that the right version of transformers is installed
      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
@ -50,11 +56,6 @@ jobs:
          git rev-parse --short HEAD
          pip install .

-      - name: Install deepspeed
-        run: |
-          pip install .[dev]
-          ds_report
-
      - name: Python environment
        run: |
          pip list
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@ -36,7 +36,7 @@ jobs:
            #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
        - name: Compile DeepSpeed Ops
          run: |
-            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
        - name: DS Report
          run: |
             ds_report
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@ -44,7 +44,8 @@ jobs:

      - name: Install deepspeed
        run: |
-          pip install .[dev,1bit,autotuning]
+          pip install .[dev,1bit,autotuning,deepcompile]
+          pip install pytest-timeout pytest-instafail
          ds_report

      - name: Python environment
@ -55,5 +56,5 @@ jobs:
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          cd tests
-          pytest $PYTEST_OPTS --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
+          pytest -x $PYTEST_OPTS --instafail --timeout 600 --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
+          pytest $PYTEST_OPTS --instafail --timeout 600 --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@ -37,7 +37,7 @@ jobs:
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
          git rev-parse --short HEAD
          pip install .

--- a/.github/workflows/setup-venv/action.yml
+++ b/.github/workflows/setup-venv/action.yml
@ -6,7 +6,9 @@ runs:
    - id: update-env
      run: |
        sudo apt-get update
-        sudo apt-get install -y libaio-dev
+        # Temporary disable nvme UTs
+        # sudo apt-get install -y libaio-dev
+        sudo apt remove -y libaio-dev
        python -m pip install --user --upgrade pip
        python -m pip install --user --upgrade virtualenv
      shell: bash
@ -28,7 +30,7 @@ runs:
        echo HF_DATASETS_CACHE=/blob/datasets_cache/ >> $GITHUB_ENV
        echo MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ >> $GITHUB_ENV
        echo CRITIC_CKPT_DIR=/blob/step2_opt_125m_ckpt/ >> $GITHUB_ENV
-        echo PYTEST_OPTS="--color=yes --durations=0 --verbose -rF" >> $GITHUB_ENV
+        echo PYTEST_OPTS="--maxfail=100 --color=yes --durations=0 --verbose -rF" >> $GITHUB_ENV
      shell: bash
    - id: print-env
      run: |
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@ -36,7 +36,7 @@ jobs:
  unit-tests:
    runs-on: [self-hosted, intel, xpu]
    container:
-      image: intel/oneapi-basekit:2025.0.1-0-devel-ubuntu24.04
+      image: intel/oneapi-basekit:2025.0.2-0-devel-ubuntu22.04
      ports:
        - 80
      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
@ -47,20 +47,16 @@ jobs:
      shell: bash
      run: |
        apt-get update
-        apt-get install clinfo libaio-dev python3-pip python3.12-venv -y
-        python3 -m venv ~/ds_env
-        source ~/ds_env/bin/activate
-        pip install torch==2.5.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
-        pip install intel-extension-for-pytorch==2.5.10+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
-        pip install oneccl_bind_pt==2.5.0+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
-        pip install torchvision==0.20.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
-        pip install py-cpuinfo numpy
+        apt-get install -y python3.11 python3.11-dev python3-pip clinfo libaio-dev
+        pip install --upgrade pip
+        pip install py-cpuinfo
+        pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
+        pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us
        pip install .[dev,autotuning]

    - name: Check container state
      shell: bash
      run: |
-        source ~/ds_env/bin/activate
        ldd --version
        ds_report
        python3 -c "import torch; print('torch:', torch.__version__, torch)"
@ -71,8 +67,9 @@ jobs:
    - name: Unit tests
      shell: bash
      run: |
-        source ~/ds_env/bin/activate
        cd tests/unit
+        export FI_PROVIDER="tcp"
+        export I_MPI_SHM=off
        pytest --verbose accelerator/*
        pytest --verbose autotuning/*
        pytest --verbose checkpoint/test_reshape_checkpoint.py
--- a/COMMITTERS.md
+++ b/COMMITTERS.md
@ -9,3 +9,4 @@
 | Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC  |
 | Ashwin Aji      | [ashwinma](https://github.com/ashwinma)        | AMD   |
 | Sam Foreman     | [saforem2](https://github.com/saforem2)        | Argonne National Laboratory |
+| Zhipeng Wang    | [PKUWZP](https://github.com/PKUWZP)       | LinkedIn  |
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -19,6 +19,12 @@ If a formatting test fails, it will fix the modified code in place and abort
 the `git commit`. After looking over the changes, you can `git add <modified files>`
 and then repeat the previous `git commit` command.

+You can also run:
+```
+make format
+```
+which will do the same as above, and it'll also automatically build a `venv` python environment if you
+don't already have one, which will isolate the requirements of this project from requirements of other projects.

 ## Testing
 DeepSpeed tracks two types of tests: unit tests and more costly model convergence tests.
@ -38,6 +44,11 @@ You can also provide the `-v` flag to `pytest` to see additional information abo
 tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) and the
 `--forked` flag are required to test CUDA functionality in distributed tests.

+You can also run:
+```
+make test
+```
+
 ### Model Tests
 To execute model tests, first [install DeepSpeed](#installation). The
 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) repository is cloned
@ -48,16 +59,15 @@ pytest run_sanity_check.py
 ```
 Note that the `--forked` flag is not necessary for the model tests.

-## Contributor License Agreement
-This project welcomes contributions and suggestions. Most contributions require you to
-agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
-actually do, grant us the rights to use your contribution. For details, visit
-https://cla.opensource.microsoft.com.
+## Developer Certificate of Origin
+This project welcomes contributions and suggestions. All contributions to deepspeedai projects
+require commits to be signed off with a [Developer Certificate of Origin](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin)
+(DCO) declaring that you have the right to, and actually do, grant us the rights to use your contribution.

-When you submit a pull request, a CLA bot will automatically determine whether you need
-to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
-follow the instructions provided by the bot. You will only need to do this once across
-all repos using our CLA.
+When you submit a pull request, the DCO app will check for the presence of signed commits.
+Information about how this check works is here: https://github.com/dcoapp/app?tab=readme-ov-file#how-it-works
+
+To sign commits, you will need to include `-s` when running `git commit`. For example, `git commit -s -m "Commit message"`. One note, creating PRs via the GitHub interface do not appear to include this option.  If you forget this, clicking on the failing check in your PR will point you to commands you can run to rebase and sign previous commits.

 ## Code of Conduct
 This project has adopted the [Microsoft Open Source Code of
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+# usage: make help
+
+.PHONY: help test format
+.DEFAULT_GOAL := help
+
+help: ## this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[0-9a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+	echo $(MAKEFILE_LIST)
+
+test: ## run tests
+	pytest --forked tests/unit/
+
+format: ## fix formatting
+	@if [ ! -d "venv" ]; then \
+		python -m venv venv; \
+		. venv/bin/activate; \
+		pip install pre-commit -U; \
+		pre-commit clean; \
+		pre-commit uninstall; \
+		pre-commit install; \
+		deactivate; \
+	fi
+	. venv/bin/activate && pre-commit run --files $$(git diff --name-only master) && deactivate
--- a/README.md
+++ b/README.md
@ -6,6 +6,7 @@
 [![Twitter](https://img.shields.io/twitter/follow/DeepSpeedAI)](https://twitter.com/intent/follow?screen_name=DeepSpeedAI)
 [![Japanese Twitter](https://img.shields.io/badge/%E6%97%A5%E6%9C%AC%E8%AA%9ETwitter-%40DeepSpeedAI_JP-blue)](https://twitter.com/DeepSpeedAI_JP)
 [![Chinese Zhihu](https://img.shields.io/badge/%E7%9F%A5%E4%B9%8E-%E5%BE%AE%E8%BD%AFDeepSpeed-blue)](https://www.zhihu.com/people/deepspeed)
+[![Slack](https://img.shields.io/badge/Slack-4A154B?style=for-the-badge&logo=slack&logoColor=white)](https://join.slack.com/t/deepspeedworkspace/shared_invite/zt-3a8pjd8dd-PCj2hMvR4Y2syPwVnjEoww)


 <div align="center">
@ -15,32 +16,23 @@

 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
-
+* [2025/06] [Arctic Long Sequence Training (ALST) with DeepSpeed: Scalable And Efficient Training For Multi-Million Token Sequences](https://www.snowflake.com/en/engineering-blog/arctic-long-sequence-training-multi-million-token-ai/)
+* [2025/04] [DeepCompile: Unlocking Compiler Optimization for Distributed Training](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepcompile/README.md)
+* [2025/03] [DeepSpeed-AutoTP: Automatic Tensor Parallel Training of Hugging Face models](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/huggingface-tp/README.md)
 * [2024/12] [Ulysses-Offload: Democratizing Long Context LLM Training ](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/ulysses-offload/README.md)
 * [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
 * [2024/08] [DeepSpeed on Windows](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)]  [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/chinese/README.md)]
-* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)]
-* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
-* [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
-* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
-* [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
-* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
-* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
-* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
-* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
-

 <!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
 <details>
 <summary>More news</summary>
 <ul>
-  <li>[2023/08] <a href="https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md">DeepSpeed ZeRO-Inference: 20x faster inference through weight quantization and KV cache offloading</a></li>
+   <li> [2024/08] <a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md"> DeepNVMe: Improving DL Applications through I/O Optimizations</a> [<a href="ttps://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md"> 日本語 </a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md"> 中文 </a>]</li>

-  <li>[2023/08] <a href="https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md">DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements</a></li>
+    <li> [2024/07] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md"> DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training</a> [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md"> 日本語 </a>] </li>

-  <li>[2023/08] <a href="https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses">DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models</a> [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md">中文</a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md">日本語</a>]</li>
+   <li> [2024/03] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md"> DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models</a> [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md"> 中文 </a>] </li>

-  <li>[2023/06] <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">ZeRO++: A leap in speed for LLM and chat model training with 4X less communication</a> [<a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">English</a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md">中文</a>] [<a href="https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md">日本語</a>]</li>
 </ul>
 </details>

@ -135,9 +127,9 @@ DeepSpeed has been integrated with several different popular open-source DL fram

 | Description | Status |
 | ----------- | ------ |
-| NVIDIA | [![nv-torch110-p40](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml) |
+| NVIDIA | [![nv-torch-latest-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml)  [![nv-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml) |
 | AMD | [![amd-mi200](https://github.com/deepspeedai/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/amd-mi200.yml) |
-| CPU | [![torch-latest-cpu](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-inference.yml) |
+| CPU | [![torch-latest-cpu](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml) |
 | Intel Gaudi | [![hpu-gaudi2](https://github.com/deepspeedai/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/hpu-gaudi2.yml) |
 | Intel XPU | [![xpu-max1100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/xpu-max1100.yml) |
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
@ -280,8 +272,7 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 31. Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, Shuaiwen Leon Song. (2024) FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design  [arXiv:2401.14112](https://arxiv.org/abs/2401.14112)
 32. Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Reza Yazdani Aminadabi, Shuaiwen Leon Song, Samyam Rajbhandari, Yuxiong He. (2024) [System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://dl.acm.org/doi/10.1145/3662158.3662806)
 33. Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, Minjia Zhang. (2024) Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training [arXiv:2406.18820](https://arxiv.org/abs/2406.18820)
-
-
+34. Stas Bekman, Samyam Rajbhandari, Michael Wyatt, Jeff Rasley, Tunji Ruwase, Zhewei Yao, Aurick Qiao, Yuxiong He. (2025) Arctic Long Sequence Training: Scalable And Efficient Training For Multi-Million Token Sequences [arXiv:2506.13996](https://arxiv.org/abs/2506.13996)


 # Videos
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@ -229,10 +229,17 @@ class CPU_Accelerator(DeepSpeedAccelerator):
        return True

    def is_fp16_supported(self):
-        return False
+        try:
+            if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+                return True
+        except:
+            return False

    def supported_dtypes(self):
-        return [torch.float, torch.bfloat16]
+        supported_dtypes = [torch.float, torch.bfloat16]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.float16)
+        return supported_dtypes

    # Graph operations
    def create_graph(self):
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@ -60,8 +60,11 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
            return 'cuda'
        return 'cuda:{}'.format(device_index)

+    def communication_backend_version(self):
+        return torch.cuda.nccl.version()
+
    def device(self, device_index=None):
-        return torch.cuda.device(device_index)
+        return torch.device('cuda', device_index)

    def set_device(self, device_index):
        torch.cuda.set_device(device_index)
--- a/accelerator/hpu_accelerator.py
+++ b/accelerator/hpu_accelerator.py
@ -23,6 +23,8 @@ class HPU_Accelerator(DeepSpeedAccelerator):
            import habana_frameworks.torch.hpu as hpu
            self.hpu = hpu
            torch.use_deterministic_algorithms(True)
+            # TODO: remove this WA when memory mapping break is resolved.
+            torch.utils.deterministic.fill_uninitialized_memory = False
        except ImportError as e:
            raise ValueError(
                f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@ -67,7 +67,7 @@ def get_accelerator():
                    f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
        elif accelerator_name == "xpu.external":
            try:
-                import intel_extension_for_deepspeed  # noqa: F401 # type: ignore
+                from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F401 # type: ignore
            except ImportError as e:
                raise ValueError(
                    f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
@ -136,6 +136,21 @@ def get_accelerator():
                    accelerator_name = "xpu"
            except ImportError as e:
                pass
+        if accelerator_name is None:
+            try:
+                import torch
+
+                # torch.xpu will be supported in upstream pytorch-2.8.
+                # Currently we can run on xpu device only using pytorch,
+                # also reserve the old path using ipex when the torch version is old.
+                if hasattr(torch, 'xpu'):
+                    if torch.cuda.device_count() == 0:  #ignore-cuda
+                        if torch.xpu.device_count() > 0 and torch.xpu.is_available():
+                            accelerator_name = "xpu"
+                else:
+                    pass
+            except ImportError as e:
+                pass
        if accelerator_name is None:
            try:
                import torch_npu  # noqa: F401,F811 # type: ignore
@ -209,6 +224,12 @@ def get_accelerator():
        ds_accelerator = CPU_Accelerator()
    elif accelerator_name == "xpu.external":
        # XPU_Accelerator is already imported in detection stage
+        try:
+            from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F811
+        except ImportError as e:
+            raise ValueError(
+                f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
+            )
        ds_accelerator = XPU_Accelerator()
    elif accelerator_name == "xpu":
        from .xpu_accelerator import XPU_Accelerator
@ -243,7 +264,7 @@ def get_accelerator():
 def set_accelerator(accel_obj):
    global ds_accelerator
    _validate_accelerator(accel_obj)
-    if accel_logger is not None:
+    if accel_logger is not None and accel_obj is not None:
        accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
    ds_accelerator = accel_obj

--- a/accelerator/xpu_accelerator.py
+++ b/accelerator/xpu_accelerator.py
@ -5,19 +5,32 @@

 import torch
 from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
-import intel_extension_for_pytorch as ipex  # noqa: F401 # type: ignore
-import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
 import functools
-
 import importlib
 import inspect

+try:
+    import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
+    oneccl_imported_p = True
+except ImportError as e:
+    oneccl_imported_p = False
+
+try:
+    import intel_extension_for_pytorch as ipex  # noqa: F401 # type: ignore
+    ipex_imported_p = True
+except ImportError as e:
+    ipex_imported_p = False
+

 class XPU_Accelerator(DeepSpeedAccelerator):

    def __init__(self):
        self._name = 'xpu'
-        self._communication_backend_name = 'ccl'
+        if oneccl_imported_p:
+            self._communication_backend_name = 'ccl'
+        else:
+            # changed to xccl if not using torch-CCL on XPU device
+            self._communication_backend_name = 'xccl'
        self._compile_backend = "inductor"
        self.aligned_tensors = []
        self.class_dict = None
@ -26,11 +39,14 @@ class XPU_Accelerator(DeepSpeedAccelerator):
        return False

    def use_host_timers(self):
-        # WA XPU event will be consolidated in 2.6
-        if ipex.__version__ < '2.6':
-            return True
-        else:
+        if not ipex_imported_p:
            return self.is_synchronized_device()
+        else:
+            # WA XPU event will be consolidated in 2.6
+            if ipex.__version__ < '2.6':
+                return True
+            else:
+                return self.is_synchronized_device()

    def resolves_data_dependency(self):
        return self.is_synchronized_device()
@ -290,10 +306,13 @@ class XPU_Accelerator(DeepSpeedAccelerator):
            return self.class_dict['NotImplementedBuilder']

    def build_extension(self):
-        try:
-            from intel_extension_for_pytorch.xpu.cpp_extension import DpcppBuildExtension
-        except ImportError:
-            from intel_extension_for_pytorch.xpu.utils import DpcppBuildExtension
+        if ipex_imported_p:
+            try:
+                from intel_extension_for_pytorch.xpu.cpp_extension import DpcppBuildExtension
+            except ImportError:
+                from intel_extension_for_pytorch.xpu.utils import DpcppBuildExtension
+        else:
+            from torch.utils.cpp_extension import DpcppBuildExtension
        return DpcppBuildExtension

    def export_envs(self):
--- a/blogs/deepcompile/README.md
+++ b/blogs/deepcompile/README.md
@ -0,0 +1,174 @@
+<div align="center">
+
+# DeepCompile: Unlocking Compiler Optimization for Distributed Training
+
+</div>
+
+# Introduction
+
+<div align="center">
+
+<img src="media/perf_summary.png" width="1000">
+
+</div>
+
+Distributed training has become essential for scaling today’s massive deep learning models. While deep learning compilers like PyTorch compiler dramatically improved single-GPU training performance through optimizations like kernel fusion and operator scheduling, they fall short when it comes to distributed workloads.
+Existing distributed training frameworks such as DeepSpeed and FSDP have made large-scale model training feasible through advanced parallelization strategies. While powerful, their optimizations are implemented at the PyTorch framework level, which limits the ability to apply compiler-style techniques like dependency analysis or operator scheduling.
+
+DeepCompile addresses this gap by enabling compiler-level optimizations for distributed training. It takes a standard single-GPU model implementation and transforms it into an optimized multi-GPU training graph without requiring changes to the model code. Unlike existing approaches, DeepCompile automatically applies parameter sharding, communication scheduling, and memory-aware execution at the compiler IR level, enabling global analysis and optimization that are difficult to express in traditional frameworks. Furthermore, during training, DeepCompile employs profile-guided optimization techniques to dynamically tune these parallelization strategies and improve training performance.
+
+Our evaluation demonstrates that DeepCompile improves training performance over ZeRO-3 baselines, achieving up to 1.5x speedup when sufficient GPU resources are available, and up to 7x speedup in GPU-constrained settings that require offloading. DeepCompile is available in DeepSpeed versions >= [0.16.6](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.16.6). As it is under active development, we recommend using the latest version of DeepSpeed or installing from source to access the most recent updates and bug fixes.
+
+# Design Overview
+
+DeepCompile extends the capabilities of deep learning compilers to support distributed training. It starts from a standard single-GPU model implementation, such as those available on the Hugging Face model hub, and automatically transforms it by inserting necessary distributed training operations such as parameter sharding and communication primitives. Users are not required to embed any distributed logic into the model code.
+
+The process begins by compiling the model into an intermediate representation (IR), which forms a computation graph. DeepCompile then applies a sequence of *optimization passes*, each responsible for a specific transformation of the computation graph or a targeted performance improvement, to incrementally introduce distributed behavior and optimize the graph. These include operations such as all-gather for sharded parameters or offloading of optimizer states, all while preserving the original computation semantics (Fig. 1).
+
+<div align="center">
+
+<img src="media/workflow.png" width="400">
+
+*Figure 1: Workflow of compilation and optimization with DeepCompile.*
+
+</div>
+
+At its core, DeepCompile builds on two key capabilities:
+
+- **Automatic parallelization**: DeepCompile allows optimization passes to rewrite the single-GPU computation graph into a distributed multi-GPU version, incorporating strategies such as ZeRO, FSDP, and more. This eliminates the need for manual implementation of distributed training logic, drastically reducing engineering effort.
+- **Profile-guided performance tuning**: At runtime, DeepCompile collects profiling data such as operator-level memory usage and execution latency. It uses this information to dynamically schedule computation and communication operators. This enables effects such as an improved overlap between communication and computation, and an avoidance of memory bottlenecks. Fine-grained tuning through these optimization passes often leads to better performance than even manually engineered implementations.
+
+Figure 2 illustrates the optimization cycle employed by DeepCompile. After the initial computation graph is generated by the compiler, DeepCompile profiles its behavior by measuring operator execution time, communication overhead, and memory usage throughout the forward and backward passes.
+
+<div align="center">
+
+<img src="media/opt_loop.png" width="600">
+
+*Figure 2. Optimization cycle.*
+
+</div>
+
+Based on the collected profiling data, DeepCompile applies a sequence of optimization passes. These passes modify the computation graph by inserting, removing, or reordering operators to improve overall efficiency. The modified graph is then re-profiled, and this cycle of profiling and optimization is repeated.
+
+Once a stable set of optimizations has been applied, the graph is deployed for the remaining training iterations. During execution, memory usage and other runtime characteristics may change. In such cases, DeepCompile can resume the profiling and optimization cycle according to the predefined schedule of passes, allowing the graph to adapt and maintain high performance.
+
+# Optimizations
+
+DeepCompile is designed as a general compiler framework for applying and optimizing a wide range of parallelization strategies. In the following, we describe several optimizations that have been implemented as optimization passes within DeepCompile.
+
+## ZeRO3
+
+As an initial step, we have used DeepCompile to implement and enhance ZeRO-3-style optimizations at the compiler level. ZeRO-3 partitions model parameters, gradients, and optimizer states across devices, reducing memory usage and enabling large-scale training.
+
+In conventional ZeRO-3 implementations, operations such as all-gather, reduce-scatter, and buffer release are typically inserted using Python hooks at runtime. DeepCompile replaces this approach by injecting these operations directly into the computation graph during compilation. This allows the compiler to determine their placement precisely, guided by both the static structure of the graph and runtime profiling information.
+
+One of the key optimizations is **proactive prefetching**, which launches all-gather operations earlier in the computation based on memory usage profiling. This reordering increases the overlap between communication and computation thereby improving throughput, while avoiding OOMs. In addition, small communication operations are often fused to reduce launch latency and improve efficiency.
+
+Another optimization is **selective unsharding**, which keeps certain parameters in an unsharded form during the forward and backward passes when memory conditions permit. This reduces the frequency of all-gather operations and avoids redundant communication, particularly in scenarios where gradient accumulation is enabled.
+
+## Offloading
+
+DeepCompile also supports **adaptive offloading**, which offloads optimizer states to reduce GPU memory pressure. Unlike approaches that offload all the optimizer states, adaptive offloading identifies only the portions that exceed the memory limit—such as momentum and variance used by the Adam optimizer—and schedules data transfers to overlap with computation. This selective and asynchronous strategy minimizes overhead and enables efficient training even in memory-constrained environments.
+
+## ZeRO1
+
+ZeRO-1 differs from ZeRO-3 in that it shards only the optimizer states across devices, while keeping parameters and gradients fully replicated. This approach reduces memory usage with minimal changes to computation flow, making it a lightweight alternative for certain training scenarios.
+DeepCompile implements ZeRO-1-style optimization by inserting reduce-scatter operations directly into the computation graph. By avoiding Python-level hooks, this graph-level integration reduces overhead and improves execution efficiency.
+
+# Performance Improvements
+
+## ZeRO-3
+
+We evaluated DeepCompile on Llama-3-70B and Mixtral 8x7B using parameter sharding on top of Hugging Face model implementations.
+Figure 3 shows training throughput (TFLOPs/GPU) across different gradient accumulation steps, using 32 H100 GPUs with a sequence length of 1024.
+We compare DeepCompile against two DeepSpeed ZeRO-3 baselines: (i) an eager-mode version without compiler support (labelled ZeRO3+Eager), and (ii) a compiled version using PyTorch compiler (labelled ZeRO3+Compile). For DeepCompile, we enabled both proactive prefetching and selective unsharding to demonstrate the combined effect of these optimization passes.
+
+<div align="center"> <img src="media/perf_zero3.png" width="800">
+
+*Figure 3. Achieved throughputs for ZeRO3 training of Llama-3 70B and Mixtral 8x7B models.*
+
+</div>
+Across both models, DeepCompile consistently delivers higher throughput. The benefit becomes more pronounced at higher accumulation steps, where the reduced frequency of parameter updates makes selective unsharding more effective. DeepCompile with proactive prefetching and selective unsharding achieves up to 1.28× speedup over ZeRO-3 on Llama-3-70B and 1.54× on Mixtral 8x7B.
+
+Meanwhile, enabling the PyTorch compiler with ZeRO-3, i.e., ZeRO3+Compile introduces minor overheads in some settings. This is because ZeRO-3 includes many conditional branches for runtime features such as prefetching. When the compiler encounters branches that cannot be statically resolved, it splits the computation into multiple graph segments. These fragmented segments can reduce optimization opportunities and introduce additional overheads during execution.
+
+## Offloading
+
+Training models as large as Llama-3 70B with ZeRO-3 typically requires 32 GPUs with 80GB of memory.
+DeepSpeed addresses this challenge by offering offloading capabilities, which transfer optimizer states and optionally model parameters to CPU memory to reduce GPU memory usage. DeepCompile also supports offloading through a dedicated optimization pass, but with a few key differences in design.
+
+Unlike the traditional approach of offloading both optimizer computation and memory, DeepCompile offloads only optimizer memory (e.g., momentum, variance, and master weights of Adam optimizer) while the optimizer computation remains on GPU. DeepCompile profiles memory usage during both forward and backward passes to identify when offloading is necessary, and transfers only the required data. This fine-grained approach avoids unnecessary overhead and helps maintain high computational throughput.
+Furthermore, DeepCompile overlaps data transfers with computation whenever possible, dynamically adjusting the timing based on observed memory usage patterns. This asynchronous behavior is a crucial aspect of DeepCompile’s offloading strategy, allowing it to reduce GPU memory pressure without stalling execution.
+
+We evaluated DeepCompile's offloading using Llama-3 70B on 16xH100-80GB (half the required GPU counts) and present the results in Figure 4.
+
+<div align="center">
+
+<img src="media/perf_offload.png" width="400">
+
+*Figure 4. Achieved throughput of optimizer offloading for Llama-3 70B on 16x80GB GPUs*
+
+</div>
+
+We compare against two ZeRO-3 offloading baselines: (i) an eager-mode version without compiler support (ZeRO3+Eager), and (ii) a compiled version using PyTorch compiler (ZeRO3+Compile). As shown by the results, DeepCompile significantly improves offloading efficiency and provides up to 7× speedup over ZeRO3+Eager. In contrast, we see that ZeRO3+Compile achieves similar performance as ZeRO3+Eager.
+
+
+## ZeRO-1
+
+We also evaluated DeepCompile with ZeRO-1 using the Llama-3-8B model. We compare DeepCompile against two ZeRO-1 baselines: (i) an eager-mode version without compiler support (ZeRO1+Eager), and (ii) a compiled version using PyTorch compiler (ZeRO1+Compile). In our experiment with 8 GPUs and a batch size of 2, DeepCompile achieved consistent throughput improvements across different sequence lengths, as shown in Figure 5.
+
+<div align="center">
+
+<img src="media/perf_zero1.png" width="800">
+
+*Figure 5. Achieved throughput of ZeRO-1 training of Llama-3 8B*
+
+</div>
+
+The most significant speedup was observed with batch size 1 and sequence length 512, where DeepCompile outperformed ZeRO1+Eager by up to 1.9×, and ZeRO1+Compile by up to 2.5×.
+
+While compiler-based approaches can be effective for large batch sizes and long sequences by replacing suboptimal operations with more efficient kernels, they may also introduce overheads in ZeRO-1-style training in the form of *graph breaks* around the communication operations. These overheads become more pronounced with smaller batch sizes and sequence lengths, thus hurting performance compared to the non-compiled execution. In contrast, DeepCompile inserts communication operators directly into the computation graph during compilation, avoiding graph fragmentation and minimizing associated overhead. This makes DeepCompile more robust to small-scale workloads, while still benefiting from compiler-level optimizations.
+
+## Additional Results and Analysis
+
+Please refer to our [arXiv paper](https://arxiv.org/abs/2504.09983) for additional results, such as detailed comparisons across different batch sizes, sequence lengths, and memory usage.
+
+# Looking Ahead
+
+DeepCompile brings the power of compiler-based optimizations to distributed deep learning. By transforming computation graphs and applying profile-guided optimization passes, it enables more efficient training without requiring changes to model code.
+
+This release is just the beginning. We’re actively working on expanding the set of optimization passes and improving integration with a broader range of distributed training strategies. Future directions include automated parallelization (sequence/tensor parallelisms), smarter memory management, and dynamic adaptation to runtime behavior.
+
+We invite the community to try DeepCompile, explore its capabilities, and contribute to its evolution. Let’s build the next generation of scalable deep learning together.
+
+# Acknowledgments
+
+We would like to thank everyone who supported this project.
+
+This project would not have been possible without the PyTorch Compiler—a platform that is not only powerful and flexible, but also a pleasure to work with. We are especially grateful to the developers and researchers behind PyTorch Compiler for making such an excellent foundation available to the community.
+
+# Contributors
+
+This project is the result of a close collaboration between Microsoft and the University of Virginia. The contributors are: Masahiro Tanaka, Du Li, and Umesh Chand, Olatunji Ruwase (Microsoft); and Ali Zafar and Haiying Shen (University of Virginia).
+
+# Appendix
+
+## Examples and Benchmarks
+
+Our DeepSpeedExamples repository provides [example code](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/deepcompile) to enable DeepCompile.
+
+## Optimization Passes
+
+The following optimization passes are currently available in DeepCompile:
+
+- All-gather & reduce-scatter insertion (ZeRO3)
+- Proactive prefetching (ZeRO3)
+- Selective unsharding (ZeRO3)
+- Reduce-scatter insertion (ZeRO1)
+- Adaptive offloading
+
+We used the following combinations of passes in the experiments presented above:
+
+- Improved communication scheduling for ZeRO-3: All-gather & reduce-scatter → Proactive prefetching → Selective unsharding
+- Offloading optimizer states for ZeRO3: Adding all-gather & reduce-scatter → Adaptive offloading
+- Reduced overhead and improved overlap for ZeRO-1: Adding reduce-scatter
--- a/blogs/deepcompile/media/opt_loop.png
+++ b/blogs/deepcompile/media/opt_loop.png
--- a/blogs/deepcompile/media/perf_offload.png
+++ b/blogs/deepcompile/media/perf_offload.png
--- a/blogs/deepcompile/media/perf_summary.png
+++ b/blogs/deepcompile/media/perf_summary.png
--- a/blogs/deepcompile/media/perf_zero1.png
+++ b/blogs/deepcompile/media/perf_zero1.png
--- a/blogs/deepcompile/media/perf_zero3.png
+++ b/blogs/deepcompile/media/perf_zero3.png
--- a/blogs/deepcompile/media/workflow.png
+++ b/blogs/deepcompile/media/workflow.png
--- a/blogs/deepnvme/06-2025/README.md
+++ b/blogs/deepnvme/06-2025/README.md
@ -0,0 +1,137 @@
+<div align="center">
+
+# DeepNVMe: Affordable I/O scaling for Deep Learning Applications.
+
+</div>
+
+# Introduction
+We introduced [DeepNVMe](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepnvme/08-2024/README.md) in summer 2024 as a suite of optimizations for tackling I/O bottlenecks in Deep Learning (DL). DeepNVMe delivers significant speedups for I/O bound DL workloads by leveraging storage innovations including local NVMe SSDs, NVIDIA Magnum IO<sup>TM</sup> GPUDirect® Storage (GDS), and Linux Asynchronous I/O (AIO).
+In this update, we are delighted to announce DeepNVMe improvements on multiple fronts: (i) expanding application coverage to FastPersist model checkpointing and SGLang inference, (ii) I/O performance scaling by upgrading from PCIe Gen4 to Gen5 NVMe SSDs, and (iii) expanding usability to CPU-only environments, offset-based I/O operations, and tensor data type casting. The results reported in this blog are available in DeepSpeed versions >= [0.17.1](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.17.1).
+
+# Evaluation environments
+Our experiments are conducted on Azure [ND-H200-v5](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/nd-h200-v5-series?tabs=sizebasic) VM. The key software configurations are summarized in the following table.
+
+|Software | Version
+|---|--|
+|Ubuntu | 24.04.2|
+|PyTorch | 2.6.0|
+|CUDA | 12.6 |
+SGLang | 0.4.4.post4 |
+
+# Addressing I/O Bottlenecks of Deep Learning
+We used DeepNVMe to develop FastPersist and ZeRO-Inference to target I/O bottlenecks in DL training and inference respectively. Our experiments are conducted using a single VM, in which we combine the available NVMe SSDs into a single RAID-0 (i.e., disk striping) volume to leverage aggregate read and write bandwidths. Since DeepNVMe can offload tensors using CPU bounce buffers (a.k.a., AIO), or NVIDIA GPUDirect Storage (a.k.a., GDS), we report results for both modes.
+
+## FastPersist: Faster Model Checkpoint Creation
+Although saving model checkpoints to persistent storage is critical in model training, it is also a major bottleneck due to the inefficiencies of existing approaches. We developed [FastPersist](https://arxiv.org/abs/2406.13768) to address the performance challenges of checkpointing. FastPersist makes checkpointing overheads negligible during training through three key techniques: (i) DeepNVMe, (ii) data parallelism, and (iii) overlapping I/O and computation.
+
+Our goal here is to demonstrate the impact of DeepNVMe in FastPersist using single-process micro-benchmarks (available [here](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/deepnvme/model_checkpoint)) which serialize a model checkpoint state from HBM to local NVMe. We use the popular PyTorch `torch.save()` as the baseline in our experiments, and integrate FastPersist into `torch.save()` to simplify adoption and performance comparisons.
+
+### Faster Saving of PyTorch Models to local NVMe Storage
+We measure the throughput of serializing Phi-3-Mini checkpoint state from HBM to local NVMe storage. The results are summarized in the Figure below. We observe significantly faster checkpointing with FastPersist compared to the baseline. We see speedups of over 20X in the 8xGen5 NVMe settings. We also observe FastPersist scaling with increased NVMe bandwidth of 8xGen5 compared with 4xGen5.
+
+<img src="./media/fastpersist_phi3_mini.png">
+<div align="center">
+  FastPersist provides significantly faster model checkpointing to local NVMe.
+</div>
+
+## ZeRO-Inference: Democratizing Generative AI
+[ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) is a technology that democratizes access to state-of-the-art models by reducing the GPU costs of model inference. ZeRO-Inference enables inference computations of massive models (hundreds-of-billions of parameters) on as few as one GPU by offloading the model weights to DRAM and NVMe storage. ZeRO-Inference is designed for offline or throughput-oriented inference scenarios. In this blog, we share two updates on ZeRO-Inference. First, we have integrated ZeRO-Inference into SGLang, a state-of-the-art model serving framework. Second, we observed ZeRO-Inference performance scales with the faster NVMe SSDs in the latest Azure SKUs.
+
+### Democratizing SGLang through ZeRO-Inference integration
+[SGLang](https://docs.sglang.ai/) is a state-of-the-art serving framework for large language models (LLMs) and vision language models (VLMs). Our integration of ZeRO-Inference into SGLang makes SGLang available to budget-constrained users, and offers a cost-reduction option to existing SGLang users. We used SGLang's [offline benchmarking tool](https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_offline_throughput.py) to measure the generation throughput of LLAMA3-70B on a single H200 with NVMe offloading (LLAMA3-70B cannot fit in the 141GB VRAM without offloading). The experiment is configured with prompt length of 512, generation length of 32, and batch size of 128. We summarize the results in the figure below for both AIO and GDS offloading.
+
+<img src="./media/sg_zinf_llama_70b.png">
+<div align="center">
+  ZeRO-Inference improves SGLang inference with NVMe offloading to reduce hardware costs.
+</div>
+
+
+### Scaling HF Transformer Generation with Faster NVMe SSDs
+ZeRO-Inference enhances HF Transformer inference with efficient model offloading to DRAM or NVMe. We previously [evaluated](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepnvme/08-2024/README.md#high-performance-offloading-via-nvme-scaling) LLAMA-3-70B generation performance with NVMe offloading on a single GPU and four Gen4 NVMes in an Azure [NC_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/nca100v4-series?tabs=sizebasic) VM. We measured the generation speed for a prompt of 512 tokens, output of 32 tokens, and batch size 96. Since NVMe bandwidth was the main bottleneck, we repeat the experiments on Azure ND-H200-v5 offering Gen5 NVMes. The results summarized in the Figure below show that ZeRO-Inference uses the increased NVMe bandwidths to improve generation speeds. For example, with GDS, generation speed improves from 7 tokens/sec with four Gen4 NVMes to 17 tokens/sec with four Gen5 NVMes, and further to 26 tokens/sec with eight Gen5 NVMes. We observe similar improvements without GDS. These results show that ZeRO-Inference performance can be improved in cost-effective manner by increasing NVMe bandwidths.
+
+<img src="./media/hf_zinf_llama_70b.png">
+<div align="center">
+  ZeRO-Inference leverages available NVMe bandwidth to scale LLAMA-3-70B generation.
+</div>
+
+
+# I/O performance scaling
+We used our `ds_io` benchmarking tool to demonstrate DeepNVMe proportionally scaling I/O performance with available NVMe bandwidths. This empowers users to accelerate I/O bound DL applications at modest cost using more or faster NVMe SSDs. In our experiments, we measure the achieved read and write bandwidths of 1GB data transfers between HBM and NVMes. We evaluate scaling up NVMes from PCIe Gen4 to Gen5, and scaling out from 4 to 8 SSDs. The SSDs are combined into a single RAID-0 (disk striping) volume. We summarize the results in the Figure below which show that DeepNVMe scales I/O performance on both dimensions. Scaling up from 4xGen4 SSDs to 4xGen5 SSDs improves reads from 10GB/sec to 27GB/sec, and writes from 5GB/sec to 11GB/sec. Scaling out from 4xGen5 to 8xGen5 further improves reads to 48GB/sec, and writes to 26GB/sec.
+
+<img src="./media/dnvme_scaling.png">
+<div align="center">
+  Microbenchmark shows DeepNVMe scales I/O performance with available NVMe bandwidth
+</div>
+
+
+# Broadening usability
+We have increased the usage scenarios of DeepNVMe by removing restrictions regarding hardware environments and I/O operations, as explained below.
+
+## CPU-Only environments
+Although GPUs (and similar accelerators) dominate DL, CPUs are still used in important machine learning (ML) workloads such as recommendation systems. However, DeepNVMe was previously unusable in CPU-only environments. This was because DeepNVMe relied on `torch.pin_memory()` for page-locked CPU tensors, whereas `torch.pin_memory()` does not work in the CPU versions of `torch` as illustrated below.
+
+```bash
+>>> import torch
+>>> torch.__version__
+'2.6.0+cpu'
+>>> x = torch.empty(1024).pin_memory()
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+RuntimeError: Cannot access accelerator device when none is available.
+>>>
+```
+
+We have made DeepNVMe usable in CPU environments by adding mechanisms for allocating (`new_cpu_locked_tensor()`) and releasing (`free_cpu_locked_tensor()`) page-locked CPU tensors. The snippet below illustrates allocating a pinned CPU tensor (`x`).
+
+```bash
+>> import torch
+>>> torch.__version__
+'2.6.0+cpu'
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle()
+>>> x = h.new_cpu_locked_tensor(1024, torch.Tensor())
+>>> x.shape
+torch.Size([1024])
+>>> x.dtype
+torch.float32
+```
+
+## Offset-based I/O operations
+Previously, DeepNVMe functionality was restricted to reading or writing the entire contents of a file. We have now improved DeepNVMe to read or write a user-specified portion of file content from a given offset. In particular, we have extended the existing read/write APIs to accept a user-specified `file offset` argument (with default value 0) such as below:
+
+```bash
+>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> help(AsyncIOBuilder().load().aio_handle().pread)
+Help on method pread in module async_io:
+
+pread(...) method of async_io.aio_handle instance
+    pread(self: async_io.aio_handle, buffer: torch.Tensor, filename: str, validate: bool, async: bool, file_offset: int = 0) -> int
+```
+
+
+## Tensor data type casting
+While developing FastPersist, we needed to manipulate model tensors, typically of floating point data types, in byte format for both performance and convenience of I/O operations. However, we could not find a zero-copy mechanism for casting tensors from arbitrary data types to a byte data type (i.e., torch.uint8), so we decided to create one. This functionality is available via the `UtilsBuilder` op as demonstrated in the example below. In the example, we cast a `torch.bfloat16` tensor into `torch.uint8`. Note that due to the zero-copy nature of the functionality, `bf16_tensor` and `byte_tensor` are aliases.
+
+```
+>>> import torch
+>>> from deepspeed.ops.op_builder import UtilsBuilder
+>>> util_ops = UtilsBuilder().load()
+>>> bf16_tensor = torch.zeros(1024, dtype=torch.bfloat16, device='cuda')
+>>> bf16_tensor
+tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', dtype=torch.bfloat16)
+>>> byte_tensor = util_ops.cast_to_byte_tensor(bf16_tensor)
+>>> byte_tensor
+tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0', dtype=torch.uint8)
+>>> bf16_tensor += 1.0
+>>> bf16_tensor
+tensor([1., 1., 1.,  ..., 1., 1., 1.], device='cuda:0', dtype=torch.bfloat16)
+>>> byte_tensor
+tensor([128,  63, 128,  ...,  63, 128,  63], device='cuda:0',
+       dtype=torch.uint8)
+```
+
+# Summary
+This blog post has provided updates on our continued development of DeepNVMe, an I/O optimization technology for accelerating DL applications. We have announced DeepNVMe improvements on multiple aspects, including application coverage, I/O performance scaling, and usability.
+
+# Acknowledgements
+This blog describes work done by Joe Mayer, Logan Adams, and Olatunji Ruwase of the DeepSpeed team at Microsoft.
--- a/blogs/deepnvme/06-2025/media/dnvme_file_access.png
+++ b/blogs/deepnvme/06-2025/media/dnvme_file_access.png
--- a/blogs/deepnvme/06-2025/media/dnvme_scaling.png
+++ b/blogs/deepnvme/06-2025/media/dnvme_scaling.png
--- a/blogs/deepnvme/06-2025/media/fastpersist_phi3_mini.png
+++ b/blogs/deepnvme/06-2025/media/fastpersist_phi3_mini.png
--- a/blogs/deepnvme/06-2025/media/fastpersist_tensor.png
+++ b/blogs/deepnvme/06-2025/media/fastpersist_tensor.png
--- a/blogs/deepnvme/06-2025/media/hf_zinf_llama_70b.png
+++ b/blogs/deepnvme/06-2025/media/hf_zinf_llama_70b.png
--- a/blogs/deepnvme/06-2025/media/sg_zinf_llama_70b.png
+++ b/blogs/deepnvme/06-2025/media/sg_zinf_llama_70b.png
--- a/blogs/deepnvme/08-2024/README.md
+++ b/blogs/deepnvme/08-2024/README.md
@ -47,7 +47,7 @@ We used three benchmarking tools for our evaluations. The first is fio, the popu

 ## High-Performance I/O with CPU Buffers via NVMe Scaling

-Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
+Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.

 <img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />

@ -85,4 +85,4 @@ In this blog post, we introduced DeepNVMe, an I/O optimization technology create


 # Acknowlegements
-This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
+This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nvidia.
--- a/blogs/deepnvme/08-2024/chinese/README.md
+++ b/blogs/deepnvme/08-2024/chinese/README.md
--- a/blogs/deepnvme/08-2024/japanese/README.md
+++ b/blogs/deepnvme/08-2024/japanese/README.md
--- a/blogs/deepnvme/08-2024/media/figure1.png
+++ b/blogs/deepnvme/08-2024/media/figure1.png
--- a/blogs/deepnvme/08-2024/media/figure2.png
+++ b/blogs/deepnvme/08-2024/media/figure2.png
--- a/blogs/deepnvme/08-2024/media/figure3.png
+++ b/blogs/deepnvme/08-2024/media/figure3.png
--- a/blogs/deepnvme/08-2024/media/table1.png
+++ b/blogs/deepnvme/08-2024/media/table1.png
--- a/blogs/deepspeed-zenflow/README.md
+++ b/blogs/deepspeed-zenflow/README.md
@ -0,0 +1,183 @@
+<p align="center">
+  <img  height="250" src="./images/zenflow-logo.png" alt="zenflow logo"/>
+</p>
+<div align="center">
+
+# ZenFlow: Stall-Free Offloading Engine for LLM Training
+
+<div align="center">
+  <img src="./images/zenflow-overview.png" alt="" width="1200" />
+
+<div align="left">
+
+
+*Figure 1: ZenFlow is DeepSpeed’s stall-free offloading engine for LLM training. It decouples GPU and CPU updates by prioritizing important gradients for immediate GPU updates and deferring the rest for asynchronous CPU-side accumulation. By fully overlapping CPU work and PCIe transfers with GPU computation, ZenFlow eliminates stalls and achieves high hardware utilization across both single-GPU and multi-GPUs settings.*
+
+## Table of Content
+
+- [ZenFlow: Stall-Free Offloading Engine for LLM Training](#zenflow-stall-free-offloading-engine-for-llm-training)
+  - [Table of Content](#table-of-content)
+  - [Introduction](#introduction)
+  - [ZenFlow at a Glance](#zenflow-at-a-glance)
+  - [ZenFlow Highlights](#zenflow-highlights)
+  - [Design Motivation](#design-motivation)
+  - [ZenFlow Design](#zenflow-design)
+  - [Getting Started: Try out DeepSpeed-ZenFlow](#getting-started-try-out-deepspeed-zenflow)
+  - [Citation](#citation)
+  - [Acknowledgements](#acknowledgements)
+
+---
+
+## Introduction
+
+<div align="center">
+  <img src="./images/zero-offload-stall.png" alt="" width="600" />
+
+<div align="left">
+
+*Figure 2: ZeRO-Offload causes repeated GPU stalls due to blocking CPU updates and PCIe transfers, leading to >60% idle time per step when training Llama 2-7B on 4× A100s.*
+
+Offloading has become a standard approach to scale fine-tuning of large language models (LLMs) beyond GPU memory limits. Frameworks like ZeRO-Offload reduce GPU memory usage by pushing gradients and optimizer states to the CPU. However, they also create a new bottleneck: expensive GPUs often sit idle, waiting on slow CPU updates and PCIe data transfers. In practice, enabling offloading when training Llama 2-7B on 4× A100 GPUs can inflate each step from 0.5s to over 7s—a 14× slowdown.
+
+<div align="center">
+  <img src="./images/zenflow-example.png" alt="" width="1200" />
+
+<div align="left">
+
+*Figure 3: In ZeRO-Offload, CPU-side optimizer updates and PCIe transfers dominate iteration time, leaving the GPU idle for over 5 seconds.*
+
+**ZenFlow** addresses this bottleneck with a stall-free training pipeline. It prioritizes high-impact gradients for immediate GPU updates, while offloading the rest to the CPU and applying them asynchronously. These deferred CPU updates are fully overlapped with GPU compute, eliminating stalls and significantly improving throughput. Best of all, ZenFlow maintains the same model accuracy and integrates seamlessly with DeepSpeed.
+
+---
+
+## ZenFlow at a Glance
+
+- **Zero GPU stalls:** Top-k important gradients are updated immediately on GPU; low-priority gradients are asynchronously processed on CPU—no GPU wait time.
+- **Asynchronous and bounded:** ZenFlow decouples CPU and GPU execution with a bounded-staleness strategy that preserves convergence.
+- **Auto-tuned:** ZenFlow adapts update intervals at runtime based on gradient dynamics—no need to tune manually.
+
+---
+
+## ZenFlow Highlights
+
+ZenFlow is the **first offloading framework** to offer a **bounded-asynchronous** update scheme that preserves convergence while delivering **up to 5× end-to-end speed-up** over ZeRO-Offload.
+
+### Performance
+
+| Feature | Benefit |
+|--------|---------|
+| Up to **5×** end-to-end speed-up over ZeRO-Offload and **6.3×** over ZeRO-Infinity | Faster time-to-convergence |
+| **> 85% reduction in GPU stalls** on A100 / H100 nodes | Keeps GPUs busy, higher utilization |
+| **≈ 2× lower PCIe traffic** (1.13× model size per step vs. 2× in ZeRO) | Less bandwidth pressure on clusters |
+| **Maintains or improves accuracy** on GLUE (OPT-350M → Llama-13B) | No accuracy loss |
+| **Lightweight gradient selection** (6000× cheaper than full AllGather) | Scales to multi-GPU settings without memory footprint spikes |
+| **Auto-tuning (Zen-auto)** automatically adapts update interval on-the-fly | No manual knob tuning |
+
+For more detailed performance results, please refer to our [arXiv paper](https://arxiv.org/abs/2505.12242).
+
+---
+
+## Design Motivation
+
+Training large models with offloading can save GPU memory, but often at the cost of *performance*. In this section, we briefly discuss three topics. **First**, we explain why coupling CPU-side optimizer updates with GPU compute leads to severe GPU stalls during LLM fine-tuning. **Next**, we quantify how full-gradient offloading saturates the limited PCIe bandwidth on A100/H100 servers, inflating iteration time. **Finally**, we reveal the highly skewed importance distribution of gradients, showing that uniformly updating all parameters in GPUs at the same time is wasteful and unnecessary.
+
+### Offloading-Induced GPU Stalls
+
+
+<div align="center">
+  <img src="./images/zenflow-no-overlap.png" alt="" width="1200" />
+
+<div align="left">
+
+*Figure 4:  CPU updates dominate step time, causing >60% GPU idle due to poor overlap with compute.*
+
+Synchronous offloading frameworks (e.g., ZeRO-Offload) keep the GPU idle while the CPU performs a full optimizer step and transfers updated parameters back to GPU. For Llama-2-7B with 4× A100, the CPU path can take **longer than 4s** while the backward pass takes **approximately 2s**, so **over 60% of each iteration is pure GPU wait time**. Eliminating this serialization is essential for achieving high GPU utilization.
+
+### Bandwidth Bottlenecks
+
+A single training step moves a full copy of the model gradients from GPU to CPU and a full copy of the model parameters back, i.e., **2× model size of PCIe traffic per step**. Even on PCIe 4.0 (≈ 32 GB/s), Llama-2-13B pushes ~40 GB per iteration, adding **> 1s** of transfer latency.
+### Unequal Gradient Importance
+
+Not all gradients matter equally. Our analysis shows that **the top 1% of gradient channels contribute over 90% of the ℓ²-norm energy** during fine-tuning. In other words, most updates have little impact on model learning, yet still incur disproportionately high compute and I/O costs in traditional offloading pipelines.
+
+This skew in gradient importance opens the door to a better design: update critical gradients on GPU right away, and defer the rest for asynchronously batched, lower-priority updates on CPU. ZenFlow turns this idea into a principled, efficient training engine.
+
+<div align="center">
+  <img src="./images/zenflow-gradients.png" alt="" width="1200" />
+
+<div align="left">
+
+*Figure 5: Top 1% of gradients may contribute over 85% of gradient norms.*
+
+---
+
+## ZenFlow Design
+
+ZenFlow is designed around three key ideas that separate critical and non-critical gradient updates while minimizing communication bottlenecks. Here's how we break the tight coupling between GPU and CPU computation to create a **stall-free** pipeline.
+
+### Idea 1: Importance-Aware Top-k Gradient Update
+
+Not all gradients are equally impactful for training. ZenFlow introduces an **importance-aware** design that prioritizes updates for the top-k most significant gradients. These gradients are updated directly on the GPU, using its high compute bandwidth. This approach allows us to **reduce the size of the per-step gradient update** by nearly **50%**, cutting down the communication load by around 2×.
+
+For the rest of the gradients, which contribute less to the model's learning, ZenFlow batches them and performs asynchronous updates on the CPU. These updates are **deferred** until they are sufficiently accumulated, thereby reducing the impact on training speed.
+
+### Idea 2: Bounded-Asynchronous CPU Accumulation
+
+ZenFlow’s **asynchronous accumulation** allows the CPU to stay busy while the GPU performs other computations. We apply an **accumulation window** for the non-critical gradients, allowing them to accumulate over several iterations before updating. This gives ZenFlow the ability to process **multiple rounds of gradient updates** concurrently, eliminating idle time typically spent waiting for the CPU optimizer.
+
+By carefully coordinating CPU updates with GPU execution, ZenFlow **fully hides CPU execution** behind GPU computation—ensuring that GPUs remain actively utilized, avoiding stalls, and **maximizing hardware efficiency**.
+
+### Idea 3: Lightweight Gradient Selection
+
+A key challenge in distributed training is **selecting important gradients** without introducing prohibitive communication and GPU memory costs. Traditional systems rely on global synchronization (via `AllGather`) to gather full gradients, which can become a major bottleneck in multi-GPU settings.
+
+ZenFlow solves this with a **lightweight gradient proxy**: instead of transferring full gradients, ZenFlow uses a **per-column gradient norm** to approximate the importance of each gradient. By computing a compact summary of per-column gradients (e.g., squared norms), ZenFlow reduces communication volume by more than **4,000×**—with nearly no loss in accuracy.
+
+This approach allows ZenFlow to **scale efficiently across GPUs**, without high memory or communication overhead, and it supports **dynamic gradient selection** as the model evolves.
+
+### Putting It All Together: ZenFlow’s Zero-Stall Pipeline
+
+<div align="center">
+  <img src="./images/zenflow-workflow.png" alt="" width="1200" />
+
+<div align="left">
+
+
+*Figure 6: ZenFlow’s stall-free pipeline overlaps CPU updates and transfers with multi-steps GPU compute.*
+
+1. **Forward/Backward Pass on GPU:** ZenFlow processes the forward and backward passes on the GPU, immediately updating the **top-k gradients** on the GPU without waiting for the CPU.
+
+2. **Gradient Transfer to CPU:** While the GPU is busy, gradients from the current iteration (or previous ones) are transferred to the CPU over a dedicated PCIe stream. This is done in parallel with GPU computation, without causing any GPU wait time.
+
+3. **CPU Update:** Once a batch of non-critical gradients has accumulated, the CPU performs the update asynchronously. This update typically spans multiple GPU iterations, but is hidden behind GPU work, making it virtually invisible to the overall pipeline.
+
+4. **Double Buffering:** ZenFlow uses **double buffering** to manage the newly updated gradients. When the CPU update is complete, the new parameters are transferred back to the GPU. The swap is as fast as a pointer flip—no need to reload the entire model or re-launch the kernel.
+
+By constantly **overlapping GPU computation with CPU-side work**, ZenFlow transforms the traditional compute → wait → update cycle into a continuous, **stall-free pipeline**.
+
+---
+
+## Getting Started: Try out DeepSpeed-ZenFlow
+
+To try out DeepSpeed-ZenFlow, please refer to the [ZenFlow tutorial](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/DeepSpeed-ZenFlow/README.md) in our DeepSpeedExamples repo.
+
+---
+
+## Citation
+
+```bibtex
+@article{lan2025zenflow,
+  title   = {ZenFlow: Enabling Stall-Free Offloading Training via Asynchronous Updates},
+  author  = {Tingfeng Lan and Yusen Wu and Bin Ma and Zhaoyuan Su and Rui Yang and Tekin Bicer and Masahiro Tanaka and Olatunji Ruwase and Dong Li  and Yue Cheng},
+  journal = {arXiv preprint arXiv:2505.12242},
+  year    = {2025}
+}
+```
+
+---
+
+## Acknowledgements
+
+This work is the result of a close collaboration between University of Virginia (UVA), University of California, Merced (UC Merced), Argonne National Laboratory (ANL) and DeepSpeed team.
+
+The contributors include [Tingfeng Lan](https://antlera.github.io/), [Yusen Wu](https://joshwoo2003.github.io/), [Zhaoyuan Su](https://alexsssu.github.io/), [Rui Yang](https://ruiyang00.github.io/), and [Yue Cheng](https://tddg.github.io/) from UVA; [Bin Ma](https://www.linkedin.com/in/bin-ma-ba665b182/) and [Dong Li](https://faculty.ucmerced.edu/dong-li/) from UC Merced; [Tekin Bicer](https://www.anl.gov/profile/tekin-bicer) from ANL; [Olatunji Ruwase](https://www.linkedin.com/in/tunji-ruwase-088952/) and [Masahiro Tanaka](https://www.linkedin.com/in/masahiro-tanaka-77482926/) from the DeepSpeed team. We especially thank [Olatunji Ruwase](https://www.linkedin.com/in/tunji-ruwase-088952/) and [Masahiro Tanaka](https://www.linkedin.com/in/masahiro-tanaka-77482926/) for their early feedback and insightful discussions and also for open-source community support.
--- a/blogs/deepspeed-zenflow/images/zenflow-example.png
+++ b/blogs/deepspeed-zenflow/images/zenflow-example.png
--- a/blogs/deepspeed-zenflow/images/zenflow-gradients.png
+++ b/blogs/deepspeed-zenflow/images/zenflow-gradients.png
--- a/blogs/deepspeed-zenflow/images/zenflow-logo.png
+++ b/blogs/deepspeed-zenflow/images/zenflow-logo.png
--- a/blogs/deepspeed-zenflow/images/zenflow-no-overlap.png
+++ b/blogs/deepspeed-zenflow/images/zenflow-no-overlap.png
--- a/blogs/deepspeed-zenflow/images/zenflow-overview.png
+++ b/blogs/deepspeed-zenflow/images/zenflow-overview.png
--- a/blogs/deepspeed-zenflow/images/zenflow-workflow.png
+++ b/blogs/deepspeed-zenflow/images/zenflow-workflow.png
--- a/blogs/deepspeed-zenflow/images/zero-offload-stall.png
+++ b/blogs/deepspeed-zenflow/images/zero-offload-stall.png
--- a/blogs/huggingface-tp/README.md
+++ b/blogs/huggingface-tp/README.md
@ -0,0 +1,242 @@
+<div align="center">
+
+#  Automatic Tensor Parallel (AutoTP) Training of Hugging Face models
+
+</div>
+
+
+# Introduction
+
+Tensor parallelism (TP) is an important memory optimization for training large-scale deep learning models. Despite the popularity of training Hugging Face (HF) [models](https://huggingface.co/models), the model scaling options for **[HF trainer](https://huggingface.co/docs/transformers/main_classes/trainer)** was previously limited to sharded data parallelism through [ZeRO](https://huggingface.co/docs/accelerate/usage_guides/deepspeed)/[FSDP](https://huggingface.co/docs/accelerate/usage_guides/fsdp). While ZeRO3 offers superior memory efficiency, it incurs significant communication costs. ZeRO (1/2) has lower communication overhead, but in the case of very large models, it cannot be used directly due to memory limitations. Therefore, combining TP with ZeRO (1/2) offers more balanced options for memory and performance. Moreover, through TP, we can alleviate the batch scaling limitations imposed by ZeRO/FSDP.
+
+We are pleased to announce that DeepSpeed now provides native automatic tensor parallel training for Hugging Face (HF) transformers. This new feature builds on DeepSpeed's [AutoTP](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/) mechanism, which was previously restricted to inference. AutoTP training can be combined with ZeRO to unlock unprecented efficiency benefits for HF model post-training, including:
+
+**1**. Model scaling with lower communication costs than FSDP/ZeRO3 (e.g., use AutoTP + ZeRO1 to achieve ZeRO3 memory savings).
+
+**2**. Batch size scaling for faster training and increased throughput.
+
+**3**. Context length scaling to enable new application scenarios.
+
+We have integrated AutoTP training with ZeRO1 & ZeRO2, with ZeRO3 integration on the way. AutoTP training is available in DeepSpeed versions >= 0.16.4
+
+# Batch Scaling with AutoTP Training + ZeRO
+The following is a batch scaling experiment of Llama3 8B training conducted on [Gaudi2 Accelerator](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi.html).
+
+
+<div align="center">
+
+<img src="media/batchscale.png">
+
+
+*Figure 1. Batch scaling experiment on Gaudi2, showing throughput performance improvements from 2 to 4 cards by combining AutoTP and ZeRO. The used mbs is the max possible value with the given config. A higher speedup indicates better performance.*
+
+</div>
+
+
+
+<div align="center">
+
+<img src="media/flowchart.png">
+
+
+*Figure 2. Model training with AutoTP + ZeRO*
+
+</div>
+
+
+Figure 2 illustrates the basic flowchart, The division of TP and ZeRO is implemented through the AutoTP parser and ZeRO Wrapper in [Accelerate](https://github.com/huggingface/accelerate.git). Besides, The TP-based dataloader and save mechanism are both supported in DeepSpeed and Accelerate.
+
+# Usage
+
+
+
+Although we evaluated AutoTP training with Llama2 & Llama3 models in this blog, we expect compatibility with other Hugging Face models, especially [those](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/) previously validated with AutoTP inference.
+
+**Requirements**
+- `deepspeed >= 0.16.4`
+- `transformers >= 4.50.1`
+- `accelerate >= 1.6.0`
+
+ **Enable TP training**
+
+Similar to ZeRO, AutoTP training is enabled using the [deepspeed configuration file](https://www.deepspeed.ai/docs/config-json/) by specifying ```[tensor_parallel][autotp_size]```.
+```
+    "ZeRO_optimization": {
+      "stage": 1,
+      "gather_16bit_weights_on_model_save": true,
+      ...
+    },
+    "tensor_parallel":{
+      "autotp_size": 4
+    },
+```
+
+The parallel configuration follows this logic:
+
+
+```
+tp_size = auto_tp_size
+dp_size = num_gpus / tp_size
+```
+
+Note that the global_batch_size (gbs) changes with different TP settings:
+```
+gbs (only dp) = per_device_batch_size * n_gpus * gradient_accumulation_steps
+
+gbs (dp with tp) = per_device_batch_size * n_gpus / tp_size * gradient_accumulation_steps
+```
+
+
+
+
+
+
+
+ **Save Model**
+
+
+
+
+Saving checkpoints and model files is fully compatible with HF transformers. The [trainer.save_model()](https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.Trainer.save_model) method saves the original model. Ensure ```gather_16bit_weights_on_model_save``` is set to ```true```in the [deepspeed configuration file](https://www.deepspeed.ai/docs/config-json/).
+```gather_16bit_weights_on_model_save=true in config.
+    "ZeRO_optimization": {
+      ...
+      "gather_16bit_weights_on_model_save": true,
+    },
+```
+
+```
+trainer.save_model(your_saved_path)
+```
+Models saved this way can be directly used for HF format inference without intermediate transformations.
+
+
+
+ **Saving Checkpoints and Resuming**
+
+
+
+Saving Checkpoints remains compatible with HF transformers. Use [trainer.save_state()](https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.Trainer.save_state) or set the save interval for automatic saving, which can be used to resume training.
+```
+trainer.train(resume_from_checkpoint="your_saved_path/checkpoint-1200")
+```
+
+# Example
+We validated AutoTP training using supervised finetune training (SFT) task: [stanford_alpaca](https://github.com/tatsu-lab/stanford_alpaca). The original benchmark model used in this project is Llama2-7B. The example code is also available [here](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/training/tensor_parallel)
+
+
+**Training Loss curve**
+
+
+
+The following loss curves depict SFT training, where gbs is uniformly set to 32, and other configurations match the default experiment settings from ([stanford_alpaca](https://github.com/tatsu-lab/stanford_alpaca)). The loss curves are largely consistent across the following setups:
+
+ - ZeRO3
+ - TP + disable ZeRO
+ - ZeRO1 and ZeRO1 + AutoTP
+ - ZeRO2 and ZeRO2 + AutoTP
+
+
+
+
+
+<div align="center">
+
+
+<img src="media/zero3.png">
+
+*Figure 3. Loss curve of ZeRO3 stage training (gbs=32, dp=8)*
+
+</div>
+<div align="center">
+
+<img src="media/tp8.png">
+
+*Figure 4. Loss curve of AutoTP training (gbs=32, tp=8)*
+</div>
+
+<div align="center">
+
+<img src="media/tpzero1.png">
+
+*Figure 5. Loss curve of AutoTP + ZeRO1 training (gbs=32, dp=2, tp=4)*
+</div>
+
+
+<div align="center">
+
+<img src="media/tpzero2.png">
+
+*Figure 6. Loss curve of AutoTP + ZeRO2 training (gbs=32, dp=2, tp=4)*
+
+
+</div>
+
+
+ **Resuming Training**
+
+
+  We tested recovery training curves from step 1200 in AutoTP + ZeRO1 and AutoTP + ZeRO2, which align with the original training curves.
+
+<div align="center">
+
+<img src="media/zero1tpload.png">
+
+*Figure 7. AutoTP + ZeRO1 resuming training*
+
+<img src="media/zero2tpload.png">
+
+*Figure 8. AutoTP + ZeRO2 resuming training*
+
+</div>
+
+
+
+ **Model Evaluation**
+
+
+  We conducted inference evaluations for the [MMLU task](https://github.com/EleutherAI/lm-evaluation-harness).
+  In MMLU, the scores for AutoTP + ZeRO1 and ZeRO1, as well as AutoTP + ZeRO2 and ZeRO2, are consistent, showing a fixed improvement over the pre-training model before SFT.
+
+
+<div align="center">
+
+
+| Groups | Version | Filter | n-shot | Metric | Model before SFT | ZeRO1 DP8 training | ZeRO1 TP4 DP2 training | ZeRO2 DP8 training | ZeRO2 TP4DP2 training |
+|--------|---------|--------|--------|--------|-----------------------|--------------------|------------------------|--------------------|------------------------|
+| mmlu | 2 | none | | acc | 0.4185 ± 0.0041 | 0.4472 ± 0.0041 | 0.4444 ± 0.0041 | 0.4543 ± 0.0041 | 0.4529 ± 0.0041 |
+| - humanities | 2 | none | | acc | 0.3979 ± 0.0069 | 0.4185 ± 0.0070 | 0.4145 ± 0.0069 | 0.4274 ± 0.0070 | 0.4272 ± 0.0070 |
+| - other | 2 | none | | acc | 0.4712 ± 0.0089 | 0.5249 ± 0.0087 | 0.5182 ± 0.0088 | 0.5282 ± 0.0087 | 0.5269 ± 0.0087 |
+| - social sciences | 2 | none | | acc | 0.4742 ± 0.0089 | 0.5070 ± 0.0089 | 0.5083 ± 0.0088 | 0.5151 ± 0.0088 | 0.5115 ± 0.0089 |
+| - stem | 2 | none | | acc | 0.3428 ± 0.0084 | 0.3549 ± 0.0084 | 0.3539 ± 0.0084 | 0.3622 ± 0.0084 | 0.3609 ± 0.0084 |
+
+*Table 1. MMLU score with Llama2-7B inference*
+
+</div>
+
+
+
+
+
+# Miscellaneous
+
+If users define their own dataloader, please ensure data consistency within ```deepspeed.utils.groups.get_tensor_model_parallel_group()```. DeepSpeed provides basic validation functions to assist with this.
+
+Furthermore, if users are not using transformers library, you can replace the ```TensorParallel_Layer``` layer and its subclasses as needed. See ```prepare_tp_model``` function in ```unit/model_parallelism/test_autotp_training.py```. Users can also define different shard and gather for subclasses of ```TensorParallel_Layer.```
+
+
+
+
+
+# Ongoing Work
+- **Optimization**: Communication/Activation optimization.
+- **Usability**: Support [Transformers TP plan](https://github.com/huggingface/transformers/blob/336dc69d63d56f232a183a3e7f52790429b871ef/src/transformers/models/llama/configuration_llama.py#L145), decouple AutoTP parser and more model testing,
+
+
+Theoretically, features supported by ZeRO should also be supported, though extensive testing is pending.
+
+Welcome bug reports, enhancement, and additional model training examples.
+
+
+# Contributors
+This work was made possible through a deep collaboration between Intel and Microsoft. The contributors include Mingzhi Liu, Guokai Ma, Kiefer Kuah, Yejing Lai, Kurt Chen, Yejun Guo, Guangxin Xu, Xiaofei Feng, and Yang Wang from Intel; Guanhua Wang and Olatunji Ruwase from Microsoft.
--- a/blogs/huggingface-tp/media/batchscale.png
+++ b/blogs/huggingface-tp/media/batchscale.png
--- a/blogs/huggingface-tp/media/flowchart.png
+++ b/blogs/huggingface-tp/media/flowchart.png
--- a/blogs/huggingface-tp/media/tp8.png
+++ b/blogs/huggingface-tp/media/tp8.png
--- a/blogs/huggingface-tp/media/tpzero1.png
+++ b/blogs/huggingface-tp/media/tpzero1.png
--- a/blogs/huggingface-tp/media/tpzero2.png
+++ b/blogs/huggingface-tp/media/tpzero2.png
--- a/blogs/huggingface-tp/media/zero1tpload.png
+++ b/blogs/huggingface-tp/media/zero1tpload.png
--- a/blogs/huggingface-tp/media/zero2tpload.png
+++ b/blogs/huggingface-tp/media/zero2tpload.png
--- a/blogs/huggingface-tp/media/zero3.png
+++ b/blogs/huggingface-tp/media/zero3.png
--- a/blogs/windows/08-2024/chinese/README.md
+++ b/blogs/windows/08-2024/chinese/README.md
@ -97,7 +97,7 @@ DeepSpeed可以通过两种方式在Windows系统上安装。较为简单的方

 # 总结

-使得DeepSpeed，一个流行的深度学习框架，能够原生运行在最流行的操作系统 Windows 上，是让每个人和组织从当前的人工智能革命中受益的重要一步。在这篇博客中，我们分享了我们为实现这一目标所取得的早期成果。尽管 DeepSpeed 对 Windows 的支持仍在继续开发中，我们希望上述结果已经能够对我们的用户有实用价值，并且鼓舞他们。我们接下来的工作计划涵盖多GPU支持、权重量化以及性能优化。
+让流行的深度学习框架 DeepSpeed 能够在最流行的操作系统 Windows 上原生运行，是让每个人和每个组织都能从正在进行的人工智能革命中受益的关键一步。在这篇博客中，我们分享了我们为实现这一目标所取得的早期成果。尽管 DeepSpeed 对 Windows 的支持仍在继续开发中，我们希望上述结果已经能够对我们的用户有实用价值，并且鼓舞他们。我们接下来的工作计划涵盖多GPU支持、权重量化以及性能优化。

 # 致谢
 这给项目的完成得益于现任和前任 DeepSpeed 成员的大力合作，包括 Costin Eseanu、Logan Adams、Elton Zheng、Reza Yazdani Aminabadi、Martin Cai 和 Olatunji Ruwase。我们还要感谢那些及时提出此项需求、提供关键的临时解决方法、部分解决方案和建设性反馈的 DeepSpeed 用户，最重要的是，他们始终与我们同行.
--- a/build_win.bat
+++ b/build_win.bat
@ -10,6 +10,7 @@ set DS_BUILD_FP_QUANTIZER=0
 set DS_BUILD_GDS=0
 set DS_BUILD_RAGGED_DEVICE_OPS=0
 set DS_BUILD_SPARSE_ATTN=0
+set DS_BUILD_DEEP_COMPILE=0

 python -m build --wheel --no-isolation

--- a/ci/init.py
+++ b/ci/init.py
@ -0,0 +1,4 @@
+# Copyright (c) DeepSpeed Team.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
--- a/ci/accelerate.py
+++ b/ci/accelerate.py
@ -0,0 +1,43 @@
+# Copyright (c) Snowflake.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parents[1]
+
+# yapf: disable
+image = (modal.Image
+         .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
+         .run_commands("apt update && apt install -y libaio-dev")
+         .apt_install("git")
+         .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
+         .run_commands(
+                "git clone https://github.com/huggingface/accelerate && \
+                uv pip install --system --compile-bytecode ./accelerate[testing]"
+            )
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
+         .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
+         .run_commands("pip install /root")
+         .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
+         .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
+         .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
+        )
+
+app = modal.App("deepspeedai-accelerate-ci", image=image)
+
+@app.function(
+    gpu="l40s:1",
+    timeout=1800,
+)
+def pytest():
+    import subprocess
+    subprocess.run(
+        "pytest /accelerate/tests/deepspeed".split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
--- a/ci/torch_latest.py
+++ b/ci/torch_latest.py
@ -0,0 +1,39 @@
+# Copyright (c) Snowflake.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parents[1]
+
+# yapf: disable
+image = (modal.Image
+         .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
+         .run_commands("apt update && apt install -y libaio-dev")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
+         .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
+         .run_commands("pip install /root")
+         .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
+         .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
+         .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
+        )
+
+
+app = modal.App("deepspeedai-torch-latest-ci", image=image)
+
+
+@app.function(
+    gpu="l40s:2",
+    timeout=1800,
+)
+def pytest():
+    import subprocess
+    subprocess.run(
+        "pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@ -101,7 +101,7 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>*
    return actual_n_iocbs;
 }

-int get_file_size(const char* filename, int64_t& size)
+int64_t get_file_size(const char* filename, int64_t& size)
 {
    struct stat st;
    if (stat(filename, &st) == -1) { return -1; }
@ -109,6 +109,14 @@ int get_file_size(const char* filename, int64_t& size)
    return 0;
 }

+int64_t get_fd_file_size(const int fd, int64_t& size)
+{
+    struct stat st;
+    if (fstat(fd, &st) == -1) { return -1; }
+    size = st.st_size;
+    return 0;
+}
+
 void* ds_page_aligned_alloc(const int64_t size, const bool lock)
 {
    void* ptr;
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@ -78,4 +78,5 @@ struct io_prep_generator {

 void* ds_page_aligned_alloc(const int64_t size, const bool lock = false);

-int get_file_size(const char* filename, int64_t& size);
+int64_t get_file_size(const char* filename, int64_t& size);
+int64_t get_fd_file_size(const int fd, int64_t& size);
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@ -11,20 +11,19 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
                           const torch::Tensor& buffer,
                           const int fd,
                           const char* filename,
-                           const int64_t file_num_bytes,
                           const int intra_op_parallelism,
                           const bool validate,
                           const int64_t file_offset)
    : _read_op(read_op),
      _buffer(buffer),
      _fd(fd),
-      _filename(filename),
-      _file_num_bytes(file_num_bytes),
+      _filename((filename == nullptr) ? std::string() : filename),
      _file_offset(file_offset),
      _intra_op_parallelism(intra_op_parallelism),
      _num_bytes_per_thread(static_cast<int64_t>(buffer.nbytes()) / intra_op_parallelism),
      _validate(validate)
 {
+    if (validate) { assert(nullptr != filename); }
 }

 char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@ -13,8 +13,7 @@ struct io_op_desc_t {
    const bool _read_op;
    torch::Tensor _buffer;
    int _fd;
-    const std::string _filename;
-    const int64_t _file_num_bytes;
+    std::string _filename;
    const int _intra_op_parallelism;
    const int64_t _num_bytes_per_thread;
    torch::Tensor _contiguous_buffer;
@ -25,7 +24,6 @@ struct io_op_desc_t {
                 const torch::Tensor& buffer,
                 const int fd,
                 const char* filename,
-                 const int64_t file_num_bytes,
                 const int intra_op_parallelism,
                 const bool validate,
                 const int64_t file_offset);
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@ -9,23 +9,15 @@
 using namespace std;

 cpu_op_desc_t::cpu_op_desc_t(
+    const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
    const bool read_op,
    const torch::Tensor& buffer,
-    const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
    const int fd,
    const char* filename,
-    const int64_t file_num_bytes,
    const int intra_op_parallelism,
    const bool validate,
    const int64_t file_offset)
-    : io_op_desc_t(read_op,
-                   buffer,
-                   fd,
-                   filename,
-                   file_num_bytes,
-                   intra_op_parallelism,
-                   validate,
-                   file_offset),
+    : io_op_desc_t(read_op, buffer, fd, filename, intra_op_parallelism, validate, file_offset),
      _cpu_buffer(buffer),
      _pinned_tensor_mgr(pinned_tensor_mgr),
      _is_managed_bounce_buffer(false)
@ -66,7 +58,8 @@ void cpu_op_desc_t::finish()

 void cpu_op_desc_t::validate()
 {
-    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
+    const auto num_io_bytes = static_cast<int64_t>(_contiguous_buffer.nbytes());
+    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), num_io_bytes);
 }

 void cpu_op_desc_t::run(const int tid,
--- a/csrc/aio/py_lib/deepspeed_cpu_op.h
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@ -13,12 +13,11 @@ struct cpu_op_desc_t : io_op_desc_t {
    bool _is_managed_bounce_buffer;
    const std::unique_ptr<struct deepspeed_pin_tensor_t>& _pinned_tensor_mgr;

-    cpu_op_desc_t(const bool read_op,
+    cpu_op_desc_t(const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
+                  const bool read_op,
                  const torch::Tensor& buffer,
-                  const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
                  const int fd,
                  const char* filename,
-                  const int64_t file_num_bytes,
                  const int intra_op_parallelism,
                  const bool validate,
                  const int64_t file_offset);
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@ -6,7 +6,6 @@
 /*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
-
 #include <condition_variable>
 #include <memory>
 #include "deepspeed_py_io_handle.h"
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@ -10,10 +10,30 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_io_handle.h"
 #include <cstdlib>

+#define O_DIRECT_ALIGNMENT 512
+
 using namespace std;

 static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }

+static bool is_valid_bytes_to_read(const char* filename,
+                                   const int64_t file_offset,
+                                   const int64_t num_bytes_to_read)
+{
+    int64_t num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return false;
+    }
+    if ((file_offset + num_bytes_to_read) > num_file_bytes) {
+        std::cout << filename << ": file_offset + buffer nbytes > file bytes "
+                  << (file_offset + num_bytes_to_read) << " > " << num_file_bytes << std::endl;
+    }
+    assert((file_offset + num_bytes_to_read) <= num_file_bytes);
+    return true;
+}
+
 deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
                                             const int queue_depth,
                                             const bool single_submit,
@ -58,6 +78,11 @@ const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_e

 const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; }

+const int deepspeed_io_handle_t::get_alignment() const
+{
+    return _intra_op_parallelism * O_DIRECT_ALIGNMENT;
+}
+
 int deepspeed_io_handle_t::read(torch::Tensor& buffer,
                                const char* filename,
                                const bool validate,
@ -185,7 +210,7 @@ int deepspeed_io_handle_t::wait()

        completed_op->finish();

-        close(completed_op->_fd);
+        if (!completed_op->_filename.empty()) { (completed_op->_fd); }

        --_num_pending_ops;
        ++num_completed_ops;
@ -199,7 +224,8 @@ bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op, const
    const auto op_string = read_op ? "Read" : "Write";
    if (num_bytes % get_intra_op_parallelism()) {
        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_intra_op_parallelism() << std::endl;
+                  << " not divisible by intra op parallelism = " << get_intra_op_parallelism()
+                  << std::endl;
        return false;
    }

@ -211,45 +237,61 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
    const torch::Tensor& buffer,
    const int fd,
    const char* filename,
-    const int64_t file_num_bytes,
    const bool validate,
    const int64_t file_offset)
 {
-    return std::make_shared<cpu_op_desc_t>(read_op,
+    return std::make_shared<cpu_op_desc_t>(_pinned_tensor_mgr,
+                                           read_op,
                                           buffer,
-                                           _pinned_tensor_mgr,
                                           fd,
                                           filename,
-                                           file_num_bytes,
                                           _intra_op_parallelism,
                                           validate,
                                           file_offset);
 }

+int deepspeed_io_handle_t::_pread(const torch::Tensor& buffer,
+                                  const int fd,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async,
+                                  const int64_t file_offset)
+{
+    auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, validate, file_offset);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
 int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
                                 const char* filename,
                                 const bool validate,
                                 const bool async,
                                 const int64_t file_offset)
 {
-    int64_t num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-
-    // buffer can exceed file size to enable 4k alignment
    const auto buffer_bytes = static_cast<int64_t>(buffer.nbytes());
-    assert((num_file_bytes % _intra_op_parallelism) == 0);
+
+    if (!is_valid_bytes_to_read(filename, file_offset, buffer_bytes)) { return -1; }

    if (!_is_valid_parallel_aio_op(true, buffer_bytes)) { return -1; }

    const auto fd = open_file(filename, true);
    if (fd == -1) { return -1; }

-    auto scheduled_op =
-        _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate, file_offset);
+    return _pread(buffer, fd, filename, validate, async, file_offset);
+}
+
+int deepspeed_io_handle_t::_pwrite(const torch::Tensor& buffer,
+                                   const int fd,
+                                   const char* filename,
+                                   const bool validate,
+                                   const bool async,
+                                   const int64_t file_offset)
+{
+    auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, validate, file_offset);

    _schedule_aio_work(scheduled_op);

@ -265,21 +307,13 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
                                  const int64_t file_offset)
 {
    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
-    assert((num_write_bytes % _intra_op_parallelism) == 0);

    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }

    const auto fd = open_file(filename, false);
    if (fd == -1) { return -1; }

-    auto scheduled_op =
-        _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate, file_offset);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
+    return _pwrite(buffer, fd, filename, validate, async, file_offset);
 }

 int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer,
@ -310,6 +344,16 @@ int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer,
    return pwrite(buffer, filename, false, true, file_offset);
 }

+int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer,
+                                        const int fd,
+                                        const int64_t file_offset = 0)
+{
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    return _pwrite(buffer, fd, nullptr, false, true, file_offset);
+}
+
 at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem,
                                                        const torch::Tensor& example_tensor)
 {
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@ -37,6 +37,7 @@ struct deepspeed_io_handle_t {
    const bool get_single_submit() const;
    const bool get_overlap_events() const;
    const int get_intra_op_parallelism() const;
+    const int get_alignment() const;

    int read(torch::Tensor& buffer,
             const char* filename,
@ -67,6 +68,7 @@ struct deepspeed_io_handle_t {
    int async_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);

    int async_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
+    int async_pwrite(const torch::Tensor& buffer, const int fd, const int64_t file_offset);

    // TODO: Make API's args to be shape and dtype.
    torch::Tensor new_cpu_locked_tensor(const int64_t num_elem,
@ -84,11 +86,24 @@ struct deepspeed_io_handle_t {

    bool _is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes);

+    int _pread(const torch::Tensor& buffer,
+               const int fd,
+               const char* filename,
+               const bool validate,
+               const bool async,
+               const int64_t file_offset);
+
+    int _pwrite(const torch::Tensor& buffer,
+                const int fd,
+                const char* filename,
+                const bool validate,
+                const bool async,
+                const int64_t file_offset);
+
    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
                                                                    const torch::Tensor& buffer,
                                                                    const int fd,
                                                                    const char* filename,
-                                                                    const int64_t file_num_bytes,
                                                                    const bool validate,
                                                                    const int64_t file_offset);
 };
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@ -6,7 +6,6 @@
 /*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
-
 #include <torch/extension.h>
 #include "deepspeed_py_aio_handle.h"
 #include "deepspeed_py_copy.h"
@ -34,6 +33,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
        .def("get_intra_op_parallelism", &deepspeed_aio_handle_t::get_intra_op_parallelism)
+        .def("get_alignment", &deepspeed_aio_handle_t::get_alignment)

        .def("read",
             &deepspeed_aio_handle_t::read,
@ -53,7 +53,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

        .def("pread",
             &deepspeed_aio_handle_t::pread,
-             "Parallel file read with option of parallelism. Returns count of completed read ops",
+             "Parallel file read with option of asynchronous completion. If synchronous, returns "
+             "count of completed read ops",
             "buffer"_a,
             "filename"_a,
             "validate"_a,
@ -62,7 +63,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

        .def("pwrite",
             &deepspeed_aio_handle_t::pwrite,
-             "Parallel file write with option of parallelism. Returns count of completed write ops",
+             "Parallel file write with option of asynchronous completion. If synchronous, returns "
+             "count of completed write ops",
             "buffer"_a,
             "filename"_a,
             "validate"_a,
@ -71,7 +73,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

        .def("sync_pread",
             &deepspeed_aio_handle_t::sync_pread,
-             "Synchrononous parallel file read. Returns count of completed read ops",
+             "Synchronous parallel file read. Returns count of completed read ops",
             "buffer"_a,
             "filename"_a,
             "file_offset"_a = 0)
@ -86,17 +88,27 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        .def("async_pread",
             &deepspeed_aio_handle_t::async_pread,
             "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
-             "following wait() returns count of completed ops.",
+             "subsequent wait() returns count of completed ops.",
             "buffer"_a,
             "filename"_a,
             "file_offset"_a = 0)

+        .def(
+            "async_pwrite",
+            py::overload_cast<const torch::Tensor&, const char*, const int64_t>(
+                &deepspeed_aio_handle_t::async_pwrite),
+            "Asynchronous parallel file write. Returns 0 on success, and subsequent wait() returns "
+            "count of completed ops.",
+            "buffer"_a,
+            "filename"_a,
+            "file_offset"_a = 0)
+
        .def("async_pwrite",
-             &deepspeed_aio_handle_t::async_pwrite,
-             "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
-             "count of completed ops.",
+             py::overload_cast<const torch::Tensor&, const int, const int64_t>(
+                 &deepspeed_aio_handle_t::async_pwrite),
+             "Asynchronous parallel file write using opened python file object.",
             "buffer"_a,
-             "filename"_a,
+             "fd"_a,
             "file_offset"_a = 0)

        .def("new_cpu_locked_tensor",
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@ -17,7 +17,7 @@ from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
    READ_LOG_DIR, WRITE_LOG_DIR
 from deepspeed.ops.op_builder import AsyncIOBuilder

-OTHER_OPTIONS = '--handle'
+OTHER_OPTIONS = '--engine aio_handle'
 PERF_SCRIPT = 'test_ds_aio.py'
 DEFAULT_SWEEP_CONFIG = {
    "block_size": ["128K", "1M"],
@ -109,6 +109,20 @@ def get_sweep_config_dict(sweep_config_json):
    return sweep_config


+QUEUE_DEPTH = "--queue_depth"
+BLOCK_SIZE = "--block_size"
+SINGLE_SUBMIT = "--single_submit"
+SEQUENTIAL_REQUESTS = "--sequential_requests"
+THREAD_COUNT = "--threads"
+IO_PARALLEL = "--io_parallel"
+
+DEPRECATED_KEYS = {THREAD_COUNT: "multi_process"}
+
+
+def _handle_key_deprecation(key):
+    return DEPRECATED_KEYS.get(f'--{key}', key)
+
+
 def get_sweep_cmd_lines(sweep_config_dict):

    def flatten_options(key, value_list):
@ -123,7 +137,7 @@ def get_sweep_cmd_lines(sweep_config_dict):

        return flat_list

-    flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
+    flat_list = [flatten_options(_handle_key_deprecation(key), value) for key, value in sweep_config_dict.items()]
    cmd_list = list(itertools.product(*flat_list))
    cmd_list = [list(cmd) for cmd in cmd_list]
    #dump_cmd_lines(cmd_list)
--- a/csrc/aio/py_test/dgx2_v100_optimal_read.sh
+++ b/csrc/aio/py_test/dgx2_v100_optimal_read.sh
@ -0,0 +1,21 @@
+python test_ds_aio.py \
+        --read \
+        --handle --io_size 400M \
+        --loops 3  \
+        --folder_to_device_mapping \
+        /mnt/nvme23/aio:0 \
+        /mnt/nvme23/aio:1 \
+        /mnt/nvme23/aio:2 \
+        /mnt/nvme23/aio:3 \
+        /mnt/nvme45/aio:4 \
+        /mnt/nvme45/aio:5 \
+        /mnt/nvme45/aio:6 \
+        /mnt/nvme45/aio:7 \
+        /mnt/nvme67/aio:8 \
+        /mnt/nvme67/aio:9 \
+        /mnt/nvme67/aio:10 \
+        /mnt/nvme67/aio:11 \
+        /mnt/nvme89/aio:12  \
+        /mnt/nvme89/aio:13  \
+        /mnt/nvme89/aio:14  \
+        /mnt/nvme89/aio:15  \
--- a/csrc/aio/py_test/dgx2_v100_optimal_write.sh
+++ b/csrc/aio/py_test/dgx2_v100_optimal_write.sh
@ -0,0 +1,20 @@
+python test_ds_aio.py \
+        --handle --io_size 400M \
+        --loops 3  \
+        --folder_to_device_mapping \
+        /mnt/nvme23/aio:0 \
+        /mnt/nvme23/aio:1 \
+        /mnt/nvme23/aio:2 \
+        /mnt/nvme23/aio:3 \
+        /mnt/nvme45/aio:4 \
+        /mnt/nvme45/aio:5 \
+        /mnt/nvme45/aio:6 \
+        /mnt/nvme45/aio:7 \
+        /mnt/nvme67/aio:8 \
+        /mnt/nvme67/aio:9 \
+        /mnt/nvme67/aio:10 \
+        /mnt/nvme67/aio:11 \
+        /mnt/nvme89/aio:12  \
+        /mnt/nvme89/aio:13  \
+        /mnt/nvme89/aio:14  \
+        /mnt/nvme89/aio:15  \
--- a/csrc/aio/py_test/dgx2_v100_suboptimal_read.sh
+++ b/csrc/aio/py_test/dgx2_v100_suboptimal_read.sh
@ -0,0 +1,6 @@
+python test_ds_aio.py \
+        --read \
+        --handle --io_size 400M \
+        --loops 3  \
+        --folder /mnt/nvme23/aio \
+        --multi_process 16
--- a/csrc/aio/py_test/dgx2_v100_suboptimal_write.sh
+++ b/csrc/aio/py_test/dgx2_v100_suboptimal_write.sh
@ -0,0 +1,5 @@
+python test_ds_aio.py \
+        --handle --io_size 400M \
+        --loops 3  \
+        --folder /mnt/nvme23/aio \
+        --multi_process 16
--- a/csrc/aio/py_test/ds_aio_args.py
+++ b/csrc/aio/py_test/ds_aio_args.py
@ -9,6 +9,7 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 import argparse
 import os
 from test_ds_aio_utils import refine_integer_value
+from ds_aio_constants import AIO_HANDLE, AIO_BASIC, TORCH_FAST_IO, TORCH_IO, VALID_ENGINES
 from deepspeed.accelerator import get_accelerator

 MAPPING_DELIMITER = ':'
@ -21,6 +22,9 @@ def refine_args(args):
    if args.block_size and type(args.block_size) == str:
        args.block_size = refine_integer_value(args.block_size)

+    if args.fast_io_size and type(args.fast_io_size) == str:
+        args.fast_io_size = refine_integer_value(args.fast_io_size)
+
    return args


@ -83,6 +87,19 @@ def validate_args(args):
        no_error = no_error and no_mapping_error
        error_messages += mapping_error_messages

+    # Validate --engine
+    if args.engine not in VALID_ENGINES:
+        no_error = False
+        error_messages.append(f'Invalid engine {args.engine}. Valid options = {VALID_ENGINES}')
+
+    # Validate --engine=torch_io
+    if args.engine == TORCH_IO:
+        if args.read:
+            no_error = False
+            error_messages.append(f'Read not currently supported for --engine={TORCH_IO}')
+
+    if not no_error:
+        print(f'Found {len(error_messages)} validation error(s)')
    # Validate --gpu, --use_gds
    if args.use_gds and not args.gpu:
        error_messages.append(f'--gpu must be set to transfer with --use_gds')
@ -111,6 +128,8 @@ def parse_arguments():

    parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')

+    parser.add_argument('--fast_io_size', type=str, default='64M', help='Size of fast_io pinned buffer (bytes).')
+
    parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')

    parser.add_argument('--multi_process',
@ -138,7 +157,13 @@ def parse_arguments():

    parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')

-    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+    parser.add_argument(
+        '--engine',
+        type=str,
+        default=AIO_HANDLE,
+        help=
+        f'Engine to perform I/O. Options are [{AIO_HANDLE}, {AIO_BASIC}, {TORCH_IO}, {TORCH_FAST_IO}]. Default is aio_handle'
+    )

    parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')

@ -152,6 +177,20 @@ def parse_arguments():
                        action='store_true',
                        help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')

+    parser.add_argument('--torch_legacy_save', action='store_true', help='Use torch legacy save approach')
+
+    parser.add_argument('--use_accelerator_pin_memory',
+                        action='store_true',
+                        help='Obtain pinned (CPU page-locked) tensors from accelerator')
+
+    parser.add_argument('--warmup_loops', type=int, default=1, help='Count of operation warmup repetitions')
+
+    parser.add_argument('--include_warmup_time', action='store_true', help='Include warmup latency in results')
+
+    parser.add_argument('--different_file_each_iteration',
+                        action='store_true',
+                        help='Read/write a different file on each iteration.')
+
    args = parser.parse_args()
    print(f'args = {args}')
    return args
@ -163,7 +202,7 @@ def get_validated_args():
    if not validate_args(args):
        quit()
    print(f'Successful validation of command line arguments')
-
+    args.total_loops = args.warmup_loops + args.loops
    peer_tag = 'gpu' if args.gpu else 'process'
    args.mapping_dict = _get_mapping_dict(args)
    args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@ -6,129 +6,59 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """

-import torch
 import os
 import time
 from deepspeed.ops.aio import AsyncIOBuilder
-from multiprocessing import Pool, Barrier
-from test_ds_aio_utils import report_results, task_log, task_barrier
+from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
+from ds_aio_constants import *


-def pre_basic(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+class AIOBasic_Engine(object):

-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
+    def __init__(self, args, tid, read_op):
+        self.ctxt = self._create_context(args, tid, read_op)

-    ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
-    ctxt['buffer'] = buffer
-    ctxt['elapsed_sec'] = 0
+    def fini(self):
+        self.ctxt[BUFFER].detach()
+        self.ctxt[BUFFER] = None

-    return ctxt
-
-
-def pre_basic_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, True)
-    return ctxt
-
-
-def pre_basic_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, False)
-    return ctxt
-
-
-def post_basic(pool_params):
-    _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
-    return ctxt
-
-
-def main_basic_read(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                     args.single_submit, not args.sequential_requests, args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_basic_write(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                      args.single_submit, not args.sequential_requests, args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_basic_read
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_read
-    else:
-        schedule['pre'] = pre_basic_write
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-    num_processes = len(args.mapping_dict)
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, num_processes)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, num_processes)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
+    def read(self, args, tid, loop_id):
        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, num_processes)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
+        AsyncIOBuilder().load().aio_read(self.ctxt[BUFFER], self.ctxt[FILE], args.block_size, args.queue_depth,
+                                         args.single_submit, not args.sequential_requests, args.validate)
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC] += end_time - start_time

-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, num_processes)
+    def write(self, args, tid, loop_id):
+        # Avoid overwriting existing files as it could be artificially faster
+        if os.path.isfile(self.ctxt[FILE]):
+            os.remove(self.ctxt[FILE])

-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+        start_time = time.time()
+        AsyncIOBuilder().load().aio_write(self.ctxt[BUFFER], self.ctxt[FILE], args.block_size, args.queue_depth,
+                                          args.single_submit, not args.sequential_requests, args.validate)
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC] += end_time - start_time

+    def _create_context(self, args, tid, read_op):
+        io_string = "Read" if read_op else "Write"
+        device_id, folder = args.mapping_list[tid]
+        filename = create_filename(folder, args.read, args.io_size, tid)
+        if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+            create_file(filename, args.io_size)

-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
+        task_log(tid, f'Allocate tensor of size {args.io_size} bytes')

+        buffer = create_page_locked_tensor(args.io_size, True)

-def aio_basic_multiprocessing(args, read_op):
-    num_processes = len(args.mapping_dict)
-    b = Barrier(num_processes)
-    pool_params = [(args, p, read_op) for p in range(num_processes)]
-    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
+        task_log(tid,
+                 f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}')

-    report_results(args, read_op, pool_results)
+        task_log(tid, f'created deepspeed aio basic engine')
+
+        ctxt = {}
+        ctxt[FILE] = filename
+        ctxt[NUM_BYTES] = args.io_size
+        ctxt[BUFFER] = buffer
+        ctxt[ELAPSED_SEC] = 0
+        return ctxt
--- a/csrc/aio/py_test/ds_aio_constants.py
+++ b/csrc/aio/py_test/ds_aio_constants.py
@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+AIO_HANDLE = 'aio_handle'
+AIO_BASIC = 'aio_basic'
+TORCH_IO = 'torch_io'
+TORCH_FAST_IO = 'torch_fastio'
+VALID_ENGINES = [AIO_HANDLE, AIO_BASIC, TORCH_IO, TORCH_FAST_IO]
+
+BUFFER = 'buffer'
+BOUNCE_BUFFER = 'bounce_buffer'
+NUM_BYTES = 'num_bytes'
+FILE = 'file'
+HANDLE = 'handle'
+ELAPSED_SEC = 'elapsed_sec'
+FAST_IO_BUFFER = 'fast_io_buffer'
+USE_CPU_LOCKED_TENSOR = 'cpu_locked_tensor'
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@ -2,221 +2,105 @@
 # SPDX-License-Identifier: Apache-2.0

 # DeepSpeed Team
-"""
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""

 import torch
 import os
 import time
-from multiprocessing import Pool, Barrier
 from deepspeed.ops.aio import AsyncIOBuilder
-from deepspeed.ops.op_builder import GDSBuilder
-from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
-from deepspeed.accelerator import get_accelerator
-
-BUFFER = 'buffer'
-BOUNCE_BUFFER = 'bounce_buffer'
+from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
+from ds_aio_constants import *


-def pre_handle(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    gds = True if args.use_gds else False
-    device_id, folder = args.mapping_list[tid]
-    filename = create_filename(folder, args.read, args.io_size, tid)
-    if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
-        create_file(filename, args.io_size)
+class AIOHandle_Engine(object):

-    task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
-    bounce_buffer = None
-    if args.gpu:
-        device_name = get_accelerator().device_name(device_id)
-        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
-        if not (args.slow_bounce_buffer or gds):
-            bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
-                                          device='cpu').pin_memory()
-    else:
-        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(tid,
-             f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
-             force=True)
+    def __init__(self, args, tid, read_op):
+        self.ctxt = self._create_context(args, tid, read_op)

-    io_parallel = args.io_parallel if args.io_parallel else 1
-    if gds:
-        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                not args.sequential_requests, io_parallel)
-        handle.pin_device_tensor(buffer)
-    else:
+    def fini(self):
+        for buf in [BUFFER, BOUNCE_BUFFER]:
+            if self.ctxt[buf] is not None:
+                if self.ctxt[USE_CPU_LOCKED_TENSOR]:
+                    self.ctxt[HANDLE].free_cpu_locked_tensor(self.ctxt[buf])
+
+                self.ctxt[buf].detach()
+                self.ctxt[buf] = None
+
+    def read(self, args, tid, loop_id):
+        handle = self.ctxt[HANDLE]
+
+        start_time = time.time()
+        dest_buffer = BOUNCE_BUFFER if self.ctxt[BOUNCE_BUFFER] is not None else BUFFER
+        ret = handle.pread(self.ctxt[dest_buffer], self.ctxt[FILE][loop_id], args.validate, True)
+        assert ret != -1
+        handle.wait()
+        if dest_buffer == BOUNCE_BUFFER:
+            self.ctxt[BUFFER].data.copy_(self.ctxt[BOUNCE_BUFFER].data)
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC].append(end_time - start_time)
+
+    def write(self, args, tid, loop_id):
+        handle = self.ctxt[HANDLE]
+        start_time = time.time()
+        if self.ctxt[BOUNCE_BUFFER] is not None:
+            source_buffer = BOUNCE_BUFFER
+            self.ctxt[BOUNCE_BUFFER].data.copy_(self.ctxt[BUFFER].data)
+        else:
+            source_buffer = BUFFER
+        ret = handle.pwrite(self.ctxt[source_buffer], self.ctxt[FILE][loop_id], args.validate, True)
+        assert ret != -1
+        handle.wait()
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC].append(end_time - start_time)
+
+    def _create_files(self, args, folder, tid):
+        if args.different_file_each_iteration:
+            filenames = [
+                create_filename(folder, args.read, args.io_size, f'{tid}_{l}') for l in range(args.total_loops)
+            ]
+        else:
+            filenames = [
+                create_filename(folder, args.read, args.io_size, f'{tid}_{0}') for _ in range(args.total_loops)
+            ]
+
+        if args.read:
+            for f in filenames:
+                if not (os.path.isfile(f) and os.path.getsize(f) == args.io_size):
+                    create_file(f, args.io_size)
+        else:
+            for f in filenames:
+                if os.path.isfile(f):
+                    os.remove(f)
+
+        return filenames
+
+    def _create_context(self, args, tid, read_op):
+        io_string = "Read" if read_op else "Write"
+        device_id, folder = args.mapping_list[tid]
+        filenames = self._create_files(args, folder, tid)
+        io_parallel = args.io_parallel if args.io_parallel else 1
        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
                                                    not args.sequential_requests, io_parallel)
-    task_log(tid, f'created deepspeed aio handle')
+        task_log(tid, f'created deepspeed aio handle engine')

-    ctxt = {}
-    ctxt['file'] = filename
-    ctxt['num_bytes'] = args.io_size
-    ctxt['handle'] = handle
-    ctxt['gds'] = gds
-    ctxt[BUFFER] = buffer
-    ctxt[BOUNCE_BUFFER] = bounce_buffer
-    ctxt['elapsed_sec'] = 0
+        bounce_buffer = None
+        if args.gpu:
+            buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=f'cuda:{device_id}')
+            bounce_buffer = create_page_locked_tensor(args.io_size, args.use_accelerator_pin_memory, handle)
+        else:
+            buffer = create_page_locked_tensor(args.io_size, args.use_accelerator_pin_memory, handle)
+        task_log(tid, f'Allocate tensor of size {args.io_size} bytes')

-    return ctxt
+        ctxt = {}
+        ctxt[FILE] = filenames
+        ctxt[NUM_BYTES] = args.io_size
+        ctxt[HANDLE] = handle
+        ctxt[BUFFER] = buffer
+        ctxt[BOUNCE_BUFFER] = bounce_buffer
+        ctxt[ELAPSED_SEC] = []
+        ctxt[USE_CPU_LOCKED_TENSOR] = not args.use_accelerator_pin_memory

+        task_log(tid,
+                 f'{io_string} file {filenames} of size {args.io_size} bytes from buffer on device {buffer.device}',
+                 force=True)

-def pre_handle_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, True)
-    return ctxt
-
-
-def pre_handle_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, False)
-    return ctxt
-
-
-def post_handle(pool_params):
-    _, _, ctxt = pool_params
-    for buf in [BUFFER, BOUNCE_BUFFER]:
-        if ctxt[buf] is not None:
-            if ctxt['gds']:
-                ctxt['handle'].unpin_device_tensor(ctxt[buf])
-            ctxt[buf].detach()
-            ctxt[buf] = None
-    return ctxt
-
-
-def main_parallel_read(pool_params):
-    args, tid, ctxt = pool_params
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
-    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, 0, True)
-    assert ret != -1
-    handle.wait()
-    if dest_buffer == BOUNCE_BUFFER:
-        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-    return ctxt
-
-
-def main_parallel_write(pool_params):
-    args, tid, ctxt = pool_params
-    # Avoid overwriting existing files as it could be artificially faster
-    if os.path.isfile(ctxt['file']):
-        os.remove(ctxt['file'])
-
-    handle = ctxt['handle']
-    start_time = time.time()
-    if ctxt[BOUNCE_BUFFER] is not None:
-        source_buffer = BOUNCE_BUFFER
-        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
-    else:
-        source_buffer = BUFFER
-    ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
-    assert ret != -1
-    handle.wait()
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_read(pool_parms):
-    args, tid, ctxt = pool_parms
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
-    ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
-    assert ret != -1
-    if dest_buffer == BOUNCE_BUFFER:
-        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_write(pool_parms):
-    args, tid, ctxt = pool_parms
-    # Avoid overwriting existing files as it could be artificially faster
-    if os.path.isfile(ctxt['file']):
-        os.remove(ctxt['file'])
-
-    handle = ctxt['handle']
-    start_time = time.time()
-    if ctxt[BOUNCE_BUFFER] is not None:
-        source_buffer = BOUNCE_BUFFER
-        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
-    else:
-        source_buffer = BUFFER
-    ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
-    assert ret != -1
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_handle_read
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_read
-    else:
-        schedule['pre'] = pre_handle_write
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-    num_processes = len(args.mapping_dict)
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, num_processes)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, num_processes)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
-        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, num_processes)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
-
-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, num_processes)
-
-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
-
-
-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
-
-
-def aio_handle_multiprocessing(args, read_op):
-    num_processes = len(args.mapping_dict)
-    b = Barrier(num_processes)
-    pool_params = [(args, p, read_op) for p in range(num_processes)]
-    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
-
-    report_results(args, read_op, pool_results)
+        return ctxt
--- a/csrc/aio/py_test/io_engine.py
+++ b/csrc/aio/py_test/io_engine.py
@ -0,0 +1,126 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import time
+from multiprocessing import Pool, Barrier
+
+from ds_aio_constants import AIO_BASIC, TORCH_FAST_IO, TORCH_IO
+from test_ds_aio_utils import report_results, task_log, task_barrier
+from ds_aio_handle import AIOHandle_Engine
+from ds_aio_basic import AIOBasic_Engine
+from torch_io import TorchIO_Engine
+from torch_fastio_engine import Torch_FastIO_Engine
+
+
+def prepare_operation(args, tid, read_op):
+    if args.engine == TORCH_IO:
+        io_engine = TorchIO_Engine(args, tid, read_op)
+    elif args.engine == AIO_BASIC:
+        io_engine = AIOBasic_Engine(args, tid, read_op)
+    elif args.engine == TORCH_FAST_IO:
+        io_engine = Torch_FastIO_Engine(args, tid, read_op)
+    else:
+        io_engine = AIOHandle_Engine(args, tid, read_op)
+
+    return io_engine
+
+
+def prepare_read(pool_params):
+    args, tid = pool_params
+    return prepare_operation(args, tid, True)
+
+
+def prepare_write(pool_params):
+    args, tid = pool_params
+    return prepare_operation(args, tid, False)
+
+
+def post_operation(pool_params):
+    _, _, io_engine = pool_params
+    io_engine.fini()
+
+
+def read_operation(pool_params):
+    args, tid, loop_id, io_engine = pool_params
+    return io_engine.read(args, tid, loop_id)
+
+
+def write_operation(pool_params):
+    args, tid, loop_id, io_engine = pool_params
+    return io_engine.write(args, tid, loop_id)
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = prepare_read
+        schedule['post'] = post_operation
+        schedule['main'] = read_operation
+    else:
+        schedule['pre'] = prepare_write
+        schedule['post'] = post_operation
+        schedule['main'] = write_operation
+
+    return schedule
+
+
+def io_engine_tasklet(pool_params):
+    args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, num_processes)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    io_engine = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, num_processes)
+
+    # Run main tasks in a loop
+    io_engine.ctxt["main_task_sec"] = []
+    for i in range(args.total_loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        schedule["main"]((args, tid, i, io_engine))
+        task_barrier(aio_barrier, num_processes)
+        stop_time = time.time()
+        io_engine.ctxt["main_task_sec"].append(stop_time - start_time)
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    schedule["post"]((args, tid, io_engine))
+    task_barrier(aio_barrier, num_processes)
+
+    ctxt = io_engine.ctxt
+    # return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+    if args.include_warmup_time:
+        e2e_latency_sec = sum(ctxt["main_task_sec"])
+        task_latency_sec = sum(ctxt["elapsed_sec"])
+        actual_loops = args.total_loops
+    else:
+        e2e_latency_sec = sum(ctxt["main_task_sec"][args.warmup_loops:])
+        task_latency_sec = sum(ctxt["elapsed_sec"][args.warmup_loops:])
+        actual_loops = args.loops
+
+    l = ctxt["elapsed_sec"]
+    task_log(tid, f'task_latency_sec = {l}')
+    return e2e_latency_sec, task_latency_sec, ctxt["num_bytes"] * actual_loops
+
+
+def _init_takslet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def io_engine_multiprocessing(args, read_op):
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p:
+        pool_results = p.map(io_engine_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@ -79,9 +79,9 @@ for xtype in cpu gpu gds; do
        gpu_opt="--gpu"
        gds_opt="--use_gds"
    fi
-    for sub in single block; do
-        if [[ $sub == "single" ]]; then
-            sub_opt="--single_submit"
+    for ov in overlap sequential; do
+        if [[ $ov == "sequential" ]]; then
+            ov_opt="--sequential_requests"
        else
            sub_opt=""
        fi
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
@ -25,11 +25,42 @@ function validate_environment()

 validate_environment

-IO_SIZE=$1
-LOG_DIR=$2/aio_perf_sweep
-MAP_DIR=$2/aio
-GPU_MEM=$3
-USE_GDS=$4
+if [[ $# -ne 3 ]]; then
+    echo "Usage: $0 <write size in [K,M,G]> <write dir ><output log dir>"
+    exit 1
+fi
+
+SIZE=$1
+WRITE_DIR=$2
+LOG_DIR=$3/aio_perf_sweep
+
+WRITE_OPT="--folder ${WRITE_DIR} --io_size ${SIZE} --loops 3"
+IO_ENGINE="torch_fastio"
+ENGINE_OPTS=""
+if [[ $IO_ENGINE == "aio_handle" ]]; then
+    IO_PARALLEL="1" # "1 2 4 8"
+    QUEUE_DEPTH="8 16 32 64 128"
+    BLOCK_SIZE="128K 256K 512K 1M 2M 4M 8M 16M"
+    SUBMIT="block"
+    OVERLAP="overlap"
+elif [[ $IO_ENGINE == "torch_fastio" ]]; then
+    IO_PARALLEL="1" # "1 2 4 8"
+    QUEUE_DEPTH="8 16 32 64 128"
+    BLOCK_SIZE="128K 256K 512K 1M 2M 4M 8M 16M"
+    SUBMIT="block"
+    OVERLAP="overlap"
+    ENGINE_OPTS="--torch_legacy --fast_io_size ${SIZE}"
+else
+    IO_PARALLEL="1"
+    QUEUE_DEPTH="8"
+    BLOCK_SIZE="128K"
+    SUBMIT="single"
+    OVERLAP="sequential"
+fi
+
+prep_folder ${WRITE_DIR}
+prep_folder ${LOG_DIR}
+
 RUN_SCRIPT=./test_ds_aio.py

 OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt
@ -54,24 +85,24 @@ fi
 DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"

-for sub in single block; do
+for sub in  ${SUBMIT}; do
    if [[ $sub == "single" ]]; then
        sub_opt="--single_submit"
    else
        sub_opt=""
    fi
-    for ov in overlap sequential; do
+    for ov in ${OVERLAP}; do
        if [[ $ov == "sequential" ]]; then
            ov_opt="--sequential_requests"
        else
            ov_opt=""
        fi
-        for p in 1 2 4 8; do
-            for t in 1 2 4 8; do
-                for d in 32 64 128; do
-                    for bs in 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
-                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
+        for p in 1;  do
+            for t in ${IO_PARALLEL}; do
+                for d in ${QUEUE_DEPTH}; do
+                    for bs in ${BLOCK_SIZE}; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --engine ${IO_ENGINE} --io_parallel ${t} ${ENGINE_OPTS}"
+                        OPTS="--multi_process ${p} --queue_depth ${d} --block_size ${bs}"
                        LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                        cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
                        echo ${DISABLE_CACHE}
--- a/csrc/aio/py_test/single_process_config.json
+++ b/csrc/aio/py_test/single_process_config.json
@ -2,12 +2,17 @@
    "block_size": [
        "128K",
        "256K",
-        "1M"
+        "1M",
+        "2M",
+        "4M",
+        "8M",
+        "16M"
    ],
    "queue_depth": [
-        4,
+        8,
        16,
-        32
+        32,
+        64
    ],
    "io_parallel": [
        1,
@ -19,7 +24,7 @@
        true,
        false
    ],
-    "overlap_events": [
+    "sequential_requests": [
        true,
        false
    ],
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@ -7,17 +7,16 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """

 import multiprocessing as mp
-from ds_aio_basic import aio_basic_multiprocessing
-from ds_aio_handle import aio_handle_multiprocessing
 from ds_aio_args import get_validated_args
+from io_engine import io_engine_multiprocessing


 def main():
    print(f'Testing deepspeed_aio python frontend')

    args = get_validated_args()
-    mp.set_start_method('spawn')
-    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    mp.set_start_method('spawn', force=True)
+    multiprocess_function = io_engine_multiprocessing
    multiprocess_function(args, args.read)


--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@ -8,6 +8,8 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.

 import os
 from ds_aio_job import Job, run_job
+import torch
+from deepspeed.accelerator import get_accelerator

 BYTES_PER_GB = 1024**3
 BYTES_PER_MB = 1024**2
@ -79,3 +81,11 @@ def create_file(filename, num_bytes):
    print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
    run_job(dd_job)
    print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
+
+
+def create_page_locked_tensor(num_elem, use_accelerator, aio_handle=None):
+    if use_accelerator:
+        return get_accelerator().pin_memory(torch.randint(high=128, size=(num_elem, ), dtype=torch.uint8,
+                                                          device='cpu'))
+    else:
+        return aio_handle.new_cpu_locked_tensor(num_elem, torch.empty(0, dtype=torch.uint8))
--- a/csrc/aio/py_test/torch_fastio_engine.py
+++ b/csrc/aio/py_test/torch_fastio_engine.py
@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import AsyncIOBuilder
+from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
+from ds_aio_constants import *
+from deepspeed.io import FastFileWriter
+
+
+class Torch_FastIO_Engine(object):
+
+    def __init__(self, args, tid, read_op):
+        assert read_op is False, f'Read operation is not currently supported'
+        self.ctxt = self._create_context(args, tid, read_op)
+        self.zipfile_serialization = not args.torch_legacy_save
+
+    def fini(self):
+        if self.ctxt[USE_CPU_LOCKED_TENSOR]:
+            for buf in [BUFFER, FAST_IO_BUFFER]:
+                self.ctxt[HANDLE].free_cpu_locked_tensor(self.ctxt[buf])
+
+        self.ctxt[BUFFER].detach()
+        self.ctxt[BUFFER] = None
+
+    def read(self, args, tid):
+        start_time = time.time()
+        torch.load(f=self.ctxt[FILE], map_location=self.ctxt[BUFFER].device)
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC] += end_time - start_time
+
+    def write(self, args, tid):
+        # Avoid overwriting existing files as it could be artificially faster
+        if os.path.isfile(self.ctxt[FILE]):
+            os.remove(self.ctxt[FILE])
+
+        ds_file_writer = FastFileWriter(file_path=self.ctxt[FILE],
+                                        aio_handle=self.ctxt[HANDLE],
+                                        pinned_tensor=self.ctxt[FAST_IO_BUFFER])
+
+        start_time = time.time()
+        torch.save(obj=self.ctxt[BUFFER], f=ds_file_writer, _use_new_zipfile_serialization=self.zipfile_serialization)
+        ds_file_writer.close()  # Force flush to storage
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC] += end_time - start_time
+        ds_file_writer._dump_state()
+
+    def _create_context(self, args, tid, read_op):
+        io_string = "Read" if read_op else "Write"
+        device_id, folder = args.mapping_list[tid]
+        filename = create_filename(folder, args.read, args.io_size, tid)
+        if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+            create_file(filename, args.io_size)
+
+        io_parallel = args.io_parallel if args.io_parallel else 1
+        aio_handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                        not args.sequential_requests, io_parallel)
+
+        if args.gpu:
+            buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=f'cuda:{device_id}')
+        else:
+            buffer = create_page_locked_tensor(args.io_size, args.use_accelerator_pin_memory, aio_handle)
+
+        task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+
+        fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle)
+
+        task_log(tid, f'created torch_fastio engine')
+
+        ctxt = {}
+        ctxt[FILE] = filename
+        ctxt[NUM_BYTES] = args.io_size
+        ctxt[BUFFER] = buffer
+        ctxt[HANDLE] = aio_handle
+        ctxt[FAST_IO_BUFFER] = fast_io_buffer
+        ctxt[ELAPSED_SEC] = 0
+        ctxt[USE_CPU_LOCKED_TENSOR] = not args.use_accelerator_pin_memory
+
+        task_log(tid,
+                 f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+                 force=True)
+
+        return ctxt
--- a/csrc/aio/py_test/torch_io.py
+++ b/csrc/aio/py_test/torch_io.py
@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import os
+import time
+from test_ds_aio_utils import task_log, create_filename, create_file, create_page_locked_tensor
+from ds_aio_constants import *
+
+
+class TorchIO_Engine(object):
+
+    def __init__(self, args, tid, read_op):
+        self.ctxt = self._create_context(args, tid, read_op)
+        self.zipfile_serialization = not args.torch_legacy_save
+
+    def fini(self):
+        self.ctxt[BUFFER].detach()
+        self.ctxt[BUFFER] = None
+
+    def read(self, args, tid):
+        start_time = time.time()
+        torch.load(f=self.ctxt[FILE], map_location=self.ctxt[BUFFER].device)
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC] += end_time - start_time
+
+    def write(self, args, tid):
+        # Avoid overwriting existing files as it could be artificially faster
+        if os.path.isfile(self.ctxt[FILE]):
+            os.remove(self.ctxt[FILE])
+
+        start_time = time.time()
+        torch.save(obj=self.ctxt[BUFFER], f=self.ctxt[FILE], _use_new_zipfile_serialization=self.zipfile_serialization)
+        end_time = time.time()
+        self.ctxt[ELAPSED_SEC] += end_time - start_time
+
+    def _create_context(self, args, tid, read_op):
+        io_string = "Read" if read_op else "Write"
+        device_id, folder = args.mapping_list[tid]
+        filename = create_filename(folder, args.read, args.io_size, tid)
+        if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+            create_file(filename, args.io_size)
+
+        task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+
+        if args.gpu:
+            buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=f'cuda:{device_id}')
+        else:
+            buffer = create_page_locked_tensor(args.io_size, True)
+
+        task_log(tid,
+                 f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+                 force=True)
+
+        task_log(tid, f'created torch_io engine')
+
+        ctxt = {}
+        ctxt[FILE] = filename
+        ctxt[NUM_BYTES] = args.io_size
+        ctxt[BUFFER] = buffer
+        ctxt[ELAPSED_SEC] = 0
+        return ctxt
--- a/csrc/aio/utils/dgx2_mount_nvme.sh
+++ b/csrc/aio/utils/dgx2_mount_nvme.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+MOUNT_CMD="sudo mount -v -o data=ordered"
+
+for dir in nvme23 nvme45 nvme67 nvme89; do
+    mnt_point=/mnt/${dir}
+    sudo mkdir -p ${mnt_point}
+    sudo chmod -R a+rw ${mnt_point}
+done
+${MOUNT_CMD} /dev/md127 /mnt/nvme23
+${MOUNT_CMD} /dev/md126 /mnt/nvme45
+${MOUNT_CMD} /dev/md125 /mnt/nvme67
+${MOUNT_CMD} /dev/md124 /mnt/nvme89
+
+lsblk -f
--- a/csrc/aio/utils/dgx2_umount_nvme.sh
+++ b/csrc/aio/utils/dgx2_umount_nvme.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+UMOUNT_CMD="sudo umount -v"
+
+for md in md127 md126 md125 md124; do
+    mnt_device=/dev/${md}
+    ${UMOUNT_CMD} ${mnt_device}
+done
+
+lsblk -f
--- a/csrc/compile/deepcompile.cpp
+++ b/csrc/compile/deepcompile.cpp
@ -0,0 +1,188 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepcompile.h"
+
+#define USE_C10D_NCCL
+
+namespace dc {
+
+std::shared_ptr<DSParamRegistry> param_registry;
+std::unordered_map<long, std::shared_ptr<CustomOpExecutor>> executors;
+std::shared_ptr<DoubleBufferedReduceBucket> reduce_buckets = nullptr;
+
+c10::intrusive_ptr<c10d::ProcessGroup> process_group = nullptr;
+c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem = nullptr;
+ncclComm_t nccl_comm;
+bool use_symm_mem;
+bool clone_custom_op_output;
+bool profile = false;
+bool pre_div_reduce = true;
+
+bool sync_before_reduce;     // for debugging
+bool sync_after_reduce;      // for debugging
+bool sync_before_allgather;  // for debugging
+bool sync_after_allgather;   // for debugging
+
+std::vector<int64_t> sizes_to_int_vector(at::IntArrayRef sizes)
+{
+    std::vector<int64_t> result;
+    for (int i = 0; i < sizes.size(); i++) { result.push_back(sizes[i]); }
+    return result;
+}
+
+void enable_profiling(bool enable) { profile = enable; }
+
+bool is_profiling() { return profile; }
+
+c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> getSymmMemWorkspace(int64_t size)
+{
+    c10::Device device = c10::Device(c10::kCUDA, c10::cuda::current_device());
+    std::vector<int64_t> sizes = {size};
+    std::vector<int64_t> strides = {1};
+    at::Tensor sym_mem_ws = c10d::symmetric_memory::empty_strided_p2p(
+        {size}, {1}, c10::ScalarType::Byte, device, process_group->getGroupName(), std::nullopt);
+    return c10d::symmetric_memory::rendezvous(sym_mem_ws);
+}
+
+void lazy_init_symm_memory()
+{
+    if (use_symm_mem && !symm_mem) {
+        int64_t max_param_size = 0;
+        for (const auto& it : param_registry->getParams()) {
+            int64_t size = it.second.getDSTensor().numel() * it.second.getDSTensor().element_size();
+            if (size > max_param_size) { max_param_size = size; }
+        }
+        symm_mem = getSymmMemWorkspace(max_param_size);
+    }
+}
+
+ncclDataType_t get_nccl_data_type(at::ScalarType scalar_type)
+{
+    switch (scalar_type) {
+        case at::kFloat: return ncclFloat;
+        case at::kHalf: return ncclHalf;
+        case at::kDouble: return ncclDouble;
+        case at::kBFloat16: return ncclBfloat16;
+        case at::kLong: return ncclInt64;
+        case at::kInt: return ncclInt;
+        case at::kChar: return ncclInt8;
+        default: throw std::runtime_error("Unsupported scalar type");
+    }
+}
+
+void reset()
+{
+    executors.clear();
+    // We keep the buckets for memory estimation
+    // reduce_buckets->clear();
+}
+
+void cleanup()
+{
+    reset();
+
+    ncclCommDestroy(nccl_comm);
+    process_group = nullptr;
+    symm_mem = nullptr;
+}
+
+at::Tensor reduce_grad(at::Tensor grad_tensor, long graph_id, long ds_id)
+{
+    if (sync_before_reduce) { c10::cuda::device_synchronize(); }
+
+    assert(hasKey(executors, graph_id));
+    if (!profile) { executors[graph_id]->reduceGrad(grad_tensor, ds_id); }
+
+    if (sync_after_reduce) { c10::cuda::device_synchronize(); }
+
+    return at::Tensor();
+}
+
+at::Tensor reduce_grad_meta(at::Tensor grad_tensor, long graph_id, long ds_id)
+{
+    return at::Tensor();
+}
+
+void free_tensors(std::vector<at::Tensor> tensors)
+{
+    int64_t THRESHOLD = 10 * 1024 * 1024;
+
+    if (!profile) {
+        for (auto& tensor : tensors) {
+            if (tensor.is_cuda() && tensor.numel() > THRESHOLD) {
+                tensor.record_stream(at::cuda::getCurrentCUDAStream());
+                tensor.set_data(torch::empty({0}, tensor.options()));
+            }
+        }
+    }
+}
+
+void free_tensors_meta(std::vector<at::Tensor> tensors) {}
+
+void init(c10::intrusive_ptr<c10d::ProcessGroup> pg,
+          int64_t initial_reduce_bucket_size,
+          bool enable_double_buffer,
+          bool _use_symm_mem,
+          bool _clone_custom_op_output,
+          bool _sync_before_reduce,
+          bool _sync_after_reduce,
+          bool _sync_before_allgather,
+          bool _sync_after_allgather)
+{
+    process_group = pg;
+
+    ncclUniqueId ncclID;
+    ncclGetUniqueId(&ncclID);
+
+    // ProcessGroup doesn't have an API to get the CUDA stream for comm calls.
+    // So we create a NCCL communicator and call NCCL APIs directly.
+    auto vec = std::vector<uint8_t>(reinterpret_cast<uint8_t*>(&ncclID),
+                                    reinterpret_cast<uint8_t*>(&ncclID) + NCCL_UNIQUE_ID_BYTES);
+    auto device = torch::Device(torch::kCUDA);
+    at::Tensor tensor = torch::from_blob(vec.data(), {static_cast<long>(vec.size())}, torch::kUInt8)
+                            .to(torch::Device(torch::kCUDA));
+    std::vector<at::Tensor> bcast_input = {tensor};
+
+    process_group->broadcast(bcast_input, c10d::BroadcastOptions())->wait();
+
+    // create a new nccl communicator
+    std::memcpy(&ncclID, tensor.to(torch::Device(torch::kCPU)).data_ptr(), NCCL_UNIQUE_ID_BYTES);
+    ncclCommInitRank(&nccl_comm, process_group->getSize(), ncclID, process_group->getRank());
+
+    param_registry = std::make_shared<DSParamRegistry>();
+    reduce_buckets = std::make_shared<DoubleBufferedReduceBucket>(initial_reduce_bucket_size,
+                                                                  enable_double_buffer);
+    use_symm_mem = _use_symm_mem;
+    clone_custom_op_output = _clone_custom_op_output;
+
+    sync_before_reduce = _sync_before_reduce;
+    sync_after_reduce = _sync_after_reduce;
+    sync_before_allgather = _sync_before_allgather;
+    sync_after_allgather = _sync_after_allgather;
+}
+
+void start_forward()
+{
+    lazy_init_symm_memory();
+    for (auto& it : executors) { it.second->startForward(); }
+}
+
+void end_forward()
+{
+    for (auto& it : executors) { it.second->endForward(); }
+}
+
+void start_backward(bool update)
+{
+    for (auto& it : executors) { it.second->startBackward(update); }
+}
+
+// We don't call this
+// void end_backward(bool update)
+// {
+// }
+
+}  // namespace dc
--- a/Show More
+++ b/Show More