This is an initial effort to migrate CI unto Modal infra. This PR
creates two new workflows that run on Modal
1. modal-torch-latest: a subset of nv-torch-latest-v100 that includes
`tests/unit/runtime/zero/test_zero.py`.
2. modal-accelerate: a full copy of nv-accelerate-v100. 

Follow up PRs will selectively migrate relevant workflows onto Modal.

---------

Signed-off-by: Olatunji Ruwase <tunji.ruwase@snowflake.com>
Signed-off-by: Olatunji Ruwase <tjruwase@gmail.com>
Signed-off-by: Tunji Ruwase <tunji.ruwase@snowflake.com>
Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <tjruwase@gmail.com>
Co-authored-by: Stas Bekman <stas.bekman@snowflake.com>
This commit is contained in:
Olatunji Ruwase
2025-08-11 16:13:39 -04:00
committed by GitHub
parent 8e02992332
commit a12de38db6
6 changed files with 287 additions and 3 deletions

99
.github/workflows/modal-accelerate.yml vendored Normal file
View File

@ -0,0 +1,99 @@
name: modal-accelerate
# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#
on:
workflow_dispatch:
push:
branches:
- master
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [draft, opened, ready_for_review, synchronize]
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true
jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'
deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}
if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal
- name: Run tests
run: |
modal run -m ci.accelerate

View File

@ -0,0 +1,99 @@
name: modal-torch-latest
# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see deepspeed/modal_ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#
on:
workflow_dispatch:
push:
branches:
- master
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [draft, opened, ready_for_review, synchronize]
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true
jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'
deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}
if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal
- name: Run tests
run: |
modal run -m ci.torch_latest

4
ci/__init__.py Normal file
View File

@ -0,0 +1,4 @@
# Copyright (c) DeepSpeed Team.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team

43
ci/accelerate.py Normal file
View File

@ -0,0 +1,43 @@
# Copyright (c) Snowflake.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from pathlib import Path
import modal
ROOT_PATH = Path(__file__).parents[1]
# yapf: disable
image = (modal.Image
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
.run_commands("apt update && apt install -y libaio-dev")
.apt_install("git")
.run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
.run_commands(
"git clone https://github.com/huggingface/accelerate && \
uv pip install --system --compile-bytecode ./accelerate[testing]"
)
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
.run_commands("pip install /root")
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
)
app = modal.App("deepspeedai-accelerate-ci", image=image)
@app.function(
gpu="l40s:1",
timeout=1800,
)
def pytest():
import subprocess
subprocess.run(
"pytest /accelerate/tests/deepspeed".split(),
check=True,
cwd=ROOT_PATH / ".",
)

39
ci/torch_latest.py Normal file
View File

@ -0,0 +1,39 @@
# Copyright (c) Snowflake.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from pathlib import Path
import modal
ROOT_PATH = Path(__file__).parents[1]
# yapf: disable
image = (modal.Image
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
.run_commands("apt update && apt install -y libaio-dev")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
.run_commands("pip install /root")
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
)
app = modal.App("deepspeedai-torch-latest-ci", image=image)
@app.function(
gpu="l40s:2",
timeout=1800,
)
def pytest():
import subprocess
subprocess.run(
"pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
check=True,
cwd=ROOT_PATH / ".",
)

View File

@ -394,7 +394,7 @@ class TestIncorectAllgatherBucketSize(DistributedTest):
class TestPartitionNcclAlignment(DistributedTest):
world_size = 4
world_size = 2
def test(self, zero_stage=2):
config_dict = {
@ -835,7 +835,7 @@ class TestZero3ParamPartitioningBase(DistributedTest):
@pytest.mark.parametrize("init_context_manager", [True, False])
@pytest.mark.parametrize("reduce_scatter", [True, False])
class TestZero3ParamPartitioningLargeParam(DistributedTest):
world_size = 4
world_size = 2
def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None:
@ -997,7 +997,7 @@ class TestZero3ParamPartitioningManyParams(DistributedTest):
class TestZero3InitForParentWeightInitialization(DistributedTest):
world_size = 4
world_size = 2
def test(self):