mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
Modal CI (#7289)
This is an initial effort to migrate CI unto Modal infra. This PR creates two new workflows that run on Modal 1. modal-torch-latest: a subset of nv-torch-latest-v100 that includes `tests/unit/runtime/zero/test_zero.py`. 2. modal-accelerate: a full copy of nv-accelerate-v100. Follow up PRs will selectively migrate relevant workflows onto Modal. --------- Signed-off-by: Olatunji Ruwase <tunji.ruwase@snowflake.com> Signed-off-by: Olatunji Ruwase <tjruwase@gmail.com> Signed-off-by: Tunji Ruwase <tunji.ruwase@snowflake.com> Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Logan Adams <loadams@microsoft.com> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase <tjruwase@gmail.com> Co-authored-by: Stas Bekman <stas.bekman@snowflake.com>
This commit is contained in:
99
.github/workflows/modal-accelerate.yml
vendored
Normal file
99
.github/workflows/modal-accelerate.yml
vendored
Normal file
@ -0,0 +1,99 @@
|
||||
name: modal-accelerate
|
||||
|
||||
# This CI is running on modal.com's GPUs.
|
||||
#
|
||||
# It's set up here on github actions and then the cloned repo is sent to modal and everything
|
||||
# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
|
||||
# run.
|
||||
#
|
||||
# Both files are annotated to what's important and how one might change or update things if needed.
|
||||
#
|
||||
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
|
||||
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
|
||||
# Required status for PRs to pass.
|
||||
#
|
||||
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
pull_request:
|
||||
paths-ignore:
|
||||
- 'docs/**'
|
||||
- 'blogs/**'
|
||||
- 'deepspeed/inference/v2/**'
|
||||
- 'tests/unit/inference/v2/**'
|
||||
types: [draft, opened, ready_for_review, synchronize]
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
collect-tests:
|
||||
name: Collect tests to run
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
outputs:
|
||||
deepspeed: ${{ steps.filter.outputs.deepspeed }}
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
lfs: true
|
||||
|
||||
- name: Filter changed files
|
||||
uses: dorny/paths-filter@v2
|
||||
id: filter
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
filters: |
|
||||
deepspeed:
|
||||
- 'deepspeed/**'
|
||||
- '.github/workflows/modal*.yml'
|
||||
- 'ci/**'
|
||||
- 'tests/unit/**'
|
||||
- 'csrc/**'
|
||||
|
||||
deploy:
|
||||
name: DeepSpeedAI CI
|
||||
runs-on: ubuntu-latest
|
||||
needs: collect-tests
|
||||
env:
|
||||
# these are created at https://modal.com/settings/deepspeedai/tokens
|
||||
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
|
||||
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
|
||||
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
|
||||
# this one comes from https://huggingface.co/settings/profile of the bot user
|
||||
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
|
||||
if: needs.collect-tests.outputs.deepspeed == 'true'
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
lfs: true
|
||||
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
pip install uv # much faster than pip
|
||||
uv pip install --system modal
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
modal run -m ci.accelerate
|
99
.github/workflows/modal-torch-latest.yml
vendored
Normal file
99
.github/workflows/modal-torch-latest.yml
vendored
Normal file
@ -0,0 +1,99 @@
|
||||
name: modal-torch-latest
|
||||
|
||||
# This CI is running on modal.com's GPUs.
|
||||
#
|
||||
# It's set up here on github actions and then the cloned repo is sent to modal and everything
|
||||
# happens on their hw - see deepspeed/modal_ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
|
||||
# run.
|
||||
#
|
||||
# Both files are annotated to what's important and how one might change or update things if needed.
|
||||
#
|
||||
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
|
||||
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
|
||||
# Required status for PRs to pass.
|
||||
#
|
||||
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
pull_request:
|
||||
paths-ignore:
|
||||
- 'docs/**'
|
||||
- 'blogs/**'
|
||||
- 'deepspeed/inference/v2/**'
|
||||
- 'tests/unit/inference/v2/**'
|
||||
types: [draft, opened, ready_for_review, synchronize]
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
collect-tests:
|
||||
name: Collect tests to run
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
outputs:
|
||||
deepspeed: ${{ steps.filter.outputs.deepspeed }}
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
lfs: true
|
||||
|
||||
- name: Filter changed files
|
||||
uses: dorny/paths-filter@v2
|
||||
id: filter
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
filters: |
|
||||
deepspeed:
|
||||
- 'deepspeed/**'
|
||||
- '.github/workflows/modal*.yml'
|
||||
- 'ci/**'
|
||||
- 'tests/unit/**'
|
||||
- 'csrc/**'
|
||||
|
||||
deploy:
|
||||
name: DeepSpeedAI CI
|
||||
runs-on: ubuntu-latest
|
||||
needs: collect-tests
|
||||
env:
|
||||
# these are created at https://modal.com/settings/deepspeedai/tokens
|
||||
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
|
||||
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
|
||||
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
|
||||
# this one comes from https://huggingface.co/settings/profile of the bot user
|
||||
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
|
||||
if: needs.collect-tests.outputs.deepspeed == 'true'
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
lfs: true
|
||||
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
pip install uv # much faster than pip
|
||||
uv pip install --system modal
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
modal run -m ci.torch_latest
|
4
ci/__init__.py
Normal file
4
ci/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# Copyright (c) DeepSpeed Team.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# DeepSpeed Team
|
43
ci/accelerate.py
Normal file
43
ci/accelerate.py
Normal file
@ -0,0 +1,43 @@
|
||||
# Copyright (c) Snowflake.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# DeepSpeed Team
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import modal
|
||||
|
||||
ROOT_PATH = Path(__file__).parents[1]
|
||||
|
||||
# yapf: disable
|
||||
image = (modal.Image
|
||||
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
|
||||
.run_commands("apt update && apt install -y libaio-dev")
|
||||
.apt_install("git")
|
||||
.run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
|
||||
.run_commands(
|
||||
"git clone https://github.com/huggingface/accelerate && \
|
||||
uv pip install --system --compile-bytecode ./accelerate[testing]"
|
||||
)
|
||||
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
|
||||
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
|
||||
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
|
||||
.run_commands("pip install /root")
|
||||
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
|
||||
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
|
||||
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
|
||||
)
|
||||
|
||||
app = modal.App("deepspeedai-accelerate-ci", image=image)
|
||||
|
||||
@app.function(
|
||||
gpu="l40s:1",
|
||||
timeout=1800,
|
||||
)
|
||||
def pytest():
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
"pytest /accelerate/tests/deepspeed".split(),
|
||||
check=True,
|
||||
cwd=ROOT_PATH / ".",
|
||||
)
|
39
ci/torch_latest.py
Normal file
39
ci/torch_latest.py
Normal file
@ -0,0 +1,39 @@
|
||||
# Copyright (c) Snowflake.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# DeepSpeed Team
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import modal
|
||||
|
||||
ROOT_PATH = Path(__file__).parents[1]
|
||||
|
||||
# yapf: disable
|
||||
image = (modal.Image
|
||||
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
|
||||
.run_commands("apt update && apt install -y libaio-dev")
|
||||
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
|
||||
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
|
||||
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
|
||||
.run_commands("pip install /root")
|
||||
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
|
||||
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
|
||||
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
|
||||
)
|
||||
|
||||
|
||||
app = modal.App("deepspeedai-torch-latest-ci", image=image)
|
||||
|
||||
|
||||
@app.function(
|
||||
gpu="l40s:2",
|
||||
timeout=1800,
|
||||
)
|
||||
def pytest():
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
"pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
|
||||
check=True,
|
||||
cwd=ROOT_PATH / ".",
|
||||
)
|
@ -394,7 +394,7 @@ class TestIncorectAllgatherBucketSize(DistributedTest):
|
||||
|
||||
|
||||
class TestPartitionNcclAlignment(DistributedTest):
|
||||
world_size = 4
|
||||
world_size = 2
|
||||
|
||||
def test(self, zero_stage=2):
|
||||
config_dict = {
|
||||
@ -835,7 +835,7 @@ class TestZero3ParamPartitioningBase(DistributedTest):
|
||||
@pytest.mark.parametrize("init_context_manager", [True, False])
|
||||
@pytest.mark.parametrize("reduce_scatter", [True, False])
|
||||
class TestZero3ParamPartitioningLargeParam(DistributedTest):
|
||||
world_size = 4
|
||||
world_size = 2
|
||||
|
||||
def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None:
|
||||
|
||||
@ -997,7 +997,7 @@ class TestZero3ParamPartitioningManyParams(DistributedTest):
|
||||
|
||||
|
||||
class TestZero3InitForParentWeightInitialization(DistributedTest):
|
||||
world_size = 4
|
||||
world_size = 2
|
||||
|
||||
def test(self):
|
||||
|
||||
|
Reference in New Issue
Block a user