mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
Update workflows to cuda 12.4 (#7000)
- Update existing workflows that use cu121 to cu124. Note, this means that where we download torch latest, we will now be getting torch 2.6 rather than the torch latest 2.5 provided with cuda 12.1. - Note, nv-nightly is failing in master currently due to unrelated errors, so this could be ignored in this PR (nv-nightly tested locally, where it passes with 12.1 and it also passes with 12.4). --------- Signed-off-by: Fabien Dupont <fdupont@redhat.com> Signed-off-by: Logan Adams <loadams@microsoft.com> Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com> Signed-off-by: inkcherry <mingzhi.liu@intel.com> Signed-off-by: Omar Elayan <oelayan@habana.ai> Co-authored-by: Fabien Dupont <fabiendupont@fabiendupont.fr> Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Liangliang Ma <1906710196@qq.com> Co-authored-by: inkcherry <mingzhi.liu@intel.com> Co-authored-by: Omar Elayan <142979319+oelayan7@users.noreply.github.com>
This commit is contained in:
4
.github/workflows/nv-accelerate-v100.yml
vendored
4
.github/workflows/nv-accelerate-v100.yml
vendored
@ -19,7 +19,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -29,7 +29,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
|
4
.github/workflows/nv-ds-chat.yml
vendored
4
.github/workflows/nv-ds-chat.yml
vendored
@ -27,7 +27,7 @@ permissions:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
|
10
.github/workflows/nv-inference.yml
vendored
10
.github/workflows/nv-inference.yml
vendored
@ -22,7 +22,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -32,7 +32,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121
|
||||
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
@ -58,8 +58,8 @@ jobs:
|
||||
run: |
|
||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||
cd tests
|
||||
#pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
|
||||
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.1"
|
||||
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
|
||||
#pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
|
||||
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.4"
|
||||
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
|
||||
# run ds_report again to check updated op list
|
||||
ds_report
|
||||
|
4
.github/workflows/nv-lightning-v100.yml
vendored
4
.github/workflows/nv-lightning-v100.yml
vendored
@ -19,7 +19,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -29,7 +29,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
|
4
.github/workflows/nv-mii.yml
vendored
4
.github/workflows/nv-mii.yml
vendored
@ -27,7 +27,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
||||
pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
|
6
.github/workflows/nv-nightly.yml
vendored
6
.github/workflows/nv-nightly.yml
vendored
@ -18,7 +18,7 @@ permissions:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -28,7 +28,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
@ -58,7 +58,7 @@ jobs:
|
||||
run: |
|
||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||
cd tests
|
||||
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.5" --cuda_ver="12.1"
|
||||
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.6" --cuda_ver="12.4"
|
||||
|
||||
- name: Open GitHub issue if nightly CI fails
|
||||
if: ${{ failure() && (github.event_name == 'schedule') }}
|
||||
|
8
.github/workflows/nv-torch-latest-v100.yml
vendored
8
.github/workflows/nv-torch-latest-v100.yml
vendored
@ -19,7 +19,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -29,7 +29,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
||||
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
@ -55,5 +55,5 @@ jobs:
|
||||
run: |
|
||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||
cd tests
|
||||
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.5" --cuda_ver="12.1"
|
||||
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.5" --cuda_ver="12.1"
|
||||
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.6" --cuda_ver="12.4"
|
||||
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
|
||||
|
4
.github/workflows/nv-torch-nightly-v100.yml
vendored
4
.github/workflows/nv-torch-nightly-v100.yml
vendored
@ -18,7 +18,7 @@ permissions:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -28,7 +28,7 @@ jobs:
|
||||
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
|
||||
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
|
4
.github/workflows/nv-transformers-v100.yml
vendored
4
.github/workflows/nv-transformers-v100.yml
vendored
@ -18,7 +18,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, cu121, v100]
|
||||
runs-on: [self-hosted, nvidia, cu124, v100]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -29,7 +29,7 @@ jobs:
|
||||
- name: Install pytorch
|
||||
run: |
|
||||
# use the same pytorch version as transformers CI
|
||||
pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu121 --index-url https://download.pytorch.org/whl/cu121
|
||||
pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu124 --index-url https://download.pytorch.org/whl/cu124
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
|
Reference in New Issue
Block a user