mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
Update A6000 workflows to use newer docker container - 24.09 vs 24.03 (#6967)
- Issues with nv-sd updates, will follow up with a subsequent PR
This commit is contained in:
6
.github/workflows/nv-a6000.yml
vendored
6
.github/workflows/nv-a6000.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, a6000]
|
||||
container:
|
||||
image: nvcr.io/nvidia/pytorch:24.03-py3
|
||||
image: nvcr.io/nvidia/pytorch:24.09-py3
|
||||
ports:
|
||||
- 80
|
||||
options: --gpus all --shm-size "8G"
|
||||
@ -57,8 +57,8 @@ jobs:
|
||||
run: |
|
||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||
cd tests
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.3" --cuda_ver="12"
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.3" --cuda_ver="12"
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
|
||||
- name: MII unit tests
|
||||
run: |
|
||||
BRANCH="main"
|
||||
|
4
.github/workflows/nv-flash-attn.yml
vendored
4
.github/workflows/nv-flash-attn.yml
vendored
@ -18,7 +18,7 @@ jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, a6000]
|
||||
container:
|
||||
image: nvcr.io/nvidia/pytorch:24.03-py3
|
||||
image: nvcr.io/nvidia/pytorch:24.09-py3
|
||||
ports:
|
||||
- 80
|
||||
options: --gpus all --shm-size "8G"
|
||||
@ -53,7 +53,7 @@ jobs:
|
||||
run: |
|
||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||
cd tests
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.3" --cuda_ver="12"
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
|
||||
- name: Open GitHub issue if nightly CI fails
|
||||
if: ${{ failure() && (github.event_name == 'schedule') }}
|
||||
uses: JasonEtco/create-an-issue@v2
|
||||
|
4
.github/workflows/nv-human-eval.yml
vendored
4
.github/workflows/nv-human-eval.yml
vendored
@ -11,7 +11,7 @@ jobs:
|
||||
unit-tests:
|
||||
runs-on: [self-hosted, nvidia, a6000]
|
||||
container:
|
||||
image: nvcr.io/nvidia/pytorch:24.03-py3
|
||||
image: nvcr.io/nvidia/pytorch:24.09-py3
|
||||
ports:
|
||||
- 80
|
||||
options: --gpus all --shm-size "8G"
|
||||
@ -50,4 +50,4 @@ jobs:
|
||||
run: |
|
||||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
||||
cd tests
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.3" --cuda_ver="12"
|
||||
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"
|
||||
|
Reference in New Issue
Block a user