mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
pytest 8.4.0 seems to break a number of our tests, rather than pinning in each individually, we should just put this in the requirements file until we resolve the issue. --------- Co-authored-by: Olatunji Ruwase <tjruwase@gmail.com>
139 lines
4.8 KiB
YAML
139 lines
4.8 KiB
YAML
name: hpu-gaudi2
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
schedule:
|
|
- cron: "0 0 * * *"
|
|
pull_request:
|
|
paths:
|
|
- ".github/workflows/hpu-gaudi2.yml"
|
|
- "accelerator/hpu_accelerator.py"
|
|
- "op_builder/hpu/**"
|
|
- "deepspeed/runtime/engine.py"
|
|
- "deepspeed/runtime/bf16_optimizer.py"
|
|
- "deepspeed/runtime/zero/stage_1_and_2.py"
|
|
- "deepspeed/runtime/zero/stage3.py"
|
|
- "deepspeed/runtime/zero/partition_parameters.py"
|
|
- "deepspeed/runtime/zero/partitioned_param_coordinator.py"
|
|
- "deepspeed/runtime/zero/parameter_offload.py"
|
|
- "deepspeed/runtime/pipe/engine.py"
|
|
- "deepspeed/runtime/utils.py"
|
|
- "deepspeed/inference/engine.py"
|
|
- "deepspeed/module_inject/auto_tp.py"
|
|
- "deepspeed/module_inject/replace_module.py"
|
|
- "deepspeed/module_inject/load_checkpoint.py"
|
|
- "deepspeed/module_inject/inject.py"
|
|
- "deepspeed/ops/transformer/**"
|
|
- "deepspeed/ops/adam/**"
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}-${{ github.ref }}
|
|
cancel-in-progress: true
|
|
|
|
permissions:
|
|
contents: read
|
|
issues: write
|
|
|
|
jobs:
|
|
unit-tests:
|
|
# The type of runner that the job will run on
|
|
runs-on: [self-hosted, intel, gaudi2]
|
|
container:
|
|
image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
|
ports:
|
|
- 80
|
|
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
|
|
|
|
env:
|
|
PT_HPU_LAZY_MODE: 0
|
|
TORCHINDUCTOR_COMPILE_THREADS: 1
|
|
TEST_LIST: |
|
|
test_accelerator.py
|
|
test_autotuning.py
|
|
test_compression.py
|
|
test_dist.py
|
|
test_elastic.py
|
|
test_ds_arguments.py
|
|
test_run.py
|
|
test_multinode_runner.py
|
|
test_moe_tp.py
|
|
test_monitor.py
|
|
(test_zero_optimizer.py and (TestSaveTensorClone or TestZeRONonDistributed))
|
|
(test_latest_checkpoint.py and test_missing_latest)
|
|
test_reshape_checkpoint.py
|
|
test_shared_weights.py
|
|
test_sparse.py
|
|
test_tag_validation.py
|
|
test_pipe_module.py
|
|
(test_flops_profiler.py and test_flops_profiler_in_inference)
|
|
test_get_optim_files.py
|
|
test_groups.py
|
|
test_partition_balanced.py
|
|
(test_adamw.py and TestAdamConfigs)
|
|
test_coalesced_collectives.py
|
|
test_activation_checkpointing_non_reentrant.py
|
|
test_activation_checkpointing.py
|
|
test_data.py
|
|
(test_ds_config_dict.py and (TestBasicConfig or TestBatchConfig))
|
|
test_ds_config_model.py
|
|
test_mup_optimizers.py
|
|
(test_pld.py and test_pld_schedule)
|
|
test_runtime_utils.py
|
|
test_pipe_schedule.py
|
|
test_topology.py
|
|
(test_ds_initialize.py and (TestClientOptimizer or TestClientLrScheduler))
|
|
test_csr.py
|
|
(test_fp16.py and (TestZeroEmptyGrad or TestZeroAllowUntestedOptimizer))
|
|
(test_bf16.py and TestZeroDtypeCocktail)
|
|
test_partition.py
|
|
test_ignore_unused_parameters.py
|
|
test_zero_config.py
|
|
test_zero_context_ancestry.py
|
|
(test_zero_context.py and not TestSerialContext)
|
|
test_zero_dynamic_class.py
|
|
test_zero_nesting_init.py
|
|
test_zeropp.py
|
|
(test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
|
|
(test_linear.py and (TestLoRALinear or TestBasicLinear))
|
|
(test_ctx.py and TestEngine)
|
|
|
|
# Steps represent a sequence of tasks that will be executed as part of the job
|
|
steps:
|
|
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
|
|
- uses: actions/checkout@v4
|
|
|
|
- name: Check container state
|
|
run: |
|
|
ldd --version
|
|
hl-smi -L
|
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
|
|
|
- name: Install transformers
|
|
run: |
|
|
git clone https://github.com/huggingface/transformers
|
|
cd transformers
|
|
# if needed switch to the last known good SHA until transformers@master is fixed
|
|
# git checkout 981c276
|
|
git rev-parse --short HEAD
|
|
pip install .
|
|
|
|
- name: Install deepspeed
|
|
run: |
|
|
pip install .[dev,autotuning]
|
|
ds_report
|
|
|
|
- name: Python environment
|
|
run: |
|
|
pip list
|
|
|
|
- name: Unit tests
|
|
run: |
|
|
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
|
|
cd tests
|
|
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
|
|
export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
|
|
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
|
|
echo "TEST_LIST ${TEST_LIST}"
|
|
pytest --verbose unit/ -k "${TEST_LIST}"
|