name: hpu-gaudi2 on: workflow_dispatch: schedule: - cron: "0 0 * * *" pull_request: paths: - ".github/workflows/hpu-gaudi2.yml" - "accelerator/hpu_accelerator.py" - "op_builder/hpu/**" - "deepspeed/runtime/engine.py" - "deepspeed/runtime/bf16_optimizer.py" - "deepspeed/runtime/zero/stage_1_and_2.py" - "deepspeed/runtime/zero/stage3.py" - "deepspeed/runtime/zero/partition_parameters.py" - "deepspeed/runtime/zero/partitioned_param_coordinator.py" - "deepspeed/runtime/zero/parameter_offload.py" - "deepspeed/runtime/pipe/engine.py" - "deepspeed/runtime/utils.py" - "deepspeed/inference/engine.py" - "deepspeed/module_inject/auto_tp.py" - "deepspeed/module_inject/replace_module.py" - "deepspeed/module_inject/load_checkpoint.py" - "deepspeed/module_inject/inject.py" - "deepspeed/ops/transformer/**" - "deepspeed/ops/adam/**" concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: contents: read issues: write jobs: unit-tests: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] container: image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest ports: - 80 options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice env: PT_HPU_LAZY_MODE: 0 TORCHINDUCTOR_COMPILE_THREADS: 1 TEST_LIST: | test_accelerator.py test_autotuning.py test_compression.py test_dist.py test_elastic.py test_ds_arguments.py test_run.py test_multinode_runner.py test_moe_tp.py test_monitor.py (test_zero_optimizer.py and (TestSaveTensorClone or TestZeRONonDistributed)) (test_latest_checkpoint.py and test_missing_latest) test_reshape_checkpoint.py test_shared_weights.py test_sparse.py test_tag_validation.py test_pipe_module.py (test_flops_profiler.py and test_flops_profiler_in_inference) test_get_optim_files.py test_groups.py test_partition_balanced.py (test_adamw.py and TestAdamConfigs) test_coalesced_collectives.py test_activation_checkpointing_non_reentrant.py test_activation_checkpointing.py test_data.py (test_ds_config_dict.py and (TestBasicConfig or TestBatchConfig)) test_ds_config_model.py test_mup_optimizers.py (test_pld.py and test_pld_schedule) test_runtime_utils.py test_pipe_schedule.py test_topology.py (test_ds_initialize.py and (TestClientOptimizer or TestClientLrScheduler)) test_csr.py (test_fp16.py and (TestZeroEmptyGrad or TestZeroAllowUntestedOptimizer)) (test_bf16.py and TestZeroDtypeCocktail) test_partition.py test_ignore_unused_parameters.py test_zero_config.py test_zero_context_ancestry.py (test_zero_context.py and not TestSerialContext) test_zero_dynamic_class.py test_zero_nesting_init.py test_zeropp.py (test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam)) (test_linear.py and (TestLoRALinear or TestBasicLinear)) (test_ctx.py and TestEngine) # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v4 - name: Check container state run: | ldd --version hl-smi -L python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - name: Install transformers run: | git clone https://github.com/huggingface/transformers cd transformers # if needed switch to the last known good SHA until transformers@master is fixed # git checkout 981c276 git rev-parse --short HEAD pip install . - name: Install deepspeed run: | pip install .[dev,autotuning] ds_report - name: Python environment run: | pip list - name: Unit tests run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE} export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS} TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}') echo "TEST_LIST ${TEST_LIST}" pytest --verbose unit/ -k "${TEST_LIST}"