pytorch/.github/actions/linux-test/action.yml

name: linux-test

inputs:
  build-environment:
    required: true
    type: string
    description: Top-level label for what's being built/tested.
  test-matrix:
    required: true
    type: string
    description: JSON description of what test configs to run.
  docker-image:
    required: true
    type: string
    description: Docker image to run in.
  sync-tag:
    required: false
    type: string
    default: ""
    description: |
      If this is set, our linter will use this to make sure that every other
      job with the same `sync-tag` is identical.
  use-gha:
    required: false
    type: string
    default: ""
    description: If set to any value, upload to GHA. Otherwise upload to S3.
  dashboard-tag:
    required: false
    type: string
    default: ""
  s3-bucket:
    description: S3 bucket to download artifact
    required: false
    type: string
    default: "gha-artifacts"
  aws-role-to-assume:
    description: role to assume for downloading artifacts
    required: false
    type: string
    default: ""
  HUGGING_FACE_HUB_TOKEN:
    description: |
      HF Auth token to avoid rate limits when downloading models or datasets from hub
    required: false
    default: ""
  GITHUB_TOKEN:
    description: GitHub token
    required: true
  disable-monitor:
    description: |
      [Experimental] Disable utilization monitoring for tests.
      Currently, by default we disable the monitor job and only look for specific tests,
      since we are investigating the behaviour of the monitor script with different tests.
    required: false
    type: boolean
    default: true
#env:
#  GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}

runs:
  using: composite
  steps:
    - name: Setup Linux
      uses: ./.github/actions/setup-linux

    - name: configure aws credentials
      if : ${{ inputs.aws-role-to-assume != '' }}
      uses: aws-actions/configure-aws-credentials@v4
      with:
        role-to-assume: ${{ inputs.aws-role-to-assume }}
        role-session-name: gha-linux-test
        aws-region: us-east-1

    - name: Calculate docker image
      id: calculate-docker-image
      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
      with:
        docker-image-name: ${{ inputs.docker-image }}

    - name: Use following to pull public copy of the image
      id: print-ghcr-mirror
      env:
        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
      shell: bash
      run: |
        tag=${ECR_DOCKER_IMAGE##*/}
        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

    - name: Pull docker image
      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
      with:
        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

    - name: Check if in a container runner
      shell: bash
      id: check_container_runner
      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
      id: install-nvidia-driver
      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

    - name: Setup GPU_FLAG for docker run
      id: setup-gpu-flag
      run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

    - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
      id: setup-sscache-port-flag
      run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

    - name: Lock NVIDIA A100 40GB Frequency
      shell: bash
      run: |
        sudo nvidia-smi -pm 1
        sudo nvidia-smi -ac 1215,1410
        nvidia-smi
      if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

    - name: Start monitoring script
      id: monitor-script
      if: ${{ !inputs.disable-monitor }}
      shell: bash
      continue-on-error: true
      run: |
        python3 -m pip install psutil==5.9.8 nvidia-ml-py==11.525.84
        python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
        echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

    - name: Download build artifacts
      uses: ./.github/actions/download-build-artifacts
      with:
        name: ${{ inputs.build-environment }}
        s3-bucket: ${{ inputs.s3-bucket }}

    - name: Download TD artifacts
      continue-on-error: true
      uses: ./.github/actions/download-td-artifacts

    - name: Parse ref
      id: parse-ref
      shell: bash
      run: .github/scripts/parse_ref.py

    - name: Get workflow job id
      id: get-job-id
      uses: ./.github/actions/get-workflow-job-id
      if: always()
      with:
        github-token: ${{ inputs.GITHUB_TOKEN }}

    - name: Check for keep-going label and re-enabled test issues
      # This uses the filter-test-configs action because it conveniently
      # checks for labels and re-enabled test issues.  It does not actually do
      # any filtering.  All filtering is done in the build step.
      id: keep-going
      uses: ./.github/actions/filter-test-configs
      with:
        github-token: ${{ inputs.GITHUB_TOKEN }}
        test-matrix: ${{ inputs.test-matrix }}
        job-name: ${{ steps.get-job-id.outputs.job-name }}

    - name: Test
      id: test
      env:
        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
        PR_NUMBER: ${{ github.event.pull_request.number }}
        GITHUB_REPOSITORY: ${{ github.repository }}
        GITHUB_WORKFLOW: ${{ github.workflow }}
        GITHUB_JOB: ${{ github.job }}
        GITHUB_RUN_ID: ${{ github.run_id }}
        GITHUB_RUN_NUMBER: ${{ github.run_number }}
        GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
        JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
        BRANCH: ${{ steps.parse-ref.outputs.branch }}
        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
        TEST_CONFIG: ${{ matrix.config }}
        SHARD_NUMBER: ${{ matrix.shard }}
        NUM_TEST_SHARDS: ${{ matrix.num_shards }}
        REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
        CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
        VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
        TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
        NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
        SCCACHE_REGION: us-east-1
        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
        DOCKER_IMAGE: ${{ inputs.docker-image }}
        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
        PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
        SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
        IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}

      shell: bash
      run: |
        set -x

        if [[ $TEST_CONFIG == 'multigpu' ]]; then
          TEST_COMMAND=.ci/pytorch/multigpu-test.sh
        elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
          TEST_COMMAND=.ci/onnx/test.sh
        else
          TEST_COMMAND=.ci/pytorch/test.sh
        fi

        # detached container should get cleaned up by teardown_ec2_linux
        # TODO: Stop building test binaries as part of the build phase
        # Used for GPU_FLAG since that doesn't play nice
        # shellcheck disable=SC2086,SC2090
        container_name=$(docker run \
          ${GPU_FLAG:-} \
          ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
          -e BUILD_ENVIRONMENT \
          -e PR_NUMBER \
          -e GITHUB_ACTIONS \
          -e GITHUB_REPOSITORY \
          -e GITHUB_WORKFLOW \
          -e GITHUB_JOB \
          -e GITHUB_RUN_ID \
          -e GITHUB_RUN_NUMBER \
          -e GITHUB_RUN_ATTEMPT \
          -e JOB_ID \
          -e JOB_NAME \
          -e BASE_SHA \
          -e BRANCH \
          -e SHA1 \
          -e AWS_DEFAULT_REGION \
          -e IN_WHEEL_TEST \
          -e SHARD_NUMBER \
          -e TEST_CONFIG \
          -e NUM_TEST_SHARDS \
          -e REENABLED_ISSUES \
          -e CONTINUE_THROUGH_ERROR \
          -e VERBOSE_TEST_LOGS \
          -e NO_TEST_TIMEOUT \
          -e NO_TD \
          -e TD_DISTRIBUTED \
          -e PR_LABELS \
          -e MAX_JOBS="$(nproc --ignore=2)" \
          -e SCCACHE_BUCKET \
          -e SCCACHE_REGION \
          -e SCCACHE_S3_KEY_PREFIX \
          -e XLA_CUDA \
          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
          -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
          -e SKIP_SCCACHE_INITIALIZATION=1 \
          -e HUGGING_FACE_HUB_TOKEN \
          -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
          -e DASHBOARD_TAG \
          -e IS_A100_RUNNER \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
          --ipc=host \
          --shm-size="${SHM_SIZE}" \
          --tty \
          --detach \
          --name="${container_name}" \
          --user jenkins \
          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
          -w /var/lib/jenkins/workspace \
          "${DOCKER_IMAGE}"
        )
        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

    - name: Upload pytest cache if tests failed
      uses: ./.github/actions/pytest-cache-upload
      continue-on-error: true
      if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
      with:
        cache_dir: .pytest_cache
        shard: ${{ matrix.shard }}
        sha: ${{ github.event.pull_request.head.sha || github.sha }}
        test_config: ${{ matrix.config }}
        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

    - name: Print remaining test logs
      shell: bash
      if: always() && steps.test.conclusion
      run: |
        cat test/**/*_toprint.log || true

    - name: Stop monitoring script
      if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
      shell: bash
      continue-on-error: true
      env:
        MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
      run: |
        kill "$MONITOR_SCRIPT_PID"

    - name: Upload test artifacts
      uses: ./.github/actions/upload-test-artifacts
      if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
      with:
        file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
        use-gha: ${{ inputs.use-gha }}
        s3-bucket: ${{ inputs.s3-bucket }}

    - name: Collect backtraces from coredumps (if any)
      if: always()
      shell: bash
      run: |
        # shellcheck disable=SC2156
        find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

    - name: Store Core dumps on S3
      uses: seemethere/upload-artifact-s3@v5
      if: failure()
      with:
        name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
        retention-days: 14
        if-no-files-found: ignore
        path: ./**/core.[1-9]*

    - name: Teardown Linux
      uses: pytorch/test-infra/.github/actions/teardown-linux@main
      if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
    # not seem to help. Here are some symptoms:
    #   * Calling nvidia-smi timeouts after 60 second
    #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
    #     unknown error
    #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
    #   * Run docker --gpus all fails with error response from daemon
    #
    # As both the root cause and recovery path are unclear, let's take the runner out of
    # service so that it doesn't get any more jobs
    - name: Check NVIDIA driver installation step
      if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
      shell: bash
      env:
        RUNNER_WORKSPACE: ${{ runner.workspace }}
      run: |
        set +e
        set -x

        nvidia-smi
        # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
        # the case where the driver has already crashed as it still can get the driver version
        # and some basic information like the bus ID.  However, the rest of the information
        # would be missing (ERR!), for example:
        #
        # +-----------------------------------------------------------------------------+
        # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
        # |-------------------------------+----------------------+----------------------+
        # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
        # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
        # |                               |                      |               MIG M. |
        # |===============================+======================+======================|
        # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
        # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
        # |                               |                      |                 ERR! |
        # +-------------------------------+----------------------+----------------------+
        #
        # +-----------------------------------------------------------------------------+
        # | Processes:                                                                  |
        # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
        # |        ID   ID                                                   Usage      |
        # |=============================================================================|
        # +-----------------------------------------------------------------------------+
        #
        # This should be reported as a failure instead as it will guarantee to fail when
        # Docker tries to run with --gpus all
        #
        # So, the correct check here is to query one of the missing piece of info like
        # GPU name, so that the command can fail accordingly
        nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
        NVIDIA_SMI_STATUS=$?

        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
          echo "NVIDIA driver installation has failed, shutting down the runner..."
          .github/scripts/stop_runner_service.sh
        fi

        # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
        # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
        # https://github.com/pytorch/test-infra/issues/4000
        GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
        NVIDIA_SMI_STATUS=$?

        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
          echo "NVIDIA driver installation has failed, shutting down the runner..."
          .github/scripts/stop_runner_service.sh
        fi

        # Check the GPU count to be a power of 2
        if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
          echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
          .github/scripts/stop_runner_service.sh
        fi