mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
# Overview Currently monitor.py produces error only result, this pr introduct disable-monitor option to all *-test.yml. We also like to explore how the monitor code affect benchmark results. # next steps - fix the monitor.py - enable non-benchmark tests with monitor - investigate benchmark test behavior with monitor background job Pull Request resolved: https://github.com/pytorch/pytorch/pull/140857 Approved by: https://github.com/huydhn
451 lines
20 KiB
YAML
451 lines
20 KiB
YAML
name: linux-test
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
test-matrix:
|
|
required: true
|
|
type: string
|
|
description: JSON description of what test configs to run.
|
|
docker-image:
|
|
required: true
|
|
type: string
|
|
description: Docker image to run in.
|
|
sync-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: |
|
|
If this is set, our linter will use this to make sure that every other
|
|
job with the same `sync-tag` is identical.
|
|
timeout-minutes:
|
|
required: false
|
|
type: number
|
|
default: 240
|
|
description: |
|
|
Set the maximum (in minutes) how long the workflow should take to finish
|
|
use-gha:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: If set to any value, upload to GHA. Otherwise upload to S3.
|
|
dashboard-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
s3-bucket:
|
|
description: S3 bucket to download artifact
|
|
required: false
|
|
type: string
|
|
default: "gha-artifacts"
|
|
aws-role-to-assume:
|
|
description: role to assume for downloading artifacts
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
disable-monitor:
|
|
description: |
|
|
[Experimental] Disable utilization monitoring for tests.
|
|
Currently, by default we disable the monitor job and only look for specific tests,
|
|
since we are investigating the behaviour of the monitor script with different tests.
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
secrets:
|
|
HUGGING_FACE_HUB_TOKEN:
|
|
required: false
|
|
description: |
|
|
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN:
|
|
required: false
|
|
description: |
|
|
FB app token to write to scribe endpoint
|
|
|
|
env:
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
|
|
jobs:
|
|
test:
|
|
# Don't run on forked repos or empty test matrix
|
|
if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
|
|
strategy:
|
|
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
|
fail-fast: false
|
|
environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
|
|
runs-on: ${{ matrix.runner }}
|
|
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
|
steps:
|
|
- name: Setup SSH (Click me for login details)
|
|
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
if: ${{ !contains(matrix.runner, 'gcp.a100') }}
|
|
with:
|
|
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
instructions: |
|
|
All testing is done inside the container, to start an interactive session run:
|
|
docker exec -it $(docker container ps --format '{{.ID}}') bash
|
|
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
|
with:
|
|
no-sudo: true
|
|
|
|
- name: Setup Linux
|
|
uses: ./.github/actions/setup-linux
|
|
|
|
- name: configure aws credentials
|
|
if : ${{ inputs.aws-role-to-assume != '' }}
|
|
uses: aws-actions/configure-aws-credentials@v3
|
|
with:
|
|
role-to-assume: ${{ inputs.aws-role-to-assume }}
|
|
role-session-name: gha-linux-test
|
|
aws-region: us-east-1
|
|
|
|
- name: Calculate docker image
|
|
id: calculate-docker-image
|
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
|
with:
|
|
docker-image-name: ${{ inputs.docker-image }}
|
|
|
|
- name: Use following to pull public copy of the image
|
|
id: print-ghcr-mirror
|
|
env:
|
|
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
shell: bash
|
|
run: |
|
|
tag=${ECR_DOCKER_IMAGE##*/}
|
|
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
with:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
|
|
- name: Check if in a container runner
|
|
shell: bash
|
|
id: check_container_runner
|
|
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
|
id: install-nvidia-driver
|
|
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
|
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
|
|
|
- name: Setup GPU_FLAG for docker run
|
|
id: setup-gpu-flag
|
|
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
|
|
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
|
|
|
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
|
|
id: setup-sscache-port-flag
|
|
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
|
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
|
|
|
- name: Lock NVIDIA A100 40GB Frequency
|
|
run: |
|
|
sudo nvidia-smi -pm 1
|
|
sudo nvidia-smi -ac 1215,1410
|
|
nvidia-smi
|
|
if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
|
|
|
- name: Start monitoring script
|
|
id: monitor-script
|
|
if: ${{ !inputs.disable-monitor }}
|
|
shell: bash
|
|
continue-on-error: true
|
|
run: |
|
|
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
|
|
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
|
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Download build artifacts
|
|
uses: ./.github/actions/download-build-artifacts
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Download TD artifacts
|
|
continue-on-error: true
|
|
uses: ./.github/actions/download-td-artifacts
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Check for keep-going label and re-enabled test issues
|
|
# This uses the filter-test-configs action because it conviniently
|
|
# checks for labels and re-enabled test issues. It does not actually do
|
|
# any filtering. All filtering is done in the build step.
|
|
id: keep-going
|
|
uses: ./.github/actions/filter-test-configs
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
test-matrix: ${{ inputs.test-matrix }}
|
|
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
|
|
|
- name: Set Test step time
|
|
id: test-timeout
|
|
shell: bash
|
|
env:
|
|
JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
|
run: |
|
|
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Test
|
|
id: test
|
|
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
|
env:
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
GITHUB_REPOSITORY: ${{ github.repository }}
|
|
GITHUB_WORKFLOW: ${{ github.workflow }}
|
|
GITHUB_JOB: ${{ github.job }}
|
|
GITHUB_RUN_ID: ${{ github.run_id }}
|
|
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
|
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
|
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
|
|
TEST_CONFIG: ${{ matrix.config }}
|
|
SHARD_NUMBER: ${{ matrix.shard }}
|
|
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
|
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
|
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
|
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
|
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
|
|
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
|
|
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
|
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
|
|
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
|
SCCACHE_REGION: us-east-1
|
|
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
|
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
|
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
|
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
|
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
|
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
|
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
|
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
|
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
|
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
|
|
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
|
run: |
|
|
set -x
|
|
|
|
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
|
TEST_COMMAND=.ci/pytorch/multigpu-test.sh
|
|
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
|
|
TEST_COMMAND=.ci/onnx/test.sh
|
|
else
|
|
TEST_COMMAND=.ci/pytorch/test.sh
|
|
fi
|
|
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# TODO: Stop building test binaries as part of the build phase
|
|
# Used for GPU_FLAG since that doesn't play nice
|
|
# shellcheck disable=SC2086,SC2090
|
|
container_name=$(docker run \
|
|
${GPU_FLAG:-} \
|
|
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e PR_NUMBER \
|
|
-e GITHUB_ACTIONS \
|
|
-e GITHUB_REPOSITORY \
|
|
-e GITHUB_WORKFLOW \
|
|
-e GITHUB_JOB \
|
|
-e GITHUB_RUN_ID \
|
|
-e GITHUB_RUN_NUMBER \
|
|
-e GITHUB_RUN_ATTEMPT \
|
|
-e JOB_ID \
|
|
-e JOB_NAME \
|
|
-e BASE_SHA \
|
|
-e BRANCH \
|
|
-e SHA1 \
|
|
-e AWS_DEFAULT_REGION \
|
|
-e IN_WHEEL_TEST \
|
|
-e SHARD_NUMBER \
|
|
-e TEST_CONFIG \
|
|
-e NUM_TEST_SHARDS \
|
|
-e REENABLED_ISSUES \
|
|
-e CONTINUE_THROUGH_ERROR \
|
|
-e VERBOSE_TEST_LOGS \
|
|
-e TEST_SHOWLOCALS \
|
|
-e NO_TEST_TIMEOUT \
|
|
-e NO_TD \
|
|
-e TD_DISTRIBUTED \
|
|
-e PR_LABELS \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e SCCACHE_BUCKET \
|
|
-e SCCACHE_REGION \
|
|
-e SCCACHE_S3_KEY_PREFIX \
|
|
-e XLA_CUDA \
|
|
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
|
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
|
|
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
|
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
|
-e HUGGING_FACE_HUB_TOKEN \
|
|
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
|
-e DASHBOARD_TAG \
|
|
-e IS_A100_RUNNER \
|
|
-e ARTIFACTS_FILE_SUFFIX \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--ipc=host \
|
|
--shm-size="${SHM_SIZE}" \
|
|
--tty \
|
|
--detach \
|
|
--name="${container_name}" \
|
|
--user jenkins \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${DOCKER_IMAGE}"
|
|
)
|
|
# Propagate download.pytorch.org IP to container
|
|
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
|
|
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
|
|
docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
|
|
|
|
- name: Upload pytest cache if tests failed
|
|
uses: ./.github/actions/pytest-cache-upload
|
|
continue-on-error: true
|
|
if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
|
|
with:
|
|
cache_dir: .pytest_cache
|
|
shard: ${{ matrix.shard }}
|
|
sha: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
test_config: ${{ matrix.config }}
|
|
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
|
|
|
- name: Print remaining test logs
|
|
shell: bash
|
|
if: always() && steps.test.conclusion
|
|
run: |
|
|
cat test/**/*_toprint.log || true
|
|
|
|
- name: Stop monitoring script
|
|
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
|
|
shell: bash
|
|
continue-on-error: true
|
|
env:
|
|
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
|
run: |
|
|
kill "$MONITOR_SCRIPT_PID"
|
|
|
|
- name: Upload test artifacts
|
|
uses: ./.github/actions/upload-test-artifacts
|
|
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
|
|
with:
|
|
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
|
use-gha: ${{ inputs.use-gha }}
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Collect backtraces from coredumps (if any)
|
|
if: always()
|
|
run: |
|
|
# shellcheck disable=SC2156
|
|
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
|
|
|
|
- name: Store Core dumps on S3
|
|
uses: seemethere/upload-artifact-s3@v5
|
|
if: failure()
|
|
with:
|
|
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
|
|
retention-days: 14
|
|
if-no-files-found: ignore
|
|
path: ./**/core.[1-9]*
|
|
|
|
- name: Teardown Linux
|
|
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
|
if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
|
|
|
|
# NB: We are currently having an intermittent GPU-related issue on G5 runners with
|
|
# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
|
|
# not seem to help. Here are some symptoms:
|
|
# * Calling nvidia-smi timeouts after 60 second
|
|
# * Fail to run nvidia-smi with an unable to determine the device handle for GPU
|
|
# unknown error
|
|
# * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
|
|
# * Run docker --gpus all fails with error response from daemon
|
|
#
|
|
# As both the root cause and recovery path are unclear, let's take the runner out of
|
|
# service so that it doesn't get any more jobs
|
|
- name: Check NVIDIA driver installation step
|
|
if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
|
|
shell: bash
|
|
env:
|
|
RUNNER_WORKSPACE: ${{ runner.workspace }}
|
|
run: |
|
|
set +e
|
|
set -x
|
|
|
|
nvidia-smi
|
|
# NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
|
|
# the case where the driver has already crashed as it still can get the driver version
|
|
# and some basic information like the bus ID. However, the rest of the information
|
|
# would be missing (ERR!), for example:
|
|
#
|
|
# +-----------------------------------------------------------------------------+
|
|
# | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 |
|
|
# |-------------------------------+----------------------+----------------------+
|
|
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
|
|
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|
|
# | | | MIG M. |
|
|
# |===============================+======================+======================|
|
|
# | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! |
|
|
# |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default |
|
|
# | | | ERR! |
|
|
# +-------------------------------+----------------------+----------------------+
|
|
#
|
|
# +-----------------------------------------------------------------------------+
|
|
# | Processes: |
|
|
# | GPU GI CI PID Type Process name GPU Memory |
|
|
# | ID ID Usage |
|
|
# |=============================================================================|
|
|
# +-----------------------------------------------------------------------------+
|
|
#
|
|
# This should be reported as a failure instead as it will guarantee to fail when
|
|
# Docker tries to run with --gpus all
|
|
#
|
|
# So, the correct check here is to query one of the missing piece of info like
|
|
# GPU name, so that the command can fail accordingly
|
|
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
|
|
NVIDIA_SMI_STATUS=$?
|
|
|
|
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
|
|
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
|
|
echo "NVIDIA driver installation has failed, shutting down the runner..."
|
|
.github/scripts/stop_runner_service.sh
|
|
fi
|
|
|
|
# For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
|
|
# power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
|
|
# https://github.com/pytorch/test-infra/issues/4000
|
|
GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
|
|
NVIDIA_SMI_STATUS=$?
|
|
|
|
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
|
|
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
|
|
echo "NVIDIA driver installation has failed, shutting down the runner..."
|
|
.github/scripts/stop_runner_service.sh
|
|
fi
|
|
|
|
# Check the GPU count to be a power of 2
|
|
if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
|
|
echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
|
|
.github/scripts/stop_runner_service.sh
|
|
fi
|