Compare commits

...

1 Commits

Author SHA1 Message Date
9781f3142a tc 2025-10-15 10:58:45 -07:00
2 changed files with 411 additions and 411 deletions

View File

@ -9,7 +9,7 @@ install_ubuntu() {
# Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
apt-get install -y cargo
echo "Checking out sccache repo"
git clone https://github.com/mozilla/sccache -b v0.10.0
git clone https://github.com/mozilla/sccache -b v0.11.0
cd sccache
echo "Building sccache"
cargo build --release

View File

@ -107,457 +107,457 @@ jobs:
All testing is done inside the container, to start an interactive session run:
docker exec -it $(docker container ps --format '{{.ID}}') bash
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
no-sudo: true
# - name: Checkout PyTorch
# uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
# with:
# no-sudo: true
- name: Setup Python
if: contains(matrix.runner, 'b200')
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
cache: pip
# - name: Setup Python
# if: contains(matrix.runner, 'b200')
# uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
# with:
# python-version: '3.12'
# cache: pip
- name: Setup Linux
uses: ./.github/actions/setup-linux
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
# - name: Setup Linux
# uses: ./.github/actions/setup-linux
# if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
- name: configure aws credentials
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: ${{ inputs.aws-role-to-assume }}
role-session-name: gha-linux-test
aws-region: us-east-1
# - name: configure aws credentials
# if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
# uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
# with:
# role-to-assume: ${{ inputs.aws-role-to-assume }}
# role-session-name: gha-linux-test
# aws-region: us-east-1
- name: Login to Amazon ECR
if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
# - name: Login to Amazon ECR
# if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
# id: login-ecr
# continue-on-error: true
# uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
docker-image-name: ${{ inputs.docker-image }}
# - name: Calculate docker image
# id: calculate-docker-image
# uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
# if: inputs.build-environment != 'linux-s390x-binary-manywheel'
# with:
# docker-image-name: ${{ inputs.docker-image }}
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*:}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
# - name: Use following to pull public copy of the image
# id: print-ghcr-mirror
# if: inputs.build-environment != 'linux-s390x-binary-manywheel'
# env:
# ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
# shell: bash
# run: |
# tag=${ECR_DOCKER_IMAGE##*:}
# echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
# - name: Pull docker image
# uses: pytorch/test-infra/.github/actions/pull-docker-image@main
# if: inputs.build-environment != 'linux-s390x-binary-manywheel'
# with:
# docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Check if in a container runner
shell: bash
id: check_container_runner
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
# - name: Check if in a container runner
# shell: bash
# id: check_container_runner
# run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
# - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
# id: install-nvidia-driver
# uses: pytorch/test-infra/.github/actions/setup-nvidia@main
# with:
# driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
# if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
- name: Setup GPU_FLAG for docker run
id: setup-gpu-flag
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
# - name: Setup GPU_FLAG for docker run
# id: setup-gpu-flag
# run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
# if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
id: setup-sscache-port-flag
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
# - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
# id: setup-sscache-port-flag
# run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
# if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
- name: Lock NVIDIA A100 40GB Frequency
run: |
sudo nvidia-smi -pm 1
sudo nvidia-smi -ac 1215,1410
nvidia-smi
if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
# - name: Lock NVIDIA A100 40GB Frequency
# run: |
# sudo nvidia-smi -pm 1
# sudo nvidia-smi -ac 1215,1410
# nvidia-smi
# if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
# - name: Get workflow job id
# id: get-job-id
# uses: ./.github/actions/get-workflow-job-id
# if: always()
# with:
# github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
run: |
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
# - name: Start monitoring script
# id: monitor-script
# if: ${{ !inputs.disable-monitor }}
# shell: bash
# continue-on-error: true
# env:
# JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
# JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
# WORKFLOW_NAME: ${{ github.workflow }}
# WORKFLOW_RUN_ID: ${{github.run_id}}
# MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
# MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
# run: |
# python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
# python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
# echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.build-environment }}
s3-bucket: ${{ inputs.s3-bucket }}
use-gha: ${{ inputs.use-gha }}
# - name: Download build artifacts
# uses: ./.github/actions/download-build-artifacts
# with:
# name: ${{ inputs.build-environment }}
# s3-bucket: ${{ inputs.s3-bucket }}
# use-gha: ${{ inputs.use-gha }}
- name: Download TD artifacts
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
# - name: Download TD artifacts
# continue-on-error: true
# uses: ./.github/actions/download-td-artifacts
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
# - name: Parse ref
# id: parse-ref
# run: .github/scripts/parse_ref.py
- name: Check for keep-going label and re-enabled test issues
# This uses the filter-test-configs action because it conveniently
# checks for labels and re-enabled test issues. It does not actually do
# any filtering. All filtering is done in the build step.
id: keep-going
uses: ./.github/actions/filter-test-configs
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
test-matrix: ${{ inputs.test-matrix }}
job-name: ${{ steps.get-job-id.outputs.job-name }}
# - name: Check for keep-going label and re-enabled test issues
# # This uses the filter-test-configs action because it conveniently
# # checks for labels and re-enabled test issues. It does not actually do
# # any filtering. All filtering is done in the build step.
# id: keep-going
# uses: ./.github/actions/filter-test-configs
# with:
# github-token: ${{ secrets.GITHUB_TOKEN }}
# test-matrix: ${{ inputs.test-matrix }}
# job-name: ${{ steps.get-job-id.outputs.job-name }}
- name: Set Test step time
id: test-timeout
shell: bash
env:
JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
run: |
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
# - name: Set Test step time
# id: test-timeout
# shell: bash
# env:
# JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
# run: |
# echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
# - name: Preserve github env variables for use in docker
# shell: bash
# run: |
# env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
# env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test
id: test
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
run: |
set -x
# - name: Test
# id: test
# timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
# env:
# BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
# PR_NUMBER: ${{ github.event.pull_request.number }}
# GITHUB_REPOSITORY: ${{ github.repository }}
# GITHUB_WORKFLOW: ${{ github.workflow }}
# GITHUB_JOB: ${{ github.job }}
# GITHUB_RUN_ID: ${{ github.run_id }}
# GITHUB_RUN_NUMBER: ${{ github.run_number }}
# GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
# JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
# JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
# BRANCH: ${{ steps.parse-ref.outputs.branch }}
# SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
# BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
# TEST_CONFIG: ${{ matrix.config }}
# SHARD_NUMBER: ${{ matrix.shard }}
# NUM_TEST_SHARDS: ${{ matrix.num_shards }}
# EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
# OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
# REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
# CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
# VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
# TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
# NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
# NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
# TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
# # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
# SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
# SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
# SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
# DOCKER_IMAGE: ${{ inputs.docker-image }}
# XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
# XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
# PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
# PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
# DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
# VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
# HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
# ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
# run: |
# set -x
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.ci/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
TEST_COMMAND=.ci/onnx/test.sh
else
TEST_COMMAND=.ci/pytorch/test.sh
fi
# if [[ $TEST_CONFIG == 'multigpu' ]]; then
# TEST_COMMAND=.ci/pytorch/multigpu-test.sh
# elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
# TEST_COMMAND=.ci/onnx/test.sh
# else
# TEST_COMMAND=.ci/pytorch/test.sh
# fi
# Leaving 1GB for the runner and other things
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
# comes from https://github.com/pytorch/test-infra/pull/6058
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
# # Leaving 1GB for the runner and other things
# TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
# # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
# # comes from https://github.com/pytorch/test-infra/pull/6058
# TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
SHM_OPTS=
JENKINS_USER=
# ensure that docker container cleanly exits in 12 hours
# if for some reason cleanup action doesn't stop container
# when job is cancelled
DOCKER_SHELL_CMD="sleep 12h"
else
SHM_OPTS="--shm-size=${SHM_SIZE}"
JENKINS_USER="--user jenkins"
DOCKER_SHELL_CMD=
fi
# if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
# SHM_OPTS=
# JENKINS_USER=
# # ensure that docker container cleanly exits in 12 hours
# # if for some reason cleanup action doesn't stop container
# # when job is cancelled
# DOCKER_SHELL_CMD="sleep 12h"
# else
# SHM_OPTS="--shm-size=${SHM_SIZE}"
# JENKINS_USER="--user jenkins"
# DOCKER_SHELL_CMD=
# fi
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e JOB_NAME \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
-e IN_WHEEL_TEST \
-e SHARD_NUMBER \
-e TEST_CONFIG \
-e NUM_TEST_SHARDS \
-e REENABLED_ISSUES \
-e CONTINUE_THROUGH_ERROR \
-e VERBOSE_TEST_LOGS \
-e TEST_SHOWLOCALS \
-e NO_TEST_TIMEOUT \
-e NO_TD \
-e TD_DISTRIBUTED \
-e PR_LABELS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e SCCACHE_REGION \
-e XLA_CUDA \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e HUGGING_FACE_HUB_TOKEN \
-e VLLM_TEST_HUGGING_FACE_TOKEN \
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-e DASHBOARD_TAG \
-e ARTIFACTS_FILE_SUFFIX \
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--ipc=host \
${SHM_OPTS} \
--tty \
--detach \
--name="${container_name}" \
${JENKINS_USER} \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
${DOCKER_SHELL_CMD}
)
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
# # detached container should get cleaned up by teardown_ec2_linux
# # TODO: Stop building test binaries as part of the build phase
# # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice
# # shellcheck disable=SC2086,SC2090
# container_name=$(docker run \
# ${GPU_FLAG:-} \
# ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
# -e BUILD_ENVIRONMENT \
# -e PR_NUMBER \
# -e GITHUB_ACTIONS \
# -e GITHUB_REPOSITORY \
# -e GITHUB_WORKFLOW \
# -e GITHUB_JOB \
# -e GITHUB_RUN_ID \
# -e GITHUB_RUN_NUMBER \
# -e GITHUB_RUN_ATTEMPT \
# -e JOB_ID \
# -e JOB_NAME \
# -e BASE_SHA \
# -e BRANCH \
# -e SHA1 \
# -e AWS_DEFAULT_REGION \
# -e IN_WHEEL_TEST \
# -e SHARD_NUMBER \
# -e TEST_CONFIG \
# -e NUM_TEST_SHARDS \
# -e REENABLED_ISSUES \
# -e CONTINUE_THROUGH_ERROR \
# -e VERBOSE_TEST_LOGS \
# -e TEST_SHOWLOCALS \
# -e NO_TEST_TIMEOUT \
# -e NO_TD \
# -e TD_DISTRIBUTED \
# -e PR_LABELS \
# -e MAX_JOBS="$(nproc --ignore=2)" \
# -e SCCACHE_BUCKET \
# -e SCCACHE_REGION \
# -e XLA_CUDA \
# -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
# -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
# -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
# -e SKIP_SCCACHE_INITIALIZATION=1 \
# -e HUGGING_FACE_HUB_TOKEN \
# -e VLLM_TEST_HUGGING_FACE_TOKEN \
# -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
# -e DASHBOARD_TAG \
# -e ARTIFACTS_FILE_SUFFIX \
# --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
# --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
# --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
# --security-opt seccomp=unconfined \
# --cap-add=SYS_PTRACE \
# --ipc=host \
# ${SHM_OPTS} \
# --tty \
# --detach \
# --name="${container_name}" \
# ${JENKINS_USER} \
# -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
# -w /var/lib/jenkins/workspace \
# "${DOCKER_IMAGE}" \
# ${DOCKER_SHELL_CMD}
# )
# echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
docker exec -t "${container_name}" sh -c "python3 -m pip install -r .ci/docker/requirements-ci.txt"
fi
# if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
# docker exec -t "${container_name}" sh -c "python3 -m pip install -r .ci/docker/requirements-ci.txt"
# fi
docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
# docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
- name: Upload pytest cache if tests failed
uses: ./.github/actions/pytest-cache-upload
continue-on-error: true
if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' && inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
cache_dir: .pytest_cache
shard: ${{ matrix.shard }}
sha: ${{ github.event.pull_request.head.sha || github.sha }}
test_config: ${{ matrix.config }}
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
# - name: Upload pytest cache if tests failed
# uses: ./.github/actions/pytest-cache-upload
# continue-on-error: true
# if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' && inputs.build-environment != 'linux-s390x-binary-manywheel'
# with:
# cache_dir: .pytest_cache
# shard: ${{ matrix.shard }}
# sha: ${{ github.event.pull_request.head.sha || github.sha }}
# test_config: ${{ matrix.config }}
# job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
- name: Authenticate with AWS
if: ${{ always() && contains(matrix.runner, 'b200') }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
# The max duration enforced by the server side
role-duration-seconds: 18000
aws-region: us-east-1
# - name: Authenticate with AWS
# if: ${{ always() && contains(matrix.runner, 'b200') }}
# uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
# with:
# role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
# # The max duration enforced by the server side
# role-duration-seconds: 18000
# aws-region: us-east-1
- name: Upload the benchmark results
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
benchmark-results-dir: test/test-reports
dry-run: false
schema-version: v3
github-token: ${{ secrets.GITHUB_TOKEN }}
# - name: Upload the benchmark results
# uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
# if: inputs.build-environment != 'linux-s390x-binary-manywheel'
# with:
# benchmark-results-dir: test/test-reports
# dry-run: false
# schema-version: v3
# github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Print remaining test logs
shell: bash
if: always() && steps.test.conclusion
run: |
cat test/**/*_toprint.log || true
# - name: Print remaining test logs
# shell: bash
# if: always() && steps.test.conclusion
# run: |
# cat test/**/*_toprint.log || true
- name: Stop monitoring script
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
shell: bash
continue-on-error: true
env:
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
run: |
kill "$MONITOR_SCRIPT_PID"
# - name: Stop monitoring script
# if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
# shell: bash
# continue-on-error: true
# env:
# MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
# run: |
# kill "$MONITOR_SCRIPT_PID"
- name: Upload test artifacts
uses: ./.github/actions/upload-test-artifacts
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
with:
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
use-gha: ${{ inputs.use-gha }}
s3-bucket: ${{ inputs.s3-bucket }}
# - name: Upload test artifacts
# uses: ./.github/actions/upload-test-artifacts
# if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
# with:
# file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
# use-gha: ${{ inputs.use-gha }}
# s3-bucket: ${{ inputs.s3-bucket }}
- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
# - name: Collect backtraces from coredumps (if any)
# if: always()
# run: |
# # shellcheck disable=SC2156
# find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
- name: Store Core dumps on S3
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*
# - name: Store Core dumps on S3
# uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
# if: failure()
# with:
# name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
# retention-days: 14
# if-no-files-found: ignore
# path: ./**/core.[1-9]*
- name: Upload utilization stats
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
# - name: Upload utilization stats
# if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
# continue-on-error: true
# uses: ./.github/actions/upload-utilization-stats
# with:
# job_id: ${{ steps.get-job-id.outputs.job-id }}
# job_name: ${{ steps.get-job-id.outputs.job-name }}
# workflow_name: ${{ github.workflow }}
# workflow_run_id: ${{github.run_id}}
# workflow_attempt: ${{github.run_attempt}}
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
# - name: Teardown Linux
# uses: pytorch/test-infra/.github/actions/teardown-linux@main
# if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
# NB: We are currently having an intermittent GPU-related issue on G5 runners with
# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
# not seem to help. Here are some symptoms:
# * Calling nvidia-smi timeouts after 60 second
# * Fail to run nvidia-smi with an unable to determine the device handle for GPU
# unknown error
# * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
# * Run docker --gpus all fails with error response from daemon
#
# As both the root cause and recovery path are unclear, let's take the runner out of
# service so that it doesn't get any more jobs
- name: Check NVIDIA driver installation step
if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
shell: bash
run: |
set +e
set -x
# # NB: We are currently having an intermittent GPU-related issue on G5 runners with
# # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
# # not seem to help. Here are some symptoms:
# # * Calling nvidia-smi timeouts after 60 second
# # * Fail to run nvidia-smi with an unable to determine the device handle for GPU
# # unknown error
# # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
# # * Run docker --gpus all fails with error response from daemon
# #
# # As both the root cause and recovery path are unclear, let's take the runner out of
# # service so that it doesn't get any more jobs
# - name: Check NVIDIA driver installation step
# if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
# shell: bash
# run: |
# set +e
# set -x
nvidia-smi
# NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
# the case where the driver has already crashed as it still can get the driver version
# and some basic information like the bus ID. However, the rest of the information
# would be missing (ERR!), for example:
#
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# | | | MIG M. |
# |===============================+======================+======================|
# | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! |
# |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default |
# | | | ERR! |
# +-------------------------------+----------------------+----------------------+
#
# +-----------------------------------------------------------------------------+
# | Processes: |
# | GPU GI CI PID Type Process name GPU Memory |
# | ID ID Usage |
# |=============================================================================|
# +-----------------------------------------------------------------------------+
#
# This should be reported as a failure instead as it will guarantee to fail when
# Docker tries to run with --gpus all
#
# So, the correct check here is to query one of the missing piece of info like
# GPU name, so that the command can fail accordingly
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
NVIDIA_SMI_STATUS=$?
# nvidia-smi
# # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
# # the case where the driver has already crashed as it still can get the driver version
# # and some basic information like the bus ID. However, the rest of the information
# # would be missing (ERR!), for example:
# #
# # +-----------------------------------------------------------------------------+
# # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 |
# # |-------------------------------+----------------------+----------------------+
# # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# # | | | MIG M. |
# # |===============================+======================+======================|
# # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! |
# # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default |
# # | | | ERR! |
# # +-------------------------------+----------------------+----------------------+
# #
# # +-----------------------------------------------------------------------------+
# # | Processes: |
# # | GPU GI CI PID Type Process name GPU Memory |
# # | ID ID Usage |
# # |=============================================================================|
# # +-----------------------------------------------------------------------------+
# #
# # This should be reported as a failure instead as it will guarantee to fail when
# # Docker tries to run with --gpus all
# #
# # So, the correct check here is to query one of the missing piece of info like
# # GPU name, so that the command can fail accordingly
# nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
# NVIDIA_SMI_STATUS=$?
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
echo "NVIDIA driver installation has failed, shutting down the runner..."
.github/scripts/stop_runner_service.sh
fi
# # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
# if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
# echo "NVIDIA driver installation has failed, shutting down the runner..."
# .github/scripts/stop_runner_service.sh
# fi
# For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
# power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
# https://github.com/pytorch/test-infra/issues/4000
GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
NVIDIA_SMI_STATUS=$?
# # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
# # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
# # https://github.com/pytorch/test-infra/issues/4000
# GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
# NVIDIA_SMI_STATUS=$?
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
echo "NVIDIA driver installation has failed, shutting down the runner..."
.github/scripts/stop_runner_service.sh
fi
# # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
# if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
# echo "NVIDIA driver installation has failed, shutting down the runner..."
# .github/scripts/stop_runner_service.sh
# fi
# Check the GPU count to be a power of 2
if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
.github/scripts/stop_runner_service.sh
fi
# # Check the GPU count to be a power of 2
# if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
# echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
# .github/scripts/stop_runner_service.sh
# fi
- name: Cleanup docker
if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
shell: bash
run: |
# on s390x stop the container for clean worker stop
docker stop -a || true
docker kill -a || true
# - name: Cleanup docker
# if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
# shell: bash
# run: |
# # on s390x stop the container for clean worker stop
# docker stop -a || true
# docker kill -a || true