mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Passing CI: https://github.com/pytorch/pytorch/actions/runs/18141589975/job/51635340255?pr=163782 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163782 Approved by: https://github.com/huydhn, https://github.com/mikaylagawarecki
256 lines
9.6 KiB
YAML
256 lines
9.6 KiB
YAML
# The point of this workflow is to test that a FA3 wheel that was built based off the
|
|
# stable ABI as of torch nightly 20250830 can still run on the newer torch.
|
|
#
|
|
# This workflow is very similar to the _linux-test.yml workflow, with the following
|
|
# differences:
|
|
# 1. It is simpler (there is no test matrix)
|
|
# 2. It pulls flash-attention as a secondary repository in order to access the tests.
|
|
# Note that it does not BUILD anything from flash-attention, as we have a prebuilt
|
|
# wheel. We pull flash-attention only to run a few tests.
|
|
# 3. It runs only FA3 tests. No PyTorch tests are run.
|
|
name: linux-test-stable-fa3
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
docker-image:
|
|
required: true
|
|
type: string
|
|
description: Docker image to run in.
|
|
timeout-minutes:
|
|
required: false
|
|
type: number
|
|
default: 30
|
|
description: |
|
|
Set the maximum (in minutes) how long the workflow should take to finish
|
|
s3-bucket:
|
|
description: S3 bucket to download artifact
|
|
required: false
|
|
type: string
|
|
default: "gha-artifacts"
|
|
secrets:
|
|
HUGGING_FACE_HUB_TOKEN:
|
|
required: false
|
|
description: |
|
|
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
|
VLLM_TEST_HUGGING_FACE_TOKEN:
|
|
required: false
|
|
description: |
|
|
HF Auth token to test vllm
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN:
|
|
required: false
|
|
description: |
|
|
FB app token to write to scribe endpoint
|
|
|
|
env:
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
|
|
jobs:
|
|
test:
|
|
# Don't run on forked repos
|
|
if: github.repository_owner == 'pytorch'
|
|
runs-on: linux.aws.h100
|
|
timeout-minutes: ${{ inputs.timeout-minutes || 30 }}
|
|
permissions:
|
|
id-token: write
|
|
contents: read
|
|
steps:
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
|
with:
|
|
no-sudo: true
|
|
|
|
- name: Checkout flash-attention as a secondary repository
|
|
uses: actions/checkout@v4
|
|
with:
|
|
repository: Dao-AILab/flash-attention
|
|
path: flash-attention
|
|
|
|
- name: Setup Linux
|
|
uses: ./.github/actions/setup-linux
|
|
|
|
- name: Calculate docker image
|
|
id: calculate-docker-image
|
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
|
with:
|
|
docker-image-name: ${{ inputs.docker-image }}
|
|
|
|
- name: Use following to pull public copy of the image
|
|
id: print-ghcr-mirror
|
|
env:
|
|
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
shell: bash
|
|
run: |
|
|
tag=${ECR_DOCKER_IMAGE##*:}
|
|
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
with:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
|
|
- name: Check if in a container runner
|
|
shell: bash
|
|
id: check_container_runner
|
|
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Setup GPU_FLAG for docker run
|
|
id: setup-gpu-flag
|
|
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
|
|
|
|
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
|
|
id: setup-sscache-port-flag
|
|
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
|
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Download build artifacts
|
|
uses: ./.github/actions/download-build-artifacts
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
|
|
- name: Set Test step time
|
|
id: test-timeout
|
|
shell: bash
|
|
env:
|
|
JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
|
|
run: |
|
|
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Preserve github env variables for use in docker
|
|
shell: bash
|
|
run: |
|
|
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
|
|
- name: Test
|
|
id: test
|
|
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
|
env:
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
GITHUB_REPOSITORY: ${{ github.repository }}
|
|
GITHUB_WORKFLOW: ${{ github.workflow }}
|
|
GITHUB_JOB: ${{ github.job }}
|
|
GITHUB_RUN_ID: ${{ github.run_id }}
|
|
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
|
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
|
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
|
|
SHM_SIZE: '2g'
|
|
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
|
VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
|
|
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
|
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ steps.get-job-id.outputs.job-id }}
|
|
run: |
|
|
set -x
|
|
|
|
TEST_COMMAND=.ci/pytorch/test_fa3_abi_stable.sh
|
|
|
|
# Leaving 1GB for the runner and other things
|
|
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
|
|
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
|
|
# comes from https://github.com/pytorch/test-infra/pull/6058
|
|
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
|
|
|
|
|
|
SHM_OPTS="--shm-size=${SHM_SIZE}"
|
|
JENKINS_USER="--user jenkins"
|
|
DOCKER_SHELL_CMD=
|
|
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# TODO: Stop building test binaries as part of the build phase
|
|
# Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice
|
|
# shellcheck disable=SC2086,SC2090
|
|
container_name=$(docker run \
|
|
${GPU_FLAG:-} \
|
|
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e PR_NUMBER \
|
|
-e GITHUB_ACTIONS \
|
|
-e GITHUB_REPOSITORY \
|
|
-e GITHUB_WORKFLOW \
|
|
-e GITHUB_JOB \
|
|
-e GITHUB_RUN_ID \
|
|
-e GITHUB_RUN_NUMBER \
|
|
-e GITHUB_RUN_ATTEMPT \
|
|
-e JOB_ID \
|
|
-e JOB_NAME \
|
|
-e BASE_SHA \
|
|
-e BRANCH \
|
|
-e SHA1 \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e HUGGING_FACE_HUB_TOKEN \
|
|
-e VLLM_TEST_HUGGING_FACE_TOKEN \
|
|
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
|
-e ARTIFACTS_FILE_SUFFIX \
|
|
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
|
|
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--ipc=host \
|
|
${SHM_OPTS} \
|
|
--tty \
|
|
--detach \
|
|
--name="${container_name}" \
|
|
${JENKINS_USER} \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${DOCKER_IMAGE}" \
|
|
${DOCKER_SHELL_CMD}
|
|
)
|
|
|
|
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
|
|
|
|
docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
|
|
|
|
- name: Collect backtraces from coredumps (if any)
|
|
if: always()
|
|
run: |
|
|
# shellcheck disable=SC2156
|
|
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
|
|
|
|
- name: Store Core dumps on S3
|
|
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
|
|
if: failure()
|
|
with:
|
|
name: coredumps-fa3-stable-abi-smoke-tests
|
|
retention-days: 14
|
|
if-no-files-found: ignore
|
|
path: ./**/core.[1-9]*
|
|
|
|
- name: Upload utilization stats
|
|
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' }}
|
|
continue-on-error: true
|
|
uses: ./.github/actions/upload-utilization-stats
|
|
with:
|
|
job_id: ${{ steps.get-job-id.outputs.job-id }}
|
|
job_name: ${{ steps.get-job-id.outputs.job-name }}
|
|
workflow_name: ${{ github.workflow }}
|
|
workflow_run_id: ${{github.run_id}}
|
|
workflow_attempt: ${{github.run_attempt}}
|
|
|
|
- name: Teardown Linux
|
|
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
|
if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
|