From 5b1c39f5a12c4f0c64e84f4a8bc5a40507020529 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 30 Sep 2025 12:44:39 -0700 Subject: [PATCH] Add smoke tests to verify that stable ABI FA3 wheel runs w/ newer torch (#163782) Passing CI: https://github.com/pytorch/pytorch/actions/runs/18141589975/job/51635340255?pr=163782 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163782 Approved by: https://github.com/huydhn, https://github.com/mikaylagawarecki --- .ci/pytorch/test_fa3_abi_stable.sh | 32 +++ .github/workflows/_linux-test-stable-fa3.yml | 255 +++++++++++++++++++ .github/workflows/test-h100.yml | 12 + 3 files changed, 299 insertions(+) create mode 100755 .ci/pytorch/test_fa3_abi_stable.sh create mode 100644 .github/workflows/_linux-test-stable-fa3.yml diff --git a/.ci/pytorch/test_fa3_abi_stable.sh b/.ci/pytorch/test_fa3_abi_stable.sh new file mode 100755 index 000000000000..ff71e9887293 --- /dev/null +++ b/.ci/pytorch/test_fa3_abi_stable.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -ex -o pipefail + +# Suppress ANSI color escape sequences +export TERM=vt100 + +# shellcheck source=./common.sh +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" +# shellcheck source=./common-build.sh +source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" + +echo "Environment variables" +env + +echo "Testing FA3 stable wheel still works with currently built torch" + +echo "Installing ABI Stable FA3 wheel" +# The wheel was built on https://github.com/Dao-AILab/flash-attention/commit/b3846b059bf6b143d1cd56879933be30a9f78c81 +# on torch nightly torch==2.9.0.dev20250830+cu129 +$MAYBE_SUDO pip -q install https://s3.amazonaws.com/ossci-linux/wheels/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl + +pushd flash-attention/hopper +export PYTHONPATH=$PWD +pytest -v -s \ + "test_flash_attn.py::test_flash_attn_output[1-1-192-False-False-False-0.0-False-False-mha-dtype0]" \ + "test_flash_attn.py::test_flash_attn_varlen_output[511-1-64-True-False-False-0.0-False-False-gqa-dtype2]" \ + "test_flash_attn.py::test_flash_attn_kvcache[1-128-128-False-False-True-None-0.0-False-False-True-False-True-False-gqa-dtype0]" \ + "test_flash_attn.py::test_flash_attn_race_condition[97-97-192-True-dtype0]" \ + "test_flash_attn.py::test_flash_attn_combine[2-3-64-dtype1]" \ + "test_flash_attn.py::test_flash3_bw_compatibility" +popd diff --git a/.github/workflows/_linux-test-stable-fa3.yml b/.github/workflows/_linux-test-stable-fa3.yml new file mode 100644 index 000000000000..63a9e7359ed2 --- /dev/null +++ b/.github/workflows/_linux-test-stable-fa3.yml @@ -0,0 +1,255 @@ +# The point of this workflow is to test that a FA3 wheel that was built based off the +# stable ABI as of torch nightly 20250830 can still run on the newer torch. +# +# This workflow is very similar to the _linux-test.yml workflow, with the following +# differences: +# 1. It is simpler (there is no test matrix) +# 2. It pulls flash-attention as a secondary repository in order to access the tests. +# Note that it does not BUILD anything from flash-attention, as we have a prebuilt +# wheel. We pull flash-attention only to run a few tests. +# 3. It runs only FA3 tests. No PyTorch tests are run. +name: linux-test-stable-fa3 + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image: + required: true + type: string + description: Docker image to run in. + timeout-minutes: + required: false + type: number + default: 30 + description: | + Set the maximum (in minutes) how long the workflow should take to finish + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub + VLLM_TEST_HUGGING_FACE_TOKEN: + required: false + description: | + HF Auth token to test vllm + SCRIBE_GRAPHQL_ACCESS_TOKEN: + required: false + description: | + FB app token to write to scribe endpoint + +env: + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + test: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: linux.aws.h100 + timeout-minutes: ${{ inputs.timeout-minutes || 30 }} + permissions: + id-token: write + contents: read + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + no-sudo: true + + - name: Checkout flash-attention as a secondary repository + uses: actions/checkout@v4 + with: + repository: Dao-AILab/flash-attention + path: flash-attention + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ inputs.docker-image }} + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Check if in a container runner + shell: bash + id: check_container_runner + run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" + + - name: Setup GPU_FLAG for docker run + id: setup-gpu-flag + run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container + id: setup-sscache-port-flag + run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" + if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }} + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download build artifacts + uses: ./.github/actions/download-build-artifacts + with: + name: ${{ inputs.build-environment }} + s3-bucket: ${{ inputs.s3-bucket }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Set Test step time + id: test-timeout + shell: bash + env: + JOB_TIMEOUT: ${{ inputs.timeout-minutes }} + run: | + echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: Test + id: test + timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} + SHM_SIZE: '2g' + DOCKER_IMAGE: ${{ inputs.docker-image }} + VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ steps.get-job-id.outputs.job-id }} + run: | + set -x + + TEST_COMMAND=.ci/pytorch/test_fa3_abi_stable.sh + + # Leaving 1GB for the runner and other things + TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) + # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap + # comes from https://github.com/pytorch/test-infra/pull/6058 + TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + + + SHM_OPTS="--shm-size=${SHM_SIZE}" + JENKINS_USER="--user jenkins" + DOCKER_SHELL_CMD= + + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e GITHUB_ACTIONS \ + -e GITHUB_REPOSITORY \ + -e GITHUB_WORKFLOW \ + -e GITHUB_JOB \ + -e GITHUB_RUN_ID \ + -e GITHUB_RUN_NUMBER \ + -e GITHUB_RUN_ATTEMPT \ + -e JOB_ID \ + -e JOB_NAME \ + -e BASE_SHA \ + -e BRANCH \ + -e SHA1 \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e HUGGING_FACE_HUB_TOKEN \ + -e VLLM_TEST_HUGGING_FACE_TOKEN \ + -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ + -e ARTIFACTS_FILE_SUFFIX \ + --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ + --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + ${SHM_OPTS} \ + --tty \ + --detach \ + --name="${container_name}" \ + ${JENKINS_USER} \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + ${DOCKER_SHELL_CMD} + ) + + echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" + + docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" + + - name: Collect backtraces from coredumps (if any) + if: always() + run: | + # shellcheck disable=SC2156 + find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; + + - name: Store Core dumps on S3 + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 + if: failure() + with: + name: coredumps-fa3-stable-abi-smoke-tests + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Upload utilization stats + if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' }} + continue-on-error: true + uses: ./.github/actions/upload-utilization-stats + with: + job_id: ${{ steps.get-job-id.outputs.job-id }} + job_name: ${{ steps.get-job-id.outputs.job-name }} + workflow_name: ${{ github.workflow }} + workflow_run_id: ${{github.run_id}} + workflow_attempt: ${{github.run_attempt}} + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index 1e83c7b9d98c..ec99f4473bb0 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -61,3 +61,15 @@ jobs: docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }} secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test: + name: linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test + uses: ./.github/workflows/_linux-test-stable-fa3.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }} + timeout-minutes: 30 + s3-bucket: gha-artifacts + secrets: inherit