mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Use linux.c7i.2xlarge as the default runner for the _linux-build.yml workflow. In testing we found that switching from c5 - c7i grants a 15-20% faster build times despite c7i costing 5% more. This should reduce costs of jobs using _linux-build.yml. Relates to pytorch/test-infra#7175. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164747 Approved by: https://github.com/atalman
470 lines
19 KiB
YAML
470 lines
19 KiB
YAML
name: linux-build
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
docker-image-name:
|
|
required: true
|
|
type: string
|
|
description: Name of the base docker image to build with.
|
|
build-generates-artifacts:
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
description: If set, upload generated build artifacts.
|
|
sync-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: |
|
|
If this is set, our linter will use this to make sure that every other
|
|
job with the same `sync-tag` is identical.
|
|
cuda-arch-list:
|
|
required: false
|
|
type: string
|
|
default: "5.2"
|
|
description: |
|
|
List of CUDA architectures CI build should target.
|
|
runner_prefix:
|
|
required: false
|
|
default: ""
|
|
type: string
|
|
description: Prefix for runner label
|
|
runner:
|
|
required: false
|
|
type: string
|
|
default: "linux.c7i.2xlarge"
|
|
description: |
|
|
Label of the runner this job should run on.
|
|
test-matrix:
|
|
required: false
|
|
type: string
|
|
description: |
|
|
An option JSON description of what test configs to run later on. This
|
|
is moved here from the Linux test workflow so that we can apply filter
|
|
logic using test-config labels earlier and skip unnecessary builds
|
|
selected-test-configs:
|
|
description: |
|
|
A comma-separated list of test configurations from the test matrix to keep,
|
|
The empty list means we are going to keep every configurations by defaults
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
s3-bucket:
|
|
description: S3 bucket to download artifact
|
|
required: false
|
|
type: string
|
|
default: "gha-artifacts"
|
|
aws-role-to-assume:
|
|
description: Role to assume for downloading artifacts
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
disable-monitor:
|
|
description: |
|
|
Disable utilization monitoring for build job
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
monitor-log-interval:
|
|
description: |
|
|
Set the interval for the monitor script to log utilization.
|
|
required: false
|
|
type: number
|
|
default: 5
|
|
monitor-data-collect-interval:
|
|
description: |
|
|
Set the interval for the monitor script to collect data.
|
|
required: false
|
|
type: number
|
|
default: 1
|
|
allow-reuse-old-whl:
|
|
description: |
|
|
If set, the build try to pull an old wheel from s3 that was built on a
|
|
commit with no cpp changes from this commit
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
build-additional-packages:
|
|
description: |
|
|
If set, the build job will also builds these packages and saves their
|
|
wheels as artifacts
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
build-external-packages:
|
|
description: |
|
|
If set, the build external packages and saves their wheels as artifacts
|
|
use command separated list of packages to build ex: 'vllm,transformers'.
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
|
|
secrets:
|
|
HUGGING_FACE_HUB_TOKEN:
|
|
required: false
|
|
description: |
|
|
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN:
|
|
required: false
|
|
description: |
|
|
FB app token to write to scribe endpoint
|
|
|
|
outputs:
|
|
docker-image:
|
|
value: ${{ jobs.build.outputs.docker-image }}
|
|
description: The docker image containing the built PyTorch.
|
|
test-matrix:
|
|
value: ${{ jobs.build.outputs.test-matrix }}
|
|
description: An optional JSON description of what test configs to run later on.
|
|
|
|
jobs:
|
|
build:
|
|
environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
|
|
# Don't run on forked repos
|
|
if: github.repository_owner == 'pytorch'
|
|
runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
|
|
timeout-minutes: 480
|
|
outputs:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
test-matrix: ${{ steps.filter.outputs.test-matrix }}
|
|
steps:
|
|
- name: Setup SSH (Click me for login details)
|
|
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
instructions: |
|
|
Build is done inside the container, to start an interactive session run:
|
|
docker exec -it $(docker container ps --format '{{.ID}}') bash
|
|
|
|
# [pytorch repo ref]
|
|
# Use a pytorch/pytorch reference instead of a reference to the local
|
|
# checkout because when we run this action we don't *have* a local
|
|
# checkout. In other cases you should prefer a local checkout.
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
|
with:
|
|
no-sudo: true
|
|
|
|
- name: Setup Linux
|
|
uses: ./.github/actions/setup-linux
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
|
|
- name: configure aws credentials
|
|
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
|
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
|
with:
|
|
role-to-assume: ${{ inputs.aws-role-to-assume }}
|
|
role-session-name: gha-linux-build
|
|
aws-region: us-east-1
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Check if can use old whl build
|
|
id: use-old-whl
|
|
uses: ./.github/actions/reuse-old-whl
|
|
if: ${{ inputs.allow-reuse-old-whl }}
|
|
with:
|
|
build-environment: ${{ inputs.build-environment }}
|
|
run-id: ${{ github.run_id }}
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
job-id: ${{ steps.get-job-id.outputs.job-id }}
|
|
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
|
|
|
- name: Calculate docker image
|
|
id: calculate-docker-image
|
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
docker-image-name: ${{ inputs.docker-image-name }}
|
|
|
|
- name: Use following to pull public copy of the image
|
|
id: print-ghcr-mirror
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
|
|
env:
|
|
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
shell: bash
|
|
run: |
|
|
tag=${ECR_DOCKER_IMAGE##*:}
|
|
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
|
|
with:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
|
|
# Apply the filter logic to the build step too if the test-config label is already there
|
|
- name: Select all requested test configurations (if the test matrix is available)
|
|
id: filter
|
|
uses: ./.github/actions/filter-test-configs
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
test-matrix: ${{ inputs.test-matrix }}
|
|
selected-test-configs: ${{ inputs.selected-test-configs }}
|
|
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
|
|
|
- name: Start monitoring script
|
|
id: monitor-script
|
|
if: ${{ !inputs.disable-monitor }}
|
|
shell: bash
|
|
continue-on-error: true
|
|
env:
|
|
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
|
WORKFLOW_NAME: ${{ github.workflow }}
|
|
WORKFLOW_RUN_ID: ${{github.run_id}}
|
|
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
|
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
|
run: |
|
|
mkdir -p ../../usage_logs
|
|
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
|
|
python3 -m tools.stats.monitor \
|
|
--log-interval "$MONITOR_LOG_INTERVAL" \
|
|
--data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
|
|
> "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
|
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Download pytest cache
|
|
uses: ./.github/actions/pytest-cache-download
|
|
continue-on-error: true
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
|
|
with:
|
|
cache_dir: .pytest_cache
|
|
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
|
s3_bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Build
|
|
if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && steps.use-old-whl.outputs.reuse != 'true'
|
|
id: build
|
|
env:
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
|
|
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
|
SCCACHE_REGION: us-east-1
|
|
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
|
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
|
|
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
|
|
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
|
|
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
|
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
|
BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }}
|
|
RUNNER: ${{ inputs.runner }}
|
|
run: |
|
|
START_TIME=$(date +%s)
|
|
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
|
JENKINS_USER=
|
|
USED_IMAGE="${DOCKER_IMAGE_S390X}"
|
|
# ensure that docker container cleanly exits in 12 hours
|
|
# if for some reason cleanup action doesn't stop container
|
|
# when job is cancelled
|
|
DOCKER_SHELL_CMD="sleep 12h"
|
|
|
|
# since some steps are skipped on s390x, if they are necessary, run them here
|
|
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
else
|
|
JENKINS_USER="--user jenkins"
|
|
USED_IMAGE="${DOCKER_IMAGE}"
|
|
DOCKER_SHELL_CMD=
|
|
fi
|
|
|
|
# Leaving 1GB for the runner and other things
|
|
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
|
|
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
|
|
# comes from https://github.com/pytorch/test-infra/pull/6058
|
|
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
|
|
|
|
if [[ ${BUILD_ENVIRONMENT} == *"riscv64"* ]]; then
|
|
# EC2 specific setup for RISC-V emulation
|
|
# Ensure binfmt_misc is available
|
|
echo "Mounting binfmt_misc filesystem"
|
|
sudo mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc 2>/dev/null || true
|
|
|
|
echo "QEMU registration: multiarch/qemu-user-static"
|
|
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes || true
|
|
|
|
# Final verification
|
|
echo "Checking binfmt_misc status:"
|
|
ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null || echo "Cannot access binfmt_misc directory"
|
|
|
|
if [ -f /proc/sys/fs/binfmt_misc/qemu-riscv64 ]; then
|
|
echo "qemu-riscv64 registration successful"
|
|
else
|
|
echo "qemu-riscv64 registration failed - proceeding without emulation"
|
|
echo "This may cause RISC-V builds to fail"
|
|
fi
|
|
|
|
RISCV_DOCKER_ARGS="--privileged"
|
|
else
|
|
RISCV_DOCKER_ARGS=
|
|
fi
|
|
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
|
|
# shellcheck disable=SC2086
|
|
container_name=$(docker run \
|
|
${RISCV_DOCKER_ARGS} \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e PR_NUMBER \
|
|
-e SHA1 \
|
|
-e BRANCH \
|
|
-e SCCACHE_BUCKET \
|
|
-e SCCACHE_REGION \
|
|
-e XLA_CUDA \
|
|
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
|
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
|
-e TORCH_CUDA_ARCH_LIST \
|
|
-e PR_LABELS \
|
|
-e OUR_GITHUB_JOB_ID \
|
|
-e HUGGING_FACE_HUB_TOKEN \
|
|
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
|
-e BUILD_ADDITIONAL_PACKAGES \
|
|
-e RUNNER \
|
|
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
|
|
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--tty \
|
|
--detach \
|
|
${JENKINS_USER} \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${USED_IMAGE}" \
|
|
${DOCKER_SHELL_CMD}
|
|
)
|
|
|
|
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
|
docker exec -t "${container_name}" sh -c "python3 -m pip install -r requirements.txt"
|
|
fi
|
|
|
|
docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
|
|
|
|
END_TIME=$(date +%s)
|
|
echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Build external packages
|
|
id: build-external-packages
|
|
if: inputs.build-external-packages != '' && steps.build.outcome != 'skipped'
|
|
uses: ./.github/actions/build-external-packages
|
|
with:
|
|
build-targets: ${{ inputs.build-external-packages }}
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
cuda-arch-list: ${{ inputs.cuda-arch-list }}
|
|
output-dir: external
|
|
|
|
- name: Move external packages to dist
|
|
if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
|
|
shell: bash
|
|
run: |
|
|
src="${{ steps.build-external-packages.outputs.output_dir }}"
|
|
if [ -d "$src" ]; then
|
|
mkdir -p "dist/$(dirname "$src")"
|
|
mv "$src" "dist/$(dirname "$src")/"
|
|
fi
|
|
|
|
- name: Stop monitoring script
|
|
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
|
|
shell: bash
|
|
continue-on-error: true
|
|
env:
|
|
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
|
run: |
|
|
kill "$MONITOR_SCRIPT_PID"
|
|
|
|
- name: Archive artifacts into zip
|
|
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && steps.use-old-whl.outputs.reuse != 'true'
|
|
run: |
|
|
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
|
|
|
|
- name: Store PyTorch Build Artifacts on S3
|
|
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
|
|
if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
retention-days: 14
|
|
if-no-files-found: error
|
|
path: artifacts.zip
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Store PyTorch Build Artifacts for s390x
|
|
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
|
if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
retention-days: 14
|
|
if-no-files-found: error
|
|
path: artifacts.zip
|
|
|
|
- name: copy logs
|
|
shell: bash
|
|
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
|
|
continue-on-error: true
|
|
run: |
|
|
rm -f ./usage_logs
|
|
mkdir -p ./usage_logs
|
|
cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/
|
|
|
|
- name: Upload raw usage log to s3
|
|
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
|
|
uses: seemethere/upload-artifact-s3@v5
|
|
with:
|
|
s3-prefix: |
|
|
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
|
|
retention-days: 14
|
|
if-no-files-found: warn
|
|
path: usage_logs/usage_log_build_*.txt
|
|
|
|
- name: Upload sccache stats
|
|
if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
uses: ./.github/actions/upload-sccache-stats
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
build-time: ${{ steps.build.outputs.build_time }}
|
|
|
|
- name: Upload utilization stats
|
|
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
|
continue-on-error: true
|
|
uses: ./.github/actions/upload-utilization-stats
|
|
with:
|
|
job_id: ${{ steps.get-job-id.outputs.job-id }}
|
|
job_name: ${{ steps.get-job-id.outputs.job-name }}
|
|
workflow_name: ${{ github.workflow }}
|
|
workflow_run_id: ${{github.run_id}}
|
|
workflow_attempt: ${{github.run_attempt}}
|
|
artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
|
|
|
|
- name: Teardown Linux
|
|
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
|
if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
|
|
- name: Cleanup docker
|
|
if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
|
shell: bash
|
|
run: |
|
|
# on s390x stop the container for clean worker stop
|
|
docker stop -a || true
|
|
docker kill -a || true
|