pytorch/.github/workflows/_linux-build.yml

name: linux-build

on:
  workflow_call:
    inputs:
      build-environment:
        required: true
        type: string
        description: Top-level label for what's being built/tested.
      docker-image-name:
        required: true
        type: string
        description: Name of the base docker image to build with.
      build-generates-artifacts:
        required: false
        type: boolean
        default: true
        description: If set, upload generated build artifacts.
      sync-tag:
        required: false
        type: string
        default: ""
        description: |
          If this is set, our linter will use this to make sure that every other
          job with the same `sync-tag` is identical.
      cuda-arch-list:
        required: false
        type: string
        default: "5.2"
        description: |
          List of CUDA architectures CI build should target.
      runner_prefix:
        required: false
        default: ""
        type: string
        description: Prefix for runner label
      runner:
        required: false
        type: string
        default: "linux.c7i.2xlarge"
        description: |
          Label of the runner this job should run on.
      test-matrix:
        required: false
        type: string
        description: |
          An option JSON description of what test configs to run later on. This
          is moved here from the Linux test workflow so that we can apply filter
          logic using test-config labels earlier and skip unnecessary builds
      selected-test-configs:
        description: |
          A comma-separated list of test configurations from the test matrix to keep,
          The empty list means we are going to keep every configurations by defaults
        required: false
        type: string
        default: ""
      s3-bucket:
        description: S3 bucket to download artifact
        required: false
        type: string
        default: "gha-artifacts"
      aws-role-to-assume:
        description: Role to assume for downloading artifacts
        required: false
        type: string
        default: ""
      disable-monitor:
        description: |
          Disable utilization monitoring for build job
        required: false
        type: boolean
        default: false
      monitor-log-interval:
        description: |
          Set the interval for the monitor script to log utilization.
        required: false
        type: number
        default: 5
      monitor-data-collect-interval:
        description: |
          Set the interval for the monitor script to collect data.
        required: false
        type: number
        default: 1
      allow-reuse-old-whl:
        description: |
          If set, the build try to pull an old wheel from s3 that was built on a
          commit with no cpp changes from this commit
        required: false
        type: boolean
        default: true
      build-additional-packages:
        description: |
          If set, the build job will also builds these packages and saves their
          wheels as artifacts
        required: false
        type: string
        default: ""
      build-external-packages:
        description: |
          If set, the build external packages and saves their wheels as artifacts
          use command separated list of packages to build ex: 'vllm,transformers'.
        required: false
        type: string
        default: ""

    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
      SCRIBE_GRAPHQL_ACCESS_TOKEN:
        required: false
        description: |
          FB app token to write to scribe endpoint

    outputs:
      docker-image:
        value: ${{ jobs.build.outputs.docker-image }}
        description: The docker image containing the built PyTorch.
      test-matrix:
        value: ${{ jobs.build.outputs.test-matrix }}
        description: An optional JSON description of what test configs to run later on.

jobs:
  build:
    environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
    # Don't run on forked repos
    if: github.repository_owner == 'pytorch'
    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
    timeout-minutes: 480
    outputs:
      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
            Build is done inside the container, to start an interactive session run:
              docker exec -it $(docker container ps --format '{{.ID}}') bash

      # [pytorch repo ref]
      # Use a pytorch/pytorch reference instead of a reference to the local
      # checkout because when we run this action we don't *have* a local
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          no-sudo: true

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: configure aws credentials
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-build
          aws-region: us-east-1

      - name: Get workflow job id
        id: get-job-id
        uses: ./.github/actions/get-workflow-job-id
        if: always()
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Check if can use old whl build
        id: use-old-whl
        uses: ./.github/actions/reuse-old-whl
        if: ${{ inputs.allow-reuse-old-whl }}
        with:
          build-environment: ${{ inputs.build-environment }}
          run-id: ${{ github.run_id }}
          github-token: ${{ secrets.GITHUB_TOKEN }}
          job-id: ${{ steps.get-job-id.outputs.job-id }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
        env:
          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        shell: bash
        run: |
          tag=${ECR_DOCKER_IMAGE##*:}
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py

      # Apply the filter logic to the build step too if the test-config label is already there
      - name: Select all requested test configurations (if the test matrix is available)
        id: filter
        uses: ./.github/actions/filter-test-configs
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          test-matrix: ${{ inputs.test-matrix }}
          selected-test-configs: ${{ inputs.selected-test-configs }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

      - name: Start monitoring script
        id: monitor-script
        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        env:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          WORKFLOW_NAME: ${{ github.workflow }}
          WORKFLOW_RUN_ID: ${{github.run_id}}
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
          mkdir -p ../../usage_logs
          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          python3 -m tools.stats.monitor \
          --log-interval "$MONITOR_LOG_INTERVAL" \
          --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
          > "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
          s3_bucket: ${{ inputs.s3-bucket }}

      - name: Build
        if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && steps.use-old-whl.outputs.reuse != 'true'
        id: build
        env:
          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
          SCCACHE_REGION: us-east-1
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
          TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
          DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }}
          RUNNER: ${{ inputs.runner }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
            JENKINS_USER=
            USED_IMAGE="${DOCKER_IMAGE_S390X}"
            # ensure that docker container cleanly exits in 12 hours
            # if for some reason cleanup action doesn't stop container
            # when job is cancelled
            DOCKER_SHELL_CMD="sleep 12h"

            # since some steps are skipped on s390x, if they are necessary, run them here
            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
          else
            JENKINS_USER="--user jenkins"
            USED_IMAGE="${DOCKER_IMAGE}"
            DOCKER_SHELL_CMD=
          fi

          # Leaving 1GB for the runner and other things
          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
          # comes from https://github.com/pytorch/test-infra/pull/6058
          TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))

          if [[ ${BUILD_ENVIRONMENT} == *"riscv64"* ]]; then
            # EC2 specific setup for RISC-V emulation
            # Ensure binfmt_misc is available
            echo "Mounting binfmt_misc filesystem"
            sudo mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc 2>/dev/null || true

            echo "QEMU registration: multiarch/qemu-user-static"
            docker run --rm --privileged multiarch/qemu-user-static --reset -p yes || true

            # Final verification
            echo "Checking binfmt_misc status:"
            ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null || echo "Cannot access binfmt_misc directory"

            if [ -f /proc/sys/fs/binfmt_misc/qemu-riscv64 ]; then
              echo "qemu-riscv64 registration successful"
            else
              echo "qemu-riscv64 registration failed - proceeding without emulation"
              echo "This may cause RISC-V builds to fail"
            fi

            RISCV_DOCKER_ARGS="--privileged"
          else
            RISCV_DOCKER_ARGS=
          fi

          # detached container should get cleaned up by teardown_ec2_linux
          # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
          # shellcheck disable=SC2086
          container_name=$(docker run \
            ${RISCV_DOCKER_ARGS} \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e PR_NUMBER \
            -e SHA1 \
            -e BRANCH \
            -e SCCACHE_BUCKET \
            -e SCCACHE_REGION \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e SKIP_SCCACHE_INITIALIZATION=1 \
            -e TORCH_CUDA_ARCH_LIST \
            -e PR_LABELS \
            -e OUR_GITHUB_JOB_ID \
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e BUILD_ADDITIONAL_PACKAGES \
            -e RUNNER \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --tty \
            --detach \
            ${JENKINS_USER} \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${USED_IMAGE}" \
            ${DOCKER_SHELL_CMD}
          )

          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
            docker exec -t "${container_name}" sh -c "python3 -m pip install -r requirements.txt"
          fi

          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

      - name: Build external packages
        id: build-external-packages
        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
        uses: ./.github/actions/build-external-packages
        with:
          build-targets: ${{ inputs.build-external-packages }}
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
          cuda-arch-list: ${{ inputs.cuda-arch-list }}
          output-dir: external

      - name: Move external packages to dist
        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
        shell: bash
        run: |
          src="${{ steps.build-external-packages.outputs.output_dir }}"
          if [ -d "$src" ]; then
            mkdir -p "dist/$(dirname "$src")"
            mv "$src" "dist/$(dirname "$src")/"
          fi

      - name: Stop monitoring script
        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
        run: |
          kill "$MONITOR_SCRIPT_PID"

      - name: Archive artifacts into zip
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && steps.use-old-whl.outputs.reuse != 'true'
        run: |
          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
        if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Store PyTorch Build Artifacts for s390x
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip

      - name: copy logs
        shell: bash
        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
        continue-on-error: true
        run: |
          rm -f ./usage_logs
          mkdir -p ./usage_logs
          cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/

      - name: Upload raw usage log to s3
        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
        uses: seemethere/upload-artifact-s3@v5
        with:
          s3-prefix: |
            ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
          retention-days: 14
          if-no-files-found: warn
          path: usage_logs/usage_log_build_*.txt

      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/upload-sccache-stats
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          build-time: ${{ steps.build.outputs.build_time }}

      - name: Upload utilization stats
        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        continue-on-error: true
        uses: ./.github/actions/upload-utilization-stats
        with:
          job_id: ${{ steps.get-job-id.outputs.job-id }}
          job_name: ${{ steps.get-job-id.outputs.job-name }}
          workflow_name: ${{ github.workflow }}
          workflow_run_id: ${{github.run_id}}
          workflow_attempt: ${{github.run_attempt}}
          artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: Cleanup docker
        if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
        shell: bash
        run: |
          # on s390x stop the container for clean worker stop
          docker stop -a || true
          docker kill -a || true