Files
pytorch/.github/templates/linux_ci_workflow.yml.j2
Nikita Shulga d6048ecd6b Enable bazel builds on ciflow/default (#62649)
Summary:
Add `regenerate.sh` convenience script
Remove "TODO: Reenable on PR" label from workflows which are enabled on PRs

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62649

Reviewed By: seemethere

Differential Revision: D30071905

Pulled By: malfet

fbshipit-source-id: c82134cb676b273d23e225be21166588996a31d3
2021-08-03 11:05:41 -07:00

575 lines
23 KiB
Django/Jinja

{# squid_proxy is an private ELB that only available for GHA custom runners #}
{%- set squid_proxy = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%}
{# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
{%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
{%- block name -%}
# Template is at: .github/templates/linux_ci_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: !{{ build_environment }}
{%- endblock %}
on:
{%- if on_pull_request %}
pull_request:
{%- if ciflow_config.enabled %}
{%- if ciflow_config.trigger_action_only %}
types: [!{{ ciflow_config.trigger_action }}]
{%- else %}
types: [opened, synchronize, reopened, !{{ ciflow_config.trigger_action }}]
{%- endif %}
{%- endif %}
{%- else %}
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
{%- endif %}
{%- if is_scheduled %}
schedule:
- cron: !{{ is_scheduled }}
{%- else %}
push:
branches:
- master
- release/*
{%- endif %}
workflow_dispatch:
env:
BUILD_ENVIRONMENT: !{{ build_environment }}
DOCKER_IMAGE_BASE: !{{ docker_image_base }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1
# This is used for the phase of adding wheel tests only, will be removed once completed
IN_WHEEL_TEST: 1
# Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
concurrency:
group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true
jobs:
{%- if ciflow_config.enabled %}
!{{ ciflow_config.root_job_name }}:
runs-on: ubuntu-18.04
if: ${{ !{{ ciflow_config.root_job_condition }} }}
steps:
- name: noop
run: echo running !{{ ciflow_config.root_job_name }}
{%- endif %}
calculate-docker-image:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: linux.2xlarge
{%- if ciflow_config.enabled %}
needs: [!{{ ciflow_config.root_job_name }}]
{%- endif %}
env:
DOCKER_BUILDKIT: 1
timeout-minutes: 90
outputs:
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
steps:
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Clean workspace
run: |
rm -rf "${GITHUB_WORKSPACE:?}/*"
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
# deep clone, to allow use of git merge-base
fetch-depth: 0
- name: Calculate docker image tag
id: calculate-tag
run: |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
echo "::set-output name=docker_tag::${DOCKER_TAG}"
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
- name: Check if image should be built
id: check
env:
DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
run: |
set -x
# Check if image already exists, if it does then skip building it
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
exit 0
fi
if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
# if we're on the base branch then use the parent commit
MERGE_BASE=$(git rev-parse HEAD~)
else
# otherwise we're on a PR, so use the most recent base commit
MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
fi
# Covers the case where a previous tag doesn't exist for the tree
# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
exit 1
fi
PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
# If no image exists but the hash is the same as the previous hash then we should error out here
if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
echo " contact the PyTorch team to restore the original images"
exit 1
fi
echo ::set-output name=rebuild::yes
- name: Build and push docker image
if: ${{ steps.check.outputs.rebuild }}
env:
DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
DOCKER_SKIP_S3_UPLOAD: 1
run: |
export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
cd .circleci/docker && ./build_docker.sh
{% block build +%}
build:
runs-on: linux.2xlarge
needs: [calculate-docker-image, !{{ ciflow_config.root_job_name }}]
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
JOB_BASE_NAME: !{{ build_environment }}-build
steps:
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Clean workspace
run: |
rm -rf "${GITHUB_WORKSPACE:?}/*"
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
submodules: recursive
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Build PyTorch
run: |
docker run \
-e BUILD_ENVIRONMENT \
-e JOB_BASE_NAME \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
-e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Display and upload binary build size statistics (Click Me)
# temporary hack: set CIRCLE_* vars, until we update
# tools/stats/print_test_stats.py to natively support GitHub Actions
env:
AWS_DEFAULT_REGION: us-east-1
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
run: |
COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
export COMMIT_TIME
pip3 install requests
python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
{%- if not is_libtorch %}
- name: Archive artifacts into zip
run: |
zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
# Upload to github so that people can click and download artifacts
- uses: actions/upload-artifact@v2
# Don't fail on upload to GH since it's only for user convenience
continue-on-error: true
name: Store PyTorch Build Artifacts on Github
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 14
if-no-files-found: error
path:
artifacts.zip
- uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
name: Store PyTorch Build Artifacts on S3
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 14
if-no-files-found: error
path:
artifacts.zip
{%- endif %}
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
{%- endblock %}
{%- if not exclude_test %}
{% block test +%}
generate-test-matrix:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ubuntu-18.04
{%- if ciflow_config.enabled %}
needs: [!{{ ciflow_config.root_job_name }}]
{%- endif %}
env:
TEST_RUNNER_TYPE: !{{ test_runner_type }}
ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }}
ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }}
ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
ENABLE_SLOW_TEST: !{{ enable_slow_test }}
NUM_TEST_SHARDS: !{{ num_test_shards }}
MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
NOGPU_RUNNER_TYPE: linux.2xlarge
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
container:
image: python:3.9
steps:
- name: Install dependencies
run: pip install typing-extensions
- name: Clone pytorch/pytorch
uses: actions/checkout@v2
- name: Generating test matrix
id: set-matrix
run: .github/scripts/generate_pytorch_test_matrix.py
test:
needs: [calculate-docker-image, build, generate-test-matrix, !{{ ciflow_config.root_job_name }}]
strategy:
matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
JOB_BASE_NAME: !{{ build_environment }}-test
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
steps:
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Clean workspace
run: |
rm -rf "${GITHUB_WORKSPACE:?}/*"
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
submodules: recursive
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
run: |
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Determine shm-size
run: |
shm_size="1g"
case "${BUILD_ENVIRONMENT}" in
*cuda*)
shm_size="2g"
;;
*rocm*)
shm_size="8g"
;;
esac
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
unzip -o artifacts.zip
- name: Output disk space left
run: |
sudo df -H
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
run: |
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
else
TEST_COMMAND=.jenkins/pytorch/test.sh
fi
if [[ $NUM_TEST_SHARDS -ne 2 ]]; then
export SHARD_NUMBER=0
fi
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086
docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-e GITHUB_ACTIONS \
-e IN_CI \
-e IN_WHEEL_TEST \
-e SHARD_NUMBER \
-e JOB_BASE_NAME \
-e TEST_CONFIG \
-e NUM_TEST_SHARDS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND
- name: Chown workspace
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Zip test reports for upload
if: always()
env:
COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
run: |
# Remove any previous test reports if they exist
rm -f test-reports-*.zip
zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml'
- uses: actions/upload-artifact@v2
name: Store PyTorch Test Reports
if: always()
with:
name: test-reports-${{ matrix.config }}
retention-days: 14
if-no-files-found: error
path:
test-reports-*.zip
- uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
name: Store PyTorch Test Reports on S3
if: always()
with:
name: test-reports-${{ matrix.config }}
retention-days: 14
if-no-files-found: error
path:
test-reports-*.zip
- name: Clean up docker images
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
# Prune all of the docker images
docker system prune -af
{% endblock %}
{%- endif -%}
{%- if not is_libtorch %}
{% block render_test_results +%}
# this is a separate step from test because the log files from test are too
# long: basically, GitHub tries to render all of the log files when you click
# through an action causing extreme slowdown on actions that contain too many
# logs (like test); we can always move it back to the other one, but it
# doesn't create the best experience
render_test_results:
if: always()
needs: [generate-test-matrix, test, !{{ ciflow_config.root_job_name }}]
runs-on: linux.2xlarge
strategy:
matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
fail-fast: false
steps:
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Clean workspace
run: |
rm -rf "${GITHUB_WORKSPACE:?}/*"
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
# deep clone, to allow tools/stats/print_test_stats.py to use Git commands
fetch-depth: 0
- uses: actions/download-artifact@v2
name: Download PyTorch Test Reports
with:
name: test-reports-${{ matrix.config }}
path: .
- name: Unzip test reports
run: |
# Should preserve paths so reports should still be in test/test-reports
unzip -o 'test-reports-*.zip'
- name: Install dependencies
# boto3 version copied from .circleci/docker/common/install_conda.sh
run: |
pip3 install -r requirements.txt
pip3 install boto3==1.16.34 junitparser rich
- name: Output Test Results (Click Me)
run: |
python3 tools/render_junit.py test
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Display and upload test statistics (Click Me)
# temporary hack: set CIRCLE_* vars, until we update
# tools/stats/print_test_stats.py to natively support GitHub Actions
env:
AWS_DEFAULT_REGION: us-east-1
CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
JOB_BASE_NAME: !{{ build_environment }}-test
CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
run: |
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
{%- endblock %}
{%- endif -%}
{%- if enable_doc_jobs %}
pytorch_python_doc_build:
runs-on: linux.2xlarge
needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}]
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Clean workspace
run: |
rm -rf "${GITHUB_WORKSPACE:?}/*"
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
submodules: recursive
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
unzip -o artifacts.zip
- name: Build Python Doc in Docker
run: |
set -ex
time docker pull "${DOCKER_IMAGE}" > /dev/null
echo "${GITHUB_REF}"
ref=${GITHUB_REF##*/}
target=${ref//v}
docker run \
-e BUILD_ENVIRONMENT \
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-e IN_CI \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e CIRCLE_SHA1="$GITHUB_SHA" \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--name="$GITHUB_SHA" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
- uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
if: ${{ github.event_name == 'pull_request' }}
name: Upload Docs Preview
with:
name: deploy
retention-days: 14
if-no-files-found: error
path: pytorch.github.io/docs/merge
- name: Show Docs Preview URL (Click Me)
if: ${{ github.event_name == 'pull_request' }}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
- name: Archive artifacts into zip
run: |
zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: pytorch_github_io
if-no-files-found: error
path: pytorch_github_io.zip
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
{%- endif -%}