.github: Add initial linux CI workflow (#55176)

Summary:
This is a commandeer of https://github.com/pytorch/pytorch/issues/54091.

TODO:

- [x] understand why the build is [failing](https://github.com/pytorch/pytorch/pull/55176/checks?check_run_id=2254742265) here when it was [succeeding](https://github.com/pytorch/pytorch/pull/54091/checks?check_run_id=2177844748) on https://github.com/pytorch/pytorch/issues/54091
- [x] fix the build failure
- [x] fix the test failure(s)
- [x] add CI check to generate YAML workflows from templates, similar to https://github.com/pytorch/pytorch/issues/55171
- [ ] uncomment the rest of the matrix

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55176

Reviewed By: walterddr

Differential Revision: D27803529

Pulled By: seemethere

fbshipit-source-id: 52a65ec8f7a83b929fed47f0bbdca544210ec9c2
This commit is contained in:
Sam Estep
2021-04-15 16:52:12 -07:00
committed by Facebook GitHub Bot
parent 400398006f
commit 7d410bc3c8
13 changed files with 677 additions and 49 deletions

164
.github/scripts/generate_linux_ci_workflows.py vendored Executable file
View File

@ -0,0 +1,164 @@
#!/usr/bin/env python
from pathlib import Path
import jinja2
DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
GITHUB_DIR = Path(__file__).parent.parent
CPU_TEST_RUNNER = "linux.2xlarge"
CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
class PyTorchLinuxWorkflow:
def __init__(self, build_environment: str, docker_image_base: str):
self.build_environment = build_environment
self.docker_image_base = docker_image_base
self.test_runner_type = CPU_TEST_RUNNER
if "cuda" in build_environment:
self.test_runner_type = CUDA_TEST_RUNNER
def generate_workflow_file(
self, workflow_template: jinja2.Template, jinja_env: jinja2.Environment
) -> Path:
output_file_path = GITHUB_DIR.joinpath(
f"workflows/{self.build_environment}.yml"
)
with open(output_file_path, "w") as output_file:
output_file.write(
workflow_template.render(
build_environment=self.build_environment,
docker_image_base=self.docker_image_base,
test_runner_type=self.test_runner_type
)
)
output_file.write('\n')
return output_file_path
WORKFLOWS = [
PyTorchLinuxWorkflow(
build_environment="pytorch-linux-xenial-py3.6-gcc5.4",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-parallelnative-linux-xenial-py3.6-gcc5.4",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-pure_torch-linux-xenial-py3.6-gcc5.4",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3.6-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-asan",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang7-onnx",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-bionic-py3.6-clang9-noarch",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-xla-linux-bionic-py3.6-clang9",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-vulkan-linux-bionic-py3.6-clang9",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-bionic-py3.8-gcc9-coverage",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.8-gcc9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-bionic-rocm3.9-py3.6",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm3.9-py3.6",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-dynamic",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-static",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-code-analysis",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
]
if __name__ == "__main__":
jinja_env = jinja2.Environment(
variable_start_string="!{{",
loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
)
workflow_template = jinja_env.get_template("linux_ci_workflow.yml.in")
for workflow in WORKFLOWS:
print(
workflow.generate_workflow_file(
workflow_template=workflow_template,
jinja_env=jinja_env
)
)

43
.github/scripts/install_nvidia_utils_linux.sh vendored Executable file
View File

@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -eou pipefail
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
DRIVER_FN="NVIDIA-Linux-x86_64-460.39.run"
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
install_nvidia_docker2_amzn2() {
(
set -x
# Needed for yum-config-manager
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
sudo yum install -y nvidia-docker2
sudo systemctl restart docker
)
}
install_nvidia_driver() {
(
set -x
sudo yum groupinstall -y "Development Tools"
curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
nvidia-smi
)
}
# Install container toolkit based on distribution
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_docker2_amzn2
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac
echo "== Installing nvidia driver ${DRIVER_FN} =="
install_nvidia_driver

5
.github/scripts/report_git_status.sh vendored Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/env bash
CHANGES=$(git status --porcelain)
echo "$CHANGES"
git diff
[ -z "$CHANGES" ]

View File

@ -0,0 +1,174 @@
# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
#
# Template is at: .github/templates/linux_ci_workflow.yml
# Generation script: .github/scripts/generate_linux_ci_workflows.py
name: Linux CI (!{{ build_environment }})
on:
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
# pull_request:
push:
branches:
- master
- release/*
workflow_dispatch:
env:
BUILD_ENVIRONMENT: !{{ build_environment }}
DOCKER_IMAGE_BASE: !{{ docker_image_base }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1
jobs:
calculate-docker-image:
runs-on: ubuntu-18.04
outputs:
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Calculate docker image tag
id: calculate-tag
run: |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
build:
runs-on: linux.2xlarge
needs: calculate-docker-image
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
submodules: recursive
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Build PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS \
-e SCCACHE_BUCKET \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Archive artifacts into zip
run: |
zip -q -r artifacts.zip dist build
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 30
if-no-files-found: error
path:
artifacts.zip
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
test:
runs-on: !{{ test_runner_type }}
needs:
- calculate-docker-image
- build
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
run: |
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Determine shm-size
run: |
shm_size="1g"
case "${BUILD_ENVIRONMENT}" in
*cuda*)
shm_size="2g"
;;
*rocm*)
shm_size="8g"
;;
esac
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
- uses: actions/download-artifact@v2
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
unzip -q artifacts.zip
- name: Output disk space left
run: |
sudo df -H
- name: Test PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086
docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e IN_CI \
-e MAX_JOBS \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && .jenkins/pytorch/test.sh'
- name: Clean up docker images
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
# Prune all of the docker images
docker system prune -af

View File

@ -91,6 +91,23 @@ jobs:
run: |
python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch."
templates:
runs-on: ubuntu-18.04
steps:
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.x
architecture: x64
- name: Install Jinja2
run: pip install Jinja2
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Regenerate workflows
run: .github/scripts/generate_linux_ci_workflows.py
- name: Assert that regenerating the workflows didn't change them
run: .github/scripts/report_git_status.sh
toc:
runs-on: ubuntu-18.04
# https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
@ -111,12 +128,7 @@ jobs:
markdown-toc --bullets='-' -i "$FILE"
done
- name: Assert that regenerating the ToCs didn't change them
run: |
set -eux
CHANGES=$(git status --porcelain)
echo "$CHANGES"
git diff
[ -z "$CHANGES" ]
run: .github/scripts/report_git_status.sh
flake8-py3:
runs-on: ubuntu-18.04

View File

@ -0,0 +1,174 @@
# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
#
# Template is at: .github/templates/linux_ci_workflow.yml
# Generation script: .github/scripts/generate_linux_ci_workflows.py
name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
on:
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
# pull_request:
push:
branches:
- master
- release/*
workflow_dispatch:
env:
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3.6-gcc5.4
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1
jobs:
calculate-docker-image:
runs-on: ubuntu-18.04
outputs:
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Calculate docker image tag
id: calculate-tag
run: |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
build:
runs-on: linux.2xlarge
needs: calculate-docker-image
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
submodules: recursive
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Build PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS \
-e SCCACHE_BUCKET \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Archive artifacts into zip
run: |
zip -q -r artifacts.zip dist build
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 30
if-no-files-found: error
path:
artifacts.zip
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
test:
runs-on: linux.2xlarge
needs:
- calculate-docker-image
- build
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
run: |
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Determine shm-size
run: |
shm_size="1g"
case "${BUILD_ENVIRONMENT}" in
*cuda*)
shm_size="2g"
;;
*rocm*)
shm_size="8g"
;;
esac
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
- uses: actions/download-artifact@v2
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
unzip -q artifacts.zip
- name: Output disk space left
run: |
sudo df -H
- name: Test PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086
docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e IN_CI \
-e MAX_JOBS \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && .jenkins/pytorch/test.sh'
- name: Clean up docker images
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
# Prune all of the docker images
docker system prune -af

9
.gitignore vendored
View File

@ -288,3 +288,12 @@ TAGS
# bazel symlinks
bazel-*
# generated shellcheck directories
.shellcheck_generated*/
# zip archives
*.zip
# core dump files
core.*

View File

@ -59,13 +59,20 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
export BUILD_SPLIT_CUDA=ON
fi
if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
export BUILD_CAFFE2=OFF
fi
if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
export ATEN_THREADING=TBB
export USE_TBB=1
elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
export ATEN_THREADING=NATIVE
fi
# TODO: Don't run this...
pip_install -r requirements.txt || true
# Enable LLVM dependency for TensorExpr testing
export USE_LLVM=/opt/llvm
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
# TODO: Don't install this here
if ! which conda; then
# In ROCm CIs, we are doing cross compilation on build machines with
@ -229,40 +236,6 @@ else
cp build/.ninja_log dist
fi
# Build custom operator tests.
CUSTOM_OP_BUILD="$PWD/../custom-op-build"
CUSTOM_OP_TEST="$PWD/test/custom_operator"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir "$CUSTOM_OP_BUILD"
pushd "$CUSTOM_OP_BUILD"
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
popd
assert_git_not_dirty
# Build jit hook tests
JIT_HOOK_BUILD="$PWD/../jit-hook-build"
JIT_HOOK_TEST="$PWD/test/jit_hooks"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir "$JIT_HOOK_BUILD"
pushd "$JIT_HOOK_BUILD"
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
popd
assert_git_not_dirty
# Build custom backend tests.
CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
CUSTOM_BACKEND_TEST="$PWD/test/custom_backend"
python --version
mkdir "$CUSTOM_BACKEND_BUILD"
pushd "$CUSTOM_BACKEND_BUILD"
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
popd
assert_git_not_dirty
else
# Test standalone c10 build
if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda10.1-cudnn7-py3* ]]; then

View File

@ -72,7 +72,16 @@ if [[ "$BUILD_ENVIRONMENT" != *pytorch-win-* ]]; then
# Save sccache logs to file
sccache --stop-server || true
rm ~/sccache_error.log || true
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
# sccache --start-server seems to hang forever on self hosted runners for GHA
# so let's just go ahead and skip the --start-server altogether since it seems
# as though sccache still gets used even when the sscache server isn't started
# explicitly
echo "Skipping sccache server initialization, setting environment variables"
export SCCACHE_IDLE_TIMEOUT=1200
export SCCACHE_ERROR_LOG=~/sccache_error.log
export RUST_LOG=sccache::server=error
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
else
# increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
@ -147,3 +156,7 @@ fi
retry () {
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
}
# Enable LLVM dependency for TensorExpr testing
export USE_LLVM=/opt/llvm
export LLVM_DIR=/opt/llvm/lib/cmake/llvm

View File

@ -51,7 +51,11 @@ test_python_all() {
export GLOO_SOCKET_IFNAME=lo0
echo "Ninja version: $(ninja --version)"
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
if [ -n "$IN_PULL_REQUEST" ]; then
DETERMINE_FROM=$(mktemp)
file_diff_from_base "$DETERMINE_FROM"
fi

View File

@ -115,7 +115,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* ]]; then
export ATEN_CPU_CAPABILITY=avx
fi
if [ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
DETERMINE_FROM=$(mktemp)
file_diff_from_base "$DETERMINE_FROM"
fi
@ -257,6 +261,18 @@ test_rpc() {
test_custom_backend() {
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
echo "Building custom backends tests"
# Build custom backend tests.
CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
CUSTOM_BACKEND_TEST="$PWD/test/custom_backend"
python --version
mkdir "$CUSTOM_BACKEND_BUILD"
pushd "$CUSTOM_BACKEND_BUILD"
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
popd
assert_git_not_dirty
echo "Testing custom backends"
CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
pushd test/custom_backend
@ -274,6 +290,19 @@ test_custom_backend() {
test_custom_script_ops() {
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
# Build custom operator tests.
echo "Building custom script operators tests"
CUSTOM_OP_BUILD="$PWD/../custom-op-build"
CUSTOM_OP_TEST="$PWD/test/custom_operator"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir "$CUSTOM_OP_BUILD"
pushd "$CUSTOM_OP_BUILD"
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
popd
assert_git_not_dirty
echo "Testing custom script operators"
CUSTOM_OP_BUILD="$PWD/../custom-op-build"
pushd test/custom_operator
@ -290,6 +319,19 @@ test_custom_script_ops() {
test_jit_hooks() {
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
echo "Building jit hooks in cpp tests"
# Build jit hook tests
JIT_HOOK_BUILD="$PWD/../jit-hook-build"
JIT_HOOK_TEST="$PWD/test/jit_hooks"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir "$JIT_HOOK_BUILD"
pushd "$JIT_HOOK_BUILD"
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
popd
assert_git_not_dirty
echo "Testing jit hooks in cpp"
HOOK_BUILD="$PWD/../jit-hook-build"
pushd test/jit_hooks

View File

@ -42,12 +42,16 @@ fi
export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
if [ -n "$IN_PULL_REQUEST" ]; then
DETERMINE_FROM="${TMP_DIR}/determine_from"
file_diff_from_base "$DETERMINE_FROM"
fi
if [[ "${CIRCLE_JOB}" == *11* ]]; then
if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then
export BUILD_SPLIT_CUDA=ON
fi

View File

@ -14,8 +14,19 @@ ios:
clean: # This will remove ALL build folders.
@rm -r build*/
@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
linecount:
@cloc --read-lang-def=caffe.cloc caffe2 || \
echo "Cloc is not available on the machine. You can install cloc with " && \
echo " sudo apt-get install cloc"
SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha
shellcheck-gha:
@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER)
tools/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER)
generate-gha-workflows:
./.github/scripts/generate_linux_ci_workflows.py
$(MAKE) shellcheck-gha