.github: Add initial Linux CI for CUDA (#56494)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56494

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D27953781

Pulled By: seemethere

fbshipit-source-id: bce9298dc40d035bfbb5057e48b99d15c13733bc
This commit is contained in:
Eli Uriegas
2021-04-23 18:07:41 -07:00
committed by Facebook GitHub Bot
parent 060e4c96ee
commit b2b9efb33a
7 changed files with 271 additions and 29 deletions

View File

@ -13,10 +13,16 @@ CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
class PyTorchLinuxWorkflow:
def __init__(self, build_environment: str, docker_image_base: str):
def __init__(
self,
build_environment: str,
docker_image_base: str,
on_pull_request: bool = False
):
self.build_environment = build_environment
self.docker_image_base = docker_image_base
self.test_runner_type = CPU_TEST_RUNNER
self.on_pull_request = on_pull_request
if "cuda" in build_environment:
self.test_runner_type = CUDA_TEST_RUNNER
@ -31,7 +37,11 @@ class PyTorchLinuxWorkflow:
workflow_template.render(
build_environment=self.build_environment,
docker_image_base=self.docker_image_base,
test_runner_type=self.test_runner_type
test_runner_type=self.test_runner_type,
# two leading spaces is necessary to match yaml indent
on_pull_request=(
" pull_request:" if self.on_pull_request else ""
)
)
)
output_file.write('\n')
@ -67,10 +77,10 @@ WORKFLOWS = [
# build_environment="pytorch-linux-xenial-py3-clang7-onnx",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
# ),
PyTorchLinuxWorkflow(
build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",

View File

@ -17,12 +17,16 @@ install_nvidia_docker2_amzn2() {
)
}
install_nvidia_driver() {
install_nvidia_driver_amzn2() {
(
set -x
sudo yum groupinstall -y "Development Tools"
curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
# ensure our kernel install is the same as our underlying kernel,
# groupinstall "Development Tools" has a habit of mismatching kernel headers
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
sudo rm -fv /tmp/nvidia_driver
nvidia-smi
)
}
@ -40,4 +44,12 @@ case "${DISTRIBUTION}" in
esac
echo "== Installing nvidia driver ${DRIVER_FN} =="
install_nvidia_driver
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_driver_amzn2
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac

View File

@ -6,7 +6,7 @@ name: Linux CI (!{{ build_environment }})
on:
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
# pull_request:
!{{ on_pull_request }}
push:
branches:
- master
@ -61,6 +61,9 @@ jobs:
mkdir -pv ../custom-op-build
mkdir -pv ../custom-backend-build
mkdir -pv ../jit-hook-build
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Build PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
@ -76,14 +79,15 @@ jobs:
-e SCCACHE_BUCKET \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
@ -156,6 +160,9 @@ jobs:
- name: Output disk space left
run: |
sudo df -H
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
@ -172,15 +179,16 @@ jobs:
-e BUILD_ENVIRONMENT \
-e IN_CI \
-e MAX_JOBS \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'

View File

@ -0,0 +1,201 @@
# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
#
# Template is at: .github/templates/linux_ci_workflow.yml
# Generation script: .github/scripts/generate_linux_ci_workflows.py
name: Linux CI (pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)
on:
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
push:
branches:
- master
- release/*
workflow_dispatch:
env:
BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1
jobs:
calculate-docker-image:
runs-on: ubuntu-18.04
outputs:
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Calculate docker image tag
id: calculate-tag
run: |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
build:
runs-on: linux.2xlarge
needs: calculate-docker-image
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
submodules: recursive
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Create test binary build directories
run: |
mkdir -pv ../custom-op-build
mkdir -pv ../custom-backend-build
mkdir -pv ../jit-hook-build
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Build PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Why the three volume mounts here? So test binaries are put in the correct spot
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
# TODO: Stop building test binaries as part of the build phase
docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS \
-e SCCACHE_BUCKET \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)/../":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Archive artifacts into zip
run: |
(cd "${GITHUB_WORKSPACE}/../" && zip -r pytorch/artifacts.zip pytorch/dist pytorch/build custom-op-build/ custom-backend-build/ jit-hook-build/)
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 30
if-no-files-found: error
path:
artifacts.zip
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
test:
runs-on: linux.8xlarge.nvidia.gpu
needs:
- calculate-docker-image
- build
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
run: |
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Determine shm-size
run: |
shm_size="1g"
case "${BUILD_ENVIRONMENT}" in
*cuda*)
shm_size="2g"
;;
*rocm*)
shm_size="8g"
;;
esac
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
- uses: actions/download-artifact@v2
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
(cd "${GITHUB_WORKSPACE}/../" && unzip -q pytorch/artifacts.zip)
- name: Output disk space left
run: |
sudo df -H
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Why the three volume mounts here? So test binaries are put in the correct spot
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086
docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e IN_CI \
-e MAX_JOBS \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'
- name: Clean up docker images
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
# Prune all of the docker images
docker system prune -af

View File

@ -6,7 +6,7 @@ name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
on:
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
# pull_request:
push:
branches:
- master
@ -61,6 +61,9 @@ jobs:
mkdir -pv ../custom-op-build
mkdir -pv ../custom-backend-build
mkdir -pv ../jit-hook-build
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Build PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
@ -76,14 +79,15 @@ jobs:
-e SCCACHE_BUCKET \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
@ -156,6 +160,9 @@ jobs:
- name: Output disk space left
run: |
sudo df -H
- name: Preserve github env variables for use in docker
run: |
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
@ -172,15 +179,16 @@ jobs:
-e BUILD_ENVIRONMENT \
-e IN_CI \
-e MAX_JOBS \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'

View File

@ -50,7 +50,7 @@ function get_exit_code() {
}
function file_diff_from_base() {
# The fetch may fail on Docker hosts, but it's not always necessary.
# The fetch may fail on Docker hosts, this fetch is necessary for GHA
set +e
git fetch origin master --quiet
set -e

View File

@ -124,7 +124,7 @@ fi
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
DETERMINE_FROM=$(mktemp)
file_diff_from_base "$DETERMINE_FROM"
@ -382,11 +382,14 @@ test_benchmarks() {
BENCHMARK_DATA="benchmarks/.data"
mkdir -p ${BENCHMARK_DATA}
pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_default.json
pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json
pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_profiling_te.json
# TODO: Enable these for GHA once we have credentials for forked pull requests
if [[ -z "${GITHUB_ACTIONS}" ]]; then
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_default.json
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_profiling_te.json
fi
assert_git_not_dirty
fi
}