test reporting (#29658)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/29658

This PR makes our test scripts output artifacts that CircleCI can
understand. This has a few benefits:
1. We can actually see failed tests and their output in the job screen
(instead of having to scroll through logs)
2. We can use the CircleCI test metadata API to track failed tests
programmatically.

it looks like this (old ui):
https://circleci.com/gh/pytorch/pytorch/3546584?pipelines-ui-opt-out
or this (new ui):
https://app.circleci.com/jobs/github/pytorch/pytorch/3546584/tests

Test Plan: Imported from OSS

Differential Revision: D18597261

Pulled By: suo

fbshipit-source-id: 07fc7d26bbb834e13cc4cc0e48178645ae6579f5
This commit is contained in:
Michael Suo
2019-11-19 11:11:09 -08:00
committed by Facebook Github Bot
parent 1dbc84ab6d
commit 4b0a6d299c
10 changed files with 75 additions and 16 deletions

View File

@ -409,12 +409,23 @@ jobs:
else
export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
fi
retrieve_test_reports() {
echo "retrieving test reports"
docker cp $id:/var/lib/jenkins/workspace/test/test-reports ./ || echo 'No test reports found!'
}
trap "retrieve_test_reports" ERR
if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
else
export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
fi
echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
retrieve_test_reports
- store_test_results:
path: test-reports
caffe2_linux_build:
<<: *caffe2_params
machine:
@ -1069,6 +1080,8 @@ jobs:
chmod a+x .jenkins/pytorch/macos-test.sh
unbuffer .jenkins/pytorch/macos-test.sh 2>&1 | ts
- store_test_results:
path: test/test-reports
pytorch_macos_10_13_cuda9_2_cudnn7_py3_build:
environment:
@ -1299,7 +1312,7 @@ jobs:
name: cert install
no_output_timeout: "1h"
command: |
set -e
set -e
PROJ_ROOT=/Users/distiller/project
cd ${PROJ_ROOT}/ios/TestApp
# install fastlane
@ -1360,7 +1373,7 @@ jobs:
if ! [ -x "$(command -v xcodebuild)" ]; then
echo 'Error: xcodebuild is not installed.'
exit 1
fi
fi
echo ${IOS_DEV_TEAM_ID}
ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID}
if ! [ "$?" -eq "0" ]; then

View File

@ -151,6 +151,8 @@
chmod a+x .jenkins/pytorch/macos-test.sh
unbuffer .jenkins/pytorch/macos-test.sh 2>&1 | ts
- store_test_results:
path: test/test-reports
pytorch_macos_10_13_cuda9_2_cudnn7_py3_build:
environment:
@ -381,7 +383,7 @@
name: cert install
no_output_timeout: "1h"
command: |
set -e
set -e
PROJ_ROOT=/Users/distiller/project
cd ${PROJ_ROOT}/ios/TestApp
# install fastlane
@ -442,7 +444,7 @@
if ! [ -x "$(command -v xcodebuild)" ]; then
echo 'Error: xcodebuild is not installed.'
exit 1
fi
fi
echo ${IOS_DEV_TEAM_ID}
ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID}
if ! [ "$?" -eq "0" ]; then

View File

@ -121,9 +121,20 @@ jobs:
else
export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
fi
retrieve_test_reports() {
echo "retrieving test reports"
docker cp $id:/var/lib/jenkins/workspace/test/test-reports ./ || echo 'No test reports found!'
}
trap "retrieve_test_reports" ERR
if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
else
export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
fi
echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
retrieve_test_reports
- store_test_results:
path: test-reports

1
.gitignore vendored
View File

@ -42,6 +42,7 @@ dropout_model.pt
test/generated_type_hints_smoketest.py
test/htmlcov
test/cpp_extensions/install/
test/test-reports/
third_party/build/
tools/shared/_utils_internal.py
torch.egg-info/

View File

@ -6,6 +6,9 @@ source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"
conda install -y six
pip install -q hypothesis "librosa>=0.6.2" psutil
# TODO move this to docker
pip install unittest-xml-reporting
# faulthandler become built-in since 3.3
if [[ ! $(python -c "import sys; print(int(sys.version_info >= (3, 3)))") == "1" ]]; then
pip install -q faulthandler

View File

@ -10,8 +10,10 @@ COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}"
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
echo "Testing pytorch (distributed only)"
if [ -n "${IN_CIRCLECI}" ]; then
# TODO move this to docker
pip_install unittest-xml-reporting
if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then
# TODO: move this to Docker
sudo apt-get update

View File

@ -12,6 +12,9 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
echo "Testing pytorch"
if [ -n "${IN_CIRCLECI}" ]; then
# TODO move this to docker
pip_install unittest-xml-reporting
if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then
# TODO: move this to Docker
sudo apt-get -qq update

View File

@ -155,6 +155,13 @@ def repeat_test_for_types(dtypes):
return call_helper
return repeat_helper
# Environment variable `IS_PYTORCH_CI` is set in `.jenkins/common.sh`.
IS_PYTORCH_CI = bool(os.environ.get('IS_PYTORCH_CI'))
IN_CIRCLECI = bool(os.environ.get('IN_CIRCLECI'))
TEST_REPORT_SOURCE_OVERRIDE = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE')
PY3 = sys.version_info > (3, 0)
PY34 = sys.version_info >= (3, 4)
def run_tests(argv=UNITTEST_ARGS):
if TEST_IN_SUBPROCESS:
@ -179,18 +186,32 @@ def run_tests(argv=UNITTEST_ARGS):
assert len(failed_tests) == 0, "{} unit test(s) failed:\n\t{}".format(
len(failed_tests), '\n\t'.join(failed_tests))
else:
unittest.main(argv=argv)
if IN_CIRCLECI:
# import here so that non-CI doesn't need xmlrunner installed
import xmlrunner
# allow users to override the test file location. We need this
# because the distributed tests run the same test file multiple
# times with different configurations.
if TEST_REPORT_SOURCE_OVERRIDE is not None:
test_source = TEST_REPORT_SOURCE_OVERRIDE
else:
test_source = 'python-unittest'
PY3 = sys.version_info > (3, 0)
PY34 = sys.version_info >= (3, 4)
test_report_path = os.path.join('test-reports', test_source)
if PY3:
os.makedirs(test_report_path, exist_ok=True)
else:
if not os.path.exists(test_report_path):
os.makedirs(test_report_path)
unittest.main(argv=argv, testRunner=xmlrunner.XMLTestRunner(output=test_report_path))
else:
unittest.main(argv=argv)
IS_WINDOWS = sys.platform == "win32"
IS_MACOS = sys.platform == "darwin"
IS_PPC = platform.machine() == "ppc64le"
# Environment variable `IS_PYTORCH_CI` is set in `.jenkins/common.sh`.
IS_PYTORCH_CI = bool(os.environ.get('IS_PYTORCH_CI', 0))
if IS_WINDOWS:
@contextmanager
def TemporaryFileName():

View File

@ -102,15 +102,18 @@ DISTRIBUTED_TESTS_CONFIG = {}
if dist.is_available():
if dist.is_mpi_available():
DISTRIBUTED_TESTS_CONFIG['mpi'] = {
'WORLD_SIZE': '3'
'WORLD_SIZE': '3',
'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi'
}
if dist.is_nccl_available():
DISTRIBUTED_TESTS_CONFIG['nccl'] = {
'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl'
}
if dist.is_gloo_available():
DISTRIBUTED_TESTS_CONFIG['gloo'] = {
'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo'
}
# https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python

View File

@ -283,11 +283,11 @@ class TestConcatDataset(TestCase):
# takes in dummy var so this can also be used as a `worker_init_fn`
def set_faulthander_if_available(_=None):
if HAS_FAULTHANDLER:
faulthandler.enable()
faulthandler.enable(sys.__stderr__)
if not IS_WINDOWS:
# windows does not have faulthandler.register
# chain=False prevents the default behavior of killing the process
faulthandler.register(signal.SIGUSR1, chain=False)
faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False)
set_faulthander_if_available()