mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Make test_torch.py pass cuda-memcheck (#29243)
Summary: Make the following changes: - When there are more than 10k errors, cuda-memcheck only shows 10k errors, in this case we shouldn't raise an Exception - Add UNDER_CUDA_MEMCHECK environment to allow disabling `pin_memory` tests when running cuda-memcheck. - Add a `--ci` command option, when turned on, then this script would run output to stdout instead of writing a file, and exit with an error if cuda-memcheck fails - Add a `--nohang` command option. When turned on, then hang would be treated as pass instead of error - Do simple filtering on the test to run: if `'cpu'` in the test name but not `'cuda'` is not in the test name - Add `--split` and `--rank` to allowing splitting the work (NVIDIA CI has a limitation of 3 hours, we have to split the work to satisfy this limitation) - The error summary could be `ERROR SUMMARY: 1 error`, or `ERROR SUMMARY: 2 errors`, the tail could be `error` or `errors`, it is not of the same length. The script is fixed to handle this case. - Ignore errors from `cufft` Pull Request resolved: https://github.com/pytorch/pytorch/pull/29243 Differential Revision: D18941701 Pulled By: mruberry fbshipit-source-id: 2048428f32b66ef50c67444c03ce4dd9491179d2
This commit is contained in:
committed by
Facebook Github Bot
parent
701e05dcbb
commit
ffe0c1ae4d
@ -9,15 +9,20 @@ class ParseError(Exception):
|
||||
class Report:
|
||||
"""A report is a container of errors, and a summary on how many errors are found"""
|
||||
|
||||
HEAD = 'ERROR SUMMARY: '
|
||||
TAIL = ' errors'
|
||||
|
||||
def __init__(self, text, errors):
|
||||
# text is something like
|
||||
# ERROR SUMMARY: 1 error
|
||||
# or
|
||||
# ERROR SUMMARY: 2 errors
|
||||
self.text = text
|
||||
self.num_errors = int(text[len(self.HEAD):len(text) - len(self.TAIL)])
|
||||
self.num_errors = int(text.strip().split()[2])
|
||||
self.errors = errors
|
||||
if len(errors) != self.num_errors:
|
||||
raise ParseError("Number of errors does not match")
|
||||
if len(errors) == 10000 and self.num_errors > 10000:
|
||||
# When there are more than 10k errors, cuda-memcheck only display 10k
|
||||
self.num_errors = 10000
|
||||
else:
|
||||
raise ParseError("Number of errors does not match")
|
||||
|
||||
|
||||
class Error:
|
||||
|
@ -18,7 +18,8 @@ import multiprocessing
|
||||
import argparse
|
||||
import subprocess
|
||||
import tqdm
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import cuda_memcheck_common as cmc
|
||||
|
||||
ALL_TESTS = []
|
||||
@ -35,6 +36,13 @@ parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(),
|
||||
help='Number of processes running tests, default to number of cores in the system')
|
||||
parser.add_argument('--gpus', default='all',
|
||||
help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"')
|
||||
parser.add_argument('--ci', action='store_true',
|
||||
help='Whether this script is executed in CI. When executed inside a CI, this script fails when '
|
||||
'an error is detected. Also, it will not show tqdm progress bar, but directly print the error'
|
||||
'to stdout instead.')
|
||||
parser.add_argument('--nohang', action='store_true', help='Treat timeout as success')
|
||||
parser.add_argument('--split', type=int, default=1, help='Split the job into pieces')
|
||||
parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Filters that ignores cublas/cudnn errors
|
||||
@ -48,10 +56,13 @@ def is_ignored_only(output):
|
||||
return False
|
||||
count_ignored_errors = 0
|
||||
for e in report.errors:
|
||||
if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack):
|
||||
if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack):
|
||||
count_ignored_errors += 1
|
||||
return count_ignored_errors == report.num_errors
|
||||
|
||||
# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
|
||||
os.environ['PYTORCH_CUDA_MEMCHECK'] = '1'
|
||||
|
||||
# Discover tests:
|
||||
# To get a list of tests, run:
|
||||
# pytest --setup-only test/test_torch.py
|
||||
@ -66,6 +77,21 @@ for line in lines:
|
||||
line = line.replace('::', '.')
|
||||
ALL_TESTS.append(line)
|
||||
|
||||
# Do a simple filtering:
|
||||
# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
|
||||
def is_cpu_only(name):
|
||||
name = name.lower()
|
||||
return ('cpu' in name) and not ('cuda' in name)
|
||||
|
||||
ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
|
||||
|
||||
# Split all tests into chunks, and only on the selected chunk
|
||||
ALL_TESTS.sort()
|
||||
chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
|
||||
start = chunk_size * args.rank
|
||||
end = chunk_size * (args.rank + 1)
|
||||
ALL_TESTS = ALL_TESTS[start:end]
|
||||
|
||||
# Run tests:
|
||||
# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
|
||||
# This is done by using the coroutine feature in new Python versions. A number of coroutines are created;
|
||||
@ -74,8 +100,17 @@ for line in lines:
|
||||
# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
|
||||
# or as specified by the user
|
||||
progress = 0
|
||||
logfile = open('result.log', 'w')
|
||||
progressbar = tqdm.tqdm(total=len(ALL_TESTS))
|
||||
if not args.ci:
|
||||
logfile = open('result.log', 'w')
|
||||
progressbar = tqdm.tqdm(total=len(ALL_TESTS))
|
||||
else:
|
||||
logfile = sys.stdout
|
||||
|
||||
# create a fake progress bar that does not display anything
|
||||
class ProgressbarStub:
|
||||
def update(*args):
|
||||
return
|
||||
progressbar = ProgressbarStub()
|
||||
|
||||
async def run1(coroutine_id):
|
||||
global progress
|
||||
@ -97,6 +132,8 @@ async def run1(coroutine_id):
|
||||
except asyncio.TimeoutError:
|
||||
print('Timeout:', test, file=logfile)
|
||||
proc.kill()
|
||||
if args.ci and not args.nohang:
|
||||
sys.exit("Hang detected on cuda-memcheck")
|
||||
else:
|
||||
if proc.returncode == 0:
|
||||
print('Success:', test, file=logfile)
|
||||
@ -108,13 +145,15 @@ async def run1(coroutine_id):
|
||||
print('Fail:', test, file=logfile)
|
||||
print(stdout, file=logfile)
|
||||
print(stderr, file=logfile)
|
||||
if args.ci:
|
||||
sys.exit("Failure detected on cuda-memcheck")
|
||||
else:
|
||||
print('Ignored:', test, file=logfile)
|
||||
del proc
|
||||
progressbar.update(1)
|
||||
|
||||
async def main():
|
||||
tasks = [asyncio.create_task(run1(i)) for i in range(args.nproc)]
|
||||
tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
|
||||
for t in tasks:
|
||||
await t
|
||||
|
||||
|
Reference in New Issue
Block a user