mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-20 21:14:14 +08:00 
			
		
		
		
	Pull Request resolved: https://github.com/pytorch/pytorch/pull/144556 Approved by: https://github.com/ezyang
		
			
				
	
	
		
			207 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			207 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| """This script runs cuda-memcheck on the specified unit test. Each test case
 | |
| is run in its isolated process with a timeout so that:
 | |
| 1) different test cases won't influence each other, and
 | |
| 2) in case of hang, the script would still finish in a finite amount of time.
 | |
| The output will be written to a log file result.log
 | |
| 
 | |
| Example usage:
 | |
|     python run_cuda_memcheck.py ../test_torch.py 600
 | |
| 
 | |
| Note that running cuda-memcheck could be very slow.
 | |
| """
 | |
| 
 | |
| import argparse
 | |
| import asyncio
 | |
| import multiprocessing
 | |
| import os
 | |
| import subprocess
 | |
| import sys
 | |
| 
 | |
| import cuda_memcheck_common as cmc
 | |
| import tqdm
 | |
| 
 | |
| import torch
 | |
| 
 | |
| 
 | |
| ALL_TESTS = []
 | |
| GPUS = torch.cuda.device_count()
 | |
| 
 | |
| # parse arguments
 | |
| parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
 | |
| parser.add_argument(
 | |
|     "filename", help="the python file for a test, such as test_torch.py"
 | |
| )
 | |
| parser.add_argument(
 | |
|     "timeout",
 | |
|     type=int,
 | |
|     help="kill the test if it does not terminate in a certain amount of seconds",
 | |
| )
 | |
| parser.add_argument(
 | |
|     "--strict",
 | |
|     action="store_true",
 | |
|     help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
 | |
|     "cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
 | |
| )
 | |
| parser.add_argument(
 | |
|     "--nproc",
 | |
|     type=int,
 | |
|     default=multiprocessing.cpu_count(),
 | |
|     help="Number of processes running tests, default to number of cores in the system",
 | |
| )
 | |
| parser.add_argument(
 | |
|     "--gpus",
 | |
|     default="all",
 | |
|     help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
 | |
| )
 | |
| parser.add_argument(
 | |
|     "--ci",
 | |
|     action="store_true",
 | |
|     help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
 | |
|     "an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
 | |
|     "to stdout instead.",
 | |
| )
 | |
| parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
 | |
| parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
 | |
| parser.add_argument(
 | |
|     "--rank", type=int, default=0, help="Which piece this process should pick"
 | |
| )
 | |
| args = parser.parse_args()
 | |
| 
 | |
| 
 | |
| # Filters that ignores cublas/cudnn errors
 | |
| # TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
 | |
| def is_ignored_only(output):
 | |
|     try:
 | |
|         report = cmc.parse(output)
 | |
|     except cmc.ParseError:
 | |
|         # in case the simple parser fails parsing the output of cuda memcheck
 | |
|         # then this error is never ignored.
 | |
|         return False
 | |
|     count_ignored_errors = 0
 | |
|     for e in report.errors:
 | |
|         if (
 | |
|             "libcublas" in "".join(e.stack)
 | |
|             or "libcudnn" in "".join(e.stack)
 | |
|             or "libcufft" in "".join(e.stack)
 | |
|         ):
 | |
|             count_ignored_errors += 1
 | |
|     return count_ignored_errors == report.num_errors
 | |
| 
 | |
| 
 | |
| # Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
 | |
| os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"
 | |
| 
 | |
| # Discover tests:
 | |
| # To get a list of tests, run:
 | |
| # pytest --setup-only test/test_torch.py
 | |
| # and then parse the output
 | |
| proc = subprocess.Popen(
 | |
|     ["pytest", "--setup-only", args.filename],
 | |
|     stdout=subprocess.PIPE,
 | |
|     stderr=subprocess.PIPE,
 | |
| )
 | |
| stdout, stderr = proc.communicate()
 | |
| lines = stdout.decode().strip().splitlines()
 | |
| for line in lines:
 | |
|     if "(fixtures used:" in line:
 | |
|         line = line.strip().split()[0]
 | |
|         line = line[line.find("::") + 2 :]
 | |
|         line = line.replace("::", ".")
 | |
|         ALL_TESTS.append(line)
 | |
| 
 | |
| 
 | |
| # Do a simple filtering:
 | |
| # if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
 | |
| def is_cpu_only(name):
 | |
|     name = name.lower()
 | |
|     return ("cpu" in name) and "cuda" not in name
 | |
| 
 | |
| 
 | |
| ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
 | |
| 
 | |
| # Split all tests into chunks, and only on the selected chunk
 | |
| ALL_TESTS.sort()
 | |
| chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
 | |
| start = chunk_size * args.rank
 | |
| end = chunk_size * (args.rank + 1)
 | |
| ALL_TESTS = ALL_TESTS[start:end]
 | |
| 
 | |
| # Run tests:
 | |
| # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
 | |
| # This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
 | |
| # they create subprocesses and awaiting them to finish. The number of running subprocesses could be
 | |
| # specified by the user and by default is the same as the number of CPUs in the machine.
 | |
| # These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
 | |
| # or as specified by the user
 | |
| progress = 0
 | |
| if not args.ci:
 | |
|     logfile = open("result.log", "w")
 | |
|     progressbar = tqdm.tqdm(total=len(ALL_TESTS))
 | |
| else:
 | |
|     logfile = sys.stdout
 | |
| 
 | |
|     # create a fake progress bar that does not display anything
 | |
|     class ProgressbarStub:
 | |
|         def update(self, *args):
 | |
|             return
 | |
| 
 | |
|     progressbar = ProgressbarStub()
 | |
| 
 | |
| 
 | |
| async def run1(coroutine_id):
 | |
|     global progress
 | |
| 
 | |
|     if args.gpus == "all":
 | |
|         gpuid = coroutine_id % GPUS
 | |
|     else:
 | |
|         gpu_assignments = args.gpus.split(":")
 | |
|         assert args.nproc == len(gpu_assignments), (
 | |
|             "Please specify GPU assignment for each process, separated by :"
 | |
|         )
 | |
|         gpuid = gpu_assignments[coroutine_id]
 | |
| 
 | |
|     while progress < len(ALL_TESTS):
 | |
|         test = ALL_TESTS[progress]
 | |
|         progress += 1
 | |
|         cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
 | |
|         proc = await asyncio.create_subprocess_shell(
 | |
|             cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
 | |
|         )
 | |
|         try:
 | |
|             stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
 | |
|         except asyncio.TimeoutError:
 | |
|             print("Timeout:", test, file=logfile)
 | |
|             proc.kill()
 | |
|             if args.ci and not args.nohang:
 | |
|                 sys.exit("Hang detected on cuda-memcheck")
 | |
|         else:
 | |
|             if proc.returncode == 0:
 | |
|                 print("Success:", test, file=logfile)
 | |
|             else:
 | |
|                 stdout = stdout.decode()
 | |
|                 stderr = stderr.decode()
 | |
|                 should_display = args.strict or not is_ignored_only(stdout)
 | |
|                 if should_display:
 | |
|                     print("Fail:", test, file=logfile)
 | |
|                     print(stdout, file=logfile)
 | |
|                     print(stderr, file=logfile)
 | |
|                     if args.ci:
 | |
|                         sys.exit("Failure detected on cuda-memcheck")
 | |
|                 else:
 | |
|                     print("Ignored:", test, file=logfile)
 | |
|         del proc
 | |
|         progressbar.update(1)
 | |
| 
 | |
| 
 | |
| async def main():
 | |
|     tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
 | |
|     for t in tasks:
 | |
|         await t
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     loop = asyncio.get_event_loop()
 | |
|     loop.run_until_complete(main())
 |