Add ability to save TORCH_COMPILE_DEBUG logs for CI failures (#124408)

Summary: The intent is that we can whitelist certain benchmarks to a) enable TORCH_COMPILE_DEBUG=1, and b) save the generated artifacts in test/debug in case of a failure. Via the rules in action.yml, we can then upload test/debug/ to S3 whenever it exists. I chose to introduce a new directory (test/debug/) rather than using an existing one (e.g., test/test-reports/), because these don't seem like test reports and we can later add other debug-related artifacts if we find it useful. For example, we might want to later explore including the inductor cache artifacts.

Test Plan:
See artifacts generated when I force a failure: https://hud.pytorch.org/pr/124234
Specifically: https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/8729891826/1/artifact/debug-test-inductor_torchbench-2-2-linux.g5.4xlarge.nvidia.gpu_23953679574.zip

Pull Request resolved: https://github.com/pytorch/pytorch/pull/124408
Approved by: https://github.com/desertfire
This commit is contained in:
Sam Larsen
2024-04-18 09:36:51 -07:00
committed by PyTorch MergeBot
parent 889e3eeed3
commit 290e3e7abb
3 changed files with 61 additions and 2 deletions

View File

@ -46,7 +46,7 @@ runs:
env:
FILE_SUFFIX: ${{ inputs.file-suffix }}
run: |
# Remove any previous test reports if they exist
# Remove any previous usage logs if they exist
rm -f logs-*.zip
# this workflow is also run in bazel build test, but we dont generate usage reports for it
# so check to see if the file exists first
@ -57,6 +57,18 @@ runs:
zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
fi
- name: Zip debugging artifacts for upload
if: runner.os != 'Windows' && !inputs.use-gha
shell: bash
env:
FILE_SUFFIX: ${{ inputs.file-suffix }}
run: |
# Remove any previous debugging artifacts if they exist
rm -f debug-*.zip
if [ -d 'test/debug' ]; then
zip -r "debug-${FILE_SUFFIX}.zip" test/debug
fi
# Windows zip
- name: Zip JSONs for upload
if: runner.os == 'Windows' && !inputs.use-gha
@ -121,6 +133,18 @@ runs:
if-no-files-found: ignore
path: logs-*.zip
- name: Store Debug Artifacts on S3
uses: seemethere/upload-artifact-s3@v5
if: ${{ !inputs.use-gha }}
continue-on-error: true
with:
s3-bucket: ${{ inputs.s3-bucket }}
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
if-no-files-found: ignore
path: debug-*.zip
# GHA upload
- name: Store Test Downloaded JSONs on Github
uses: actions/upload-artifact@v3

1
.gitignore vendored
View File

@ -54,6 +54,7 @@ test/.coverage
test/.hypothesis/
test/cpp/api/mnist
test/custom_operator/model.pt
test/debug/
test/jit_hooks/*.pt
test/data/legacy_modules.t7
test/data/*.pt

View File

@ -258,6 +258,15 @@ CI_USE_SGD = {
DO_NOT_CAST_INPUTS = {"stable_diffusion"}
# Maps a benchmark model name to a list of status codes. For any listed entry, we'll
# capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
# the result status matches one listed.
CI_PRESERVE_COMPILE_DEBUG = {
# For example:
# "mnasnet1_0": ["fail_accuracy"],
}
def model_specified_by_path(path_and_class_str):
return ":" in path_and_class_str
@ -2854,6 +2863,24 @@ class BenchmarkRunner:
repro_dir,
)
def maybe_preserve_compile_debug(self, name, status):
if (
name in CI_PRESERVE_COMPILE_DEBUG
and status in CI_PRESERVE_COMPILE_DEBUG[name]
):
src_dir = torch._dynamo.utils.get_debug_dir()
if os.path.isdir(src_dir):
dbg_dir = os.path.join(
os.getcwd(), "test", "debug", "torch_compile_debug"
)
dst_dir = os.path.join(dbg_dir, os.path.basename(src_dir))
try:
os.makedirs(dbg_dir, exist_ok=True)
os.rename(src_dir, dst_dir)
log.warning("Moved %s to %s", src_dir, dst_dir)
except OSError:
log.exception("Failed to preserve %s", src_dir)
def run_one_model(
self,
name,
@ -2891,6 +2918,8 @@ class BenchmarkRunner:
print(status)
torch.cuda.empty_cache()
self.maybe_preserve_compile_debug(name, status)
if self.args.timing:
from torch._dynamo.utils import op_count, print_time_report
from torch.utils._stats import simple_call_counter
@ -4068,8 +4097,13 @@ def run(runner, args, original_dir=None):
timeout = args.timeout
if should_diff_branch(args):
timeout *= 2
env = os.environ.copy()
if args.ci and name in CI_PRESERVE_COMPILE_DEBUG:
env["TORCH_COMPILE_DEBUG"] = "1"
subprocess.check_call(
[sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
[sys.executable] + sys.argv + [f"--only={name}"],
timeout=timeout,
env=env,
)
except subprocess.TimeoutExpired:
write_csv_when_exception(args, name, "timeout")