mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Add ability to save TORCH_COMPILE_DEBUG logs for CI failures (#124408)
Summary: The intent is that we can whitelist certain benchmarks to a) enable TORCH_COMPILE_DEBUG=1, and b) save the generated artifacts in test/debug in case of a failure. Via the rules in action.yml, we can then upload test/debug/ to S3 whenever it exists. I chose to introduce a new directory (test/debug/) rather than using an existing one (e.g., test/test-reports/), because these don't seem like test reports and we can later add other debug-related artifacts if we find it useful. For example, we might want to later explore including the inductor cache artifacts. Test Plan: See artifacts generated when I force a failure: https://hud.pytorch.org/pr/124234 Specifically: https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/8729891826/1/artifact/debug-test-inductor_torchbench-2-2-linux.g5.4xlarge.nvidia.gpu_23953679574.zip Pull Request resolved: https://github.com/pytorch/pytorch/pull/124408 Approved by: https://github.com/desertfire
This commit is contained in:
committed by
PyTorch MergeBot
parent
889e3eeed3
commit
290e3e7abb
26
.github/actions/upload-test-artifacts/action.yml
vendored
26
.github/actions/upload-test-artifacts/action.yml
vendored
@ -46,7 +46,7 @@ runs:
|
||||
env:
|
||||
FILE_SUFFIX: ${{ inputs.file-suffix }}
|
||||
run: |
|
||||
# Remove any previous test reports if they exist
|
||||
# Remove any previous usage logs if they exist
|
||||
rm -f logs-*.zip
|
||||
# this workflow is also run in bazel build test, but we dont generate usage reports for it
|
||||
# so check to see if the file exists first
|
||||
@ -57,6 +57,18 @@ runs:
|
||||
zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
|
||||
fi
|
||||
|
||||
- name: Zip debugging artifacts for upload
|
||||
if: runner.os != 'Windows' && !inputs.use-gha
|
||||
shell: bash
|
||||
env:
|
||||
FILE_SUFFIX: ${{ inputs.file-suffix }}
|
||||
run: |
|
||||
# Remove any previous debugging artifacts if they exist
|
||||
rm -f debug-*.zip
|
||||
if [ -d 'test/debug' ]; then
|
||||
zip -r "debug-${FILE_SUFFIX}.zip" test/debug
|
||||
fi
|
||||
|
||||
# Windows zip
|
||||
- name: Zip JSONs for upload
|
||||
if: runner.os == 'Windows' && !inputs.use-gha
|
||||
@ -121,6 +133,18 @@ runs:
|
||||
if-no-files-found: ignore
|
||||
path: logs-*.zip
|
||||
|
||||
- name: Store Debug Artifacts on S3
|
||||
uses: seemethere/upload-artifact-s3@v5
|
||||
if: ${{ !inputs.use-gha }}
|
||||
continue-on-error: true
|
||||
with:
|
||||
s3-bucket: ${{ inputs.s3-bucket }}
|
||||
s3-prefix: |
|
||||
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
|
||||
retention-days: 14
|
||||
if-no-files-found: ignore
|
||||
path: debug-*.zip
|
||||
|
||||
# GHA upload
|
||||
- name: Store Test Downloaded JSONs on Github
|
||||
uses: actions/upload-artifact@v3
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -54,6 +54,7 @@ test/.coverage
|
||||
test/.hypothesis/
|
||||
test/cpp/api/mnist
|
||||
test/custom_operator/model.pt
|
||||
test/debug/
|
||||
test/jit_hooks/*.pt
|
||||
test/data/legacy_modules.t7
|
||||
test/data/*.pt
|
||||
|
@ -258,6 +258,15 @@ CI_USE_SGD = {
|
||||
DO_NOT_CAST_INPUTS = {"stable_diffusion"}
|
||||
|
||||
|
||||
# Maps a benchmark model name to a list of status codes. For any listed entry, we'll
|
||||
# capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
|
||||
# the result status matches one listed.
|
||||
CI_PRESERVE_COMPILE_DEBUG = {
|
||||
# For example:
|
||||
# "mnasnet1_0": ["fail_accuracy"],
|
||||
}
|
||||
|
||||
|
||||
def model_specified_by_path(path_and_class_str):
|
||||
return ":" in path_and_class_str
|
||||
|
||||
@ -2854,6 +2863,24 @@ class BenchmarkRunner:
|
||||
repro_dir,
|
||||
)
|
||||
|
||||
def maybe_preserve_compile_debug(self, name, status):
|
||||
if (
|
||||
name in CI_PRESERVE_COMPILE_DEBUG
|
||||
and status in CI_PRESERVE_COMPILE_DEBUG[name]
|
||||
):
|
||||
src_dir = torch._dynamo.utils.get_debug_dir()
|
||||
if os.path.isdir(src_dir):
|
||||
dbg_dir = os.path.join(
|
||||
os.getcwd(), "test", "debug", "torch_compile_debug"
|
||||
)
|
||||
dst_dir = os.path.join(dbg_dir, os.path.basename(src_dir))
|
||||
try:
|
||||
os.makedirs(dbg_dir, exist_ok=True)
|
||||
os.rename(src_dir, dst_dir)
|
||||
log.warning("Moved %s to %s", src_dir, dst_dir)
|
||||
except OSError:
|
||||
log.exception("Failed to preserve %s", src_dir)
|
||||
|
||||
def run_one_model(
|
||||
self,
|
||||
name,
|
||||
@ -2891,6 +2918,8 @@ class BenchmarkRunner:
|
||||
print(status)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
self.maybe_preserve_compile_debug(name, status)
|
||||
|
||||
if self.args.timing:
|
||||
from torch._dynamo.utils import op_count, print_time_report
|
||||
from torch.utils._stats import simple_call_counter
|
||||
@ -4068,8 +4097,13 @@ def run(runner, args, original_dir=None):
|
||||
timeout = args.timeout
|
||||
if should_diff_branch(args):
|
||||
timeout *= 2
|
||||
env = os.environ.copy()
|
||||
if args.ci and name in CI_PRESERVE_COMPILE_DEBUG:
|
||||
env["TORCH_COMPILE_DEBUG"] = "1"
|
||||
subprocess.check_call(
|
||||
[sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
|
||||
[sys.executable] + sys.argv + [f"--only={name}"],
|
||||
timeout=timeout,
|
||||
env=env,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
write_csv_when_exception(args, name, "timeout")
|
||||
|
Reference in New Issue
Block a user