mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Add ability to save TORCH_COMPILE_DEBUG logs for CI failures (#124408)
Summary: The intent is that we can whitelist certain benchmarks to a) enable TORCH_COMPILE_DEBUG=1, and b) save the generated artifacts in test/debug in case of a failure. Via the rules in action.yml, we can then upload test/debug/ to S3 whenever it exists. I chose to introduce a new directory (test/debug/) rather than using an existing one (e.g., test/test-reports/), because these don't seem like test reports and we can later add other debug-related artifacts if we find it useful. For example, we might want to later explore including the inductor cache artifacts. Test Plan: See artifacts generated when I force a failure: https://hud.pytorch.org/pr/124234 Specifically: https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/8729891826/1/artifact/debug-test-inductor_torchbench-2-2-linux.g5.4xlarge.nvidia.gpu_23953679574.zip Pull Request resolved: https://github.com/pytorch/pytorch/pull/124408 Approved by: https://github.com/desertfire
This commit is contained in:
committed by
PyTorch MergeBot
parent
889e3eeed3
commit
290e3e7abb
26
.github/actions/upload-test-artifacts/action.yml
vendored
26
.github/actions/upload-test-artifacts/action.yml
vendored
@ -46,7 +46,7 @@ runs:
|
|||||||
env:
|
env:
|
||||||
FILE_SUFFIX: ${{ inputs.file-suffix }}
|
FILE_SUFFIX: ${{ inputs.file-suffix }}
|
||||||
run: |
|
run: |
|
||||||
# Remove any previous test reports if they exist
|
# Remove any previous usage logs if they exist
|
||||||
rm -f logs-*.zip
|
rm -f logs-*.zip
|
||||||
# this workflow is also run in bazel build test, but we dont generate usage reports for it
|
# this workflow is also run in bazel build test, but we dont generate usage reports for it
|
||||||
# so check to see if the file exists first
|
# so check to see if the file exists first
|
||||||
@ -57,6 +57,18 @@ runs:
|
|||||||
zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
|
zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Zip debugging artifacts for upload
|
||||||
|
if: runner.os != 'Windows' && !inputs.use-gha
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
FILE_SUFFIX: ${{ inputs.file-suffix }}
|
||||||
|
run: |
|
||||||
|
# Remove any previous debugging artifacts if they exist
|
||||||
|
rm -f debug-*.zip
|
||||||
|
if [ -d 'test/debug' ]; then
|
||||||
|
zip -r "debug-${FILE_SUFFIX}.zip" test/debug
|
||||||
|
fi
|
||||||
|
|
||||||
# Windows zip
|
# Windows zip
|
||||||
- name: Zip JSONs for upload
|
- name: Zip JSONs for upload
|
||||||
if: runner.os == 'Windows' && !inputs.use-gha
|
if: runner.os == 'Windows' && !inputs.use-gha
|
||||||
@ -121,6 +133,18 @@ runs:
|
|||||||
if-no-files-found: ignore
|
if-no-files-found: ignore
|
||||||
path: logs-*.zip
|
path: logs-*.zip
|
||||||
|
|
||||||
|
- name: Store Debug Artifacts on S3
|
||||||
|
uses: seemethere/upload-artifact-s3@v5
|
||||||
|
if: ${{ !inputs.use-gha }}
|
||||||
|
continue-on-error: true
|
||||||
|
with:
|
||||||
|
s3-bucket: ${{ inputs.s3-bucket }}
|
||||||
|
s3-prefix: |
|
||||||
|
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
|
||||||
|
retention-days: 14
|
||||||
|
if-no-files-found: ignore
|
||||||
|
path: debug-*.zip
|
||||||
|
|
||||||
# GHA upload
|
# GHA upload
|
||||||
- name: Store Test Downloaded JSONs on Github
|
- name: Store Test Downloaded JSONs on Github
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -54,6 +54,7 @@ test/.coverage
|
|||||||
test/.hypothesis/
|
test/.hypothesis/
|
||||||
test/cpp/api/mnist
|
test/cpp/api/mnist
|
||||||
test/custom_operator/model.pt
|
test/custom_operator/model.pt
|
||||||
|
test/debug/
|
||||||
test/jit_hooks/*.pt
|
test/jit_hooks/*.pt
|
||||||
test/data/legacy_modules.t7
|
test/data/legacy_modules.t7
|
||||||
test/data/*.pt
|
test/data/*.pt
|
||||||
|
@ -258,6 +258,15 @@ CI_USE_SGD = {
|
|||||||
DO_NOT_CAST_INPUTS = {"stable_diffusion"}
|
DO_NOT_CAST_INPUTS = {"stable_diffusion"}
|
||||||
|
|
||||||
|
|
||||||
|
# Maps a benchmark model name to a list of status codes. For any listed entry, we'll
|
||||||
|
# capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
|
||||||
|
# the result status matches one listed.
|
||||||
|
CI_PRESERVE_COMPILE_DEBUG = {
|
||||||
|
# For example:
|
||||||
|
# "mnasnet1_0": ["fail_accuracy"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def model_specified_by_path(path_and_class_str):
|
def model_specified_by_path(path_and_class_str):
|
||||||
return ":" in path_and_class_str
|
return ":" in path_and_class_str
|
||||||
|
|
||||||
@ -2854,6 +2863,24 @@ class BenchmarkRunner:
|
|||||||
repro_dir,
|
repro_dir,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def maybe_preserve_compile_debug(self, name, status):
|
||||||
|
if (
|
||||||
|
name in CI_PRESERVE_COMPILE_DEBUG
|
||||||
|
and status in CI_PRESERVE_COMPILE_DEBUG[name]
|
||||||
|
):
|
||||||
|
src_dir = torch._dynamo.utils.get_debug_dir()
|
||||||
|
if os.path.isdir(src_dir):
|
||||||
|
dbg_dir = os.path.join(
|
||||||
|
os.getcwd(), "test", "debug", "torch_compile_debug"
|
||||||
|
)
|
||||||
|
dst_dir = os.path.join(dbg_dir, os.path.basename(src_dir))
|
||||||
|
try:
|
||||||
|
os.makedirs(dbg_dir, exist_ok=True)
|
||||||
|
os.rename(src_dir, dst_dir)
|
||||||
|
log.warning("Moved %s to %s", src_dir, dst_dir)
|
||||||
|
except OSError:
|
||||||
|
log.exception("Failed to preserve %s", src_dir)
|
||||||
|
|
||||||
def run_one_model(
|
def run_one_model(
|
||||||
self,
|
self,
|
||||||
name,
|
name,
|
||||||
@ -2891,6 +2918,8 @@ class BenchmarkRunner:
|
|||||||
print(status)
|
print(status)
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
self.maybe_preserve_compile_debug(name, status)
|
||||||
|
|
||||||
if self.args.timing:
|
if self.args.timing:
|
||||||
from torch._dynamo.utils import op_count, print_time_report
|
from torch._dynamo.utils import op_count, print_time_report
|
||||||
from torch.utils._stats import simple_call_counter
|
from torch.utils._stats import simple_call_counter
|
||||||
@ -4068,8 +4097,13 @@ def run(runner, args, original_dir=None):
|
|||||||
timeout = args.timeout
|
timeout = args.timeout
|
||||||
if should_diff_branch(args):
|
if should_diff_branch(args):
|
||||||
timeout *= 2
|
timeout *= 2
|
||||||
|
env = os.environ.copy()
|
||||||
|
if args.ci and name in CI_PRESERVE_COMPILE_DEBUG:
|
||||||
|
env["TORCH_COMPILE_DEBUG"] = "1"
|
||||||
subprocess.check_call(
|
subprocess.check_call(
|
||||||
[sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
|
[sys.executable] + sys.argv + [f"--only={name}"],
|
||||||
|
timeout=timeout,
|
||||||
|
env=env,
|
||||||
)
|
)
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
write_csv_when_exception(args, name, "timeout")
|
write_csv_when_exception(args, name, "timeout")
|
||||||
|
Reference in New Issue
Block a user