Add ability to save TORCH_COMPILE_DEBUG logs for CI failures (#124408)

Summary: The intent is that we can whitelist certain benchmarks to a) enable TORCH_COMPILE_DEBUG=1, and b) save the generated artifacts in test/debug in case of a failure. Via the rules in action.yml, we can then upload test/debug/ to S3 whenever it exists. I chose to introduce a new directory (test/debug/) rather than using an existing one (e.g., test/test-reports/), because these don't seem like test reports and we can later add other debug-related artifacts if we find it useful. For example, we might want to later explore including the inductor cache artifacts. Test Plan: See artifacts generated when I force a failure: https://hud.pytorch.org/pr/124234 Specifically: https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/8729891826/1/artifact/debug-test-inductor_torchbench-2-2-linux.g5.4xlarge.nvidia.gpu_23953679574.zip Pull Request resolved: https://github.com/pytorch/pytorch/pull/124408 Approved by: https://github.com/desertfire
2025-10-20 12:54:11 +08:00 · 2024-04-18 09:36:51 -07:00
parent 889e3eeed3
commit 290e3e7abb
3 changed files with 61 additions and 2 deletions
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -46,7 +46,7 @@ runs:
      env:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
-        # Remove any previous test reports if they exist
+        # Remove any previous usage logs if they exist
        rm -f logs-*.zip
        # this workflow is also run in bazel build test, but we dont generate usage reports for it
        # so check to see if the file exists first
@ -57,6 +57,18 @@ runs:
            zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
        fi

+    - name: Zip debugging artifacts for upload
+      if: runner.os != 'Windows' && !inputs.use-gha
+      shell: bash
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # Remove any previous debugging artifacts if they exist
+        rm -f debug-*.zip
+        if [ -d 'test/debug' ]; then
+          zip -r "debug-${FILE_SUFFIX}.zip" test/debug
+        fi
+
    # Windows zip
    - name: Zip JSONs for upload
      if: runner.os == 'Windows' && !inputs.use-gha
@ -121,6 +133,18 @@ runs:
        if-no-files-found: ignore
        path: logs-*.zip

+    - name: Store Debug Artifacts on S3
+      uses: seemethere/upload-artifact-s3@v5
+      if: ${{ !inputs.use-gha }}
+      continue-on-error: true
+      with:
+        s3-bucket: ${{ inputs.s3-bucket }}
+        s3-prefix: |
+          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+        retention-days: 14
+        if-no-files-found: ignore
+        path: debug-*.zip
+
    # GHA upload
    - name: Store Test Downloaded JSONs on Github
      uses: actions/upload-artifact@v3
--- a/.gitignore
+++ b/.gitignore
@ -54,6 +54,7 @@ test/.coverage
 test/.hypothesis/
 test/cpp/api/mnist
 test/custom_operator/model.pt
+test/debug/
 test/jit_hooks/*.pt
 test/data/legacy_modules.t7
 test/data/*.pt
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -258,6 +258,15 @@ CI_USE_SGD = {
 DO_NOT_CAST_INPUTS = {"stable_diffusion"}


+# Maps a benchmark model name to a list of status codes. For any listed entry, we'll
+# capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
+# the result status matches one listed.
+CI_PRESERVE_COMPILE_DEBUG = {
+    # For example:
+    # "mnasnet1_0": ["fail_accuracy"],
+}
+
+
 def model_specified_by_path(path_and_class_str):
    return ":" in path_and_class_str

@ -2854,6 +2863,24 @@ class BenchmarkRunner:
                repro_dir,
            )

+    def maybe_preserve_compile_debug(self, name, status):
+        if (
+            name in CI_PRESERVE_COMPILE_DEBUG
+            and status in CI_PRESERVE_COMPILE_DEBUG[name]
+        ):
+            src_dir = torch._dynamo.utils.get_debug_dir()
+            if os.path.isdir(src_dir):
+                dbg_dir = os.path.join(
+                    os.getcwd(), "test", "debug", "torch_compile_debug"
+                )
+                dst_dir = os.path.join(dbg_dir, os.path.basename(src_dir))
+                try:
+                    os.makedirs(dbg_dir, exist_ok=True)
+                    os.rename(src_dir, dst_dir)
+                    log.warning("Moved %s to %s", src_dir, dst_dir)
+                except OSError:
+                    log.exception("Failed to preserve %s", src_dir)
+
    def run_one_model(
        self,
        name,
@ -2891,6 +2918,8 @@ class BenchmarkRunner:
            print(status)
        torch.cuda.empty_cache()

+        self.maybe_preserve_compile_debug(name, status)
+
        if self.args.timing:
            from torch._dynamo.utils import op_count, print_time_report
            from torch.utils._stats import simple_call_counter
@ -4068,8 +4097,13 @@ def run(runner, args, original_dir=None):
                timeout = args.timeout
                if should_diff_branch(args):
                    timeout *= 2
+                env = os.environ.copy()
+                if args.ci and name in CI_PRESERVE_COMPILE_DEBUG:
+                    env["TORCH_COMPILE_DEBUG"] = "1"
                subprocess.check_call(
-                    [sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
+                    [sys.executable] + sys.argv + [f"--only={name}"],
+                    timeout=timeout,
+                    env=env,
                )
            except subprocess.TimeoutExpired:
                write_csv_when_exception(args, name, "timeout")