[Utilization Monitor] input to disable utilization monitor (#140857)

# Overview Currently monitor.py produces error only result, this pr introduct disable-monitor option to all *-test.yml. We also like to explore how the monitor code affect benchmark results. # next steps - fix the monitor.py - enable non-benchmark tests with monitor - investigate benchmark test behavior with monitor background job Pull Request resolved: https://github.com/pytorch/pytorch/pull/140857 Approved by: https://github.com/huydhn
2025-10-20 12:54:11 +08:00 · 2024-11-18 23:26:01 +00:00
parent 48a276c5a0
commit 175ba9fed6
6 changed files with 60 additions and 7 deletions
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -47,7 +47,14 @@ inputs:
  GITHUB_TOKEN:
    description: GitHub token
    required: true
-
+  disable-monitor:
+    description: |
+      [Experimental] Disable utilization monitoring for tests.
+      Currently, by default we disable the monitor job and only look for specific tests,
+      since we are investigating the behaviour of the monitor script with different tests.
+    required: false
+    type: boolean
+    default: true
 #env:
 #  GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}

@ -115,6 +122,7 @@ runs:

    - name: Start monitoring script
      id: monitor-script
+      if: ${{ !inputs.disable-monitor }}
      shell: bash
      continue-on-error: true
      run: |
@ -289,7 +297,7 @@ runs:
        cat test/**/*_toprint.log || true

    - name: Stop monitoring script
-      if: always() && steps.monitor-script.outputs.monitor-script-pid
+      if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
      shell: bash
      continue-on-error: true
      env:
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -47,6 +47,14 @@ on:
        required: false
        type: string
        default: ""
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -145,6 +153,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        run: |
@ -328,7 +337,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -30,6 +30,14 @@ on:
        default: 270
        description: |
          Set the maximum (in minutes) how long the workflow should take to finish
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 jobs:
  test:
@ -101,6 +109,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        continue-on-error: true
        run: |
          ${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
@ -200,7 +209,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        continue-on-error: true
        env:
          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -38,6 +38,14 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -91,6 +99,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        run: |
@ -247,7 +256,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -28,6 +28,14 @@ on:
        default: 240
        description: |
          Set the maximum (in minutes) how long the workflow should take to finish
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -101,6 +109,7 @@ jobs:
      - name: Start monitoring script
        id: monitor-script
        shell: bash
+        if: ${{ !inputs.disable-monitor }}
        continue-on-error: true
        run: |
          # Windows conda doesn't have python3 binary, only python, but it's python3
@ -213,7 +222,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -38,6 +38,14 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -83,6 +91,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        run: |
@ -242,7 +251,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env: