[Monitoring] Add util for linux build (#153456)

Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/153456 Approved by: https://github.com/huydhn
2025-10-20 21:14:14 +08:00 · 2025-05-19 17:28:13 +00:00
parent be36bacdaa
commit c54b9f2969
5 changed files with 145 additions and 7 deletions
--- a/.github/actions/upload-utilization-stats/action.yml
+++ b/.github/actions/upload-utilization-stats/action.yml
@ -23,6 +23,12 @@ inputs:
      type: string
      description: 'the job name of the test'
      required: True
+    artifact_prefix:
+      type: string
+      description: |
+          'the prefix of the raw utilization data, for data stored in zip file, this is the prefix of the parent zip file'
+      default: ""
+      required: False

 runs:
  using: composite
@ -35,6 +41,7 @@ runs:
        echo "workflow_Name: ${{inputs.workflow_name}}"
        echo "job_id: ${{inputs.job_id}}"
        echo "job_name:  ${{inputs.job_name}}"
+        echo "artifact_prefix: ${{inputs.artifact_prefix}}"
    - uses: nick-fields/retry@v3.0.0
      name: Setup dependencies
      with:
@ -53,4 +60,5 @@ runs:
          --workflow-name "${{inputs.workflow_name}}" \
          --workflow-run-attempt "${{inputs.workflow_attempt}}" \
          --job-id "${{inputs.job_id}}" \
-          --job-name "${{inputs.job_name}}"
+          --job-name "${{inputs.job_name}}" \
+          --artifact-prefix "${{inputs.artifact_prefix}}"
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -74,6 +74,24 @@ on:
          Overwrite the number of jobs to use for the build
        required: false
        type: string
+      disable-monitor:
+        description: |
+          Disable utilization monitoring for build job
+        required: false
+        type: boolean
+        default: false
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -176,6 +194,27 @@ jobs:
          selected-test-configs: ${{ inputs.selected-test-configs }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

+      - name: Start monitoring script
+        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        run: |
+          mkdir -p ../../usage_logs
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m tools.stats.monitor \
+          --log-interval "$MONITOR_LOG_INTERVAL" \
+          --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
+          > "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
+          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
+
      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
@ -280,6 +319,15 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

+      - name: Stop monitoring script
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
+        shell: bash
+        continue-on-error: true
+        env:
+          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        run: |
+          kill "$MONITOR_SCRIPT_PID"
+
      - name: Archive artifacts into zip
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
        run: |
@ -304,6 +352,25 @@ jobs:
          if-no-files-found: error
          path: artifacts.zip

+      - name: copy logs
+        shell: bash
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
+        continue-on-error: true
+        run: |
+          rm -f ./usage_logs
+          mkdir -p ./usage_logs
+          cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/
+
+      - name: Upload raw usage log to s3
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+          retention-days: 14
+          if-no-files-found: warn
+          path: usage_logs/usage_log_build_*.txt
+
      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/upload-sccache-stats
@ -311,6 +378,18 @@ jobs:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          build-time: ${{ steps.build.outputs.build_time }}

+      - name: Upload utilization stats
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+          artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
+
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
--- a/.gitignore
+++ b/.gitignore
@ -47,6 +47,7 @@ docs/source/generated/
 docs/source/compile/generated/
 log
 usage_log.txt
+usage_log*
 test-reports/
 test/*.bak
 test/**/*.bak
--- a/tools/stats/monitor.py
+++ b/tools/stats/monitor.py
@ -415,10 +415,14 @@ class UsageLogger:

            self._num_of_cpus = psutil.cpu_count(logical=True)
            # update summary info
-            self._metadata.gpu_type = self._gpu_lib_detected
            self._metadata.gpu_count = len(self._gpu_handles)
            self._metadata.cpu_count = self._num_of_cpus

+            if self._has_pynvml or self._has_amdsmi:
+                if len(self._gpu_handles) == 0:
+                    self._metadata.gpu_type = ""
+                else:
+                    self._metadata.gpu_type = self._gpu_lib_detected
        except Exception as e:
            self._metadata.error = str(e)

--- a/tools/stats/upload_utilization_stats/upload_utilization_stats.py
+++ b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
@ -31,10 +31,11 @@ from tools.stats.utilization_stats_lib import (
 )


-USAGE_LOG_FILENAME = "usage_log.txt"
+TEST_USAGE_LOG_FILENAME = "usage_log.txt"
 CMD_PYTHON_LEVEL = "CMD_PYTHON"
 UTILIZATION_BUCKET = "ossci-utilization"
 PYTORCH_REPO = "pytorch/pytorch"
+JOB_TEST_ARTIFACT_PREFIX = "logs-test"


 class SegmentGenerator:
@ -190,10 +191,12 @@ class UploadUtilizationData:

    def __init__(
        self,
+        artifact_prefix: str,
        info: WorkflowInfo,
        dry_run: bool = False,
        debug: bool = False,
    ):
+        self.artifact_prefix = artifact_prefix
        self.info = info
        self.segment_generator = SegmentGenerator()
        self.debug_mode = debug
@ -201,7 +204,10 @@ class UploadUtilizationData:

    def start(self) -> None:
        metadata, valid_records, _ = self.get_log_data(
-            self.info.workflow_run_id, self.info.job_id, self.info.run_attempt
+            self.info.workflow_run_id,
+            self.info.job_id,
+            self.info.run_attempt,
+            self.artifact_prefix,
        )

        if not metadata:
@ -271,12 +277,16 @@ class UploadUtilizationData:
        upload_to_s3(bucket_name, key, docs)

    def get_log_data(
-        self, workflow_run_id: int, job_id: int, workflow_run_attempt: int
+        self,
+        workflow_run_id: int,
+        job_id: int,
+        workflow_run_attempt: int,
+        artifact_prefix: str = JOB_TEST_ARTIFACT_PREFIX,
    ) -> tuple[
        Optional[UtilizationMetadata], list[UtilizationRecord], list[UtilizationRecord]
    ]:
        artifact_paths = download_s3_artifacts(
-            "logs-test", workflow_run_id, workflow_run_attempt, job_id
+            artifact_prefix, workflow_run_id, workflow_run_attempt, job_id
        )
        if len(artifact_paths) == 0:
            print(
@ -290,7 +300,10 @@ class UploadUtilizationData:
            return None, [], []

        p = artifact_paths[0]
-        test_log_content = unzip_file(p, USAGE_LOG_FILENAME)
+
+        test_log_content = handle_file(p)
+        if not test_log_content:
+            return None, [], []

        metadata, records, error_records = self.convert_to_log_models(test_log_content)
        if metadata is None:
@ -354,6 +367,26 @@ class UploadUtilizationData:
        return metadata, result_logs, error_logs


+def handle_file(file_path: Path) -> str:
+    if file_path.match("*.zip"):
+        print(f"extracting {TEST_USAGE_LOG_FILENAME} from zip file {file_path}")
+        return unzip_file(file_path, TEST_USAGE_LOG_FILENAME)
+    elif file_path.match("*.txt"):
+        print(f"extracting {file_path}")
+        return read_file(file_path)
+    print(f"{file_path} is not a supported file type")
+    return ""
+
+
+def read_file(file_path: Path) -> str:
+    try:
+        with open(file_path) as f:
+            return f.read()
+    except Exception as e:
+        print(f"::warning trying to download test log {object} failed by: {e}")
+        return ""
+
+
 def unzip_file(path: Path, file_name: str) -> str:
    try:
        with zipfile.ZipFile(path) as zip_file:
@ -412,6 +445,13 @@ def parse_args() -> argparse.Namespace:

    parser.add_argument("--dry-run", action="store_true", help="Enable dry-run mode")

+    parser.add_argument(
+        "--artifact-prefix",
+        type=str,
+        required=False,
+        help="artifact prefix to download raw utilizarion data from s3",
+    )
+
    return parser.parse_args()


@ -435,9 +475,15 @@ if __name__ == "__main__":
        repo=repo,
    )

+    artifact_prefix = JOB_TEST_ARTIFACT_PREFIX
+    if args.artifact_prefix:
+        artifact_prefix = args.artifact_prefix
+    print(f"args.artifact_prefix: {args.artifact_prefix}")
+    print(f"artifact_prefix: {artifact_prefix}")
    ud = UploadUtilizationData(
        info=workflow_info,
        dry_run=args.dry_run,
        debug=args.debug,
+        artifact_prefix=artifact_prefix,
    )
    ud.start()