[Monitoring] Add util for linux build (#153456)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/153456
Approved by: https://github.com/huydhn
This commit is contained in:
Yang Wang
2025-05-19 17:28:13 +00:00
committed by PyTorch MergeBot
parent be36bacdaa
commit c54b9f2969
5 changed files with 145 additions and 7 deletions

View File

@ -23,6 +23,12 @@ inputs:
type: string
description: 'the job name of the test'
required: True
artifact_prefix:
type: string
description: |
'the prefix of the raw utilization data, for data stored in zip file, this is the prefix of the parent zip file'
default: ""
required: False
runs:
using: composite
@ -35,6 +41,7 @@ runs:
echo "workflow_Name: ${{inputs.workflow_name}}"
echo "job_id: ${{inputs.job_id}}"
echo "job_name: ${{inputs.job_name}}"
echo "artifact_prefix: ${{inputs.artifact_prefix}}"
- uses: nick-fields/retry@v3.0.0
name: Setup dependencies
with:
@ -53,4 +60,5 @@ runs:
--workflow-name "${{inputs.workflow_name}}" \
--workflow-run-attempt "${{inputs.workflow_attempt}}" \
--job-id "${{inputs.job_id}}" \
--job-name "${{inputs.job_name}}"
--job-name "${{inputs.job_name}}" \
--artifact-prefix "${{inputs.artifact_prefix}}"

View File

@ -74,6 +74,24 @@ on:
Overwrite the number of jobs to use for the build
required: false
type: string
disable-monitor:
description: |
Disable utilization monitoring for build job
required: false
type: boolean
default: false
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
@ -176,6 +194,27 @@ jobs:
selected-test-configs: ${{ inputs.selected-test-configs }}
job-name: ${{ steps.get-job-id.outputs.job-name }}
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
run: |
mkdir -p ../../usage_logs
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
python3 -m tools.stats.monitor \
--log-interval "$MONITOR_LOG_INTERVAL" \
--data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
> "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download pytest cache
uses: ./.github/actions/pytest-cache-download
continue-on-error: true
@ -280,6 +319,15 @@ jobs:
END_TIME=$(date +%s)
echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
- name: Stop monitoring script
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
shell: bash
continue-on-error: true
env:
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
run: |
kill "$MONITOR_SCRIPT_PID"
- name: Archive artifacts into zip
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
run: |
@ -304,6 +352,25 @@ jobs:
if-no-files-found: error
path: artifacts.zip
- name: copy logs
shell: bash
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
continue-on-error: true
run: |
rm -f ./usage_logs
mkdir -p ./usage_logs
cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/
- name: Upload raw usage log to s3
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
uses: seemethere/upload-artifact-s3@v5
with:
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
if-no-files-found: warn
path: usage_logs/usage_log_build_*.txt
- name: Upload sccache stats
if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
uses: ./.github/actions/upload-sccache-stats
@ -311,6 +378,18 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
build-time: ${{ steps.build.outputs.build_time }}
- name: Upload utilization stats
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'

1
.gitignore vendored
View File

@ -47,6 +47,7 @@ docs/source/generated/
docs/source/compile/generated/
log
usage_log.txt
usage_log*
test-reports/
test/*.bak
test/**/*.bak

View File

@ -415,10 +415,14 @@ class UsageLogger:
self._num_of_cpus = psutil.cpu_count(logical=True)
# update summary info
self._metadata.gpu_type = self._gpu_lib_detected
self._metadata.gpu_count = len(self._gpu_handles)
self._metadata.cpu_count = self._num_of_cpus
if self._has_pynvml or self._has_amdsmi:
if len(self._gpu_handles) == 0:
self._metadata.gpu_type = ""
else:
self._metadata.gpu_type = self._gpu_lib_detected
except Exception as e:
self._metadata.error = str(e)

View File

@ -31,10 +31,11 @@ from tools.stats.utilization_stats_lib import (
)
USAGE_LOG_FILENAME = "usage_log.txt"
TEST_USAGE_LOG_FILENAME = "usage_log.txt"
CMD_PYTHON_LEVEL = "CMD_PYTHON"
UTILIZATION_BUCKET = "ossci-utilization"
PYTORCH_REPO = "pytorch/pytorch"
JOB_TEST_ARTIFACT_PREFIX = "logs-test"
class SegmentGenerator:
@ -190,10 +191,12 @@ class UploadUtilizationData:
def __init__(
self,
artifact_prefix: str,
info: WorkflowInfo,
dry_run: bool = False,
debug: bool = False,
):
self.artifact_prefix = artifact_prefix
self.info = info
self.segment_generator = SegmentGenerator()
self.debug_mode = debug
@ -201,7 +204,10 @@ class UploadUtilizationData:
def start(self) -> None:
metadata, valid_records, _ = self.get_log_data(
self.info.workflow_run_id, self.info.job_id, self.info.run_attempt
self.info.workflow_run_id,
self.info.job_id,
self.info.run_attempt,
self.artifact_prefix,
)
if not metadata:
@ -271,12 +277,16 @@ class UploadUtilizationData:
upload_to_s3(bucket_name, key, docs)
def get_log_data(
self, workflow_run_id: int, job_id: int, workflow_run_attempt: int
self,
workflow_run_id: int,
job_id: int,
workflow_run_attempt: int,
artifact_prefix: str = JOB_TEST_ARTIFACT_PREFIX,
) -> tuple[
Optional[UtilizationMetadata], list[UtilizationRecord], list[UtilizationRecord]
]:
artifact_paths = download_s3_artifacts(
"logs-test", workflow_run_id, workflow_run_attempt, job_id
artifact_prefix, workflow_run_id, workflow_run_attempt, job_id
)
if len(artifact_paths) == 0:
print(
@ -290,7 +300,10 @@ class UploadUtilizationData:
return None, [], []
p = artifact_paths[0]
test_log_content = unzip_file(p, USAGE_LOG_FILENAME)
test_log_content = handle_file(p)
if not test_log_content:
return None, [], []
metadata, records, error_records = self.convert_to_log_models(test_log_content)
if metadata is None:
@ -354,6 +367,26 @@ class UploadUtilizationData:
return metadata, result_logs, error_logs
def handle_file(file_path: Path) -> str:
if file_path.match("*.zip"):
print(f"extracting {TEST_USAGE_LOG_FILENAME} from zip file {file_path}")
return unzip_file(file_path, TEST_USAGE_LOG_FILENAME)
elif file_path.match("*.txt"):
print(f"extracting {file_path}")
return read_file(file_path)
print(f"{file_path} is not a supported file type")
return ""
def read_file(file_path: Path) -> str:
try:
with open(file_path) as f:
return f.read()
except Exception as e:
print(f"::warning trying to download test log {object} failed by: {e}")
return ""
def unzip_file(path: Path, file_name: str) -> str:
try:
with zipfile.ZipFile(path) as zip_file:
@ -412,6 +445,13 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--dry-run", action="store_true", help="Enable dry-run mode")
parser.add_argument(
"--artifact-prefix",
type=str,
required=False,
help="artifact prefix to download raw utilizarion data from s3",
)
return parser.parse_args()
@ -435,9 +475,15 @@ if __name__ == "__main__":
repo=repo,
)
artifact_prefix = JOB_TEST_ARTIFACT_PREFIX
if args.artifact_prefix:
artifact_prefix = args.artifact_prefix
print(f"args.artifact_prefix: {args.artifact_prefix}")
print(f"artifact_prefix: {artifact_prefix}")
ud = UploadUtilizationData(
info=workflow_info,
dry_run=args.dry_run,
debug=args.debug,
artifact_prefix=artifact_prefix,
)
ud.start()