mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[Monitoring] Add util for linux build (#153456)
Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/153456 Approved by: https://github.com/huydhn
This commit is contained in:
committed by
PyTorch MergeBot
parent
be36bacdaa
commit
c54b9f2969
@ -23,6 +23,12 @@ inputs:
|
||||
type: string
|
||||
description: 'the job name of the test'
|
||||
required: True
|
||||
artifact_prefix:
|
||||
type: string
|
||||
description: |
|
||||
'the prefix of the raw utilization data, for data stored in zip file, this is the prefix of the parent zip file'
|
||||
default: ""
|
||||
required: False
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
@ -35,6 +41,7 @@ runs:
|
||||
echo "workflow_Name: ${{inputs.workflow_name}}"
|
||||
echo "job_id: ${{inputs.job_id}}"
|
||||
echo "job_name: ${{inputs.job_name}}"
|
||||
echo "artifact_prefix: ${{inputs.artifact_prefix}}"
|
||||
- uses: nick-fields/retry@v3.0.0
|
||||
name: Setup dependencies
|
||||
with:
|
||||
@ -53,4 +60,5 @@ runs:
|
||||
--workflow-name "${{inputs.workflow_name}}" \
|
||||
--workflow-run-attempt "${{inputs.workflow_attempt}}" \
|
||||
--job-id "${{inputs.job_id}}" \
|
||||
--job-name "${{inputs.job_name}}"
|
||||
--job-name "${{inputs.job_name}}" \
|
||||
--artifact-prefix "${{inputs.artifact_prefix}}"
|
||||
|
79
.github/workflows/_linux-build.yml
vendored
79
.github/workflows/_linux-build.yml
vendored
@ -74,6 +74,24 @@ on:
|
||||
Overwrite the number of jobs to use for the build
|
||||
required: false
|
||||
type: string
|
||||
disable-monitor:
|
||||
description: |
|
||||
Disable utilization monitoring for build job
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
monitor-log-interval:
|
||||
description: |
|
||||
Set the interval for the monitor script to log utilization.
|
||||
required: false
|
||||
type: number
|
||||
default: 5
|
||||
monitor-data-collect-interval:
|
||||
description: |
|
||||
Set the interval for the monitor script to collect data.
|
||||
required: false
|
||||
type: number
|
||||
default: 1
|
||||
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN:
|
||||
@ -176,6 +194,27 @@ jobs:
|
||||
selected-test-configs: ${{ inputs.selected-test-configs }}
|
||||
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
||||
|
||||
- name: Start monitoring script
|
||||
id: monitor-script
|
||||
if: ${{ !inputs.disable-monitor }}
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
env:
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
WORKFLOW_NAME: ${{ github.workflow }}
|
||||
WORKFLOW_RUN_ID: ${{github.run_id}}
|
||||
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
||||
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
||||
run: |
|
||||
mkdir -p ../../usage_logs
|
||||
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
|
||||
python3 -m tools.stats.monitor \
|
||||
--log-interval "$MONITOR_LOG_INTERVAL" \
|
||||
--data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
|
||||
> "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
|
||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Download pytest cache
|
||||
uses: ./.github/actions/pytest-cache-download
|
||||
continue-on-error: true
|
||||
@ -280,6 +319,15 @@ jobs:
|
||||
END_TIME=$(date +%s)
|
||||
echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Stop monitoring script
|
||||
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
env:
|
||||
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
||||
run: |
|
||||
kill "$MONITOR_SCRIPT_PID"
|
||||
|
||||
- name: Archive artifacts into zip
|
||||
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
|
||||
run: |
|
||||
@ -304,6 +352,25 @@ jobs:
|
||||
if-no-files-found: error
|
||||
path: artifacts.zip
|
||||
|
||||
- name: copy logs
|
||||
shell: bash
|
||||
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
rm -f ./usage_logs
|
||||
mkdir -p ./usage_logs
|
||||
cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/
|
||||
|
||||
- name: Upload raw usage log to s3
|
||||
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
|
||||
uses: seemethere/upload-artifact-s3@v5
|
||||
with:
|
||||
s3-prefix: |
|
||||
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
|
||||
retention-days: 14
|
||||
if-no-files-found: warn
|
||||
path: usage_logs/usage_log_build_*.txt
|
||||
|
||||
- name: Upload sccache stats
|
||||
if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
uses: ./.github/actions/upload-sccache-stats
|
||||
@ -311,6 +378,18 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
build-time: ${{ steps.build.outputs.build_time }}
|
||||
|
||||
- name: Upload utilization stats
|
||||
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
continue-on-error: true
|
||||
uses: ./.github/actions/upload-utilization-stats
|
||||
with:
|
||||
job_id: ${{ steps.get-job-id.outputs.job-id }}
|
||||
job_name: ${{ steps.get-job-id.outputs.job-name }}
|
||||
workflow_name: ${{ github.workflow }}
|
||||
workflow_run_id: ${{github.run_id}}
|
||||
workflow_attempt: ${{github.run_attempt}}
|
||||
artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
|
||||
|
||||
- name: Teardown Linux
|
||||
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
||||
if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -47,6 +47,7 @@ docs/source/generated/
|
||||
docs/source/compile/generated/
|
||||
log
|
||||
usage_log.txt
|
||||
usage_log*
|
||||
test-reports/
|
||||
test/*.bak
|
||||
test/**/*.bak
|
||||
|
@ -415,10 +415,14 @@ class UsageLogger:
|
||||
|
||||
self._num_of_cpus = psutil.cpu_count(logical=True)
|
||||
# update summary info
|
||||
self._metadata.gpu_type = self._gpu_lib_detected
|
||||
self._metadata.gpu_count = len(self._gpu_handles)
|
||||
self._metadata.cpu_count = self._num_of_cpus
|
||||
|
||||
if self._has_pynvml or self._has_amdsmi:
|
||||
if len(self._gpu_handles) == 0:
|
||||
self._metadata.gpu_type = ""
|
||||
else:
|
||||
self._metadata.gpu_type = self._gpu_lib_detected
|
||||
except Exception as e:
|
||||
self._metadata.error = str(e)
|
||||
|
||||
|
@ -31,10 +31,11 @@ from tools.stats.utilization_stats_lib import (
|
||||
)
|
||||
|
||||
|
||||
USAGE_LOG_FILENAME = "usage_log.txt"
|
||||
TEST_USAGE_LOG_FILENAME = "usage_log.txt"
|
||||
CMD_PYTHON_LEVEL = "CMD_PYTHON"
|
||||
UTILIZATION_BUCKET = "ossci-utilization"
|
||||
PYTORCH_REPO = "pytorch/pytorch"
|
||||
JOB_TEST_ARTIFACT_PREFIX = "logs-test"
|
||||
|
||||
|
||||
class SegmentGenerator:
|
||||
@ -190,10 +191,12 @@ class UploadUtilizationData:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
artifact_prefix: str,
|
||||
info: WorkflowInfo,
|
||||
dry_run: bool = False,
|
||||
debug: bool = False,
|
||||
):
|
||||
self.artifact_prefix = artifact_prefix
|
||||
self.info = info
|
||||
self.segment_generator = SegmentGenerator()
|
||||
self.debug_mode = debug
|
||||
@ -201,7 +204,10 @@ class UploadUtilizationData:
|
||||
|
||||
def start(self) -> None:
|
||||
metadata, valid_records, _ = self.get_log_data(
|
||||
self.info.workflow_run_id, self.info.job_id, self.info.run_attempt
|
||||
self.info.workflow_run_id,
|
||||
self.info.job_id,
|
||||
self.info.run_attempt,
|
||||
self.artifact_prefix,
|
||||
)
|
||||
|
||||
if not metadata:
|
||||
@ -271,12 +277,16 @@ class UploadUtilizationData:
|
||||
upload_to_s3(bucket_name, key, docs)
|
||||
|
||||
def get_log_data(
|
||||
self, workflow_run_id: int, job_id: int, workflow_run_attempt: int
|
||||
self,
|
||||
workflow_run_id: int,
|
||||
job_id: int,
|
||||
workflow_run_attempt: int,
|
||||
artifact_prefix: str = JOB_TEST_ARTIFACT_PREFIX,
|
||||
) -> tuple[
|
||||
Optional[UtilizationMetadata], list[UtilizationRecord], list[UtilizationRecord]
|
||||
]:
|
||||
artifact_paths = download_s3_artifacts(
|
||||
"logs-test", workflow_run_id, workflow_run_attempt, job_id
|
||||
artifact_prefix, workflow_run_id, workflow_run_attempt, job_id
|
||||
)
|
||||
if len(artifact_paths) == 0:
|
||||
print(
|
||||
@ -290,7 +300,10 @@ class UploadUtilizationData:
|
||||
return None, [], []
|
||||
|
||||
p = artifact_paths[0]
|
||||
test_log_content = unzip_file(p, USAGE_LOG_FILENAME)
|
||||
|
||||
test_log_content = handle_file(p)
|
||||
if not test_log_content:
|
||||
return None, [], []
|
||||
|
||||
metadata, records, error_records = self.convert_to_log_models(test_log_content)
|
||||
if metadata is None:
|
||||
@ -354,6 +367,26 @@ class UploadUtilizationData:
|
||||
return metadata, result_logs, error_logs
|
||||
|
||||
|
||||
def handle_file(file_path: Path) -> str:
|
||||
if file_path.match("*.zip"):
|
||||
print(f"extracting {TEST_USAGE_LOG_FILENAME} from zip file {file_path}")
|
||||
return unzip_file(file_path, TEST_USAGE_LOG_FILENAME)
|
||||
elif file_path.match("*.txt"):
|
||||
print(f"extracting {file_path}")
|
||||
return read_file(file_path)
|
||||
print(f"{file_path} is not a supported file type")
|
||||
return ""
|
||||
|
||||
|
||||
def read_file(file_path: Path) -> str:
|
||||
try:
|
||||
with open(file_path) as f:
|
||||
return f.read()
|
||||
except Exception as e:
|
||||
print(f"::warning trying to download test log {object} failed by: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def unzip_file(path: Path, file_name: str) -> str:
|
||||
try:
|
||||
with zipfile.ZipFile(path) as zip_file:
|
||||
@ -412,6 +445,13 @@ def parse_args() -> argparse.Namespace:
|
||||
|
||||
parser.add_argument("--dry-run", action="store_true", help="Enable dry-run mode")
|
||||
|
||||
parser.add_argument(
|
||||
"--artifact-prefix",
|
||||
type=str,
|
||||
required=False,
|
||||
help="artifact prefix to download raw utilizarion data from s3",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -435,9 +475,15 @@ if __name__ == "__main__":
|
||||
repo=repo,
|
||||
)
|
||||
|
||||
artifact_prefix = JOB_TEST_ARTIFACT_PREFIX
|
||||
if args.artifact_prefix:
|
||||
artifact_prefix = args.artifact_prefix
|
||||
print(f"args.artifact_prefix: {args.artifact_prefix}")
|
||||
print(f"artifact_prefix: {artifact_prefix}")
|
||||
ud = UploadUtilizationData(
|
||||
info=workflow_info,
|
||||
dry_run=args.dry_run,
|
||||
debug=args.debug,
|
||||
artifact_prefix=artifact_prefix,
|
||||
)
|
||||
ud.start()
|
||||
|
Reference in New Issue
Block a user