mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Add torch._logging.scribe (#135224)
See https://github.com/pytorch/pytorch/pull/135138 for a usage example. Meta only, see https://docs.google.com/document/d/1JpbAQvRhTmuxjnKKjT7qq57dsnV84nxSLpWJo1abJuE/edit#heading=h.9wi46k7np6xw for context fbscribelogger is a library that allows us to write to scribe, which is Meta's logging infrastructure, when you have appropriate access token (this token is available for jobs running on main, as well as authorized jobs with the ci-scribe label). The resulting data is accessible via Scuba (a real time in-memory database) and Hive (a more traditional SQL persisted database). Here's the motivating use case. Suppose there is somewhere in PyTorch's codebase where you'd like to log an event, and then you'd like to find all the situations where this log is called. If PyTorch is rolled out to our internal users, we have some FB-oriented APIs (like torch._utils_internal.signpost_event) with which you can do this. But you have to actually land your PR to main, wait for it to be ingested to fbcode, and then wait for us to actually roll out this version, before you get any data. But what if you want the results within the next few hours? Instead, you can use torch._logging.scribe to directly write to our logging infrastructure *from inside CI jobs.* The most convenient approach is to log unstructured JSON blobs to `open_source_signpost` (added in this PR; you can also add your own dedicated table as described in the GDoc above). After adding logging code to your code, you can push your PR to CI, add 'ci-scribe' label, and in a few hours view the results in Scuba, e.g., (Meta-only) https://fburl.com/scuba/torch_open_source_signpost/z2mq8o4l If you want continuous logging on all commits on master, you can land your PR and it will be continuously get logging for all CI runs that happen on main. Eventually, if your dataset is important enough, you can consider collaborating with PyTorch Dev Infra to get the data collected in our public AWS cloud so that OSS users can view it without access to Meta's internal users. But this facility is really good for prototyping / one-off experiments. It's entirely self serve: just add your logging, run your PR CI with ci-scribe, get results, do analysis in Scuba. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/135224 Approved by: https://github.com/Skylion007
This commit is contained in:
committed by
PyTorch MergeBot
parent
3c8f71ff93
commit
3825607144
@ -36,6 +36,11 @@ expecttest==0.2.1
|
||||
#Pinned versions: 0.2.1
|
||||
#test that import:
|
||||
|
||||
fbscribelogger==0.1.6
|
||||
#Description: write to scribe from authenticated jobs on CI
|
||||
#Pinned versions: 0.1.6
|
||||
#test that import:
|
||||
|
||||
flatbuffers==2.0
|
||||
#Description: cross platform serialization library
|
||||
#Pinned versions: 2.0
|
||||
|
@ -1,6 +1,7 @@
|
||||
boto3==1.19.12
|
||||
hypothesis==6.56.4
|
||||
expecttest==0.2.1
|
||||
fbscribelogger==0.1.6
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
networkx==2.8.7
|
||||
|
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@ -223,7 +223,7 @@ jobs:
|
||||
cache: pip
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* numpy==1.24.*
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
|
||||
pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
|
||||
- name: Run run_test.py (nonretryable)
|
||||
run: |
|
||||
|
61
torch/_logging/scribe.py
Normal file
61
torch/_logging/scribe.py
Normal file
@ -0,0 +1,61 @@
|
||||
from typing import Callable, List, Union
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
|
||||
try:
|
||||
from fbscribelogger import make_scribe_logger # type: ignore[import-untyped]
|
||||
except ImportError:
|
||||
TAtom: TypeAlias = Union[int, float, bool, str]
|
||||
TField: TypeAlias = Union[TAtom, List[TAtom]]
|
||||
TLazyField: TypeAlias = Union[TField, Callable[[], TField]]
|
||||
|
||||
def make_scribe_logger(name: str, thrift_src: str) -> Callable[..., None]:
|
||||
def inner(**kwargs: TLazyField) -> None:
|
||||
pass
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
open_source_signpost = make_scribe_logger(
|
||||
"TorchOpenSourceSignpost",
|
||||
"""
|
||||
struct TorchOpenSourceSignpostLogEntry {
|
||||
|
||||
# The commit SHA that triggered the workflow, e.g., 02a6b1d30f338206a71d0b75bfa09d85fac0028a. Derived from GITHUB_SHA.
|
||||
4: optional string commit_sha;
|
||||
|
||||
# Commit date (not author date) of the commit in commit_sha as timestamp, e.g., 1724208105. Increasing if merge bot is used, though not monotonic; duplicates occur when stack is landed.
|
||||
5: optional i64 commit_date;
|
||||
|
||||
# The fully-formed ref of the branch or tag that triggered the workflow run, e.g., refs/pull/133891/merge or refs/heads/main. Derived from GITHUB_REF.
|
||||
6: optional string github_ref;
|
||||
|
||||
# Indicates if branch protections or rulesets are configured for the ref that triggered the workflow run. Derived from GITHUB_REF_PROTECTED.
|
||||
7: optional bool github_ref_protected;
|
||||
|
||||
# A unique number for each attempt of a particular workflow run in a repository, e.g., 1. Derived from GITHUB_RUN_ATTEMPT.
|
||||
8: optional string github_run_attempt;
|
||||
|
||||
# A unique number for each workflow run within a repository, e.g., 19471190684. Derived from GITHUB_RUN_ID.
|
||||
9: optional string github_run_id;
|
||||
|
||||
# A unique number for each run of a particular workflow in a repository, e.g., 238742. Derived from GITHUB_RUN_NUMBER.
|
||||
10: optional string github_run_number_str;
|
||||
|
||||
# The name of the current job. Derived from JOB_NAME, e.g., linux-jammy-py3.8-gcc11 / test (default, 3, 4, amz2023.linux.2xlarge).
|
||||
11: optional string job_name;
|
||||
|
||||
# The GitHub user who triggered the job. Derived from GITHUB_TRIGGERING_ACTOR.
|
||||
12: optional string github_triggering_actor;
|
||||
13: optional string name; # Event name
|
||||
14: optional string parameters; # Parameters (JSON data)
|
||||
16: optional string subsystem; # Subsystem the event is associated with
|
||||
|
||||
# The unit timestamp in second for the Scuba Time Column override
|
||||
17: optional i64 time;
|
||||
|
||||
# The weight of the record according to current sampling rate
|
||||
18: optional i64 weight;
|
||||
}
|
||||
""", # noqa: B950
|
||||
)
|
Reference in New Issue
Block a user