mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-11-04 16:04:58 +08:00 
			
		
		
		
	This diff is the starting steps of https://docs.google.com/document/u/2/d/1kAEBt4AyW7HTAhXHbjoz8FBFHNyyEA2Qo2mPn7v3WUQ/edit?usp=drive_web&ouid=113555078003219714709 It implements the following changes: - Only log spans to scuba, so no start events are ever logged - Log events as the full event name, without "START" or "END" - Only log to scuba major phases from chromium events. These are: - entire_frame_compile (dynamo) - backend_compile (aotdispatch) - inductor_compile (inductor) - codegen (inductor codegen) Tlparse chromium events stay basically the same. But I implemented a few changes to clean that up as well: - When there's a phase name available, log the phase name instead of the function name as the event name. This simplifies the trace to not have two identical rows. The fn_name is avaliable as metadata on the chromium event, if interested - Log new events for pre and post grad passes. These do *not* log to scuba. By making the phases much simpler in Scuba, with only categories for major phases of PT2 Compilation, we pave the way to add **much** more metadata and information to each individual event type. Diffs for that will come later. **IMPLEMENTATION NOTES:** - The logic for `log_chromium_event_internal` (which is the function that logs to Scuba) lives in chromium_events for now, but in the future as we add more metadata, it may belong independently in dynamo_timed or even outside of dynamo_timed. I haven't explored in detail what the refactor will look like. Once we start logging metadata for dynamo, aotdispatch, inductor, I suspect we will call log_pt2_compile_event directly, instead of making chromium event logger handle the pt2_compile_event logic. But that refactor is left for another PR on top of this one. - There's an interesting space after pre grad passes within AOT autograd logic, that's between create_aot_dispatcher_function and pre grad passes. I'm not sure what we're spending time doing in that time, but I'll find out with a profile later. Differential Revision: [D64479033](https://our.internmc.facebook.com/intern/diff/D64479033/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/138093 Approved by: https://github.com/ezyang
		
			
				
	
	
		
			367 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			367 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# mypy: allow-untyped-defs
 | 
						|
import functools
 | 
						|
import logging
 | 
						|
import os
 | 
						|
import sys
 | 
						|
import tempfile
 | 
						|
from typing import Any, Dict, List, Optional
 | 
						|
 | 
						|
import torch
 | 
						|
from torch._strobelight.compile_time_profiler import StrobelightCompileTimeProfiler
 | 
						|
 | 
						|
 | 
						|
log = logging.getLogger(__name__)
 | 
						|
 | 
						|
if os.environ.get("TORCH_COMPILE_STROBELIGHT", False):
 | 
						|
    import shutil
 | 
						|
 | 
						|
    if not shutil.which("strobeclient"):
 | 
						|
        log.info(
 | 
						|
            "TORCH_COMPILE_STROBELIGHT is true, but seems like you are not on a FB machine."
 | 
						|
        )
 | 
						|
    else:
 | 
						|
        log.info("Strobelight profiler is enabled via environment variable")
 | 
						|
        StrobelightCompileTimeProfiler.enable()
 | 
						|
 | 
						|
# this arbitrary-looking assortment of functionality is provided here
 | 
						|
# to have a central place for overrideable behavior. The motivating
 | 
						|
# use is the FB build environment, where this source file is replaced
 | 
						|
# by an equivalent.
 | 
						|
 | 
						|
if torch._running_with_deploy():
 | 
						|
    # __file__ is meaningless in the context of frozen torch used in torch deploy.
 | 
						|
    # setting empty torch_parent should allow below functions to operate without crashing,
 | 
						|
    # but it's unclear if there is a valid use case for them in the context of deploy.
 | 
						|
    torch_parent = ""
 | 
						|
else:
 | 
						|
    if os.path.basename(os.path.dirname(__file__)) == "shared":
 | 
						|
        torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
 | 
						|
    else:
 | 
						|
        torch_parent = os.path.dirname(os.path.dirname(__file__))
 | 
						|
 | 
						|
 | 
						|
def get_file_path(*path_components: str) -> str:
 | 
						|
    return os.path.join(torch_parent, *path_components)
 | 
						|
 | 
						|
 | 
						|
def get_file_path_2(*path_components: str) -> str:
 | 
						|
    return os.path.join(*path_components)
 | 
						|
 | 
						|
 | 
						|
def get_writable_path(path: str) -> str:
 | 
						|
    if os.access(path, os.W_OK):
 | 
						|
        return path
 | 
						|
    return tempfile.mkdtemp(suffix=os.path.basename(path))
 | 
						|
 | 
						|
 | 
						|
def prepare_multiprocessing_environment(path: str) -> None:
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
def resolve_library_path(path: str) -> str:
 | 
						|
    return os.path.realpath(path)
 | 
						|
 | 
						|
 | 
						|
def throw_abstract_impl_not_imported_error(opname, module, context):
 | 
						|
    if module in sys.modules:
 | 
						|
        raise NotImplementedError(
 | 
						|
            f"{opname}: We could not find the fake impl for this operator. "
 | 
						|
        )
 | 
						|
    else:
 | 
						|
        raise NotImplementedError(
 | 
						|
            f"{opname}: We could not find the fake impl for this operator. "
 | 
						|
            f"The operator specified that you may need to import the '{module}' "
 | 
						|
            f"Python module to load the fake impl. {context}"
 | 
						|
        )
 | 
						|
 | 
						|
 | 
						|
# NB!  This treats "skip" kwarg specially!!
 | 
						|
def compile_time_strobelight_meta(phase_name):
 | 
						|
    def compile_time_strobelight_meta_inner(function):
 | 
						|
        @functools.wraps(function)
 | 
						|
        def wrapper_function(*args, **kwargs):
 | 
						|
            if "skip" in kwargs:
 | 
						|
                kwargs["skip"] = kwargs["skip"] + 1
 | 
						|
 | 
						|
            if not StrobelightCompileTimeProfiler.enabled:
 | 
						|
                return function(*args, **kwargs)
 | 
						|
 | 
						|
            return StrobelightCompileTimeProfiler.profile_compile_time(
 | 
						|
                function, phase_name, *args, **kwargs
 | 
						|
            )
 | 
						|
 | 
						|
        return wrapper_function
 | 
						|
 | 
						|
    return compile_time_strobelight_meta_inner
 | 
						|
 | 
						|
 | 
						|
# Meta only, see
 | 
						|
# https://www.internalfb.com/intern/wiki/ML_Workflow_Observability/User_Guides/Adding_instrumentation_to_your_code/
 | 
						|
#
 | 
						|
# This will cause an event to get logged to Scuba via the signposts API.  You
 | 
						|
# can view samples on the API at https://fburl.com/scuba/workflow_signpost/zh9wmpqs
 | 
						|
# we log to subsystem "torch", and the category and name you provide here.
 | 
						|
# Each of the arguments translate into a Scuba column.  We're still figuring
 | 
						|
# out local conventions in PyTorch, but category should be something like
 | 
						|
# "dynamo" or "inductor", and name should be a specific string describing what
 | 
						|
# kind of event happened.
 | 
						|
#
 | 
						|
# Killswitch is at
 | 
						|
# https://www.internalfb.com/intern/justknobs/?name=pytorch%2Fsignpost#event
 | 
						|
def signpost_event(category: str, name: str, parameters: Dict[str, Any]):
 | 
						|
    log.info("%s %s: %r", category, name, parameters)
 | 
						|
 | 
						|
 | 
						|
def log_compilation_event(metrics):
 | 
						|
    log.info("%s", metrics)
 | 
						|
 | 
						|
 | 
						|
def upload_graph(graph):
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
def set_pytorch_distributed_envs_from_justknobs():
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
def log_export_usage(**kwargs):
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
def log_trace_structured_event(*args, **kwargs) -> None:
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
def log_cache_bypass(*args, **kwargs) -> None:
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
def log_torchscript_usage(api: str, **kwargs):
 | 
						|
    _ = api
 | 
						|
    return
 | 
						|
 | 
						|
 | 
						|
def check_if_torch_exportable():
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
def export_training_ir_rollout_check() -> bool:
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
def log_torch_jit_trace_exportability(
 | 
						|
    api: str,
 | 
						|
    type_of_export: str,
 | 
						|
    export_outcome: str,
 | 
						|
    result: str,
 | 
						|
):
 | 
						|
    _, _, _, _ = api, type_of_export, export_outcome, result
 | 
						|
    return
 | 
						|
 | 
						|
 | 
						|
def capture_pre_autograd_graph_using_training_ir() -> bool:
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
class JustKnobsConfig:
 | 
						|
    """Represents a lazily loaded config
 | 
						|
 | 
						|
    This is designed to be used to specify a value in a config.
 | 
						|
 | 
						|
    i.e. foo.bar = JustknobsConfig(name="//foo:bar", env_name="FORCE_FOO_BAR")
 | 
						|
 | 
						|
    Call .get() in order to access the value
 | 
						|
    i.e. if foo.bar.get():
 | 
						|
 | 
						|
    Note that the value is fetched once, and then not allowed to change. This
 | 
						|
    means less suprises, at the downside that you may have to restart a job
 | 
						|
    to pick up an update.
 | 
						|
 | 
						|
    It can also be set explicitly via set - i.e.
 | 
						|
    foo.bar = JustknobsConfig(name="//foo:bar")
 | 
						|
    foo.bar.set(True)
 | 
						|
 | 
						|
    Note that this does allow for no JK name (so that you can use this to replace old configurations).
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self, *, name: Optional[str] = None, env_name=None, default: bool = True
 | 
						|
    ):
 | 
						|
        self.name = name
 | 
						|
        self.env_name = env_name
 | 
						|
        self.default = default
 | 
						|
        self.value: Optional[bool] = None
 | 
						|
        self.executed_value = None
 | 
						|
 | 
						|
    def set(self, value: bool):
 | 
						|
        self.value = value
 | 
						|
 | 
						|
    def get(self):
 | 
						|
        if self.executed_value is None:
 | 
						|
            self.executed_value = justknobs_feature(
 | 
						|
                self.name,
 | 
						|
                config_value=self.value,
 | 
						|
                env_name=self.env_name,
 | 
						|
                default=self.default,
 | 
						|
            )
 | 
						|
        return self.executed_value
 | 
						|
 | 
						|
    def __str__(self):
 | 
						|
        v = bool(self)
 | 
						|
        return f"JustknobsConfig(name={self.name}, env_name={self.env_name}, default={self.default} - evals_to={v})"
 | 
						|
 | 
						|
    def __bool__(self):
 | 
						|
        return self.get()
 | 
						|
 | 
						|
 | 
						|
def justknobs_feature(
 | 
						|
    name: Optional[str], config_value=None, env_name=None, default: bool = True
 | 
						|
):
 | 
						|
    """Returns whether or not a specific justknob feature is enabled.
 | 
						|
 | 
						|
    This is a slightly higher level API then justknobs_check, designed to make it "easy" to do the right thing.
 | 
						|
    The primary thing it does, is allow configuration to override JK by default, while retaining some features to force this
 | 
						|
    the other way during sevs.
 | 
						|
 | 
						|
    The preference order (i.e. who wins first) in OSS (and FB) is
 | 
						|
    - Config if specified
 | 
						|
    - Environment Variable if specified
 | 
						|
    - JK (FB), or default (OSS)
 | 
						|
 | 
						|
 | 
						|
    Quickstart
 | 
						|
    Have a config variable
 | 
						|
    Make a JK which is set to your "enabled" value (generally true).
 | 
						|
    Use this feature to check it (if you set the JK to be false, change the default).
 | 
						|
    If you have an env variable, also use the function to check it.
 | 
						|
 | 
						|
    Arguments:
 | 
						|
        name - This should correspond 1:1 to a JK name internally to FB.
 | 
						|
        env_name - If this is set, we'll try and read the value from environment variables
 | 
						|
        config_value - If this is set to anything other than None, we'll use this value by
 | 
						|
            default. Note that within FB, there is some functionality to force override these
 | 
						|
            configs
 | 
						|
        default - This is the value to return in OSS. This avoids having to write weird double
 | 
						|
            negatives within justknobs and the config code, if you just want to have the
 | 
						|
            killswitch work by having feature return True to turn off features
 | 
						|
 | 
						|
    Requirements:
 | 
						|
        WARNING - Don't use this at import time - Simply pass in the existing config.
 | 
						|
        If you want to use this at config time, use JustKnobsConfig
 | 
						|
    """
 | 
						|
    if config_value is not None:
 | 
						|
        return config_value
 | 
						|
    if env_name is not None and ((env := os.getenv(env_name)) is not None):
 | 
						|
        env = env.upper()
 | 
						|
        if env in ("1", "TRUE"):
 | 
						|
            return True
 | 
						|
        if env in ("0", "FALSE"):
 | 
						|
            return False
 | 
						|
        log.error(
 | 
						|
            "Difficulty parsing env variable %s=%s for feature %s - Assuming env variable means true and returning True",
 | 
						|
            env_name,
 | 
						|
            env,
 | 
						|
            name,
 | 
						|
        )
 | 
						|
        # We could return default here, but that was confusing to log.
 | 
						|
        return True
 | 
						|
    if name is None:
 | 
						|
        return True
 | 
						|
    if not default:
 | 
						|
        return not justknobs_check(name)
 | 
						|
    return justknobs_check(name)
 | 
						|
 | 
						|
 | 
						|
def justknobs_check(name: str) -> bool:
 | 
						|
    """
 | 
						|
    This function can be used to killswitch functionality in FB prod,
 | 
						|
    where you can toggle this value to False in JK without having to
 | 
						|
    do a code push.  In OSS, we always have everything turned on all
 | 
						|
    the time, because downstream users can simply choose to not update
 | 
						|
    PyTorch.  (If more fine-grained enable/disable is needed, we could
 | 
						|
    potentially have a map we lookup name in to toggle behavior.  But
 | 
						|
    the point is that it's all tied to source code in OSS, since there's
 | 
						|
    no live server to query.)
 | 
						|
 | 
						|
    This is the bare minimum functionality I needed to do some killswitches.
 | 
						|
    We have a more detailed plan at
 | 
						|
    https://docs.google.com/document/d/1Ukerh9_42SeGh89J-tGtecpHBPwGlkQ043pddkKb3PU/edit
 | 
						|
    In particular, in some circumstances it may be necessary to read in
 | 
						|
    a knob once at process start, and then use it consistently for the
 | 
						|
    rest of the process.  Future functionality will codify these patterns
 | 
						|
    into a better high level API.
 | 
						|
 | 
						|
    WARNING: Do NOT call this function at module import time, JK is not
 | 
						|
    fork safe and you will break anyone who forks the process and then
 | 
						|
    hits JK again.
 | 
						|
    """
 | 
						|
    return True
 | 
						|
 | 
						|
 | 
						|
def justknobs_getval_int(name: str) -> int:
 | 
						|
    """
 | 
						|
    Read warning on justknobs_check
 | 
						|
    """
 | 
						|
    return 0
 | 
						|
 | 
						|
 | 
						|
def is_fb_unit_test() -> bool:
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
@functools.lru_cache(None)
 | 
						|
def max_clock_rate():
 | 
						|
    if not torch.version.hip:
 | 
						|
        from triton.testing import nvsmi
 | 
						|
 | 
						|
        return nvsmi(["clocks.max.sm"])[0]
 | 
						|
    else:
 | 
						|
        # Manually set max-clock speeds on ROCm until equivalent nvmsi
 | 
						|
        # functionality in triton.testing or via pyamdsmi enablement. Required
 | 
						|
        # for test_snode_runtime unit tests.
 | 
						|
        gcn_arch = str(torch.cuda.get_device_properties(0).gcnArchName.split(":", 1)[0])
 | 
						|
        if "gfx94" in gcn_arch:
 | 
						|
            return 1700
 | 
						|
        elif "gfx90a" in gcn_arch:
 | 
						|
            return 1700
 | 
						|
        elif "gfx908" in gcn_arch:
 | 
						|
            return 1502
 | 
						|
        elif "gfx11" in gcn_arch:
 | 
						|
            return 1700
 | 
						|
        elif "gfx103" in gcn_arch:
 | 
						|
            return 1967
 | 
						|
        elif "gfx101" in gcn_arch:
 | 
						|
            return 1144
 | 
						|
        else:
 | 
						|
            return 1100
 | 
						|
 | 
						|
 | 
						|
TEST_MASTER_ADDR = "127.0.0.1"
 | 
						|
TEST_MASTER_PORT = 29500
 | 
						|
# USE_GLOBAL_DEPS controls whether __init__.py tries to load
 | 
						|
# libtorch_global_deps, see Note [Global dependencies]
 | 
						|
USE_GLOBAL_DEPS = True
 | 
						|
# USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load
 | 
						|
# _C.so with RTLD_GLOBAL during the call to dlopen.
 | 
						|
USE_RTLD_GLOBAL_WITH_LIBTORCH = False
 | 
						|
# If an op was defined in C++ and extended from Python using the
 | 
						|
# torch.library.register_fake, returns if we require that there be a
 | 
						|
# m.set_python_module("mylib.ops") call from C++ that associates
 | 
						|
# the C++ op with a python module.
 | 
						|
REQUIRES_SET_PYTHON_MODULE = False
 | 
						|
 | 
						|
 | 
						|
def maybe_upload_prof_stats_to_manifold(profile_path: str) -> Optional[str]:
 | 
						|
    print("Uploading profile stats (fb-only otherwise no-op)")
 | 
						|
    return None
 | 
						|
 | 
						|
 | 
						|
def log_chromium_event_internal(
 | 
						|
    event: Dict[str, Any],
 | 
						|
    stack: List[str],
 | 
						|
    compile_id: Optional[str],
 | 
						|
    logger_uuid: str,
 | 
						|
    start_time_ns: int,
 | 
						|
):
 | 
						|
    return None
 |