[cudagraph] fix verbose graph logging (#126694)

According to the [doc](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g0907ca7a1e7d0211b71ee49c5403072b): > enum cudaGraphDebugDotFlags > CUDA Graph debug write options > > Values > cudaGraphDebugDotFlagsVerbose = 1<<0 > Output all debug data as if every debug flag is enabled > cudaGraphDebugDotFlagsKernelNodeParams = 1<<2 > Adds cudaKernelNodeParams to output > cudaGraphDebugDotFlagsMemcpyNodeParams = 1<<3 > Adds cudaMemcpy3DParms to output > cudaGraphDebugDotFlagsMemsetNodeParams = 1<<4 > Adds cudaMemsetParams to output > cudaGraphDebugDotFlagsHostNodeParams = 1<<5 > Adds cudaHostNodeParams to output > cudaGraphDebugDotFlagsEventNodeParams = 1<<6 > Adds cudaEvent_t handle from record and wait nodes to output > cudaGraphDebugDotFlagsExtSemasSignalNodeParams = 1<<7 > Adds cudaExternalSemaphoreSignalNodeParams values to output > cudaGraphDebugDotFlagsExtSemasWaitNodeParams = 1<<8 > Adds cudaExternalSemaphoreWaitNodeParams to output > cudaGraphDebugDotFlagsKernelNodeAttributes = 1<<9 > Adds cudaKernelNodeAttrID values to output > cudaGraphDebugDotFlagsHandles = 1<<10 > Adds node handles and every kernel function handle to output > cudaGraphDebugDotFlagsConditionalNodeParams = 1<<15 > Adds cudaConditionalNodeParams to output > `1 << 10` is not the most verbose flag. it is just one flag to add node handles and every kernel function handle to output. `1 << 0` is the most verbose flag, under the name `cudaGraphDebugDotFlagsVerbose`. Here is an example of graph, dumped with `1 << 10`: ```dot digraph dot { subgraph cluster_1 { label="graph_1" graph[style="dashed"]; "graph_1_node_0"[style="solid" shape="rectangle" label="0 MEM_ALLOC node handle: 0x000055D2889750F0 "]; "graph_1_node_1"[style="bold" shape="octagon" label="1 _Z3addPhS_S_m node handle: 0x000055D288979A20 func handle: 0x000055D288978D40 "]; "graph_1_node_2"[style="solid" shape="trapezium"label="2 MEMCPY node handle: 0x000055D28897A130 (DtoH,1024) "]; "graph_1_node_3"[style="solid" shape="rectangle" label="3 MEM_FREE node handle: 0x000055D2889890C0 "]; "graph_1_node_0" -> "graph_1_node_1"; "graph_1_node_1" -> "graph_1_node_2"; "graph_1_node_2" -> "graph_1_node_3"; } } ``` The same graph dumped with `1 << 0`: ```dot digraph dot { subgraph cluster_1 { label="graph_1" graph[style="dashed"]; "graph_1_node_0"[style="solid" shape="record" label="{ MEM_ALLOC | {{ID | node handle} | {0 (topoId: 3) | 0x000055D2889750F0}} | {{{poolProps | {allocType | handleTypes | {location | {type | id}}} | {PINNED | NONE | DEVICE | 0}}}} | {{bytesize | dptr} | {1024 | 0x0000000A02000000}} }"]; "graph_1_node_1"[style="bold" shape="record" label="{KERNEL | {ID | 1 (topoId: 2) | _Z3addPhS_S_m\<\<\<4,256,0\>\>\>} | {{node handle | func handle} | {0x000055D288979A20 | 0x000055D288978D40}} | {accessPolicyWindow | {base_ptr | num_bytes | hitRatio | hitProp | missProp} | {0x0000000000000000 | 0 | 0.000000 | N | N}} | {cooperative | 0} | {priority | 0} }"]; "graph_1_node_2"[style="solid" shape="record" label="{ MEMCPY | {{ID | node handle} | {2 (topoId: 1) | 0x000055D28897A130}} | {kind | DtoH (DEVICE to HOST PAGEABLE)} | {{srcPtr | dstPtr} | {pitch | ptr | xsize | ysize | pitch | ptr | xsize | ysize} | {0 | 0x0000000A02000000 | 0 | 0 | 0 | 0x000055D287CA6DB0 | 0 | 0}} | {{srcPos | {{x | 0} | {y | 0} | {z | 0}}} | {dstPos | {{x | 0} | {y | 0} | {z | 0}}} | {Extent | {{Width | 1024} | {Height | 1} | {Depth | 1}}}} }"]; "graph_1_node_3"[style="solid" shape="record" label="{ MEM_FREE | {{ID | node handle} | {3 (topoId: 0) | 0x000055D2889890C0}} | {{dptr} | {0x0000000A02000000}} }"]; "graph_1_node_0" -> "graph_1_node_1" [headlabel=0]; "graph_1_node_1" -> "graph_1_node_2" [headlabel=0]; "graph_1_node_2" -> "graph_1_node_3" [headlabel=0]; } } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/126694 Approved by: https://github.com/eqy, https://github.com/eellison
2025-10-21 05:34:18 +08:00 · 2024-05-21 00:55:15 +00:00
parent 4644611b14
commit 82b4528788
3 changed files with 3 additions and 2 deletions
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -268,7 +268,7 @@ void CUDAGraph::debug_dump(const std::string& debug_path) {
    TORCH_WARN("DEBUG: calling debug_dump()");
    if (has_graph_) {
      TORCH_WARN("DEBUG: calling cudaGraphDebugDotPrint() with ", debug_path);
-      C10_CUDA_CHECK_WARN(cudaGraphDebugDotPrint(graph_, debug_path.c_str(), 1<<10)); // most verbose output
+      C10_CUDA_CHECK_WARN(cudaGraphDebugDotPrint(graph_, debug_path.c_str(), cudaGraphDebugDotFlagsVerbose)); // most verbose output
      AT_CUDA_CHECK(cudaGraphDestroy(graph_));
    }
  } else {
--- a/torch/utils/hipify/constants.py
+++ b/torch/utils/hipify/constants.py
@ -2,7 +2,7 @@

 The constants defined here are used to annotate the mapping tuples in cuda_to_hip_mappings.py.
 They are based on
-https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/Statistics.h
+https://github.com/ROCm/HIPIFY/blob/master/src/Statistics.h
 and fall in three categories: 1) type of mapping, 2) API of mapping, 3) unsupported
 mapping.
 """
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@ -4163,6 +4163,7 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict(
        ("cudaGraphLaunch", ("hipGraphLaunch", CONV_TYPE, API_RUNTIME)),
        ("cudaGraphGetNodes", ("hipGraphGetNodes", CONV_TYPE, API_RUNTIME)),
        ("cudaGraphDebugDotPrint", ("hipGraphDebugDotPrint", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphDebugDotFlagsVerbose", ("hipGraphDebugDotFlagsVerbose", CONV_NUMERIC_LITERAL, API_RUNTIME)),
        ("cudaGraphRetainUserObject", ("hipGraphRetainUserObject", CONV_TYPE, API_RUNTIME)),
        ("cudaGraphUserObjectMove", ("hipGraphUserObjectMove", CONV_TYPE, API_RUNTIME)),
        ("cudaUserObject_t", ("hipUserObject_t", CONV_TYPE, API_RUNTIME)),