Files
pytorch/test/inductor/test_provenance_tracing.py
PyTorch MergeBot a34797e031 Revert "Add provenance to inductor IR nodes created after graph.run (#164255)"
This reverts commit b9e73e639e36f3aa628752161711e68878231b30.

Reverted https://github.com/pytorch/pytorch/pull/164255 on behalf of https://github.com/jeffdaily due to broke rocm; inductor/test_provenance_tracing.py::TestProvenanceTracingStackTraces::test_deferred_triton_kernels [GH job link](https://github.com/pytorch/pytorch/actions/runs/18200790301/job/51821738132) [HUD commit link](b9e73e639e) ([comment](https://github.com/pytorch/pytorch/pull/164255#issuecomment-3363360088))
2025-10-02 22:01:41 +00:00

783 lines
30 KiB
Python

# Owner(s): ["module: inductor"]
import contextlib
import io
import json
import logging
import os
import re
import shutil
import tempfile
import unittest
import zipfile
from pathlib import Path
import torch
from torch._dynamo.utils import detect_fake_mode
from torch._inductor import config
from torch._inductor.debug import (
create_kernel_information_json,
create_mapping_pre_post_grad_nodes,
create_node_mapping_kernel_to_post_grad,
reset_inductor_kernel_provenance_debug_handle,
)
from torch._inductor.fx_passes.post_grad import post_grad_passes
from torch._inductor.test_case import run_tests, TestCase
from torch._inductor.virtualized import V
from torch.testing._internal.common_utils import IS_MACOS
from torch.testing._internal.triton_utils import requires_cuda_and_triton
try:
from .test_aot_inductor_utils import AOTIRunnerUtil
except ImportError:
from test_aot_inductor_utils import AOTIRunnerUtil
trace_log = logging.getLogger("torch.__trace")
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, a, b, c):
x = a * 3.14
y = torch.addmm(c, x, b)
z = torch.nn.functional.gelu(y)
return z
class Model2(torch.nn.Module):
# this test model is used for combo kernel provenance tracing info
def __init__(self):
super().__init__()
def forward(self, a, b, c):
a1 = torch.nn.functional.relu(a)
b1 = torch.nn.functional.sigmoid(b)
c1 = torch.nn.functional.tanh(c)
return a1, b1, c1
class Model3(torch.nn.Module):
def __init__(self, n, k):
super().__init__()
self.weight = torch.randn(n, k, device="cuda")
self.bias = torch.randn(n, device="cuda")
def forward(self, a):
return torch.nn.functional.linear(a, self.weight, self.bias)
class Model4(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc1 = torch.nn.Linear(10, 16)
self.relu = torch.nn.ReLU()
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x, a, b, c):
x = self.fc1(x)
x = self.relu(x)
x = self.sigmoid(x)
d = a * 3.14
y = torch.addmm(c, d, b)
z = torch.nn.functional.gelu(y)
return x, z
@config.patch("trace.enabled", True)
@config.patch("trace.provenance_tracking_level", 1)
class TestProvenanceTracingArtifact(TestCase):
"""
This test checks that generated provenance tracing artifact from "post_grad" to
corresponding "inductor triton kernel node" is expected.
"""
def _check_provenance_tracing_kernel_to_post_grad(self, filepath, expected_data):
self.assertTrue(filepath.is_dir())
filename = Path(filepath) / "inductor_provenance_tracking_node_mappings.json"
with open(filename) as f:
actual_data = json.load(f)
actual_data = actual_data["cppCodeToPost"]
# check that the generated provenance tracing artifact is expected
self.assertEqual(sorted(actual_data.items()), sorted(expected_data.items()))
def _check_provenance_tracking_node_mappings(self, filepath, expected_mapping):
self.assertTrue(filepath.is_dir())
filename = Path(filepath) / "inductor_provenance_tracking_node_mappings.json"
with open(filename) as f:
actual_data = json.load(f)
# check that the generated provenance tracing node mapping is expected
self.assertEqual(sorted(actual_data.items()), sorted(expected_mapping))
def _test_triton_kernel_to_post_grad_tracing(self, device):
a = torch.randn(10, 20, device=device)
b = torch.randn(20, 30, device=device)
c = torch.randn(10, 30, device=device)
example_inputs = (a, b, c)
model = Model().to(device)
filepath = None
for backend in ["aot_inductor", "inductor"]:
reset_inductor_kernel_provenance_debug_handle()
try:
with config.patch(
{
"trace.debug_dir": tempfile.mkdtemp(),
"force_disable_caches": True,
}
):
with self.assertLogs(
logging.getLogger("torch._inductor.debug"),
level=logging.WARNING,
) as cm:
if backend == "aot_inductor":
AOTIRunnerUtil.run(model, example_inputs)
else:
ep = torch.export._trace._export(model, example_inputs)
compiled = torch.compile(ep.module(), backend=backend)
compiled(*example_inputs)
self.assertEqual(len(cm.output), 1)
m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
self.assertTrue(m)
filepath = Path(m.group(1))
if device == "cuda":
expected_mapping = [
(
"cppCodeToPost",
{
"triton_poi_fused_mul_0:1": ["mul"],
"triton_poi_fused_addmm_gelu_1:3": [
"mul_3",
"mul_1",
"add_tensor",
"add",
"erf",
"mul_2",
],
},
),
(
"postToCppCode",
{
"mul": ["triton_poi_fused_mul_0:1"],
"mul_3": ["triton_poi_fused_addmm_gelu_1:3"],
"mul_1": ["triton_poi_fused_addmm_gelu_1:3"],
"add_tensor": ["triton_poi_fused_addmm_gelu_1:3"],
"add": ["triton_poi_fused_addmm_gelu_1:3"],
"erf": ["triton_poi_fused_addmm_gelu_1:3"],
"mul_2": ["triton_poi_fused_addmm_gelu_1:3"],
},
),
(
"postToPre",
{
"mul": ["mul"],
"mm_default": ["addmm"],
"add_tensor": ["addmm"],
"mul_1": ["gelu"],
"mul_2": ["gelu"],
"erf": ["gelu"],
"add": ["gelu"],
"mul_3": ["gelu"],
},
),
(
"preToPost",
{
"mul": ["mul"],
"addmm": ["mm_default", "add_tensor"],
"gelu": ["mul_1", "mul_2", "erf", "add", "mul_3"],
},
),
]
if backend == "aot_inductor":
expected_mapping[0][1]["aoti_torch_cuda_mm_out:2"] = [
"mm_default"
]
expected_mapping[1][1]["mm_default"] = [
"aoti_torch_cuda_mm_out:2"
]
else:
expected_mapping[0][1]["extern_kernels.mm:2"] = [
"mm_default"
]
expected_mapping[1][1]["mm_default"] = [
"extern_kernels.mm:2"
]
self._check_provenance_tracking_node_mappings(
filepath, expected_mapping
)
else:
assert device == "cpu"
# check the inductor kernel to post grad nodes mapping is expected for cpu
if backend == "aot_inductor":
expected_data = {
"cpp_fused_mul_0:1": ["mul"],
"aoti_torch_cpu_addmm_out:2": ["addmm"],
"cpp_fused_gelu_1:3": [
"mul_3",
"mul_1",
"add",
"erf",
"mul_2",
],
}
else:
# backend == "inductor"
expected_data = {
"cpp_fused_mul_0:1": ["mul"],
"cpp_fused_gelu_1:3": [
"mul_3",
"mul_1",
"add",
"erf",
"mul_2",
],
"extern_kernels.addmm:2": ["addmm"],
}
self._check_provenance_tracing_kernel_to_post_grad(
filepath, expected_data
)
finally:
if filepath:
shutil.rmtree(filepath)
@requires_cuda_and_triton
def test_triton_kernel_to_post_grad_tracing_cuda(self):
self._test_triton_kernel_to_post_grad_tracing(device="cuda")
def test_triton_kernel_to_post_grad_tracing_cpu(self):
self._test_triton_kernel_to_post_grad_tracing(device="cpu")
@requires_cuda_and_triton
def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
M = 8
N = 6
K = 16
model = Model3(N, K)
batch = 2
a = torch.randn(batch, M, K, device="cuda")
example_inputs = (a,)
filepath = None
for backend in ["aot_inductor", "inductor"]:
reset_inductor_kernel_provenance_debug_handle()
try:
with config.patch(
{
"trace.debug_dir": tempfile.mkdtemp(),
"force_disable_caches": True,
}
):
with self.assertLogs(
logging.getLogger("torch._inductor.debug"),
level=logging.WARNING,
) as cm:
if backend == "aot_inductor":
AOTIRunnerUtil.run(model, example_inputs)
else:
ep = torch.export._trace._export(model, example_inputs)
compiled = torch.compile(ep.module(), backend=backend)
compiled(*example_inputs)
self.assertEqual(len(cm.output), 1)
m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
self.assertTrue(m)
filepath = Path(m.group(1))
if backend == "inductor":
expected_data = {
"extern_kernels.addmm:1": ["addmm"],
}
else:
# backend = aot_inductor
expected_data = {
"aoti_torch_cuda_addmm_out:2": ["addmm"],
"triton_poi_fused_0:1": ["_tensor_constant1"],
}
self._check_provenance_tracing_kernel_to_post_grad(
filepath, expected_data
)
finally:
if filepath:
shutil.rmtree(filepath)
@requires_cuda_and_triton
def _test_pt_tracing_combo_kernel(self, backend):
"""This test checks that generated provenance tracing artifact from triton combo kernel to post grad nodes"""
a = torch.randn(10, 10, device="cuda")
b = torch.randn(20, 20, device="cuda")
c = torch.randn(10, 10, device="cuda")
example_inputs = (a, b, c)
model = Model2()
reset_inductor_kernel_provenance_debug_handle()
with config.patch(
{
"trace.debug_dir": tempfile.mkdtemp(),
"force_disable_caches": True,
"combo_kernels": True,
"benchmark_combo_kernel": False,
}
):
with self.assertLogs(
logging.getLogger("torch._inductor.debug"),
level=logging.WARNING,
) as cm:
if backend == "aot_inductor":
AOTIRunnerUtil.run(model, example_inputs)
else:
ep = torch.export._trace._export(model, example_inputs)
compiled = torch.compile(ep.module(), backend=backend)
compiled(*example_inputs)
self.assertEqual(len(cm.output), 1)
m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
self.assertTrue(m)
filepath = Path(m.group(1)).resolve()
expected_data = {"triton_poi_fused_0:1": ["relu", "sigmoid", "tanh"]}
self._check_provenance_tracing_kernel_to_post_grad(filepath, expected_data)
@requires_cuda_and_triton
def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
self._test_pt_tracing_combo_kernel(backend="inductor")
self._test_pt_tracing_combo_kernel(backend="aot_inductor")
class TestProvenanceTracingNodeMapping(TestCase):
def test_create_node_mapping(self):
pre_grad_graph_id = 140156815043952
post_to_pre_grad_nodes_json = {
"add_tensor": [
{
"from_node": [
{
"from_node": [
{
"from_node": [],
"graph_id": 140156815043952,
"name": "linear",
}
],
"graph_id": 140152856025632,
"name": "addmm",
}
],
"graph_id": 140151961816272,
"name": "add",
},
],
"mm_default": [
{
"from_node": [],
"graph_id": -1,
"name": "",
},
{
"from_node": [
{
"from_node": [
{
"from_node": [],
"graph_id": 140156815043952,
"name": "linear",
}
],
"graph_id": 140152856025632,
"name": "addmm",
}
],
"graph_id": 140151961816272,
"name": "mm",
},
],
"permute": [
{
"from_node": [],
"graph_id": 140156815043952,
"name": "linear",
}
],
"relu": [
{
"from_node": [],
"graph_id": 140156815043952,
"name": "relu",
}
],
}
triton_kernel_to_post_grad_json = {
"triton_poi_fused_addmm_relu_sigmoid_0": ["relu", "add_tensor"]
}
result = create_mapping_pre_post_grad_nodes(
pre_grad_graph_id,
post_to_pre_grad_nodes_json,
)
result = {
**result,
**create_node_mapping_kernel_to_post_grad(
triton_kernel_to_post_grad_json,
),
}
self.assertEqual(
result,
{
"cppCodeToPost": {
"triton_poi_fused_addmm_relu_sigmoid_0": [
"relu",
"add_tensor",
]
},
"postToCppCode": {
"add_tensor": ["triton_poi_fused_addmm_relu_sigmoid_0"],
"relu": ["triton_poi_fused_addmm_relu_sigmoid_0"],
},
"postToPre": {
"add_tensor": ["linear"],
"mm_default": ["linear"],
"permute": ["linear"],
"relu": ["relu"],
},
"preToPost": {
"linear": ["add_tensor", "mm_default", "permute"],
"relu": ["relu"],
},
},
)
class TestProvenanceTracingNodeMeta(TestCase):
def get_node_with_target(self, gm, target):
"""
Return first node in gm with target
"""
return next(iter([node for node in gm.graph.nodes if node.target == target]))
@requires_cuda_and_triton # test only works for cuda pattern matcher
def test_pattern_matcher_transfer_meta(self):
"""
Test that stack trace is transfered when node is decomposed in post_grad_passes
"""
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc1 = torch.nn.Linear(10, 16)
self.relu = torch.nn.ReLU()
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.sigmoid(x)
return x * 3
x = torch.randn(8, 10).to("cuda")
example_inputs = (x,)
model = Model().to("cuda")
# mimic the before_post_grad graph
ep = torch.export.export(model, example_inputs).run_decompositions()
gm = ep.module()
# Set fake mode for V
fake_inputs = [
node.meta.get("val") for node in gm.graph.nodes if node.op == "placeholder"
]
fake_mode = detect_fake_mode(fake_inputs)
V.set_fake_mode(fake_mode)
addmm_node = self.get_node_with_target(gm, torch.ops.aten.addmm.default)
stack_trace = addmm_node.meta["stack_trace"]
post_grad_passes(gm, True) # for this test is_inference doesn't matter
mm_node = self.get_node_with_target(gm, torch.ops.aten.mm.default)
add_node = self.get_node_with_target(gm, torch.ops.aten.add.Tensor)
self.assertEqual(add_node.meta["stack_trace"], stack_trace)
self.assertEqual(mm_node.meta["stack_trace"], stack_trace)
class ProvenanceArtifactFilter(logging.Filter):
def filter(self, record):
if "artifact" in record.metadata:
return (
record.metadata["artifact"]["name"]
== "inductor_provenance_tracking_kernel_stack_traces"
)
return False
class StructuredTracePayloadFormatter(logging.Formatter):
def format(self, record):
return record.payload.strip()
class TestProvenanceTracingStackTraces(TestCase):
@contextlib.contextmanager
def _setup_provenance_capture(self):
"""Helper to turn on and capture the 'inductor_tlparse_runtime' structured trace."""
payload_buffer = io.StringIO()
payload_handler = logging.StreamHandler(payload_buffer)
payload_handler.setLevel(logging.DEBUG)
payload_handler.setFormatter(StructuredTracePayloadFormatter())
payload_handler.addFilter(ProvenanceArtifactFilter())
trace_log.addHandler(payload_handler)
try:
yield payload_buffer
finally:
trace_log.removeHandler(payload_handler)
def extract_code_line(self, s, i=-2):
# Extract ith line
return s.split("\n")[i].strip()
@torch._inductor.config.patch({"trace.provenance_tracking_level": 2})
@requires_cuda_and_triton
def test_tlparse_kernel_stack_traces(self):
device = "cuda"
model = Model4().to(device)
x = torch.randn(8, 10).to(device)
a = torch.randn(10, 20).to(device)
b = torch.randn(20, 30).to(device)
c = torch.randn(10, 30).to(device)
example_inputs = (x, a, b, c)
expected = {
"triton_poi_fused_addmm_relu_sigmoid_threshold_backward_0:2": [
"x = self.sigmoid(x)",
"x = self.fc1(x)",
"x = self.relu(x)",
],
"triton_poi_fused_mul_1:3": [
"d = a * 3.14",
],
"triton_poi_fused_addmm_gelu_2:5": [
"z = torch.nn.functional.gelu(y)",
"y = torch.addmm(c, d, b)",
],
"extern_kernels.mm:1": [
"x = self.fc1(x)",
],
"extern_kernels.mm:4": [
"y = torch.addmm(c, d, b)",
],
}
compiled = torch.compile(model)
# should produce the same provenance if there's cache hit
for _ in range(2):
# reset cache
torch._dynamo.reset()
reset_inductor_kernel_provenance_debug_handle()
with self._setup_provenance_capture() as payload_buffer:
compiled = torch.compile(model)
compiled(*example_inputs)
payload_content = payload_buffer.getvalue().strip()
data = json.loads(payload_content)
self.assertEqual(set(data.keys()), set(expected.keys()))
for key, expected_lines in expected.items():
actual_lines = [self.extract_code_line(s) for s in data[key]]
self.assertEqual(
sorted(actual_lines),
sorted(expected_lines),
f"Mismatch for key: {key}",
)
def _check_kernel_information_json(self, kernel_info, expected_kernels):
"""Validate kernel information JSON structure and content."""
self.assertIsInstance(kernel_info, dict)
for expected in expected_kernels:
self.assertIn(
expected,
kernel_info,
f"Expected kernel {expected} not found in {list(kernel_info)}",
)
for data in kernel_info.values():
self.assertIsInstance(data, dict)
for field in ["stack_traces", "post_grad_nodes", "pre_grad_nodes"]:
self.assertIn(field, data)
self.assertIsInstance(data[field], list)
for item in data[field]:
self.assertIsInstance(item, str)
@requires_cuda_and_triton
@torch._inductor.config.patch("trace.provenance_tracking_level", 1)
def test_kernel_information_generation(self):
"""Test basic kernel information generation in AOTI packages."""
model = Model4().to("cuda")
x = torch.randn(8, 10, device="cuda")
a = torch.randn(10, 20, device="cuda")
b = torch.randn(20, 30, device="cuda")
c = torch.randn(10, 30, device="cuda")
inputs = (x, a, b, c)
with tempfile.TemporaryDirectory() as temp_dir:
ep = torch.export.export(model, inputs, strict=False)
pt2_file = os.path.join(temp_dir, "model.pt2")
reset_inductor_kernel_provenance_debug_handle()
torch._inductor.aoti_compile_and_package(ep, package_path=pt2_file)
# Extract and check kernel_information.json exists in the package
with zipfile.ZipFile(pt2_file, "r") as zip_ref:
zip_ref.extractall(temp_dir)
json_path = os.path.join(
temp_dir,
"model",
"data",
"aotinductor",
"model",
"kernel_information.json",
)
self.assertTrue(
os.path.exists(json_path),
f"kernel_information.json not found in extracted package at {json_path}",
)
with open(json_path) as f:
kernel_info = json.load(f)
expected = {
"triton_poi_fused_addmm_relu_sigmoid_0:2": {
"stack_traces": [
"x = self.sigmoid(x)",
"x = self.fc1(x)",
"x = self.relu(x)",
],
"post_grad_nodes": ["sigmoid", "relu", "add_tensor_1"],
"pre_grad_nodes": ["sigmoid", "relu", "linear"],
},
"triton_poi_fused_mul_1:3": {
"stack_traces": [
"d = a * 3.14",
],
"post_grad_nodes": ["mul"],
"pre_grad_nodes": ["mul"],
},
"triton_poi_fused_addmm_gelu_2:5": {
"stack_traces": [
"z = torch.nn.functional.gelu(y)",
"y = torch.addmm(c, d, b)",
],
"post_grad_nodes": [
"mul_3",
"mul_1",
"add_tensor",
"add",
"erf",
"mul_2",
],
"pre_grad_nodes": ["gelu", "addmm"],
},
"aoti_torch_cuda_mm_out:1": {
"stack_traces": [
"x = self.fc1(x)",
],
"post_grad_nodes": ["mm_default_1"],
"pre_grad_nodes": ["linear"],
},
"aoti_torch_cuda_mm_out:4": {
"stack_traces": [
"y = torch.addmm(c, d, b)",
],
"post_grad_nodes": ["mm_default"],
"pre_grad_nodes": ["addmm"],
},
}
self._check_kernel_information_json(kernel_info, expected.keys())
self.assertEqual(set(kernel_info.keys()), set(expected.keys()))
for key, data in expected.items():
all_lines = ",".join(kernel_info[key]["stack_traces"])
for s in data["stack_traces"]:
self.assertTrue(s in all_lines)
self.assertEqual(
sorted(kernel_info[key]["pre_grad_nodes"]),
sorted(data["pre_grad_nodes"]),
f"Mismatch for key: {key}",
)
self.assertEqual(
sorted(kernel_info[key]["post_grad_nodes"]),
sorted(data["post_grad_nodes"]),
f"Mismatch for key: {key}",
)
@torch._inductor.config.patch("trace.provenance_tracking_level", 0)
def test_no_kernel_information_without_provenance_tracking(self):
"""Test that kernel_information.json is not generated without provenance tracking."""
class SimpleModel(torch.nn.Module):
def forward(self, x):
return x * 2.0
model = SimpleModel()
x = torch.randn(4, 8)
# Compile with AOTI but without provenance tracking
with tempfile.TemporaryDirectory() as temp_dir:
ep = torch.export.export(model, (x,), strict=False)
pt2_file = os.path.join(temp_dir, "model.pt2")
torch._inductor.aoti_compile_and_package(ep, package_path=pt2_file)
# Extract and check kernel_information.json was NOT created in the package
extract_dir = os.path.join(temp_dir, "extracted")
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(pt2_file, "r") as zip_ref:
zip_ref.extractall(extract_dir)
expected_json_path = os.path.join(extract_dir, "kernel_information.json")
self.assertFalse(
os.path.exists(expected_json_path),
"kernel_information.json should not exist in package when provenance tracking is disabled",
)
def test_create_kernel_information_json_function(self):
"""Test the create_kernel_information_json function directly."""
# Test with empty state
result = create_kernel_information_json()
self.assertIsInstance(result, dict)
self.assertEqual(len(result), 0) # Should be empty with no provenance data
@unittest.skipIf(
IS_MACOS,
"MacOS generates different debug handles",
)
@torch._inductor.config.patch("trace.provenance_tracking_level", 1)
def test_cpu_extern_kernel(self):
class Foo(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
self.conv = torch.nn.Conv2d(16, 33, 3)
def forward(self, x):
return self.conv(x)
model = Foo()
x = torch.randn(20, 16, 50, 100)
with self._setup_provenance_capture() as payload_buffer:
reset_inductor_kernel_provenance_debug_handle()
ep = torch.export.export(model, (x,))
torch._inductor.aoti_compile_and_package(ep)
payload_content = payload_buffer.getvalue().strip()
data = json.loads(payload_content)
keys = [k.split(":")[0] for k in data]
self.assertTrue("aoti_torch_cpu_convolution" in keys)
if __name__ == "__main__":
run_tests()