Compare commits

...

1 Commits

Author SHA1 Message Date
8cd74b302f compiled 2025-09-04 19:30:27 -07:00
22 changed files with 2781 additions and 0 deletions

View File

@ -0,0 +1 @@
1171719005974771805808300960005001569062

1
model2/.data/version Normal file
View File

@ -0,0 +1 @@
6

33
model2/CMakeLists.txt Normal file
View File

@ -0,0 +1,33 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
project(aoti_example LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
add_executable(aoti_example main.cpp)
set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17)
find_package(TorchStandalone REQUIRED)
# Set up include directories to find headers at the correct paths
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS})
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 17)
find_package(CUDAToolkit REQUIRED)
target_compile_definitions(aoti_example PRIVATE NOMINMAX USE_CUDA)
# Add compile flags
target_compile_options(aoti_example PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
target_link_libraries(aoti_example PRIVATE
${TorchStandalone_LIBRARIES} # if you have this variable from find_package(TorchStandalone)
CUDA::cudart # CUDA runtime library
cuda # CUDA driver library (usually nvcuda.lib on Windows)
)
# cmake -DTorchStandalone_DIR="C:/Users/shangdiy/source/repos/torchnative/standalone/build/torchstandalone_install/lib/cmake/TorchStandalone" ..
# cmake --build . --config Release

1
model2/archive_format Normal file
View File

@ -0,0 +1 @@
pt2

1
model2/archive_version Normal file
View File

@ -0,0 +1 @@
0

1
model2/byteorder Normal file
View File

@ -0,0 +1 @@
little

View File

@ -0,0 +1,69 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
project(model LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
# Set a library target
add_library(model SHARED)
# TODO: change to TorchStandalone
find_package(TorchStandalone REQUIRED)
# Set up include directories to find headers at the correct paths
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS})
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
# Add macro definitions
target_compile_definitions(model PRIVATE NOMINMAX TORCH_INDUCTOR_CPP_WRAPPER STANDALONE_TORCH_HEADER C10_USING_CUSTOM_GENERATED_MACROS USE_CUDA) # CPU_CAPABILITY_AVX512
# Add compile flags
target_compile_options(model PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
# Backend-specific flags
# target_compile_options(model PRIVATE -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16 -c) # TODO remove
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 17)
find_package(CUDAToolkit REQUIRED)
# Make output use .pyd instead of .dll
set_target_properties(model PROPERTIES
SUFFIX ".pyd"
LINK_FLAGS "/DEF:${CMAKE_CURRENT_SOURCE_DIR}/model_exports.def"
)
set(KERNEL_TARGETS "")
set(KERNEL_OBJECT_FILES "")
# Function to compile ptx to cubin
function(embed_gpu_kernel KERNEL_NAME PTX_FILE)
set(CUBIN_BASENAME ${KERNEL_NAME}.cubin)
set(CUBIN_FILE ${CMAKE_CURRENT_BINARY_DIR}/${CUBIN_BASENAME})
# --- PTX to FATBIN Command & Target ---
add_custom_command(
OUTPUT ${CUBIN_FILE}
COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --cubin ${PTX_FILE}
-o ${CUBIN_FILE} ${NVCC_GENCODE_FLAGS}
-gencode arch=compute_89,code=sm_89
DEPENDS ${PTX_FILE}
)
add_custom_target(build_kernel_object_${KERNEL_NAME} DEPENDS ${CUBIN_FILE})
set(KERNEL_TARGETS ${KERNEL_TARGETS} build_kernel_object_${KERNEL_NAME} PARENT_SCOPE)
endfunction()
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model.wrapper.cpp)
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model_consts.weights.cpp)
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_t_0 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_t_0.ptx)
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_sigmoid_t_1 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx)
add_dependencies(model ${KERNEL_TARGETS})
target_link_libraries(model PRIVATE ${KERNEL_OBJECT_FILES})
# Add linker flags
target_link_options(model PRIVATE )
# Add libraries
# TODO: change to TorchStandalone
target_link_libraries(model PRIVATE ${TorchStandalone_LIBRARIES} cuda CUDA::cudart)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": [], "libraries": [], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": true}

View File

@ -0,0 +1 @@
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/libs", "/home/shangdiy/pytorch/torch/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib/stubs"], "libraries": ["torch", "torch_cpu", "sleef", "c10", "c10_cuda", "cuda", "torch_cuda"], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": false}

View File

@ -0,0 +1 @@
{"AOTI_DEVICE_KEY": "cuda"}

View File

@ -0,0 +1,58 @@
#if defined(__clang__) || defined (__GNUC__)
#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))
#else
#define ATTRIBUTE_NO_SANITIZE_ADDRESS
#endif
ATTRIBUTE_NO_SANITIZE_ADDRESS
alignas(64) extern unsigned char _binary_constants_bin_start[768] = {
69, 165, 123, 190, 252, 181, 16, 62, 242, 69, 193, 59, 44, 80, 100, 62,
237, 163, 142, 188, 138, 139, 109, 190, 61, 248, 20, 190, 77, 84, 143, 60,
111, 90, 163, 60, 96, 140, 230, 189, 101, 69, 38, 190, 132, 190, 12, 188,
28, 113, 159, 190, 252, 128, 154, 62, 247, 234, 217, 61, 206, 79, 58, 61,
224, 209, 135, 61, 211, 238, 147, 62, 231, 229, 27, 190, 169, 208, 57, 62,
100, 20, 153, 190, 161, 160, 85, 190, 207, 10, 156, 62, 234, 107, 155, 190,
188, 85, 116, 62, 27, 211, 114, 60, 94, 21, 158, 189, 147, 210, 34, 62,
203, 109, 80, 62, 28, 242, 141, 189, 205, 27, 152, 190, 38, 104, 6, 189,
211, 16, 249, 189, 72, 103, 143, 190, 163, 44, 140, 189, 178, 223, 127, 189,
5, 112, 160, 189, 177, 55, 132, 190, 218, 22, 159, 62, 115, 30, 35, 190,
26, 247, 9, 62, 251, 219, 27, 62, 165, 86, 135, 62, 99, 168, 66, 190,
238, 64, 93, 62, 65, 147, 86, 62, 167, 108, 97, 62, 183, 219, 50, 190,
138, 83, 106, 62, 90, 122, 208, 189, 149, 140, 161, 188, 44, 145, 194, 189,
5, 142, 186, 61, 202, 230, 153, 190, 133, 72, 136, 62, 251, 1, 3, 62,
225, 146, 54, 190, 91, 176, 219, 189, 118, 244, 10, 189, 232, 107, 142, 62,
185, 6, 151, 62, 241, 137, 223, 61, 124, 100, 114, 190, 15, 240, 168, 189,
149, 252, 58, 190, 238, 93, 243, 188, 144, 218, 115, 61, 159, 91, 6, 62,
57, 139, 74, 190, 84, 200, 49, 61, 193, 78, 32, 190, 84, 121, 26, 190,
219, 39, 115, 190, 171, 127, 94, 62, 248, 253, 177, 61, 63, 18, 127, 62,
146, 18, 137, 189, 203, 90, 161, 190, 139, 194, 239, 58, 126, 54, 40, 190,
47, 247, 30, 190, 106, 93, 191, 61, 22, 48, 120, 61, 56, 123, 7, 62,
150, 229, 210, 189, 118, 231, 158, 188, 7, 98, 215, 60, 72, 251, 89, 190,
190, 160, 137, 190, 173, 194, 158, 62, 225, 26, 118, 190, 174, 199, 4, 189,
205, 148, 16, 62, 20, 225, 155, 61, 90, 124, 133, 190, 88, 196, 34, 61,
26, 104, 51, 190, 149, 106, 40, 62, 25, 136, 177, 60, 169, 111, 138, 190,
214, 181, 226, 189, 109, 17, 77, 62, 224, 166, 55, 62, 250, 128, 160, 61,
104, 223, 250, 61, 34, 182, 210, 187, 60, 87, 149, 190, 189, 55, 98, 188,
58, 86, 85, 190, 170, 43, 132, 190, 81, 220, 87, 190, 47, 226, 138, 62,
189, 162, 36, 190, 30, 232, 34, 60, 138, 147, 167, 61, 151, 129, 157, 61,
206, 33, 152, 62, 109, 227, 113, 190, 147, 255, 11, 190, 175, 56, 46, 189,
46, 238, 1, 189, 123, 159, 85, 188, 14, 126, 148, 189, 226, 226, 169, 189,
255, 106, 134, 61, 38, 140, 187, 60, 119, 73, 49, 62, 32, 236, 43, 62,
78, 33, 232, 189, 72, 188, 139, 62, 94, 206, 20, 62, 25, 230, 75, 189,
171, 239, 26, 190, 136, 218, 121, 62, 96, 115, 85, 62, 126, 92, 55, 190,
112, 108, 134, 61, 64, 212, 69, 190, 253, 118, 214, 188, 210, 116, 66, 62,
204, 131, 123, 190, 13, 151, 38, 190, 56, 17, 252, 189, 153, 151, 138, 62,
21, 30, 216, 61, 146, 103, 32, 62, 140, 60, 78, 62, 183, 149, 174, 61,
95, 153, 164, 61, 144, 167, 187, 189, 112, 53, 153, 190, 127, 195, 105, 61,
169, 167, 251, 189, 42, 204, 123, 62, 116, 193, 86, 62, 98, 147, 30, 61,
176, 138, 137, 62, 245, 244, 17, 62, 201, 90, 140, 62, 177, 110, 77, 62,
188, 31, 129, 190, 66, 203, 85, 62, 182, 209, 112, 188, 216, 91, 222, 59,
18, 208, 131, 189, 151, 142, 150, 190, 36, 252, 31, 62, 241, 2, 180, 61,
83, 240, 159, 62, 37, 152, 115, 190, 13, 52, 107, 62, 169, 178, 148, 62,
171, 54, 38, 62, 33, 4, 199, 189, 201, 247, 216, 189, 225, 89, 146, 190,
192, 118, 79, 189, 92, 171, 12, 62, 136, 235, 3, 62, 180, 202, 87, 62,
8, 129, 122, 61, 160, 75, 170, 188, 20, 84, 6, 62, 60, 194, 56, 190,
182, 99, 44, 190, 88, 96, 228, 189, 50, 106, 5, 190, 34, 133, 12, 190,
26, 50, 0, 190, 176, 25, 127, 61, 48, 69, 219, 61, 192, 237, 252, 187,
};
alignas(64) extern unsigned char * _binary_constants_bin_end;

View File

@ -0,0 +1,6 @@
LIBRARY model
EXPORTS
AOTInductorModelContainerCreate
AOTInductorModelContainerCreateWithDevice
AOTInductorModelContainerRun
AOTInductorModelContainerDelete

View File

@ -0,0 +1,684 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_89
.address_size 64
// .globl model_triton_tem_fused_addmm_relu_sigmoid_t_1 // -- Begin function model_triton_tem_fused_addmm_relu_sigmoid_t_1
.extern .shared .align 16 .b8 global_smem[];
// @model_triton_tem_fused_addmm_relu_sigmoid_t_1
.visible .entry model_triton_tem_fused_addmm_relu_sigmoid_t_1(
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2,
.param .u32 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_4
)
.reqntid 32
{
.reg .pred %p<12>;
.reg .b32 %r<375>;
.reg .b64 %rd<27>;
.loc 1 18 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
$L__func_begin0:
.loc 1 18 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
// %bb.0:
ld.param.b32 %r1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3];
$L__tmp0:
.loc 1 34 16 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:34:16
setp.ne.s32 %p1, %r1, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
.loc 1 0 16 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0:16
ld.param.b64 %rd3, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2];
ld.param.b64 %rd2, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1];
ld.param.b64 %rd1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0];
.loc 1 43 24 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:43:24
mov.u32 %r26, %ctaid.x;
.loc 1 44 28 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:28
add.s32 %r27, %r1, 15;
.loc 1 44 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:34
shr.s32 %r28, %r27, 31;
shr.u32 %r29, %r28, 28;
add.s32 %r30, %r27, %r29;
shr.s32 %r31, %r30, 4;
.loc 1 50 41 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:41
and.b32 %r32, %r26, 2147483640;
.loc 1 50 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:30
sub.s32 %r33, %r31, %r32;
.loc 1 50 50 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:50
min.s32 %r34, %r33, 8;
.loc 1 51 40 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:40
rem.s32 %r35, %r26, %r34;
.loc 1 51 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:34
add.s32 %r36, %r35, %r32;
.loc 1 52 19 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:19
and.b32 %r37, %r26, 7;
.loc 1 52 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:30
div.s32 %r38, %r37, %r34;
.loc 1 56 17 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:17
shl.b32 %r39, %r36, 4;
.loc 1 56 40 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:40
mov.u32 %r40, %tid.x;
shr.u32 %r41, %r40, 2;
and.b32 %r42, %r41, 2;
bfe.u32 %r43, %r40, 2, 2;
and.b32 %r44, %r40, 16;
shr.u32 %r45, %r44, 2;
or.b32 %r46, %r43, %r45;
and.b32 %r47, %r40, 15;
bfe.u32 %r48, %r40, 4, 1;
.loc 1 56 27 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:27
or.b32 %r49, %r39, %r46;
or.b32 %r50, %r49, 8;
or.b32 %r51, %r39, %r47;
.loc 1 0 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
rem.s32 %r52, %r50, %r1;
rem.s32 %r53, %r49, %r1;
.loc 1 71 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:71:30
shl.b32 %r54, %r40, 2;
and.b32 %r55, %r54, 12;
.loc 1 76 28 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:28
shl.b32 %r56, %r53, 4;
shl.b32 %r57, %r52, 4;
.loc 1 76 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:25
or.b32 %r58, %r56, %r55;
or.b32 %r59, %r57, %r55;
.loc 1 77 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:25
mul.wide.s32 %rd16, %r58, 4;
add.s64 %rd4, %rd1, %rd16;
mul.wide.s32 %rd17, %r59, 4;
add.s64 %rd5, %rd1, %rd17;
.loc 1 77 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
// begin inline asm
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
mov.u32 %r4, 0x0;
mov.u32 %r5, 0x0;
ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd4 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r6, 0x0;
mov.u32 %r7, 0x0;
mov.u32 %r8, 0x0;
mov.u32 %r9, 0x0;
ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd5 + 0 ];
// end inline asm
shl.b32 %r60, %r46, 6;
shl.b32 %r61, %r55, 2;
or.b32 %r62, %r60, %r61;
mov.b32 %r63, global_smem;
add.s32 %r64, %r63, %r62;
st.shared.v4.b32 [%r64], {%r2, %r3, %r4, %r5};
st.shared.v4.b32 [%r64+512], {%r6, %r7, %r8, %r9};
.loc 1 82 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:25
mul.wide.u32 %rd18, %r55, 4;
add.s64 %rd6, %rd2, %rd18;
.loc 1 82 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
// begin inline asm
mov.u32 %r10, 0x0;
mov.u32 %r11, 0x0;
mov.u32 %r12, 0x0;
mov.u32 %r13, 0x0;
ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd6 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r14, 0x0;
mov.u32 %r15, 0x0;
mov.u32 %r16, 0x0;
mov.u32 %r17, 0x0;
ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ];
// end inline asm
add.s32 %r65, %r63, 1024;
add.s32 %r66, %r65, %r62;
st.shared.v4.b32 [%r66], {%r10, %r11, %r12, %r13};
st.shared.v4.b32 [%r66+512], {%r14, %r15, %r16, %r17};
.loc 1 90 17 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:17
shl.b32 %r67, %r38, 4;
.loc 1 90 27 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:27
or.b32 %r68, %r67, %r48;
.loc 1 93 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:20
setp.lt.s32 %p10, %r51, %r1;
.loc 1 93 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:34
setp.eq.s32 %p11, %r68, 0;
.loc 1 93 26 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:26
and.pred %p2, %p10, %p11;
.loc 1 96 21 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:96:21
add.s32 %r69, %r51, %r68;
add.s32 %r70, %r69, 2;
add.s32 %r71, %r69, 4;
add.s32 %r72, %r69, 6;
add.s32 %r73, %r69, 8;
add.s32 %r74, %r69, 10;
add.s32 %r75, %r69, 12;
add.s32 %r76, %r69, 14;
.loc 1 77 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
bar.sync 0;
or.b32 %r77, %r42, %r45;
shl.b32 %r78, %r77, 6;
add.s32 %r79, %r63, %r78;
ld.shared.v4.b32 {%r80, %r81, %r82, %r83}, [%r79+112];
ld.shared.v4.b32 {%r84, %r85, %r86, %r87}, [%r79+96];
ld.shared.v4.b32 {%r88, %r89, %r90, %r91}, [%r79+80];
ld.shared.v4.b32 {%r92, %r93, %r94, %r95}, [%r79+64];
ld.shared.v4.b32 {%r96, %r97, %r98, %r99}, [%r79+48];
ld.shared.v4.b32 {%r100, %r101, %r102, %r103}, [%r79+32];
ld.shared.v4.b32 {%r104, %r105, %r106, %r107}, [%r79+16];
ld.shared.v4.b32 {%r108, %r109, %r110, %r111}, [%r79];
ld.shared.v4.b32 {%r112, %r113, %r114, %r115}, [%r79+624];
ld.shared.v4.b32 {%r116, %r117, %r118, %r119}, [%r79+608];
ld.shared.v4.b32 {%r120, %r121, %r122, %r123}, [%r79+592];
ld.shared.v4.b32 {%r124, %r125, %r126, %r127}, [%r79+576];
ld.shared.v4.b32 {%r128, %r129, %r130, %r131}, [%r79+560];
ld.shared.v4.b32 {%r132, %r133, %r134, %r135}, [%r79+544];
ld.shared.v4.b32 {%r136, %r137, %r138, %r139}, [%r79+528];
ld.shared.v4.b32 {%r140, %r141, %r142, %r143}, [%r79+512];
.loc 1 82 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
shl.b32 %r144, %r40, 1;
and.b32 %r145, %r144, 14;
shl.b32 %r146, %r145, 6;
add.s32 %r147, %r65, %r146;
ld.shared.v4.b32 {%r148, %r149, %r150, %r151}, [%r147];
ld.shared.v4.b32 {%r152, %r153, %r154, %r155}, [%r147+64];
ld.shared.v4.b32 {%r156, %r157, %r158, %r159}, [%r147+16];
ld.shared.v4.b32 {%r160, %r161, %r162, %r163}, [%r147+80];
ld.shared.v4.b32 {%r164, %r165, %r166, %r167}, [%r147+32];
ld.shared.v4.b32 {%r168, %r169, %r170, %r171}, [%r147+96];
ld.shared.v4.b32 {%r172, %r173, %r174, %r175}, [%r147+48];
ld.shared.v4.b32 {%r176, %r177, %r178, %r179}, [%r147+112];
.loc 1 85 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:85:25
fma.rn.f32 %r180, %r108, %r148, 0fBE0AE428;
fma.rn.f32 %r181, %r109, %r149, %r180;
fma.rn.f32 %r182, %r110, %r150, %r181;
fma.rn.f32 %r183, %r111, %r151, %r182;
fma.rn.f32 %r184, %r104, %r156, %r183;
fma.rn.f32 %r185, %r105, %r157, %r184;
fma.rn.f32 %r186, %r106, %r158, %r185;
fma.rn.f32 %r187, %r107, %r159, %r186;
fma.rn.f32 %r188, %r100, %r164, %r187;
fma.rn.f32 %r189, %r101, %r165, %r188;
fma.rn.f32 %r190, %r102, %r166, %r189;
fma.rn.f32 %r191, %r103, %r167, %r190;
fma.rn.f32 %r192, %r96, %r172, %r191;
fma.rn.f32 %r193, %r97, %r173, %r192;
fma.rn.f32 %r194, %r98, %r174, %r193;
fma.rn.f32 %r195, %r99, %r175, %r194;
fma.rn.f32 %r196, %r108, %r152, 0fBE0AE428;
fma.rn.f32 %r197, %r109, %r153, %r196;
fma.rn.f32 %r198, %r110, %r154, %r197;
fma.rn.f32 %r199, %r111, %r155, %r198;
fma.rn.f32 %r200, %r104, %r160, %r199;
fma.rn.f32 %r201, %r105, %r161, %r200;
fma.rn.f32 %r202, %r106, %r162, %r201;
fma.rn.f32 %r203, %r107, %r163, %r202;
fma.rn.f32 %r204, %r100, %r168, %r203;
fma.rn.f32 %r205, %r101, %r169, %r204;
fma.rn.f32 %r206, %r102, %r170, %r205;
fma.rn.f32 %r207, %r103, %r171, %r206;
fma.rn.f32 %r208, %r96, %r176, %r207;
fma.rn.f32 %r209, %r97, %r177, %r208;
fma.rn.f32 %r210, %r98, %r178, %r209;
fma.rn.f32 %r211, %r99, %r179, %r210;
fma.rn.f32 %r212, %r92, %r148, 0fBE0AE428;
fma.rn.f32 %r213, %r93, %r149, %r212;
fma.rn.f32 %r214, %r94, %r150, %r213;
fma.rn.f32 %r215, %r95, %r151, %r214;
fma.rn.f32 %r216, %r88, %r156, %r215;
fma.rn.f32 %r217, %r89, %r157, %r216;
fma.rn.f32 %r218, %r90, %r158, %r217;
fma.rn.f32 %r219, %r91, %r159, %r218;
fma.rn.f32 %r220, %r84, %r164, %r219;
fma.rn.f32 %r221, %r85, %r165, %r220;
fma.rn.f32 %r222, %r86, %r166, %r221;
fma.rn.f32 %r223, %r87, %r167, %r222;
fma.rn.f32 %r224, %r80, %r172, %r223;
fma.rn.f32 %r225, %r81, %r173, %r224;
fma.rn.f32 %r226, %r82, %r174, %r225;
fma.rn.f32 %r227, %r83, %r175, %r226;
fma.rn.f32 %r228, %r92, %r152, 0fBE0AE428;
fma.rn.f32 %r229, %r93, %r153, %r228;
fma.rn.f32 %r230, %r94, %r154, %r229;
fma.rn.f32 %r231, %r95, %r155, %r230;
fma.rn.f32 %r232, %r88, %r160, %r231;
fma.rn.f32 %r233, %r89, %r161, %r232;
fma.rn.f32 %r234, %r90, %r162, %r233;
fma.rn.f32 %r235, %r91, %r163, %r234;
fma.rn.f32 %r236, %r84, %r168, %r235;
fma.rn.f32 %r237, %r85, %r169, %r236;
fma.rn.f32 %r238, %r86, %r170, %r237;
fma.rn.f32 %r239, %r87, %r171, %r238;
fma.rn.f32 %r240, %r80, %r176, %r239;
fma.rn.f32 %r241, %r81, %r177, %r240;
fma.rn.f32 %r242, %r82, %r178, %r241;
fma.rn.f32 %r243, %r83, %r179, %r242;
fma.rn.f32 %r244, %r140, %r148, 0fBE0AE428;
fma.rn.f32 %r245, %r141, %r149, %r244;
fma.rn.f32 %r246, %r142, %r150, %r245;
fma.rn.f32 %r247, %r143, %r151, %r246;
fma.rn.f32 %r248, %r136, %r156, %r247;
fma.rn.f32 %r249, %r137, %r157, %r248;
fma.rn.f32 %r250, %r138, %r158, %r249;
fma.rn.f32 %r251, %r139, %r159, %r250;
fma.rn.f32 %r252, %r132, %r164, %r251;
fma.rn.f32 %r253, %r133, %r165, %r252;
fma.rn.f32 %r254, %r134, %r166, %r253;
fma.rn.f32 %r255, %r135, %r167, %r254;
fma.rn.f32 %r256, %r128, %r172, %r255;
fma.rn.f32 %r257, %r129, %r173, %r256;
fma.rn.f32 %r258, %r130, %r174, %r257;
fma.rn.f32 %r259, %r131, %r175, %r258;
fma.rn.f32 %r260, %r140, %r152, 0fBE0AE428;
fma.rn.f32 %r261, %r141, %r153, %r260;
fma.rn.f32 %r262, %r142, %r154, %r261;
fma.rn.f32 %r263, %r143, %r155, %r262;
fma.rn.f32 %r264, %r136, %r160, %r263;
fma.rn.f32 %r265, %r137, %r161, %r264;
fma.rn.f32 %r266, %r138, %r162, %r265;
fma.rn.f32 %r267, %r139, %r163, %r266;
fma.rn.f32 %r268, %r132, %r168, %r267;
fma.rn.f32 %r269, %r133, %r169, %r268;
fma.rn.f32 %r270, %r134, %r170, %r269;
fma.rn.f32 %r271, %r135, %r171, %r270;
fma.rn.f32 %r272, %r128, %r176, %r271;
fma.rn.f32 %r273, %r129, %r177, %r272;
fma.rn.f32 %r274, %r130, %r178, %r273;
fma.rn.f32 %r275, %r131, %r179, %r274;
fma.rn.f32 %r276, %r124, %r148, 0fBE0AE428;
fma.rn.f32 %r277, %r125, %r149, %r276;
fma.rn.f32 %r278, %r126, %r150, %r277;
fma.rn.f32 %r279, %r127, %r151, %r278;
fma.rn.f32 %r280, %r120, %r156, %r279;
fma.rn.f32 %r281, %r121, %r157, %r280;
fma.rn.f32 %r282, %r122, %r158, %r281;
fma.rn.f32 %r283, %r123, %r159, %r282;
fma.rn.f32 %r284, %r116, %r164, %r283;
fma.rn.f32 %r285, %r117, %r165, %r284;
fma.rn.f32 %r286, %r118, %r166, %r285;
fma.rn.f32 %r287, %r119, %r167, %r286;
fma.rn.f32 %r288, %r112, %r172, %r287;
fma.rn.f32 %r289, %r113, %r173, %r288;
fma.rn.f32 %r290, %r114, %r174, %r289;
fma.rn.f32 %r291, %r115, %r175, %r290;
fma.rn.f32 %r292, %r124, %r152, 0fBE0AE428;
fma.rn.f32 %r293, %r125, %r153, %r292;
fma.rn.f32 %r294, %r126, %r154, %r293;
fma.rn.f32 %r295, %r127, %r155, %r294;
fma.rn.f32 %r296, %r120, %r160, %r295;
fma.rn.f32 %r297, %r121, %r161, %r296;
fma.rn.f32 %r298, %r122, %r162, %r297;
fma.rn.f32 %r299, %r123, %r163, %r298;
fma.rn.f32 %r300, %r116, %r168, %r299;
fma.rn.f32 %r301, %r117, %r169, %r300;
fma.rn.f32 %r302, %r118, %r170, %r301;
fma.rn.f32 %r303, %r119, %r171, %r302;
fma.rn.f32 %r304, %r112, %r176, %r303;
fma.rn.f32 %r305, %r113, %r177, %r304;
fma.rn.f32 %r306, %r114, %r178, %r305;
fma.rn.f32 %r307, %r115, %r179, %r306;
mov.b32 %r308, 0f00000000;
$L__tmp1:
.loc 2 47 30 // standard.py:47:30 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
sub.f32 %r309, %r308, %r195;
sub.f32 %r310, %r308, %r211;
sub.f32 %r311, %r308, %r227;
sub.f32 %r312, %r308, %r243;
sub.f32 %r313, %r308, %r259;
sub.f32 %r314, %r308, %r275;
sub.f32 %r315, %r308, %r291;
sub.f32 %r316, %r308, %r307;
.loc 2 47 29 // standard.py:47:29 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
mul.f32 %r317, %r309, 0f3FB8AA3B;
ex2.approx.f32 %r318, %r317;
mul.f32 %r319, %r310, 0f3FB8AA3B;
ex2.approx.f32 %r320, %r319;
mul.f32 %r321, %r311, 0f3FB8AA3B;
ex2.approx.f32 %r322, %r321;
mul.f32 %r323, %r312, 0f3FB8AA3B;
ex2.approx.f32 %r324, %r323;
mul.f32 %r325, %r313, 0f3FB8AA3B;
ex2.approx.f32 %r326, %r325;
mul.f32 %r327, %r314, 0f3FB8AA3B;
ex2.approx.f32 %r328, %r327;
mul.f32 %r329, %r315, 0f3FB8AA3B;
ex2.approx.f32 %r330, %r329;
mul.f32 %r331, %r316, 0f3FB8AA3B;
ex2.approx.f32 %r332, %r331;
.loc 2 47 20 // standard.py:47:20 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
add.f32 %r333, %r318, 0f3F800000;
add.f32 %r334, %r320, 0f3F800000;
add.f32 %r335, %r322, 0f3F800000;
add.f32 %r336, %r324, 0f3F800000;
add.f32 %r337, %r326, 0f3F800000;
add.f32 %r338, %r328, 0f3F800000;
add.f32 %r339, %r330, 0f3F800000;
add.f32 %r340, %r332, 0f3F800000;
mov.b32 %r341, 0f3F800000;
.loc 2 47 16 // standard.py:47:16 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
div.full.f32 %r342, %r341, %r333;
div.full.f32 %r343, %r341, %r334;
div.full.f32 %r344, %r341, %r335;
div.full.f32 %r345, %r341, %r336;
div.full.f32 %r346, %r341, %r337;
div.full.f32 %r347, %r341, %r338;
div.full.f32 %r348, %r341, %r339;
div.full.f32 %r349, %r341, %r340;
$L__tmp2:
.loc 1 100 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:25
mul.wide.s32 %rd19, %r69, 4;
add.s64 %rd8, %rd3, %rd19;
mul.wide.s32 %rd20, %r70, 4;
add.s64 %rd9, %rd3, %rd20;
mul.wide.s32 %rd21, %r71, 4;
add.s64 %rd10, %rd3, %rd21;
mul.wide.s32 %rd22, %r72, 4;
add.s64 %rd11, %rd3, %rd22;
mul.wide.s32 %rd23, %r73, 4;
add.s64 %rd12, %rd3, %rd23;
mul.wide.s32 %rd24, %r74, 4;
add.s64 %rd13, %rd3, %rd24;
mul.wide.s32 %rd25, %r75, 4;
add.s64 %rd14, %rd3, %rd25;
mul.wide.s32 %rd26, %r76, 4;
add.s64 %rd15, %rd3, %rd26;
.loc 1 100 68 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:68
bar.sync 0;
and.b32 %r350, %r54, 96;
or.b32 %r351, %r145, %r350;
shl.b32 %r352, %r40, 4;
and.b32 %r353, %r352, 240;
shr.u32 %r354, %r350, 1;
add.s32 %r355, %r63, %r354;
shl.b32 %r356, %r351, 2;
add.s32 %r357, %r355, %r356;
st.shared.v2.b32 [%r357], {%r342, %r343};
or.b32 %r358, %r350, 16;
shr.u32 %r359, %r358, 1;
add.s32 %r360, %r63, %r359;
add.s32 %r361, %r360, %r356;
st.shared.v2.b32 [%r361+64], {%r344, %r345};
or.b32 %r362, %r350, 128;
shr.u32 %r363, %r362, 1;
add.s32 %r364, %r63, %r363;
add.s32 %r365, %r364, %r356;
st.shared.v2.b32 [%r365+512], {%r346, %r347};
or.b32 %r366, %r350, 144;
shr.u32 %r367, %r366, 1;
add.s32 %r368, %r63, %r367;
add.s32 %r369, %r368, %r356;
st.shared.v2.b32 [%r369+576], {%r348, %r349};
bar.sync 0;
shr.u32 %r370, %r353, 1;
add.s32 %r371, %r63, %r370;
shl.b32 %r372, %r353, 2;
add.s32 %r373, %r371, %r372;
add.s32 %r374, %r373, %r45;
ld.shared.b32 %r18, [%r374];
ld.shared.b32 %r19, [%r374+8];
ld.shared.b32 %r20, [%r374+16];
ld.shared.b32 %r21, [%r374+24];
ld.shared.b32 %r22, [%r374+32];
ld.shared.b32 %r23, [%r374+40];
ld.shared.b32 %r24, [%r374+48];
ld.shared.b32 %r25, [%r374+56];
// begin inline asm
@%p2 st.global.b32 [ %rd8 + 0 ], { %r18 };
// end inline asm
mov.pred %p3, 0;
// begin inline asm
@%p3 st.global.b32 [ %rd9 + 0 ], { %r19 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd10 + 0 ], { %r20 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd11 + 0 ], { %r21 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd12 + 0 ], { %r22 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd13 + 0 ], { %r23 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd14 + 0 ], { %r24 };
// end inline asm
// begin inline asm
@%p3 st.global.b32 [ %rd15 + 0 ], { %r25 };
// end inline asm
$L__BB0_1: // %common.ret
.loc 1 0 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
ret;
$L__tmp3:
$L__func_end0:
// -- End function
}
.file 1 "/tmp/torchinductor_shangdiy/q4/cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py"
.file 2 "/home/shangdiy/miniconda3/envs/pytorch-3.10/lib/python3.10/site-packages/triton/language/standard.py"
.section .debug_abbrev
{
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
}
.section .debug_info
{
.b32 203 // Length of Unit
.b8 2 // DWARF version number
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2 // DW_AT_language
.b8 0
.b8 99 // DW_AT_name
.b8 113
.b8 52
.b8 54
.b8 118
.b8 111
.b8 108
.b8 114
.b8 122
.b8 109
.b8 97
.b8 54
.b8 55
.b8 105
.b8 110
.b8 100
.b8 116
.b8 119
.b8 104
.b8 106
.b8 53
.b8 97
.b8 52
.b8 110
.b8 54
.b8 122
.b8 114
.b8 51
.b8 54
.b8 55
.b8 103
.b8 113
.b8 112
.b8 107
.b8 99
.b8 119
.b8 111
.b8 50
.b8 104
.b8 108
.b8 112
.b8 104
.b8 121
.b8 110
.b8 104
.b8 106
.b8 116
.b8 104
.b8 110
.b8 51
.b8 117
.b8 107
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 115
.b8 104
.b8 97
.b8 110
.b8 103
.b8 100
.b8 105
.b8 121
.b8 47
.b8 113
.b8 52
.b8 0
.b8 2 // Abbrev [2] 0x70:0x30 DW_TAG_subprogram
.b8 109 // DW_AT_name
.b8 111
.b8 100
.b8 101
.b8 108
.b8 95
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 116
.b8 101
.b8 109
.b8 95
.b8 102
.b8 117
.b8 115
.b8 101
.b8 100
.b8 95
.b8 97
.b8 100
.b8 100
.b8 109
.b8 109
.b8 95
.b8 114
.b8 101
.b8 108
.b8 117
.b8 95
.b8 115
.b8 105
.b8 103
.b8 109
.b8 111
.b8 105
.b8 100
.b8 95
.b8 116
.b8 95
.b8 49
.b8 0
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0xa0:0x2e DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 112 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine
.b32 112 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp2 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 99 // DW_AT_call_line
.b8 22 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
.section .debug_macinfo { }

View File

@ -0,0 +1,727 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_89
.address_size 64
// .globl model_triton_tem_fused_addmm_relu_t_0 // -- Begin function model_triton_tem_fused_addmm_relu_t_0
.extern .shared .align 16 .b8 global_smem[];
// @model_triton_tem_fused_addmm_relu_t_0
.visible .entry model_triton_tem_fused_addmm_relu_t_0(
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_0,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_1,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_2,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_3,
.param .u32 model_triton_tem_fused_addmm_relu_t_0_param_4,
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_5
)
.reqntid 32
{
.reg .pred %p<27>;
.reg .b32 %r<398>;
.reg .b64 %rd<29>;
.loc 1 18 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
$L__func_begin0:
.loc 1 18 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
// %bb.0:
ld.param.b32 %r1, [model_triton_tem_fused_addmm_relu_t_0_param_4];
$L__tmp0:
.loc 1 34 16 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:34:16
and.b32 %r2, %r1, 268435455;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
.loc 1 0 16 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0:16
ld.param.b64 %rd4, [model_triton_tem_fused_addmm_relu_t_0_param_3];
ld.param.b64 %rd3, [model_triton_tem_fused_addmm_relu_t_0_param_2];
ld.param.b64 %rd2, [model_triton_tem_fused_addmm_relu_t_0_param_1];
ld.param.b64 %rd1, [model_triton_tem_fused_addmm_relu_t_0_param_0];
.loc 1 43 24 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:43:24
mov.u32 %r51, %ctaid.x;
.loc 1 44 28 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:28
add.s32 %r52, %r1, 15;
.loc 1 44 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:34
shr.s32 %r53, %r52, 31;
shr.u32 %r54, %r53, 28;
add.s32 %r55, %r52, %r54;
shr.s32 %r56, %r55, 4;
.loc 1 50 41 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:41
and.b32 %r57, %r51, 2147483640;
.loc 1 50 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:30
sub.s32 %r58, %r56, %r57;
.loc 1 50 50 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:50
min.s32 %r59, %r58, 8;
.loc 1 51 40 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:40
rem.s32 %r60, %r51, %r59;
.loc 1 51 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:34
add.s32 %r61, %r60, %r57;
.loc 1 52 19 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:19
and.b32 %r62, %r51, 7;
.loc 1 52 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:30
div.s32 %r63, %r62, %r59;
.loc 1 56 17 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:17
shl.b32 %r64, %r61, 4;
.loc 1 56 40 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:40
mov.u32 %r65, %tid.x;
and.b32 %r66, %r65, 4;
bfe.u32 %r67, %r65, 2, 1;
shr.u32 %r68, %r65, 2;
and.b32 %r69, %r68, 6;
or.b32 %r70, %r69, %r67;
bfe.u32 %r71, %r65, 3, 2;
and.b32 %r72, %r65, 1;
shl.b32 %r73, %r72, 2;
shl.b32 %r74, %r65, 2;
and.b32 %r75, %r74, 12;
or.b32 %r76, %r75, 2;
.loc 1 56 27 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:27
or.b32 %r77, %r64, %r70;
or.b32 %r78, %r77, 8;
.loc 1 57 17 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:17
shl.b32 %r79, %r63, 4;
.loc 1 57 27 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:27
or.b32 %r80, %r79, %r75;
.loc 1 0 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
rem.s32 %r81, %r78, %r1;
rem.s32 %r82, %r77, %r1;
.loc 1 71 36 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:71:36
setp.lt.u32 %p2, %r75, 10;
setp.lt.u32 %p3, %r76, 10;
.loc 1 72 24 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:24
shl.b32 %r83, %r65, 1;
and.b32 %r84, %r83, 6;
shl.b32 %r85, %r66, 1;
or.b32 %r86, %r84, %r85;
.loc 1 72 36 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:36
setp.lt.u32 %p6, %r86, 10;
.loc 1 79 28 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:28
mul.lo.s32 %r87, %r82, 10;
mul.lo.s32 %r88, %r81, 10;
.loc 1 79 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:25
add.s32 %r89, %r87, %r75;
add.s32 %r90, %r87, %r76;
add.s32 %r91, %r88, %r75;
add.s32 %r92, %r88, %r76;
.loc 1 80 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:25
mul.wide.s32 %rd21, %r89, 4;
add.s64 %rd5, %rd1, %rd21;
mul.wide.s32 %rd22, %r90, 4;
add.s64 %rd6, %rd1, %rd22;
mul.wide.s32 %rd23, %r91, 4;
add.s64 %rd7, %rd1, %rd23;
mul.wide.s32 %rd24, %r92, 4;
add.s64 %rd8, %rd1, %rd24;
mov.b32 %r5, 0;
.loc 1 80 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
// begin inline asm
mov.u32 %r3, %r5;
mov.u32 %r4, %r5;
@%p2 ld.global.v2.b32 { %r3, %r4 }, [ %rd5 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r7, %r5;
mov.u32 %r8, %r5;
@%p3 ld.global.v2.b32 { %r7, %r8 }, [ %rd6 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r11, %r5;
mov.u32 %r12, %r5;
@%p2 ld.global.v2.b32 { %r11, %r12 }, [ %rd7 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r15, %r5;
mov.u32 %r16, %r5;
@%p3 ld.global.v2.b32 { %r15, %r16 }, [ %rd8 + 0 ];
// end inline asm
shl.b32 %r93, %r75, 2;
mov.b32 %r94, global_smem;
add.s32 %r95, %r94, %r93;
shl.b32 %r96, %r70, 6;
add.s32 %r97, %r95, %r96;
st.shared.v4.b32 [%r97], {%r3, %r4, %r7, %r8};
st.shared.v4.b32 [%r97+512], {%r11, %r12, %r15, %r16};
.loc 1 85 50 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:50
mad.lo.s32 %r98, %r71, 10, %r86;
.loc 1 85 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:25
mul.wide.u32 %rd25, %r98, 4;
add.s64 %rd9, %rd2, %rd25;
add.s64 %rd10, %rd9, 160;
add.s64 %rd11, %rd9, 320;
add.s64 %rd12, %rd9, 480;
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
// begin inline asm
mov.u32 %r19, %r5;
mov.u32 %r20, %r5;
@%p6 ld.global.v2.b32 { %r19, %r20 }, [ %rd9 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r23, %r5;
mov.u32 %r24, %r5;
@%p6 ld.global.v2.b32 { %r23, %r24 }, [ %rd10 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r27, %r5;
mov.u32 %r28, %r5;
@%p6 ld.global.v2.b32 { %r27, %r28 }, [ %rd11 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r31, %r5;
mov.u32 %r32, %r5;
@%p6 ld.global.v2.b32 { %r31, %r32 }, [ %rd12 + 0 ];
// end inline asm
shl.b32 %r99, %r86, 2;
add.s32 %r100, %r94, 1024;
add.s32 %r101, %r100, %r99;
shl.b32 %r102, %r71, 6;
add.s32 %r103, %r101, %r102;
st.shared.v2.b32 [%r103], {%r19, %r20};
st.shared.v2.b32 [%r103+256], {%r23, %r24};
st.shared.v2.b32 [%r103+512], {%r27, %r28};
st.shared.v2.b32 [%r103+768], {%r31, %r32};
.loc 1 96 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:20
setp.lt.s32 %p14, %r77, %r1;
setp.lt.s32 %p15, %r78, %r1;
.loc 1 96 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:34
setp.eq.s32 %p16, %r63, 0;
.loc 1 96 26 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:26
and.pred %p10, %p16, %p14;
and.pred %p11, %p15, %p16;
.loc 1 100 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:30
mul.wide.u32 %rd26, %r80, 4;
add.s64 %rd14, %rd3, %rd26;
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
// begin inline asm
mov.u64 %rd15, 0x0;
createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
// end inline asm
// begin inline asm
mov.u32 %r35, 0x0;
mov.u32 %r36, 0x0;
mov.u32 %r37, 0x0;
mov.u32 %r38, 0x0;
@%p10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15;
// end inline asm
// begin inline asm
mov.u64 %rd18, 0x0;
createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
// end inline asm
// begin inline asm
mov.u32 %r39, 0x0;
mov.u32 %r40, 0x0;
mov.u32 %r41, 0x0;
mov.u32 %r42, 0x0;
@%p11 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd14 + 0 ], %rd18;
// end inline asm
.loc 1 80 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
bar.sync 0;
shl.b32 %r104, %r69, 6;
add.s32 %r105, %r94, %r104;
ld.shared.v4.b32 {%r106, %r107, %r108, %r109}, [%r105+48];
ld.shared.v4.b32 {%r110, %r111, %r112, %r113}, [%r105+112];
ld.shared.v4.b32 {%r114, %r115, %r116, %r117}, [%r105+32];
ld.shared.v4.b32 {%r118, %r119, %r120, %r121}, [%r105+96];
ld.shared.v4.b32 {%r122, %r123, %r124, %r125}, [%r105+16];
ld.shared.v4.b32 {%r126, %r127, %r128, %r129}, [%r105+80];
ld.shared.v4.b32 {%r130, %r131, %r132, %r133}, [%r105];
ld.shared.v4.b32 {%r134, %r135, %r136, %r137}, [%r105+64];
ld.shared.v4.b32 {%r138, %r139, %r140, %r141}, [%r105+560];
ld.shared.v4.b32 {%r142, %r143, %r144, %r145}, [%r105+624];
ld.shared.v4.b32 {%r146, %r147, %r148, %r149}, [%r105+544];
ld.shared.v4.b32 {%r150, %r151, %r152, %r153}, [%r105+608];
ld.shared.v4.b32 {%r154, %r155, %r156, %r157}, [%r105+528];
ld.shared.v4.b32 {%r158, %r159, %r160, %r161}, [%r105+592];
ld.shared.v4.b32 {%r162, %r163, %r164, %r165}, [%r105+512];
ld.shared.v4.b32 {%r166, %r167, %r168, %r169}, [%r105+576];
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
mad.lo.s32 %r170, %r86, 60, %r101;
or.b32 %r171, %r84, 1;
or.b32 %r172, %r171, %r85;
shl.b32 %r173, %r172, 6;
add.s32 %r174, %r100, %r173;
ld.shared.b32 %r175, [%r170+16];
ld.shared.b32 %r176, [%r174+16];
ld.shared.b32 %r177, [%r174+52];
ld.shared.b32 %r178, [%r170+60];
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
shr.u32 %r179, %r66, 1;
setp.eq.s32 %p17, %r66, 0;
bfe.u32 %r180, %r65, 1, 1;
or.b32 %r181, %r73, %r180;
or.b32 %r182, %r181, %r179;
and.b32 %r183, %r65, 24;
or.b32 %r184, %r182, %r183;
selp.b32 %r185, %r35, %r37, %p17;
shfl.sync.idx.b32 %r186, %r185, %r184, 31, -1;
selp.b32 %r187, %r36, %r38, %p17;
shfl.sync.idx.b32 %r188, %r187, %r184, 31, -1;
selp.b32 %r189, %r37, %r35, %p17;
xor.b32 %r190, %r184, 4;
shfl.sync.idx.b32 %r191, %r189, %r190, 31, -1;
selp.b32 %r192, %r38, %r36, %p17;
shfl.sync.idx.b32 %r193, %r192, %r190, 31, -1;
selp.b32 %r194, %r39, %r41, %p17;
shfl.sync.idx.b32 %r195, %r194, %r184, 31, -1;
selp.b32 %r196, %r40, %r42, %p17;
shfl.sync.idx.b32 %r197, %r196, %r184, 31, -1;
selp.b32 %r198, %r41, %r39, %p17;
shfl.sync.idx.b32 %r199, %r198, %r190, 31, -1;
selp.b32 %r200, %r42, %r40, %p17;
shfl.sync.idx.b32 %r201, %r200, %r190, 31, -1;
setp.eq.s32 %p18, %r72, 0;
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
ld.shared.v4.b32 {%r202, %r203, %r204, %r205}, [%r170];
ld.shared.v4.b32 {%r206, %r207, %r208, %r209}, [%r174];
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
selp.f32 %r210, %r186, %r191, %p18;
selp.f32 %r211, %r188, %r193, %p18;
selp.f32 %r212, %r191, %r186, %p18;
selp.f32 %r213, %r193, %r188, %p18;
selp.f32 %r214, %r195, %r199, %p18;
selp.f32 %r215, %r197, %r201, %p18;
selp.f32 %r216, %r199, %r195, %p18;
selp.f32 %r217, %r201, %r197, %p18;
.loc 1 88 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
fma.rn.f32 %r218, %r166, %r206, %r217;
fma.rn.f32 %r219, %r166, %r202, %r216;
fma.rn.f32 %r220, %r162, %r206, %r215;
fma.rn.f32 %r221, %r162, %r202, %r214;
fma.rn.f32 %r222, %r134, %r206, %r213;
fma.rn.f32 %r223, %r134, %r202, %r212;
fma.rn.f32 %r224, %r130, %r206, %r211;
fma.rn.f32 %r225, %r130, %r202, %r210;
fma.rn.f32 %r226, %r131, %r203, %r225;
fma.rn.f32 %r227, %r131, %r207, %r224;
fma.rn.f32 %r228, %r135, %r203, %r223;
fma.rn.f32 %r229, %r135, %r207, %r222;
fma.rn.f32 %r230, %r163, %r203, %r221;
fma.rn.f32 %r231, %r163, %r207, %r220;
fma.rn.f32 %r232, %r167, %r203, %r219;
fma.rn.f32 %r233, %r167, %r207, %r218;
fma.rn.f32 %r234, %r168, %r208, %r233;
fma.rn.f32 %r235, %r168, %r204, %r232;
fma.rn.f32 %r236, %r164, %r208, %r231;
fma.rn.f32 %r237, %r164, %r204, %r230;
fma.rn.f32 %r238, %r136, %r208, %r229;
fma.rn.f32 %r239, %r136, %r204, %r228;
fma.rn.f32 %r240, %r132, %r208, %r227;
fma.rn.f32 %r241, %r132, %r204, %r226;
fma.rn.f32 %r242, %r133, %r205, %r241;
fma.rn.f32 %r243, %r133, %r209, %r240;
fma.rn.f32 %r244, %r137, %r205, %r239;
fma.rn.f32 %r245, %r137, %r209, %r238;
fma.rn.f32 %r246, %r165, %r205, %r237;
fma.rn.f32 %r247, %r165, %r209, %r236;
fma.rn.f32 %r248, %r169, %r205, %r235;
fma.rn.f32 %r249, %r169, %r209, %r234;
fma.rn.f32 %r250, %r158, %r176, %r249;
fma.rn.f32 %r251, %r158, %r175, %r248;
fma.rn.f32 %r252, %r154, %r176, %r247;
fma.rn.f32 %r253, %r154, %r175, %r246;
fma.rn.f32 %r254, %r126, %r176, %r245;
fma.rn.f32 %r255, %r126, %r175, %r244;
fma.rn.f32 %r256, %r122, %r176, %r243;
fma.rn.f32 %r257, %r122, %r175, %r242;
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
ld.shared.b32 %r258, [%r170+24];
ld.shared.b32 %r259, [%r170+20];
ld.shared.b32 %r260, [%r174+24];
ld.shared.b32 %r261, [%r174+20];
ld.shared.b32 %r262, [%r170+32];
ld.shared.b32 %r263, [%r170+28];
ld.shared.b32 %r264, [%r174+32];
ld.shared.b32 %r265, [%r174+28];
ld.shared.b32 %r266, [%r170+40];
ld.shared.b32 %r267, [%r170+36];
ld.shared.b32 %r268, [%r174+40];
ld.shared.b32 %r269, [%r174+36];
ld.shared.b32 %r270, [%r170+48];
ld.shared.b32 %r271, [%r170+44];
ld.shared.b32 %r272, [%r174+48];
ld.shared.b32 %r273, [%r174+44];
ld.shared.b32 %r274, [%r170+56];
ld.shared.b32 %r275, [%r170+52];
ld.shared.v2.b32 {%r276, %r277}, [%r174+56];
.loc 1 88 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
fma.rn.f32 %r278, %r123, %r259, %r257;
fma.rn.f32 %r279, %r123, %r261, %r256;
fma.rn.f32 %r280, %r127, %r259, %r255;
fma.rn.f32 %r281, %r127, %r261, %r254;
fma.rn.f32 %r282, %r155, %r259, %r253;
fma.rn.f32 %r283, %r155, %r261, %r252;
fma.rn.f32 %r284, %r159, %r259, %r251;
fma.rn.f32 %r285, %r159, %r261, %r250;
fma.rn.f32 %r286, %r160, %r260, %r285;
fma.rn.f32 %r287, %r160, %r258, %r284;
fma.rn.f32 %r288, %r156, %r260, %r283;
fma.rn.f32 %r289, %r156, %r258, %r282;
fma.rn.f32 %r290, %r128, %r260, %r281;
fma.rn.f32 %r291, %r128, %r258, %r280;
fma.rn.f32 %r292, %r124, %r260, %r279;
fma.rn.f32 %r293, %r124, %r258, %r278;
fma.rn.f32 %r294, %r125, %r263, %r293;
fma.rn.f32 %r295, %r125, %r265, %r292;
fma.rn.f32 %r296, %r129, %r263, %r291;
fma.rn.f32 %r297, %r129, %r265, %r290;
fma.rn.f32 %r298, %r157, %r263, %r289;
fma.rn.f32 %r299, %r157, %r265, %r288;
fma.rn.f32 %r300, %r161, %r263, %r287;
fma.rn.f32 %r301, %r161, %r265, %r286;
fma.rn.f32 %r302, %r150, %r264, %r301;
fma.rn.f32 %r303, %r150, %r262, %r300;
fma.rn.f32 %r304, %r146, %r264, %r299;
fma.rn.f32 %r305, %r146, %r262, %r298;
fma.rn.f32 %r306, %r118, %r264, %r297;
fma.rn.f32 %r307, %r118, %r262, %r296;
fma.rn.f32 %r308, %r114, %r264, %r295;
fma.rn.f32 %r309, %r114, %r262, %r294;
fma.rn.f32 %r310, %r115, %r267, %r309;
fma.rn.f32 %r311, %r115, %r269, %r308;
fma.rn.f32 %r312, %r119, %r267, %r307;
fma.rn.f32 %r313, %r119, %r269, %r306;
fma.rn.f32 %r314, %r147, %r267, %r305;
fma.rn.f32 %r315, %r147, %r269, %r304;
fma.rn.f32 %r316, %r151, %r267, %r303;
fma.rn.f32 %r317, %r151, %r269, %r302;
fma.rn.f32 %r318, %r152, %r268, %r317;
fma.rn.f32 %r319, %r152, %r266, %r316;
fma.rn.f32 %r320, %r148, %r268, %r315;
fma.rn.f32 %r321, %r148, %r266, %r314;
fma.rn.f32 %r322, %r120, %r268, %r313;
fma.rn.f32 %r323, %r120, %r266, %r312;
fma.rn.f32 %r324, %r116, %r268, %r311;
fma.rn.f32 %r325, %r116, %r266, %r310;
fma.rn.f32 %r326, %r117, %r271, %r325;
fma.rn.f32 %r327, %r117, %r273, %r324;
fma.rn.f32 %r328, %r121, %r271, %r323;
fma.rn.f32 %r329, %r121, %r273, %r322;
fma.rn.f32 %r330, %r149, %r271, %r321;
fma.rn.f32 %r331, %r149, %r273, %r320;
fma.rn.f32 %r332, %r153, %r271, %r319;
fma.rn.f32 %r333, %r153, %r273, %r318;
fma.rn.f32 %r334, %r142, %r272, %r333;
fma.rn.f32 %r335, %r142, %r270, %r332;
fma.rn.f32 %r336, %r138, %r272, %r331;
fma.rn.f32 %r337, %r138, %r270, %r330;
fma.rn.f32 %r338, %r110, %r272, %r329;
fma.rn.f32 %r339, %r110, %r270, %r328;
fma.rn.f32 %r340, %r106, %r272, %r327;
fma.rn.f32 %r341, %r106, %r270, %r326;
fma.rn.f32 %r342, %r107, %r275, %r341;
fma.rn.f32 %r343, %r107, %r177, %r340;
fma.rn.f32 %r344, %r111, %r275, %r339;
fma.rn.f32 %r345, %r111, %r177, %r338;
fma.rn.f32 %r346, %r139, %r275, %r337;
fma.rn.f32 %r347, %r139, %r177, %r336;
fma.rn.f32 %r348, %r143, %r275, %r335;
fma.rn.f32 %r349, %r143, %r177, %r334;
fma.rn.f32 %r350, %r144, %r276, %r349;
fma.rn.f32 %r351, %r144, %r274, %r348;
fma.rn.f32 %r352, %r140, %r276, %r347;
fma.rn.f32 %r353, %r140, %r274, %r346;
fma.rn.f32 %r354, %r112, %r276, %r345;
fma.rn.f32 %r355, %r112, %r274, %r344;
fma.rn.f32 %r356, %r108, %r276, %r343;
fma.rn.f32 %r357, %r108, %r274, %r342;
fma.rn.f32 %r358, %r109, %r178, %r357;
fma.rn.f32 %r359, %r109, %r277, %r356;
fma.rn.f32 %r360, %r113, %r178, %r355;
fma.rn.f32 %r361, %r113, %r277, %r354;
fma.rn.f32 %r362, %r141, %r178, %r353;
fma.rn.f32 %r363, %r141, %r277, %r352;
fma.rn.f32 %r364, %r145, %r178, %r351;
fma.rn.f32 %r365, %r145, %r277, %r350;
$L__tmp1:
.loc 2 110 15 // triton_helpers.py:110:15 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
setp.lt.f32 %p19, %r365, 0f00000000;
setp.lt.f32 %p20, %r364, 0f00000000;
setp.lt.f32 %p21, %r363, 0f00000000;
setp.lt.f32 %p22, %r362, 0f00000000;
setp.lt.f32 %p23, %r361, 0f00000000;
setp.lt.f32 %p24, %r360, 0f00000000;
setp.lt.f32 %p25, %r359, 0f00000000;
setp.lt.f32 %p26, %r358, 0f00000000;
.loc 2 113 29 // triton_helpers.py:113:29 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
selp.f32 %r366, 0f00000000, %r358, %p26;
selp.f32 %r367, 0f00000000, %r359, %p25;
selp.f32 %r368, 0f00000000, %r360, %p24;
selp.f32 %r369, 0f00000000, %r361, %p23;
selp.f32 %r370, 0f00000000, %r362, %p22;
selp.f32 %r371, 0f00000000, %r363, %p21;
selp.f32 %r372, 0f00000000, %r364, %p20;
selp.f32 %r373, 0f00000000, %r365, %p19;
$L__tmp2:
.loc 1 104 52 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:52
shl.b32 %r374, %r77, 4;
shl.b32 %r375, %r78, 4;
.loc 1 104 49 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:49
add.s32 %r376, %r374, %r80;
add.s32 %r377, %r80, %r375;
.loc 1 104 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:25
mul.wide.s32 %rd27, %r376, 4;
add.s64 %rd19, %rd4, %rd27;
mul.wide.s32 %rd28, %r377, 4;
add.s64 %rd20, %rd4, %rd28;
.loc 1 104 78 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:78
selp.f32 %r378, %r366, %r368, %p18;
or.b32 %r379, %r183, %r67;
or.b32 %r380, %r379, %r84;
shfl.sync.idx.b32 %r381, %r378, %r380, 31, -1;
selp.f32 %r382, %r367, %r369, %p18;
shfl.sync.idx.b32 %r383, %r382, %r380, 31, -1;
selp.f32 %r384, %r368, %r366, %p18;
xor.b32 %r385, %r171, %r67;
or.b32 %r386, %r183, %r385;
shfl.sync.idx.b32 %r387, %r384, %r386, 31, -1;
selp.f32 %r388, %r369, %r367, %p18;
shfl.sync.idx.b32 %r389, %r388, %r386, 31, -1;
selp.f32 %r390, %r370, %r372, %p18;
shfl.sync.idx.b32 %r391, %r390, %r380, 31, -1;
selp.f32 %r392, %r371, %r373, %p18;
shfl.sync.idx.b32 %r393, %r392, %r380, 31, -1;
selp.f32 %r394, %r372, %r370, %p18;
shfl.sync.idx.b32 %r395, %r394, %r386, 31, -1;
selp.f32 %r396, %r373, %r371, %p18;
shfl.sync.idx.b32 %r397, %r396, %r386, 31, -1;
selp.b32 %r45, %r387, %r381, %p17;
selp.b32 %r46, %r389, %r383, %p17;
selp.b32 %r49, %r395, %r391, %p17;
selp.b32 %r50, %r397, %r393, %p17;
selp.b32 %r43, %r381, %r387, %p17;
selp.b32 %r44, %r383, %r389, %p17;
// begin inline asm
@%p10 st.global.v4.b32 [ %rd19 + 0 ], { %r43, %r44, %r45, %r46 };
// end inline asm
selp.b32 %r47, %r391, %r395, %p17;
selp.b32 %r48, %r393, %r397, %p17;
// begin inline asm
@%p11 st.global.v4.b32 [ %rd20 + 0 ], { %r47, %r48, %r49, %r50 };
// end inline asm
$L__BB0_1: // %common.ret
.loc 1 0 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
ret;
$L__tmp3:
$L__func_end0:
// -- End function
}
.file 1 "/tmp/torchinductor_shangdiy/uw/cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py"
.file 2 "/home/shangdiy/pytorch/torch/_inductor/runtime/triton_helpers.py"
.section .debug_abbrev
{
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
}
.section .debug_info
{
.b32 195 // Length of Unit
.b8 2 // DWARF version number
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0xbc DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2 // DW_AT_language
.b8 0
.b8 99 // DW_AT_name
.b8 117
.b8 119
.b8 104
.b8 98
.b8 112
.b8 101
.b8 99
.b8 100
.b8 50
.b8 117
.b8 107
.b8 101
.b8 115
.b8 111
.b8 51
.b8 106
.b8 120
.b8 101
.b8 107
.b8 99
.b8 122
.b8 118
.b8 103
.b8 104
.b8 98
.b8 55
.b8 97
.b8 104
.b8 50
.b8 104
.b8 107
.b8 50
.b8 122
.b8 111
.b8 98
.b8 54
.b8 55
.b8 111
.b8 112
.b8 99
.b8 52
.b8 51
.b8 97
.b8 103
.b8 104
.b8 101
.b8 53
.b8 119
.b8 108
.b8 118
.b8 51
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 115
.b8 104
.b8 97
.b8 110
.b8 103
.b8 100
.b8 105
.b8 121
.b8 47
.b8 117
.b8 119
.b8 0
.b8 2 // Abbrev [2] 0x70:0x28 DW_TAG_subprogram
.b8 109 // DW_AT_name
.b8 111
.b8 100
.b8 101
.b8 108
.b8 95
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 116
.b8 101
.b8 109
.b8 95
.b8 102
.b8 117
.b8 115
.b8 101
.b8 100
.b8 95
.b8 97
.b8 100
.b8 100
.b8 109
.b8 109
.b8 95
.b8 114
.b8 101
.b8 108
.b8 117
.b8 95
.b8 116
.b8 95
.b8 48
.b8 0
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0x98:0x2e DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 112 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0xad:0x18 DW_TAG_inlined_subroutine
.b32 112 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp2 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 103 // DW_AT_call_line
.b8 40 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
.section .debug_macinfo { }

View File

@ -0,0 +1,8 @@
SECTIONS {
/* By default, in LLD 16, .lrodata is placed immediately after .rodata.
* However, .lrodata can be very large in our compiled models, which leads to
* relocation out-of-range errors for relative relocations. So we place it
* after other the sections that are referenced from .text using relative
* relocations. This is the default behavior in GNU ld. */
.lrodata : { *(.lrodata) }
} INSERT AFTER .bss;

147
model2/main.cpp Normal file
View File

@ -0,0 +1,147 @@
// Windows for #include <dlfcn.h>
#include <windows.h>
#include <stdio.h>
#include <iostream>
#include <memory>
#include <vector>
#include <string>
// Include the AOTInductor headers
// #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
#include <torch/csrc/inductor/aoti_runtime/interface.h>
// #include <torch/csrc/inductor/aoti_runtime/model_container.h>
// #include <torch/csrc/inductor/aoti_torch/tensor_converter.h> // @manual
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
#include <standalone/slim/core/Empty.h>
#include <standalone/slim/cuda/Guard.h>
#include <standalone/torch/csrc/inductor/aoti_torch/tensor_converter.h>
static std::wstring u8u16(const char* s) {
int len = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
std::wstring wbuf(len, L'\0');
MultiByteToWideChar(CP_UTF8, 0, s, -1, &wbuf[0], len);
if (!wbuf.empty() && wbuf.back() == L'\0') {
wbuf.pop_back();
}
return wbuf;
}
int main() {
try {
// Load the DLL (model.pyd is a DLL on Windows)
HMODULE handle = nullptr;
{
auto wname = u8u16(R"(C:\Users\shangdiy\source\repos\pytorch\model2\model.pyd)");
// Try LoadLibraryExW with safe search flags if supported
if (GetProcAddress(GetModuleHandleW(L"KERNEL32.DLL"), "AddDllDirectory") != NULL) {
handle = LoadLibraryExW(
wname.c_str(),
NULL,
LOAD_LIBRARY_SEARCH_DEFAULT_DIRS);
}
// Fallback if that failed
if (!handle) {
handle = LoadLibraryW(wname.c_str());
}
if (!handle) {
DWORD dw = GetLastError();
char buf[512];
FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
buf, sizeof(buf), NULL);
std::cerr << "Failed to load model.pyd. WinError " << dw << ": " << buf << std::endl;
return 1;
} else {
std::cout << "Loaded model.pyd" << std::endl;
}
}
decltype(&AOTInductorModelContainerCreateWithDevice) create_model{nullptr};
decltype(&AOTInductorModelContainerDelete) delete_model{nullptr};
decltype(&AOTInductorModelContainerRun) run_model{nullptr};
#define AOTI_LOAD_SYMBOL(handle_, var, name_str) \
var = reinterpret_cast<decltype(var)>(GetProcAddress(handle_, name_str)); \
if (!var) { \
throw std::runtime_error("Could not GetProcAddress " name_str); \
}
AOTI_LOAD_SYMBOL(handle, create_model, "AOTInductorModelContainerCreateWithDevice");
AOTI_LOAD_SYMBOL(handle, run_model, "AOTInductorModelContainerRun");
AOTI_LOAD_SYMBOL(handle, delete_model, "AOTInductorModelContainerDelete");
#undef AOTI_LOAD_SYMBOL
// Create array of input/output handles
slim::SlimTensor x = slim::empty({8, 10}, c10::kFloat, c10::Device(c10::kCUDA, 0));
float fill_value = 1.0;
x.fill_(fill_value);
// AOTInductorModel::run will steal the ownership of the input and output
// tensor pointers
std::vector<slim::SlimTensor> inputs = {x};
std::vector<AtenTensorHandle> input_handles =
unsafe_alloc_new_handles_from_tensors(inputs);
AtenTensorHandle output_handle;
AOTInductorModelContainerHandle container_handle;
cudaStream_t stream = slim::cuda::getCurrentCUDAStream(0);
// aoti_torch_get_current_cuda_stream(0, (void**)&stream);
// Reinterpret as the opaque handle for AOTInductor
AOTInductorStreamHandle stream_handle = reinterpret_cast<AOTInductorStreamHandle>(stream);
// Construct model
const char* cubin_dir = R"(C:\Users\shangdiy\source\repos\pytorch\model2\)";
AOTIRuntimeError err =
create_model(&container_handle, 1, "cuda", cubin_dir);
if (err != AOTI_RUNTIME_SUCCESS) {
throw std::runtime_error("Failed to create model container");
} else {
std::cout << "Created model\n";
}
// Run the model
err = run_model(container_handle, input_handles.data(),
1, // num_inputs
&output_handle,
1, // num_outputs
stream_handle, // stream
nullptr // proxy_executor
);
if (err != AOTI_RUNTIME_SUCCESS) {
throw std::runtime_error("Failed to run model");
} else {
std::cout << "Finish model\n";
}
std::vector<slim::SlimTensor> outputs =
alloc_tensors_by_stealing_from_handles(&output_handle, 1);
// Print the result
slim::SlimTensor slim_tensor = outputs[0];
auto slim_cpu = slim_tensor.cpu();
float *slim_data = static_cast<float *>(slim_cpu.data_ptr());
std::cout << "Output" << std::endl;
std::cout << "slim_data ptr: " << slim_data << "\n";
size_t num_elements = slim_cpu.numel(); // or equivalent method
std::cout << num_elements << std::endl;
for (size_t i = 0; i < num_elements; ++i) {
std::cout << slim_data[i] << "\n";
}
std::cout << "Done" << std::endl;
delete_model(container_handle);
FreeLibrary(handle);
return 0;
} catch (const std::exception &e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
}

BIN
model2/model.exp Normal file

Binary file not shown.

Binary file not shown.