mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-26 00:24:53 +08:00
Compare commits
1 Commits
ciflow/tru
...
windows_li
| Author | SHA1 | Date | |
|---|---|---|---|
| 8cd74b302f |
1
model2/.data/serialization_id
Normal file
1
model2/.data/serialization_id
Normal file
@ -0,0 +1 @@
|
||||
1171719005974771805808300960005001569062
|
||||
1
model2/.data/version
Normal file
1
model2/.data/version
Normal file
@ -0,0 +1 @@
|
||||
6
|
||||
33
model2/CMakeLists.txt
Normal file
33
model2/CMakeLists.txt
Normal file
@ -0,0 +1,33 @@
|
||||
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
|
||||
project(aoti_example LANGUAGES CXX)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
|
||||
add_executable(aoti_example main.cpp)
|
||||
set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17)
|
||||
|
||||
find_package(TorchStandalone REQUIRED)
|
||||
# Set up include directories to find headers at the correct paths
|
||||
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS})
|
||||
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
|
||||
|
||||
enable_language(CUDA)
|
||||
set(CMAKE_CUDA_STANDARD 17)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
|
||||
target_compile_definitions(aoti_example PRIVATE NOMINMAX USE_CUDA)
|
||||
|
||||
# Add compile flags
|
||||
target_compile_options(aoti_example PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
|
||||
|
||||
|
||||
target_link_libraries(aoti_example PRIVATE
|
||||
${TorchStandalone_LIBRARIES} # if you have this variable from find_package(TorchStandalone)
|
||||
CUDA::cudart # CUDA runtime library
|
||||
cuda # CUDA driver library (usually nvcuda.lib on Windows)
|
||||
)
|
||||
|
||||
# cmake -DTorchStandalone_DIR="C:/Users/shangdiy/source/repos/torchnative/standalone/build/torchstandalone_install/lib/cmake/TorchStandalone" ..
|
||||
# cmake --build . --config Release
|
||||
1
model2/archive_format
Normal file
1
model2/archive_format
Normal file
@ -0,0 +1 @@
|
||||
pt2
|
||||
1
model2/archive_version
Normal file
1
model2/archive_version
Normal file
@ -0,0 +1 @@
|
||||
0
|
||||
1
model2/byteorder
Normal file
1
model2/byteorder
Normal file
@ -0,0 +1 @@
|
||||
little
|
||||
69
model2/data/aotinductor/model/CMakeLists.txt
Normal file
69
model2/data/aotinductor/model/CMakeLists.txt
Normal file
@ -0,0 +1,69 @@
|
||||
|
||||
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
|
||||
project(model LANGUAGES CXX)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
# Set a library target
|
||||
add_library(model SHARED)
|
||||
|
||||
|
||||
# TODO: change to TorchStandalone
|
||||
find_package(TorchStandalone REQUIRED)
|
||||
# Set up include directories to find headers at the correct paths
|
||||
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS})
|
||||
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
|
||||
|
||||
|
||||
# Add macro definitions
|
||||
target_compile_definitions(model PRIVATE NOMINMAX TORCH_INDUCTOR_CPP_WRAPPER STANDALONE_TORCH_HEADER C10_USING_CUSTOM_GENERATED_MACROS USE_CUDA) # CPU_CAPABILITY_AVX512
|
||||
|
||||
# Add compile flags
|
||||
target_compile_options(model PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
|
||||
|
||||
# Backend-specific flags
|
||||
# target_compile_options(model PRIVATE -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16 -c) # TODO remove
|
||||
|
||||
|
||||
enable_language(CUDA)
|
||||
set(CMAKE_CUDA_STANDARD 17)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
# Make output use .pyd instead of .dll
|
||||
set_target_properties(model PROPERTIES
|
||||
SUFFIX ".pyd"
|
||||
LINK_FLAGS "/DEF:${CMAKE_CURRENT_SOURCE_DIR}/model_exports.def"
|
||||
)
|
||||
|
||||
set(KERNEL_TARGETS "")
|
||||
set(KERNEL_OBJECT_FILES "")
|
||||
# Function to compile ptx to cubin
|
||||
function(embed_gpu_kernel KERNEL_NAME PTX_FILE)
|
||||
set(CUBIN_BASENAME ${KERNEL_NAME}.cubin)
|
||||
set(CUBIN_FILE ${CMAKE_CURRENT_BINARY_DIR}/${CUBIN_BASENAME})
|
||||
# --- PTX to FATBIN Command & Target ---
|
||||
add_custom_command(
|
||||
OUTPUT ${CUBIN_FILE}
|
||||
COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --cubin ${PTX_FILE}
|
||||
-o ${CUBIN_FILE} ${NVCC_GENCODE_FLAGS}
|
||||
-gencode arch=compute_89,code=sm_89
|
||||
DEPENDS ${PTX_FILE}
|
||||
)
|
||||
|
||||
add_custom_target(build_kernel_object_${KERNEL_NAME} DEPENDS ${CUBIN_FILE})
|
||||
set(KERNEL_TARGETS ${KERNEL_TARGETS} build_kernel_object_${KERNEL_NAME} PARENT_SCOPE)
|
||||
endfunction()
|
||||
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model.wrapper.cpp)
|
||||
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model_consts.weights.cpp)
|
||||
|
||||
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_t_0 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_t_0.ptx)
|
||||
|
||||
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_sigmoid_t_1 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx)
|
||||
add_dependencies(model ${KERNEL_TARGETS})
|
||||
target_link_libraries(model PRIVATE ${KERNEL_OBJECT_FILES})
|
||||
|
||||
# Add linker flags
|
||||
target_link_options(model PRIVATE )
|
||||
|
||||
# Add libraries
|
||||
# TODO: change to TorchStandalone
|
||||
target_link_libraries(model PRIVATE ${TorchStandalone_LIBRARIES} cuda CUDA::cudart)
|
||||
1041
model2/data/aotinductor/model/model.wrapper.cpp
Normal file
1041
model2/data/aotinductor/model/model.wrapper.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1 @@
|
||||
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": [], "libraries": [], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": true}
|
||||
@ -0,0 +1 @@
|
||||
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/libs", "/home/shangdiy/pytorch/torch/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib/stubs"], "libraries": ["torch", "torch_cpu", "sleef", "c10", "c10_cuda", "cuda", "torch_cuda"], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": false}
|
||||
@ -0,0 +1 @@
|
||||
{"AOTI_DEVICE_KEY": "cuda"}
|
||||
58
model2/data/aotinductor/model/model_consts.weights.cpp
Normal file
58
model2/data/aotinductor/model/model_consts.weights.cpp
Normal file
@ -0,0 +1,58 @@
|
||||
#if defined(__clang__) || defined (__GNUC__)
|
||||
#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))
|
||||
#else
|
||||
#define ATTRIBUTE_NO_SANITIZE_ADDRESS
|
||||
#endif
|
||||
|
||||
ATTRIBUTE_NO_SANITIZE_ADDRESS
|
||||
alignas(64) extern unsigned char _binary_constants_bin_start[768] = {
|
||||
69, 165, 123, 190, 252, 181, 16, 62, 242, 69, 193, 59, 44, 80, 100, 62,
|
||||
237, 163, 142, 188, 138, 139, 109, 190, 61, 248, 20, 190, 77, 84, 143, 60,
|
||||
111, 90, 163, 60, 96, 140, 230, 189, 101, 69, 38, 190, 132, 190, 12, 188,
|
||||
28, 113, 159, 190, 252, 128, 154, 62, 247, 234, 217, 61, 206, 79, 58, 61,
|
||||
224, 209, 135, 61, 211, 238, 147, 62, 231, 229, 27, 190, 169, 208, 57, 62,
|
||||
100, 20, 153, 190, 161, 160, 85, 190, 207, 10, 156, 62, 234, 107, 155, 190,
|
||||
188, 85, 116, 62, 27, 211, 114, 60, 94, 21, 158, 189, 147, 210, 34, 62,
|
||||
203, 109, 80, 62, 28, 242, 141, 189, 205, 27, 152, 190, 38, 104, 6, 189,
|
||||
211, 16, 249, 189, 72, 103, 143, 190, 163, 44, 140, 189, 178, 223, 127, 189,
|
||||
5, 112, 160, 189, 177, 55, 132, 190, 218, 22, 159, 62, 115, 30, 35, 190,
|
||||
26, 247, 9, 62, 251, 219, 27, 62, 165, 86, 135, 62, 99, 168, 66, 190,
|
||||
238, 64, 93, 62, 65, 147, 86, 62, 167, 108, 97, 62, 183, 219, 50, 190,
|
||||
138, 83, 106, 62, 90, 122, 208, 189, 149, 140, 161, 188, 44, 145, 194, 189,
|
||||
5, 142, 186, 61, 202, 230, 153, 190, 133, 72, 136, 62, 251, 1, 3, 62,
|
||||
225, 146, 54, 190, 91, 176, 219, 189, 118, 244, 10, 189, 232, 107, 142, 62,
|
||||
185, 6, 151, 62, 241, 137, 223, 61, 124, 100, 114, 190, 15, 240, 168, 189,
|
||||
149, 252, 58, 190, 238, 93, 243, 188, 144, 218, 115, 61, 159, 91, 6, 62,
|
||||
57, 139, 74, 190, 84, 200, 49, 61, 193, 78, 32, 190, 84, 121, 26, 190,
|
||||
219, 39, 115, 190, 171, 127, 94, 62, 248, 253, 177, 61, 63, 18, 127, 62,
|
||||
146, 18, 137, 189, 203, 90, 161, 190, 139, 194, 239, 58, 126, 54, 40, 190,
|
||||
47, 247, 30, 190, 106, 93, 191, 61, 22, 48, 120, 61, 56, 123, 7, 62,
|
||||
150, 229, 210, 189, 118, 231, 158, 188, 7, 98, 215, 60, 72, 251, 89, 190,
|
||||
190, 160, 137, 190, 173, 194, 158, 62, 225, 26, 118, 190, 174, 199, 4, 189,
|
||||
205, 148, 16, 62, 20, 225, 155, 61, 90, 124, 133, 190, 88, 196, 34, 61,
|
||||
26, 104, 51, 190, 149, 106, 40, 62, 25, 136, 177, 60, 169, 111, 138, 190,
|
||||
214, 181, 226, 189, 109, 17, 77, 62, 224, 166, 55, 62, 250, 128, 160, 61,
|
||||
104, 223, 250, 61, 34, 182, 210, 187, 60, 87, 149, 190, 189, 55, 98, 188,
|
||||
58, 86, 85, 190, 170, 43, 132, 190, 81, 220, 87, 190, 47, 226, 138, 62,
|
||||
189, 162, 36, 190, 30, 232, 34, 60, 138, 147, 167, 61, 151, 129, 157, 61,
|
||||
206, 33, 152, 62, 109, 227, 113, 190, 147, 255, 11, 190, 175, 56, 46, 189,
|
||||
46, 238, 1, 189, 123, 159, 85, 188, 14, 126, 148, 189, 226, 226, 169, 189,
|
||||
255, 106, 134, 61, 38, 140, 187, 60, 119, 73, 49, 62, 32, 236, 43, 62,
|
||||
78, 33, 232, 189, 72, 188, 139, 62, 94, 206, 20, 62, 25, 230, 75, 189,
|
||||
171, 239, 26, 190, 136, 218, 121, 62, 96, 115, 85, 62, 126, 92, 55, 190,
|
||||
112, 108, 134, 61, 64, 212, 69, 190, 253, 118, 214, 188, 210, 116, 66, 62,
|
||||
204, 131, 123, 190, 13, 151, 38, 190, 56, 17, 252, 189, 153, 151, 138, 62,
|
||||
21, 30, 216, 61, 146, 103, 32, 62, 140, 60, 78, 62, 183, 149, 174, 61,
|
||||
95, 153, 164, 61, 144, 167, 187, 189, 112, 53, 153, 190, 127, 195, 105, 61,
|
||||
169, 167, 251, 189, 42, 204, 123, 62, 116, 193, 86, 62, 98, 147, 30, 61,
|
||||
176, 138, 137, 62, 245, 244, 17, 62, 201, 90, 140, 62, 177, 110, 77, 62,
|
||||
188, 31, 129, 190, 66, 203, 85, 62, 182, 209, 112, 188, 216, 91, 222, 59,
|
||||
18, 208, 131, 189, 151, 142, 150, 190, 36, 252, 31, 62, 241, 2, 180, 61,
|
||||
83, 240, 159, 62, 37, 152, 115, 190, 13, 52, 107, 62, 169, 178, 148, 62,
|
||||
171, 54, 38, 62, 33, 4, 199, 189, 201, 247, 216, 189, 225, 89, 146, 190,
|
||||
192, 118, 79, 189, 92, 171, 12, 62, 136, 235, 3, 62, 180, 202, 87, 62,
|
||||
8, 129, 122, 61, 160, 75, 170, 188, 20, 84, 6, 62, 60, 194, 56, 190,
|
||||
182, 99, 44, 190, 88, 96, 228, 189, 50, 106, 5, 190, 34, 133, 12, 190,
|
||||
26, 50, 0, 190, 176, 25, 127, 61, 48, 69, 219, 61, 192, 237, 252, 187,
|
||||
};
|
||||
alignas(64) extern unsigned char * _binary_constants_bin_end;
|
||||
6
model2/data/aotinductor/model/model_exports.def
Normal file
6
model2/data/aotinductor/model/model_exports.def
Normal file
@ -0,0 +1,6 @@
|
||||
LIBRARY model
|
||||
EXPORTS
|
||||
AOTInductorModelContainerCreate
|
||||
AOTInductorModelContainerCreateWithDevice
|
||||
AOTInductorModelContainerRun
|
||||
AOTInductorModelContainerDelete
|
||||
Binary file not shown.
@ -0,0 +1,684 @@
|
||||
//
|
||||
// Generated by LLVM NVPTX Back-End
|
||||
//
|
||||
|
||||
.version 8.7
|
||||
.target sm_89
|
||||
.address_size 64
|
||||
|
||||
// .globl model_triton_tem_fused_addmm_relu_sigmoid_t_1 // -- Begin function model_triton_tem_fused_addmm_relu_sigmoid_t_1
|
||||
.extern .shared .align 16 .b8 global_smem[];
|
||||
// @model_triton_tem_fused_addmm_relu_sigmoid_t_1
|
||||
.visible .entry model_triton_tem_fused_addmm_relu_sigmoid_t_1(
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0,
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1,
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2,
|
||||
.param .u32 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3,
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_4
|
||||
)
|
||||
.reqntid 32
|
||||
{
|
||||
.reg .pred %p<12>;
|
||||
.reg .b32 %r<375>;
|
||||
.reg .b64 %rd<27>;
|
||||
.loc 1 18 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
|
||||
$L__func_begin0:
|
||||
.loc 1 18 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
|
||||
|
||||
// %bb.0:
|
||||
ld.param.b32 %r1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3];
|
||||
$L__tmp0:
|
||||
.loc 1 34 16 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:34:16
|
||||
setp.ne.s32 %p1, %r1, 0;
|
||||
@%p1 bra $L__BB0_2;
|
||||
bra.uni $L__BB0_1;
|
||||
$L__BB0_2:
|
||||
.loc 1 0 16 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0:16
|
||||
ld.param.b64 %rd3, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2];
|
||||
ld.param.b64 %rd2, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1];
|
||||
ld.param.b64 %rd1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0];
|
||||
.loc 1 43 24 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:43:24
|
||||
mov.u32 %r26, %ctaid.x;
|
||||
.loc 1 44 28 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:28
|
||||
add.s32 %r27, %r1, 15;
|
||||
.loc 1 44 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:34
|
||||
shr.s32 %r28, %r27, 31;
|
||||
shr.u32 %r29, %r28, 28;
|
||||
add.s32 %r30, %r27, %r29;
|
||||
shr.s32 %r31, %r30, 4;
|
||||
.loc 1 50 41 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:41
|
||||
and.b32 %r32, %r26, 2147483640;
|
||||
.loc 1 50 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:30
|
||||
sub.s32 %r33, %r31, %r32;
|
||||
.loc 1 50 50 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:50
|
||||
min.s32 %r34, %r33, 8;
|
||||
.loc 1 51 40 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:40
|
||||
rem.s32 %r35, %r26, %r34;
|
||||
.loc 1 51 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:34
|
||||
add.s32 %r36, %r35, %r32;
|
||||
.loc 1 52 19 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:19
|
||||
and.b32 %r37, %r26, 7;
|
||||
.loc 1 52 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:30
|
||||
div.s32 %r38, %r37, %r34;
|
||||
.loc 1 56 17 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:17
|
||||
shl.b32 %r39, %r36, 4;
|
||||
.loc 1 56 40 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:40
|
||||
mov.u32 %r40, %tid.x;
|
||||
shr.u32 %r41, %r40, 2;
|
||||
and.b32 %r42, %r41, 2;
|
||||
bfe.u32 %r43, %r40, 2, 2;
|
||||
and.b32 %r44, %r40, 16;
|
||||
shr.u32 %r45, %r44, 2;
|
||||
or.b32 %r46, %r43, %r45;
|
||||
and.b32 %r47, %r40, 15;
|
||||
bfe.u32 %r48, %r40, 4, 1;
|
||||
.loc 1 56 27 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:27
|
||||
or.b32 %r49, %r39, %r46;
|
||||
or.b32 %r50, %r49, 8;
|
||||
or.b32 %r51, %r39, %r47;
|
||||
.loc 1 0 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
|
||||
rem.s32 %r52, %r50, %r1;
|
||||
rem.s32 %r53, %r49, %r1;
|
||||
.loc 1 71 30 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:71:30
|
||||
shl.b32 %r54, %r40, 2;
|
||||
and.b32 %r55, %r54, 12;
|
||||
.loc 1 76 28 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:28
|
||||
shl.b32 %r56, %r53, 4;
|
||||
shl.b32 %r57, %r52, 4;
|
||||
.loc 1 76 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:25
|
||||
or.b32 %r58, %r56, %r55;
|
||||
or.b32 %r59, %r57, %r55;
|
||||
.loc 1 77 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:25
|
||||
mul.wide.s32 %rd16, %r58, 4;
|
||||
add.s64 %rd4, %rd1, %rd16;
|
||||
mul.wide.s32 %rd17, %r59, 4;
|
||||
add.s64 %rd5, %rd1, %rd17;
|
||||
.loc 1 77 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
|
||||
// begin inline asm
|
||||
mov.u32 %r2, 0x0;
|
||||
mov.u32 %r3, 0x0;
|
||||
mov.u32 %r4, 0x0;
|
||||
mov.u32 %r5, 0x0;
|
||||
ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd4 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r6, 0x0;
|
||||
mov.u32 %r7, 0x0;
|
||||
mov.u32 %r8, 0x0;
|
||||
mov.u32 %r9, 0x0;
|
||||
ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd5 + 0 ];
|
||||
// end inline asm
|
||||
shl.b32 %r60, %r46, 6;
|
||||
shl.b32 %r61, %r55, 2;
|
||||
or.b32 %r62, %r60, %r61;
|
||||
mov.b32 %r63, global_smem;
|
||||
add.s32 %r64, %r63, %r62;
|
||||
st.shared.v4.b32 [%r64], {%r2, %r3, %r4, %r5};
|
||||
st.shared.v4.b32 [%r64+512], {%r6, %r7, %r8, %r9};
|
||||
.loc 1 82 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:25
|
||||
mul.wide.u32 %rd18, %r55, 4;
|
||||
add.s64 %rd6, %rd2, %rd18;
|
||||
.loc 1 82 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
|
||||
// begin inline asm
|
||||
mov.u32 %r10, 0x0;
|
||||
mov.u32 %r11, 0x0;
|
||||
mov.u32 %r12, 0x0;
|
||||
mov.u32 %r13, 0x0;
|
||||
ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd6 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r14, 0x0;
|
||||
mov.u32 %r15, 0x0;
|
||||
mov.u32 %r16, 0x0;
|
||||
mov.u32 %r17, 0x0;
|
||||
ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ];
|
||||
// end inline asm
|
||||
add.s32 %r65, %r63, 1024;
|
||||
add.s32 %r66, %r65, %r62;
|
||||
st.shared.v4.b32 [%r66], {%r10, %r11, %r12, %r13};
|
||||
st.shared.v4.b32 [%r66+512], {%r14, %r15, %r16, %r17};
|
||||
.loc 1 90 17 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:17
|
||||
shl.b32 %r67, %r38, 4;
|
||||
.loc 1 90 27 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:27
|
||||
or.b32 %r68, %r67, %r48;
|
||||
.loc 1 93 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:20
|
||||
setp.lt.s32 %p10, %r51, %r1;
|
||||
.loc 1 93 34 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:34
|
||||
setp.eq.s32 %p11, %r68, 0;
|
||||
.loc 1 93 26 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:26
|
||||
and.pred %p2, %p10, %p11;
|
||||
.loc 1 96 21 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:96:21
|
||||
add.s32 %r69, %r51, %r68;
|
||||
add.s32 %r70, %r69, 2;
|
||||
add.s32 %r71, %r69, 4;
|
||||
add.s32 %r72, %r69, 6;
|
||||
add.s32 %r73, %r69, 8;
|
||||
add.s32 %r74, %r69, 10;
|
||||
add.s32 %r75, %r69, 12;
|
||||
add.s32 %r76, %r69, 14;
|
||||
.loc 1 77 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
|
||||
bar.sync 0;
|
||||
or.b32 %r77, %r42, %r45;
|
||||
shl.b32 %r78, %r77, 6;
|
||||
add.s32 %r79, %r63, %r78;
|
||||
ld.shared.v4.b32 {%r80, %r81, %r82, %r83}, [%r79+112];
|
||||
ld.shared.v4.b32 {%r84, %r85, %r86, %r87}, [%r79+96];
|
||||
ld.shared.v4.b32 {%r88, %r89, %r90, %r91}, [%r79+80];
|
||||
ld.shared.v4.b32 {%r92, %r93, %r94, %r95}, [%r79+64];
|
||||
ld.shared.v4.b32 {%r96, %r97, %r98, %r99}, [%r79+48];
|
||||
ld.shared.v4.b32 {%r100, %r101, %r102, %r103}, [%r79+32];
|
||||
ld.shared.v4.b32 {%r104, %r105, %r106, %r107}, [%r79+16];
|
||||
ld.shared.v4.b32 {%r108, %r109, %r110, %r111}, [%r79];
|
||||
ld.shared.v4.b32 {%r112, %r113, %r114, %r115}, [%r79+624];
|
||||
ld.shared.v4.b32 {%r116, %r117, %r118, %r119}, [%r79+608];
|
||||
ld.shared.v4.b32 {%r120, %r121, %r122, %r123}, [%r79+592];
|
||||
ld.shared.v4.b32 {%r124, %r125, %r126, %r127}, [%r79+576];
|
||||
ld.shared.v4.b32 {%r128, %r129, %r130, %r131}, [%r79+560];
|
||||
ld.shared.v4.b32 {%r132, %r133, %r134, %r135}, [%r79+544];
|
||||
ld.shared.v4.b32 {%r136, %r137, %r138, %r139}, [%r79+528];
|
||||
ld.shared.v4.b32 {%r140, %r141, %r142, %r143}, [%r79+512];
|
||||
.loc 1 82 20 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
|
||||
shl.b32 %r144, %r40, 1;
|
||||
and.b32 %r145, %r144, 14;
|
||||
shl.b32 %r146, %r145, 6;
|
||||
add.s32 %r147, %r65, %r146;
|
||||
ld.shared.v4.b32 {%r148, %r149, %r150, %r151}, [%r147];
|
||||
ld.shared.v4.b32 {%r152, %r153, %r154, %r155}, [%r147+64];
|
||||
ld.shared.v4.b32 {%r156, %r157, %r158, %r159}, [%r147+16];
|
||||
ld.shared.v4.b32 {%r160, %r161, %r162, %r163}, [%r147+80];
|
||||
ld.shared.v4.b32 {%r164, %r165, %r166, %r167}, [%r147+32];
|
||||
ld.shared.v4.b32 {%r168, %r169, %r170, %r171}, [%r147+96];
|
||||
ld.shared.v4.b32 {%r172, %r173, %r174, %r175}, [%r147+48];
|
||||
ld.shared.v4.b32 {%r176, %r177, %r178, %r179}, [%r147+112];
|
||||
.loc 1 85 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:85:25
|
||||
fma.rn.f32 %r180, %r108, %r148, 0fBE0AE428;
|
||||
fma.rn.f32 %r181, %r109, %r149, %r180;
|
||||
fma.rn.f32 %r182, %r110, %r150, %r181;
|
||||
fma.rn.f32 %r183, %r111, %r151, %r182;
|
||||
fma.rn.f32 %r184, %r104, %r156, %r183;
|
||||
fma.rn.f32 %r185, %r105, %r157, %r184;
|
||||
fma.rn.f32 %r186, %r106, %r158, %r185;
|
||||
fma.rn.f32 %r187, %r107, %r159, %r186;
|
||||
fma.rn.f32 %r188, %r100, %r164, %r187;
|
||||
fma.rn.f32 %r189, %r101, %r165, %r188;
|
||||
fma.rn.f32 %r190, %r102, %r166, %r189;
|
||||
fma.rn.f32 %r191, %r103, %r167, %r190;
|
||||
fma.rn.f32 %r192, %r96, %r172, %r191;
|
||||
fma.rn.f32 %r193, %r97, %r173, %r192;
|
||||
fma.rn.f32 %r194, %r98, %r174, %r193;
|
||||
fma.rn.f32 %r195, %r99, %r175, %r194;
|
||||
fma.rn.f32 %r196, %r108, %r152, 0fBE0AE428;
|
||||
fma.rn.f32 %r197, %r109, %r153, %r196;
|
||||
fma.rn.f32 %r198, %r110, %r154, %r197;
|
||||
fma.rn.f32 %r199, %r111, %r155, %r198;
|
||||
fma.rn.f32 %r200, %r104, %r160, %r199;
|
||||
fma.rn.f32 %r201, %r105, %r161, %r200;
|
||||
fma.rn.f32 %r202, %r106, %r162, %r201;
|
||||
fma.rn.f32 %r203, %r107, %r163, %r202;
|
||||
fma.rn.f32 %r204, %r100, %r168, %r203;
|
||||
fma.rn.f32 %r205, %r101, %r169, %r204;
|
||||
fma.rn.f32 %r206, %r102, %r170, %r205;
|
||||
fma.rn.f32 %r207, %r103, %r171, %r206;
|
||||
fma.rn.f32 %r208, %r96, %r176, %r207;
|
||||
fma.rn.f32 %r209, %r97, %r177, %r208;
|
||||
fma.rn.f32 %r210, %r98, %r178, %r209;
|
||||
fma.rn.f32 %r211, %r99, %r179, %r210;
|
||||
fma.rn.f32 %r212, %r92, %r148, 0fBE0AE428;
|
||||
fma.rn.f32 %r213, %r93, %r149, %r212;
|
||||
fma.rn.f32 %r214, %r94, %r150, %r213;
|
||||
fma.rn.f32 %r215, %r95, %r151, %r214;
|
||||
fma.rn.f32 %r216, %r88, %r156, %r215;
|
||||
fma.rn.f32 %r217, %r89, %r157, %r216;
|
||||
fma.rn.f32 %r218, %r90, %r158, %r217;
|
||||
fma.rn.f32 %r219, %r91, %r159, %r218;
|
||||
fma.rn.f32 %r220, %r84, %r164, %r219;
|
||||
fma.rn.f32 %r221, %r85, %r165, %r220;
|
||||
fma.rn.f32 %r222, %r86, %r166, %r221;
|
||||
fma.rn.f32 %r223, %r87, %r167, %r222;
|
||||
fma.rn.f32 %r224, %r80, %r172, %r223;
|
||||
fma.rn.f32 %r225, %r81, %r173, %r224;
|
||||
fma.rn.f32 %r226, %r82, %r174, %r225;
|
||||
fma.rn.f32 %r227, %r83, %r175, %r226;
|
||||
fma.rn.f32 %r228, %r92, %r152, 0fBE0AE428;
|
||||
fma.rn.f32 %r229, %r93, %r153, %r228;
|
||||
fma.rn.f32 %r230, %r94, %r154, %r229;
|
||||
fma.rn.f32 %r231, %r95, %r155, %r230;
|
||||
fma.rn.f32 %r232, %r88, %r160, %r231;
|
||||
fma.rn.f32 %r233, %r89, %r161, %r232;
|
||||
fma.rn.f32 %r234, %r90, %r162, %r233;
|
||||
fma.rn.f32 %r235, %r91, %r163, %r234;
|
||||
fma.rn.f32 %r236, %r84, %r168, %r235;
|
||||
fma.rn.f32 %r237, %r85, %r169, %r236;
|
||||
fma.rn.f32 %r238, %r86, %r170, %r237;
|
||||
fma.rn.f32 %r239, %r87, %r171, %r238;
|
||||
fma.rn.f32 %r240, %r80, %r176, %r239;
|
||||
fma.rn.f32 %r241, %r81, %r177, %r240;
|
||||
fma.rn.f32 %r242, %r82, %r178, %r241;
|
||||
fma.rn.f32 %r243, %r83, %r179, %r242;
|
||||
fma.rn.f32 %r244, %r140, %r148, 0fBE0AE428;
|
||||
fma.rn.f32 %r245, %r141, %r149, %r244;
|
||||
fma.rn.f32 %r246, %r142, %r150, %r245;
|
||||
fma.rn.f32 %r247, %r143, %r151, %r246;
|
||||
fma.rn.f32 %r248, %r136, %r156, %r247;
|
||||
fma.rn.f32 %r249, %r137, %r157, %r248;
|
||||
fma.rn.f32 %r250, %r138, %r158, %r249;
|
||||
fma.rn.f32 %r251, %r139, %r159, %r250;
|
||||
fma.rn.f32 %r252, %r132, %r164, %r251;
|
||||
fma.rn.f32 %r253, %r133, %r165, %r252;
|
||||
fma.rn.f32 %r254, %r134, %r166, %r253;
|
||||
fma.rn.f32 %r255, %r135, %r167, %r254;
|
||||
fma.rn.f32 %r256, %r128, %r172, %r255;
|
||||
fma.rn.f32 %r257, %r129, %r173, %r256;
|
||||
fma.rn.f32 %r258, %r130, %r174, %r257;
|
||||
fma.rn.f32 %r259, %r131, %r175, %r258;
|
||||
fma.rn.f32 %r260, %r140, %r152, 0fBE0AE428;
|
||||
fma.rn.f32 %r261, %r141, %r153, %r260;
|
||||
fma.rn.f32 %r262, %r142, %r154, %r261;
|
||||
fma.rn.f32 %r263, %r143, %r155, %r262;
|
||||
fma.rn.f32 %r264, %r136, %r160, %r263;
|
||||
fma.rn.f32 %r265, %r137, %r161, %r264;
|
||||
fma.rn.f32 %r266, %r138, %r162, %r265;
|
||||
fma.rn.f32 %r267, %r139, %r163, %r266;
|
||||
fma.rn.f32 %r268, %r132, %r168, %r267;
|
||||
fma.rn.f32 %r269, %r133, %r169, %r268;
|
||||
fma.rn.f32 %r270, %r134, %r170, %r269;
|
||||
fma.rn.f32 %r271, %r135, %r171, %r270;
|
||||
fma.rn.f32 %r272, %r128, %r176, %r271;
|
||||
fma.rn.f32 %r273, %r129, %r177, %r272;
|
||||
fma.rn.f32 %r274, %r130, %r178, %r273;
|
||||
fma.rn.f32 %r275, %r131, %r179, %r274;
|
||||
fma.rn.f32 %r276, %r124, %r148, 0fBE0AE428;
|
||||
fma.rn.f32 %r277, %r125, %r149, %r276;
|
||||
fma.rn.f32 %r278, %r126, %r150, %r277;
|
||||
fma.rn.f32 %r279, %r127, %r151, %r278;
|
||||
fma.rn.f32 %r280, %r120, %r156, %r279;
|
||||
fma.rn.f32 %r281, %r121, %r157, %r280;
|
||||
fma.rn.f32 %r282, %r122, %r158, %r281;
|
||||
fma.rn.f32 %r283, %r123, %r159, %r282;
|
||||
fma.rn.f32 %r284, %r116, %r164, %r283;
|
||||
fma.rn.f32 %r285, %r117, %r165, %r284;
|
||||
fma.rn.f32 %r286, %r118, %r166, %r285;
|
||||
fma.rn.f32 %r287, %r119, %r167, %r286;
|
||||
fma.rn.f32 %r288, %r112, %r172, %r287;
|
||||
fma.rn.f32 %r289, %r113, %r173, %r288;
|
||||
fma.rn.f32 %r290, %r114, %r174, %r289;
|
||||
fma.rn.f32 %r291, %r115, %r175, %r290;
|
||||
fma.rn.f32 %r292, %r124, %r152, 0fBE0AE428;
|
||||
fma.rn.f32 %r293, %r125, %r153, %r292;
|
||||
fma.rn.f32 %r294, %r126, %r154, %r293;
|
||||
fma.rn.f32 %r295, %r127, %r155, %r294;
|
||||
fma.rn.f32 %r296, %r120, %r160, %r295;
|
||||
fma.rn.f32 %r297, %r121, %r161, %r296;
|
||||
fma.rn.f32 %r298, %r122, %r162, %r297;
|
||||
fma.rn.f32 %r299, %r123, %r163, %r298;
|
||||
fma.rn.f32 %r300, %r116, %r168, %r299;
|
||||
fma.rn.f32 %r301, %r117, %r169, %r300;
|
||||
fma.rn.f32 %r302, %r118, %r170, %r301;
|
||||
fma.rn.f32 %r303, %r119, %r171, %r302;
|
||||
fma.rn.f32 %r304, %r112, %r176, %r303;
|
||||
fma.rn.f32 %r305, %r113, %r177, %r304;
|
||||
fma.rn.f32 %r306, %r114, %r178, %r305;
|
||||
fma.rn.f32 %r307, %r115, %r179, %r306;
|
||||
mov.b32 %r308, 0f00000000;
|
||||
$L__tmp1:
|
||||
.loc 2 47 30 // standard.py:47:30 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
|
||||
sub.f32 %r309, %r308, %r195;
|
||||
sub.f32 %r310, %r308, %r211;
|
||||
sub.f32 %r311, %r308, %r227;
|
||||
sub.f32 %r312, %r308, %r243;
|
||||
sub.f32 %r313, %r308, %r259;
|
||||
sub.f32 %r314, %r308, %r275;
|
||||
sub.f32 %r315, %r308, %r291;
|
||||
sub.f32 %r316, %r308, %r307;
|
||||
.loc 2 47 29 // standard.py:47:29 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
|
||||
mul.f32 %r317, %r309, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r318, %r317;
|
||||
mul.f32 %r319, %r310, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r320, %r319;
|
||||
mul.f32 %r321, %r311, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r322, %r321;
|
||||
mul.f32 %r323, %r312, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r324, %r323;
|
||||
mul.f32 %r325, %r313, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r326, %r325;
|
||||
mul.f32 %r327, %r314, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r328, %r327;
|
||||
mul.f32 %r329, %r315, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r330, %r329;
|
||||
mul.f32 %r331, %r316, 0f3FB8AA3B;
|
||||
ex2.approx.f32 %r332, %r331;
|
||||
.loc 2 47 20 // standard.py:47:20 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
|
||||
add.f32 %r333, %r318, 0f3F800000;
|
||||
add.f32 %r334, %r320, 0f3F800000;
|
||||
add.f32 %r335, %r322, 0f3F800000;
|
||||
add.f32 %r336, %r324, 0f3F800000;
|
||||
add.f32 %r337, %r326, 0f3F800000;
|
||||
add.f32 %r338, %r328, 0f3F800000;
|
||||
add.f32 %r339, %r330, 0f3F800000;
|
||||
add.f32 %r340, %r332, 0f3F800000;
|
||||
mov.b32 %r341, 0f3F800000;
|
||||
.loc 2 47 16 // standard.py:47:16 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
|
||||
div.full.f32 %r342, %r341, %r333;
|
||||
div.full.f32 %r343, %r341, %r334;
|
||||
div.full.f32 %r344, %r341, %r335;
|
||||
div.full.f32 %r345, %r341, %r336;
|
||||
div.full.f32 %r346, %r341, %r337;
|
||||
div.full.f32 %r347, %r341, %r338;
|
||||
div.full.f32 %r348, %r341, %r339;
|
||||
div.full.f32 %r349, %r341, %r340;
|
||||
$L__tmp2:
|
||||
.loc 1 100 25 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:25
|
||||
mul.wide.s32 %rd19, %r69, 4;
|
||||
add.s64 %rd8, %rd3, %rd19;
|
||||
mul.wide.s32 %rd20, %r70, 4;
|
||||
add.s64 %rd9, %rd3, %rd20;
|
||||
mul.wide.s32 %rd21, %r71, 4;
|
||||
add.s64 %rd10, %rd3, %rd21;
|
||||
mul.wide.s32 %rd22, %r72, 4;
|
||||
add.s64 %rd11, %rd3, %rd22;
|
||||
mul.wide.s32 %rd23, %r73, 4;
|
||||
add.s64 %rd12, %rd3, %rd23;
|
||||
mul.wide.s32 %rd24, %r74, 4;
|
||||
add.s64 %rd13, %rd3, %rd24;
|
||||
mul.wide.s32 %rd25, %r75, 4;
|
||||
add.s64 %rd14, %rd3, %rd25;
|
||||
mul.wide.s32 %rd26, %r76, 4;
|
||||
add.s64 %rd15, %rd3, %rd26;
|
||||
.loc 1 100 68 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:68
|
||||
bar.sync 0;
|
||||
and.b32 %r350, %r54, 96;
|
||||
or.b32 %r351, %r145, %r350;
|
||||
shl.b32 %r352, %r40, 4;
|
||||
and.b32 %r353, %r352, 240;
|
||||
shr.u32 %r354, %r350, 1;
|
||||
add.s32 %r355, %r63, %r354;
|
||||
shl.b32 %r356, %r351, 2;
|
||||
add.s32 %r357, %r355, %r356;
|
||||
st.shared.v2.b32 [%r357], {%r342, %r343};
|
||||
or.b32 %r358, %r350, 16;
|
||||
shr.u32 %r359, %r358, 1;
|
||||
add.s32 %r360, %r63, %r359;
|
||||
add.s32 %r361, %r360, %r356;
|
||||
st.shared.v2.b32 [%r361+64], {%r344, %r345};
|
||||
or.b32 %r362, %r350, 128;
|
||||
shr.u32 %r363, %r362, 1;
|
||||
add.s32 %r364, %r63, %r363;
|
||||
add.s32 %r365, %r364, %r356;
|
||||
st.shared.v2.b32 [%r365+512], {%r346, %r347};
|
||||
or.b32 %r366, %r350, 144;
|
||||
shr.u32 %r367, %r366, 1;
|
||||
add.s32 %r368, %r63, %r367;
|
||||
add.s32 %r369, %r368, %r356;
|
||||
st.shared.v2.b32 [%r369+576], {%r348, %r349};
|
||||
bar.sync 0;
|
||||
shr.u32 %r370, %r353, 1;
|
||||
add.s32 %r371, %r63, %r370;
|
||||
shl.b32 %r372, %r353, 2;
|
||||
add.s32 %r373, %r371, %r372;
|
||||
add.s32 %r374, %r373, %r45;
|
||||
ld.shared.b32 %r18, [%r374];
|
||||
ld.shared.b32 %r19, [%r374+8];
|
||||
ld.shared.b32 %r20, [%r374+16];
|
||||
ld.shared.b32 %r21, [%r374+24];
|
||||
ld.shared.b32 %r22, [%r374+32];
|
||||
ld.shared.b32 %r23, [%r374+40];
|
||||
ld.shared.b32 %r24, [%r374+48];
|
||||
ld.shared.b32 %r25, [%r374+56];
|
||||
// begin inline asm
|
||||
@%p2 st.global.b32 [ %rd8 + 0 ], { %r18 };
|
||||
// end inline asm
|
||||
mov.pred %p3, 0;
|
||||
// begin inline asm
|
||||
@%p3 st.global.b32 [ %rd9 + 0 ], { %r19 };
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
@%p3 st.global.b32 [ %rd10 + 0 ], { %r20 };
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
@%p3 st.global.b32 [ %rd11 + 0 ], { %r21 };
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
@%p3 st.global.b32 [ %rd12 + 0 ], { %r22 };
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
@%p3 st.global.b32 [ %rd13 + 0 ], { %r23 };
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
@%p3 st.global.b32 [ %rd14 + 0 ], { %r24 };
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
@%p3 st.global.b32 [ %rd15 + 0 ], { %r25 };
|
||||
// end inline asm
|
||||
$L__BB0_1: // %common.ret
|
||||
.loc 1 0 0 // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
|
||||
ret;
|
||||
$L__tmp3:
|
||||
$L__func_end0:
|
||||
// -- End function
|
||||
}
|
||||
.file 1 "/tmp/torchinductor_shangdiy/q4/cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py"
|
||||
.file 2 "/home/shangdiy/miniconda3/envs/pytorch-3.10/lib/python3.10/site-packages/triton/language/standard.py"
|
||||
.section .debug_abbrev
|
||||
{
|
||||
.b8 1 // Abbreviation Code
|
||||
.b8 17 // DW_TAG_compile_unit
|
||||
.b8 1 // DW_CHILDREN_yes
|
||||
.b8 37 // DW_AT_producer
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 19 // DW_AT_language
|
||||
.b8 5 // DW_FORM_data2
|
||||
.b8 3 // DW_AT_name
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 16 // DW_AT_stmt_list
|
||||
.b8 6 // DW_FORM_data4
|
||||
.b8 27 // DW_AT_comp_dir
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 2 // Abbreviation Code
|
||||
.b8 46 // DW_TAG_subprogram
|
||||
.b8 0 // DW_CHILDREN_no
|
||||
.b8 3 // DW_AT_name
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 32 // DW_AT_inline
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 3 // Abbreviation Code
|
||||
.b8 46 // DW_TAG_subprogram
|
||||
.b8 1 // DW_CHILDREN_yes
|
||||
.b8 17 // DW_AT_low_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 18 // DW_AT_high_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 49 // DW_AT_abstract_origin
|
||||
.b8 19 // DW_FORM_ref4
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 4 // Abbreviation Code
|
||||
.b8 29 // DW_TAG_inlined_subroutine
|
||||
.b8 0 // DW_CHILDREN_no
|
||||
.b8 49 // DW_AT_abstract_origin
|
||||
.b8 19 // DW_FORM_ref4
|
||||
.b8 17 // DW_AT_low_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 18 // DW_AT_high_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 88 // DW_AT_call_file
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 89 // DW_AT_call_line
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 87 // DW_AT_call_column
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 0 // EOM(3)
|
||||
}
|
||||
.section .debug_info
|
||||
{
|
||||
.b32 203 // Length of Unit
|
||||
.b8 2 // DWARF version number
|
||||
.b8 0
|
||||
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
||||
.b8 8 // Address Size (in bytes)
|
||||
.b8 1 // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
|
||||
.b8 116 // DW_AT_producer
|
||||
.b8 114
|
||||
.b8 105
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 110
|
||||
.b8 0
|
||||
.b8 2 // DW_AT_language
|
||||
.b8 0
|
||||
.b8 99 // DW_AT_name
|
||||
.b8 113
|
||||
.b8 52
|
||||
.b8 54
|
||||
.b8 118
|
||||
.b8 111
|
||||
.b8 108
|
||||
.b8 114
|
||||
.b8 122
|
||||
.b8 109
|
||||
.b8 97
|
||||
.b8 54
|
||||
.b8 55
|
||||
.b8 105
|
||||
.b8 110
|
||||
.b8 100
|
||||
.b8 116
|
||||
.b8 119
|
||||
.b8 104
|
||||
.b8 106
|
||||
.b8 53
|
||||
.b8 97
|
||||
.b8 52
|
||||
.b8 110
|
||||
.b8 54
|
||||
.b8 122
|
||||
.b8 114
|
||||
.b8 51
|
||||
.b8 54
|
||||
.b8 55
|
||||
.b8 103
|
||||
.b8 113
|
||||
.b8 112
|
||||
.b8 107
|
||||
.b8 99
|
||||
.b8 119
|
||||
.b8 111
|
||||
.b8 50
|
||||
.b8 104
|
||||
.b8 108
|
||||
.b8 112
|
||||
.b8 104
|
||||
.b8 121
|
||||
.b8 110
|
||||
.b8 104
|
||||
.b8 106
|
||||
.b8 116
|
||||
.b8 104
|
||||
.b8 110
|
||||
.b8 51
|
||||
.b8 117
|
||||
.b8 107
|
||||
.b8 46
|
||||
.b8 112
|
||||
.b8 121
|
||||
.b8 0
|
||||
.b32 .debug_line // DW_AT_stmt_list
|
||||
.b8 47 // DW_AT_comp_dir
|
||||
.b8 116
|
||||
.b8 109
|
||||
.b8 112
|
||||
.b8 47
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 114
|
||||
.b8 99
|
||||
.b8 104
|
||||
.b8 105
|
||||
.b8 110
|
||||
.b8 100
|
||||
.b8 117
|
||||
.b8 99
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 114
|
||||
.b8 95
|
||||
.b8 115
|
||||
.b8 104
|
||||
.b8 97
|
||||
.b8 110
|
||||
.b8 103
|
||||
.b8 100
|
||||
.b8 105
|
||||
.b8 121
|
||||
.b8 47
|
||||
.b8 113
|
||||
.b8 52
|
||||
.b8 0
|
||||
.b8 2 // Abbrev [2] 0x70:0x30 DW_TAG_subprogram
|
||||
.b8 109 // DW_AT_name
|
||||
.b8 111
|
||||
.b8 100
|
||||
.b8 101
|
||||
.b8 108
|
||||
.b8 95
|
||||
.b8 116
|
||||
.b8 114
|
||||
.b8 105
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 110
|
||||
.b8 95
|
||||
.b8 116
|
||||
.b8 101
|
||||
.b8 109
|
||||
.b8 95
|
||||
.b8 102
|
||||
.b8 117
|
||||
.b8 115
|
||||
.b8 101
|
||||
.b8 100
|
||||
.b8 95
|
||||
.b8 97
|
||||
.b8 100
|
||||
.b8 100
|
||||
.b8 109
|
||||
.b8 109
|
||||
.b8 95
|
||||
.b8 114
|
||||
.b8 101
|
||||
.b8 108
|
||||
.b8 117
|
||||
.b8 95
|
||||
.b8 115
|
||||
.b8 105
|
||||
.b8 103
|
||||
.b8 109
|
||||
.b8 111
|
||||
.b8 105
|
||||
.b8 100
|
||||
.b8 95
|
||||
.b8 116
|
||||
.b8 95
|
||||
.b8 49
|
||||
.b8 0
|
||||
.b8 1 // DW_AT_inline
|
||||
.b8 3 // Abbrev [3] 0xa0:0x2e DW_TAG_subprogram
|
||||
.b64 $L__func_begin0 // DW_AT_low_pc
|
||||
.b64 $L__func_end0 // DW_AT_high_pc
|
||||
.b32 112 // DW_AT_abstract_origin
|
||||
.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine
|
||||
.b32 112 // DW_AT_abstract_origin
|
||||
.b64 $L__tmp1 // DW_AT_low_pc
|
||||
.b64 $L__tmp2 // DW_AT_high_pc
|
||||
.b8 1 // DW_AT_call_file
|
||||
.b8 99 // DW_AT_call_line
|
||||
.b8 22 // DW_AT_call_column
|
||||
.b8 0 // End Of Children Mark
|
||||
.b8 0 // End Of Children Mark
|
||||
}
|
||||
.section .debug_macinfo { }
|
||||
Binary file not shown.
@ -0,0 +1,727 @@
|
||||
//
|
||||
// Generated by LLVM NVPTX Back-End
|
||||
//
|
||||
|
||||
.version 8.7
|
||||
.target sm_89
|
||||
.address_size 64
|
||||
|
||||
// .globl model_triton_tem_fused_addmm_relu_t_0 // -- Begin function model_triton_tem_fused_addmm_relu_t_0
|
||||
.extern .shared .align 16 .b8 global_smem[];
|
||||
// @model_triton_tem_fused_addmm_relu_t_0
|
||||
.visible .entry model_triton_tem_fused_addmm_relu_t_0(
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_0,
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_1,
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_2,
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_3,
|
||||
.param .u32 model_triton_tem_fused_addmm_relu_t_0_param_4,
|
||||
.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_5
|
||||
)
|
||||
.reqntid 32
|
||||
{
|
||||
.reg .pred %p<27>;
|
||||
.reg .b32 %r<398>;
|
||||
.reg .b64 %rd<29>;
|
||||
.loc 1 18 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
|
||||
$L__func_begin0:
|
||||
.loc 1 18 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
|
||||
|
||||
// %bb.0:
|
||||
ld.param.b32 %r1, [model_triton_tem_fused_addmm_relu_t_0_param_4];
|
||||
$L__tmp0:
|
||||
.loc 1 34 16 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:34:16
|
||||
and.b32 %r2, %r1, 268435455;
|
||||
setp.ne.s32 %p1, %r2, 0;
|
||||
@%p1 bra $L__BB0_2;
|
||||
bra.uni $L__BB0_1;
|
||||
$L__BB0_2:
|
||||
.loc 1 0 16 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0:16
|
||||
ld.param.b64 %rd4, [model_triton_tem_fused_addmm_relu_t_0_param_3];
|
||||
ld.param.b64 %rd3, [model_triton_tem_fused_addmm_relu_t_0_param_2];
|
||||
ld.param.b64 %rd2, [model_triton_tem_fused_addmm_relu_t_0_param_1];
|
||||
ld.param.b64 %rd1, [model_triton_tem_fused_addmm_relu_t_0_param_0];
|
||||
.loc 1 43 24 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:43:24
|
||||
mov.u32 %r51, %ctaid.x;
|
||||
.loc 1 44 28 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:28
|
||||
add.s32 %r52, %r1, 15;
|
||||
.loc 1 44 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:34
|
||||
shr.s32 %r53, %r52, 31;
|
||||
shr.u32 %r54, %r53, 28;
|
||||
add.s32 %r55, %r52, %r54;
|
||||
shr.s32 %r56, %r55, 4;
|
||||
.loc 1 50 41 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:41
|
||||
and.b32 %r57, %r51, 2147483640;
|
||||
.loc 1 50 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:30
|
||||
sub.s32 %r58, %r56, %r57;
|
||||
.loc 1 50 50 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:50
|
||||
min.s32 %r59, %r58, 8;
|
||||
.loc 1 51 40 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:40
|
||||
rem.s32 %r60, %r51, %r59;
|
||||
.loc 1 51 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:34
|
||||
add.s32 %r61, %r60, %r57;
|
||||
.loc 1 52 19 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:19
|
||||
and.b32 %r62, %r51, 7;
|
||||
.loc 1 52 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:30
|
||||
div.s32 %r63, %r62, %r59;
|
||||
.loc 1 56 17 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:17
|
||||
shl.b32 %r64, %r61, 4;
|
||||
.loc 1 56 40 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:40
|
||||
mov.u32 %r65, %tid.x;
|
||||
and.b32 %r66, %r65, 4;
|
||||
bfe.u32 %r67, %r65, 2, 1;
|
||||
shr.u32 %r68, %r65, 2;
|
||||
and.b32 %r69, %r68, 6;
|
||||
or.b32 %r70, %r69, %r67;
|
||||
bfe.u32 %r71, %r65, 3, 2;
|
||||
and.b32 %r72, %r65, 1;
|
||||
shl.b32 %r73, %r72, 2;
|
||||
shl.b32 %r74, %r65, 2;
|
||||
and.b32 %r75, %r74, 12;
|
||||
or.b32 %r76, %r75, 2;
|
||||
.loc 1 56 27 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:27
|
||||
or.b32 %r77, %r64, %r70;
|
||||
or.b32 %r78, %r77, 8;
|
||||
.loc 1 57 17 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:17
|
||||
shl.b32 %r79, %r63, 4;
|
||||
.loc 1 57 27 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:27
|
||||
or.b32 %r80, %r79, %r75;
|
||||
.loc 1 0 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
|
||||
rem.s32 %r81, %r78, %r1;
|
||||
rem.s32 %r82, %r77, %r1;
|
||||
.loc 1 71 36 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:71:36
|
||||
setp.lt.u32 %p2, %r75, 10;
|
||||
setp.lt.u32 %p3, %r76, 10;
|
||||
.loc 1 72 24 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:24
|
||||
shl.b32 %r83, %r65, 1;
|
||||
and.b32 %r84, %r83, 6;
|
||||
shl.b32 %r85, %r66, 1;
|
||||
or.b32 %r86, %r84, %r85;
|
||||
.loc 1 72 36 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:36
|
||||
setp.lt.u32 %p6, %r86, 10;
|
||||
.loc 1 79 28 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:28
|
||||
mul.lo.s32 %r87, %r82, 10;
|
||||
mul.lo.s32 %r88, %r81, 10;
|
||||
.loc 1 79 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:25
|
||||
add.s32 %r89, %r87, %r75;
|
||||
add.s32 %r90, %r87, %r76;
|
||||
add.s32 %r91, %r88, %r75;
|
||||
add.s32 %r92, %r88, %r76;
|
||||
.loc 1 80 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:25
|
||||
mul.wide.s32 %rd21, %r89, 4;
|
||||
add.s64 %rd5, %rd1, %rd21;
|
||||
mul.wide.s32 %rd22, %r90, 4;
|
||||
add.s64 %rd6, %rd1, %rd22;
|
||||
mul.wide.s32 %rd23, %r91, 4;
|
||||
add.s64 %rd7, %rd1, %rd23;
|
||||
mul.wide.s32 %rd24, %r92, 4;
|
||||
add.s64 %rd8, %rd1, %rd24;
|
||||
mov.b32 %r5, 0;
|
||||
.loc 1 80 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
|
||||
// begin inline asm
|
||||
mov.u32 %r3, %r5;
|
||||
mov.u32 %r4, %r5;
|
||||
@%p2 ld.global.v2.b32 { %r3, %r4 }, [ %rd5 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r7, %r5;
|
||||
mov.u32 %r8, %r5;
|
||||
@%p3 ld.global.v2.b32 { %r7, %r8 }, [ %rd6 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r11, %r5;
|
||||
mov.u32 %r12, %r5;
|
||||
@%p2 ld.global.v2.b32 { %r11, %r12 }, [ %rd7 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r15, %r5;
|
||||
mov.u32 %r16, %r5;
|
||||
@%p3 ld.global.v2.b32 { %r15, %r16 }, [ %rd8 + 0 ];
|
||||
// end inline asm
|
||||
shl.b32 %r93, %r75, 2;
|
||||
mov.b32 %r94, global_smem;
|
||||
add.s32 %r95, %r94, %r93;
|
||||
shl.b32 %r96, %r70, 6;
|
||||
add.s32 %r97, %r95, %r96;
|
||||
st.shared.v4.b32 [%r97], {%r3, %r4, %r7, %r8};
|
||||
st.shared.v4.b32 [%r97+512], {%r11, %r12, %r15, %r16};
|
||||
.loc 1 85 50 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:50
|
||||
mad.lo.s32 %r98, %r71, 10, %r86;
|
||||
.loc 1 85 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:25
|
||||
mul.wide.u32 %rd25, %r98, 4;
|
||||
add.s64 %rd9, %rd2, %rd25;
|
||||
add.s64 %rd10, %rd9, 160;
|
||||
add.s64 %rd11, %rd9, 320;
|
||||
add.s64 %rd12, %rd9, 480;
|
||||
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
|
||||
// begin inline asm
|
||||
mov.u32 %r19, %r5;
|
||||
mov.u32 %r20, %r5;
|
||||
@%p6 ld.global.v2.b32 { %r19, %r20 }, [ %rd9 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r23, %r5;
|
||||
mov.u32 %r24, %r5;
|
||||
@%p6 ld.global.v2.b32 { %r23, %r24 }, [ %rd10 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r27, %r5;
|
||||
mov.u32 %r28, %r5;
|
||||
@%p6 ld.global.v2.b32 { %r27, %r28 }, [ %rd11 + 0 ];
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r31, %r5;
|
||||
mov.u32 %r32, %r5;
|
||||
@%p6 ld.global.v2.b32 { %r31, %r32 }, [ %rd12 + 0 ];
|
||||
// end inline asm
|
||||
shl.b32 %r99, %r86, 2;
|
||||
add.s32 %r100, %r94, 1024;
|
||||
add.s32 %r101, %r100, %r99;
|
||||
shl.b32 %r102, %r71, 6;
|
||||
add.s32 %r103, %r101, %r102;
|
||||
st.shared.v2.b32 [%r103], {%r19, %r20};
|
||||
st.shared.v2.b32 [%r103+256], {%r23, %r24};
|
||||
st.shared.v2.b32 [%r103+512], {%r27, %r28};
|
||||
st.shared.v2.b32 [%r103+768], {%r31, %r32};
|
||||
.loc 1 96 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:20
|
||||
setp.lt.s32 %p14, %r77, %r1;
|
||||
setp.lt.s32 %p15, %r78, %r1;
|
||||
.loc 1 96 34 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:34
|
||||
setp.eq.s32 %p16, %r63, 0;
|
||||
.loc 1 96 26 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:26
|
||||
and.pred %p10, %p16, %p14;
|
||||
and.pred %p11, %p15, %p16;
|
||||
.loc 1 100 30 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:30
|
||||
mul.wide.u32 %rd26, %r80, 4;
|
||||
add.s64 %rd14, %rd3, %rd26;
|
||||
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
|
||||
// begin inline asm
|
||||
mov.u64 %rd15, 0x0;
|
||||
createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r35, 0x0;
|
||||
mov.u32 %r36, 0x0;
|
||||
mov.u32 %r37, 0x0;
|
||||
mov.u32 %r38, 0x0;
|
||||
@%p10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15;
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u64 %rd18, 0x0;
|
||||
createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
|
||||
// end inline asm
|
||||
// begin inline asm
|
||||
mov.u32 %r39, 0x0;
|
||||
mov.u32 %r40, 0x0;
|
||||
mov.u32 %r41, 0x0;
|
||||
mov.u32 %r42, 0x0;
|
||||
@%p11 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd14 + 0 ], %rd18;
|
||||
// end inline asm
|
||||
.loc 1 80 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
|
||||
bar.sync 0;
|
||||
shl.b32 %r104, %r69, 6;
|
||||
add.s32 %r105, %r94, %r104;
|
||||
ld.shared.v4.b32 {%r106, %r107, %r108, %r109}, [%r105+48];
|
||||
ld.shared.v4.b32 {%r110, %r111, %r112, %r113}, [%r105+112];
|
||||
ld.shared.v4.b32 {%r114, %r115, %r116, %r117}, [%r105+32];
|
||||
ld.shared.v4.b32 {%r118, %r119, %r120, %r121}, [%r105+96];
|
||||
ld.shared.v4.b32 {%r122, %r123, %r124, %r125}, [%r105+16];
|
||||
ld.shared.v4.b32 {%r126, %r127, %r128, %r129}, [%r105+80];
|
||||
ld.shared.v4.b32 {%r130, %r131, %r132, %r133}, [%r105];
|
||||
ld.shared.v4.b32 {%r134, %r135, %r136, %r137}, [%r105+64];
|
||||
ld.shared.v4.b32 {%r138, %r139, %r140, %r141}, [%r105+560];
|
||||
ld.shared.v4.b32 {%r142, %r143, %r144, %r145}, [%r105+624];
|
||||
ld.shared.v4.b32 {%r146, %r147, %r148, %r149}, [%r105+544];
|
||||
ld.shared.v4.b32 {%r150, %r151, %r152, %r153}, [%r105+608];
|
||||
ld.shared.v4.b32 {%r154, %r155, %r156, %r157}, [%r105+528];
|
||||
ld.shared.v4.b32 {%r158, %r159, %r160, %r161}, [%r105+592];
|
||||
ld.shared.v4.b32 {%r162, %r163, %r164, %r165}, [%r105+512];
|
||||
ld.shared.v4.b32 {%r166, %r167, %r168, %r169}, [%r105+576];
|
||||
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
|
||||
mad.lo.s32 %r170, %r86, 60, %r101;
|
||||
or.b32 %r171, %r84, 1;
|
||||
or.b32 %r172, %r171, %r85;
|
||||
shl.b32 %r173, %r172, 6;
|
||||
add.s32 %r174, %r100, %r173;
|
||||
ld.shared.b32 %r175, [%r170+16];
|
||||
ld.shared.b32 %r176, [%r174+16];
|
||||
ld.shared.b32 %r177, [%r174+52];
|
||||
ld.shared.b32 %r178, [%r170+60];
|
||||
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
|
||||
shr.u32 %r179, %r66, 1;
|
||||
setp.eq.s32 %p17, %r66, 0;
|
||||
bfe.u32 %r180, %r65, 1, 1;
|
||||
or.b32 %r181, %r73, %r180;
|
||||
or.b32 %r182, %r181, %r179;
|
||||
and.b32 %r183, %r65, 24;
|
||||
or.b32 %r184, %r182, %r183;
|
||||
selp.b32 %r185, %r35, %r37, %p17;
|
||||
shfl.sync.idx.b32 %r186, %r185, %r184, 31, -1;
|
||||
selp.b32 %r187, %r36, %r38, %p17;
|
||||
shfl.sync.idx.b32 %r188, %r187, %r184, 31, -1;
|
||||
selp.b32 %r189, %r37, %r35, %p17;
|
||||
xor.b32 %r190, %r184, 4;
|
||||
shfl.sync.idx.b32 %r191, %r189, %r190, 31, -1;
|
||||
selp.b32 %r192, %r38, %r36, %p17;
|
||||
shfl.sync.idx.b32 %r193, %r192, %r190, 31, -1;
|
||||
selp.b32 %r194, %r39, %r41, %p17;
|
||||
shfl.sync.idx.b32 %r195, %r194, %r184, 31, -1;
|
||||
selp.b32 %r196, %r40, %r42, %p17;
|
||||
shfl.sync.idx.b32 %r197, %r196, %r184, 31, -1;
|
||||
selp.b32 %r198, %r41, %r39, %p17;
|
||||
shfl.sync.idx.b32 %r199, %r198, %r190, 31, -1;
|
||||
selp.b32 %r200, %r42, %r40, %p17;
|
||||
shfl.sync.idx.b32 %r201, %r200, %r190, 31, -1;
|
||||
setp.eq.s32 %p18, %r72, 0;
|
||||
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
|
||||
ld.shared.v4.b32 {%r202, %r203, %r204, %r205}, [%r170];
|
||||
ld.shared.v4.b32 {%r206, %r207, %r208, %r209}, [%r174];
|
||||
.loc 1 100 66 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
|
||||
selp.f32 %r210, %r186, %r191, %p18;
|
||||
selp.f32 %r211, %r188, %r193, %p18;
|
||||
selp.f32 %r212, %r191, %r186, %p18;
|
||||
selp.f32 %r213, %r193, %r188, %p18;
|
||||
selp.f32 %r214, %r195, %r199, %p18;
|
||||
selp.f32 %r215, %r197, %r201, %p18;
|
||||
selp.f32 %r216, %r199, %r195, %p18;
|
||||
selp.f32 %r217, %r201, %r197, %p18;
|
||||
.loc 1 88 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
|
||||
fma.rn.f32 %r218, %r166, %r206, %r217;
|
||||
fma.rn.f32 %r219, %r166, %r202, %r216;
|
||||
fma.rn.f32 %r220, %r162, %r206, %r215;
|
||||
fma.rn.f32 %r221, %r162, %r202, %r214;
|
||||
fma.rn.f32 %r222, %r134, %r206, %r213;
|
||||
fma.rn.f32 %r223, %r134, %r202, %r212;
|
||||
fma.rn.f32 %r224, %r130, %r206, %r211;
|
||||
fma.rn.f32 %r225, %r130, %r202, %r210;
|
||||
fma.rn.f32 %r226, %r131, %r203, %r225;
|
||||
fma.rn.f32 %r227, %r131, %r207, %r224;
|
||||
fma.rn.f32 %r228, %r135, %r203, %r223;
|
||||
fma.rn.f32 %r229, %r135, %r207, %r222;
|
||||
fma.rn.f32 %r230, %r163, %r203, %r221;
|
||||
fma.rn.f32 %r231, %r163, %r207, %r220;
|
||||
fma.rn.f32 %r232, %r167, %r203, %r219;
|
||||
fma.rn.f32 %r233, %r167, %r207, %r218;
|
||||
fma.rn.f32 %r234, %r168, %r208, %r233;
|
||||
fma.rn.f32 %r235, %r168, %r204, %r232;
|
||||
fma.rn.f32 %r236, %r164, %r208, %r231;
|
||||
fma.rn.f32 %r237, %r164, %r204, %r230;
|
||||
fma.rn.f32 %r238, %r136, %r208, %r229;
|
||||
fma.rn.f32 %r239, %r136, %r204, %r228;
|
||||
fma.rn.f32 %r240, %r132, %r208, %r227;
|
||||
fma.rn.f32 %r241, %r132, %r204, %r226;
|
||||
fma.rn.f32 %r242, %r133, %r205, %r241;
|
||||
fma.rn.f32 %r243, %r133, %r209, %r240;
|
||||
fma.rn.f32 %r244, %r137, %r205, %r239;
|
||||
fma.rn.f32 %r245, %r137, %r209, %r238;
|
||||
fma.rn.f32 %r246, %r165, %r205, %r237;
|
||||
fma.rn.f32 %r247, %r165, %r209, %r236;
|
||||
fma.rn.f32 %r248, %r169, %r205, %r235;
|
||||
fma.rn.f32 %r249, %r169, %r209, %r234;
|
||||
fma.rn.f32 %r250, %r158, %r176, %r249;
|
||||
fma.rn.f32 %r251, %r158, %r175, %r248;
|
||||
fma.rn.f32 %r252, %r154, %r176, %r247;
|
||||
fma.rn.f32 %r253, %r154, %r175, %r246;
|
||||
fma.rn.f32 %r254, %r126, %r176, %r245;
|
||||
fma.rn.f32 %r255, %r126, %r175, %r244;
|
||||
fma.rn.f32 %r256, %r122, %r176, %r243;
|
||||
fma.rn.f32 %r257, %r122, %r175, %r242;
|
||||
.loc 1 85 20 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
|
||||
ld.shared.b32 %r258, [%r170+24];
|
||||
ld.shared.b32 %r259, [%r170+20];
|
||||
ld.shared.b32 %r260, [%r174+24];
|
||||
ld.shared.b32 %r261, [%r174+20];
|
||||
ld.shared.b32 %r262, [%r170+32];
|
||||
ld.shared.b32 %r263, [%r170+28];
|
||||
ld.shared.b32 %r264, [%r174+32];
|
||||
ld.shared.b32 %r265, [%r174+28];
|
||||
ld.shared.b32 %r266, [%r170+40];
|
||||
ld.shared.b32 %r267, [%r170+36];
|
||||
ld.shared.b32 %r268, [%r174+40];
|
||||
ld.shared.b32 %r269, [%r174+36];
|
||||
ld.shared.b32 %r270, [%r170+48];
|
||||
ld.shared.b32 %r271, [%r170+44];
|
||||
ld.shared.b32 %r272, [%r174+48];
|
||||
ld.shared.b32 %r273, [%r174+44];
|
||||
ld.shared.b32 %r274, [%r170+56];
|
||||
ld.shared.b32 %r275, [%r170+52];
|
||||
ld.shared.v2.b32 {%r276, %r277}, [%r174+56];
|
||||
.loc 1 88 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
|
||||
fma.rn.f32 %r278, %r123, %r259, %r257;
|
||||
fma.rn.f32 %r279, %r123, %r261, %r256;
|
||||
fma.rn.f32 %r280, %r127, %r259, %r255;
|
||||
fma.rn.f32 %r281, %r127, %r261, %r254;
|
||||
fma.rn.f32 %r282, %r155, %r259, %r253;
|
||||
fma.rn.f32 %r283, %r155, %r261, %r252;
|
||||
fma.rn.f32 %r284, %r159, %r259, %r251;
|
||||
fma.rn.f32 %r285, %r159, %r261, %r250;
|
||||
fma.rn.f32 %r286, %r160, %r260, %r285;
|
||||
fma.rn.f32 %r287, %r160, %r258, %r284;
|
||||
fma.rn.f32 %r288, %r156, %r260, %r283;
|
||||
fma.rn.f32 %r289, %r156, %r258, %r282;
|
||||
fma.rn.f32 %r290, %r128, %r260, %r281;
|
||||
fma.rn.f32 %r291, %r128, %r258, %r280;
|
||||
fma.rn.f32 %r292, %r124, %r260, %r279;
|
||||
fma.rn.f32 %r293, %r124, %r258, %r278;
|
||||
fma.rn.f32 %r294, %r125, %r263, %r293;
|
||||
fma.rn.f32 %r295, %r125, %r265, %r292;
|
||||
fma.rn.f32 %r296, %r129, %r263, %r291;
|
||||
fma.rn.f32 %r297, %r129, %r265, %r290;
|
||||
fma.rn.f32 %r298, %r157, %r263, %r289;
|
||||
fma.rn.f32 %r299, %r157, %r265, %r288;
|
||||
fma.rn.f32 %r300, %r161, %r263, %r287;
|
||||
fma.rn.f32 %r301, %r161, %r265, %r286;
|
||||
fma.rn.f32 %r302, %r150, %r264, %r301;
|
||||
fma.rn.f32 %r303, %r150, %r262, %r300;
|
||||
fma.rn.f32 %r304, %r146, %r264, %r299;
|
||||
fma.rn.f32 %r305, %r146, %r262, %r298;
|
||||
fma.rn.f32 %r306, %r118, %r264, %r297;
|
||||
fma.rn.f32 %r307, %r118, %r262, %r296;
|
||||
fma.rn.f32 %r308, %r114, %r264, %r295;
|
||||
fma.rn.f32 %r309, %r114, %r262, %r294;
|
||||
fma.rn.f32 %r310, %r115, %r267, %r309;
|
||||
fma.rn.f32 %r311, %r115, %r269, %r308;
|
||||
fma.rn.f32 %r312, %r119, %r267, %r307;
|
||||
fma.rn.f32 %r313, %r119, %r269, %r306;
|
||||
fma.rn.f32 %r314, %r147, %r267, %r305;
|
||||
fma.rn.f32 %r315, %r147, %r269, %r304;
|
||||
fma.rn.f32 %r316, %r151, %r267, %r303;
|
||||
fma.rn.f32 %r317, %r151, %r269, %r302;
|
||||
fma.rn.f32 %r318, %r152, %r268, %r317;
|
||||
fma.rn.f32 %r319, %r152, %r266, %r316;
|
||||
fma.rn.f32 %r320, %r148, %r268, %r315;
|
||||
fma.rn.f32 %r321, %r148, %r266, %r314;
|
||||
fma.rn.f32 %r322, %r120, %r268, %r313;
|
||||
fma.rn.f32 %r323, %r120, %r266, %r312;
|
||||
fma.rn.f32 %r324, %r116, %r268, %r311;
|
||||
fma.rn.f32 %r325, %r116, %r266, %r310;
|
||||
fma.rn.f32 %r326, %r117, %r271, %r325;
|
||||
fma.rn.f32 %r327, %r117, %r273, %r324;
|
||||
fma.rn.f32 %r328, %r121, %r271, %r323;
|
||||
fma.rn.f32 %r329, %r121, %r273, %r322;
|
||||
fma.rn.f32 %r330, %r149, %r271, %r321;
|
||||
fma.rn.f32 %r331, %r149, %r273, %r320;
|
||||
fma.rn.f32 %r332, %r153, %r271, %r319;
|
||||
fma.rn.f32 %r333, %r153, %r273, %r318;
|
||||
fma.rn.f32 %r334, %r142, %r272, %r333;
|
||||
fma.rn.f32 %r335, %r142, %r270, %r332;
|
||||
fma.rn.f32 %r336, %r138, %r272, %r331;
|
||||
fma.rn.f32 %r337, %r138, %r270, %r330;
|
||||
fma.rn.f32 %r338, %r110, %r272, %r329;
|
||||
fma.rn.f32 %r339, %r110, %r270, %r328;
|
||||
fma.rn.f32 %r340, %r106, %r272, %r327;
|
||||
fma.rn.f32 %r341, %r106, %r270, %r326;
|
||||
fma.rn.f32 %r342, %r107, %r275, %r341;
|
||||
fma.rn.f32 %r343, %r107, %r177, %r340;
|
||||
fma.rn.f32 %r344, %r111, %r275, %r339;
|
||||
fma.rn.f32 %r345, %r111, %r177, %r338;
|
||||
fma.rn.f32 %r346, %r139, %r275, %r337;
|
||||
fma.rn.f32 %r347, %r139, %r177, %r336;
|
||||
fma.rn.f32 %r348, %r143, %r275, %r335;
|
||||
fma.rn.f32 %r349, %r143, %r177, %r334;
|
||||
fma.rn.f32 %r350, %r144, %r276, %r349;
|
||||
fma.rn.f32 %r351, %r144, %r274, %r348;
|
||||
fma.rn.f32 %r352, %r140, %r276, %r347;
|
||||
fma.rn.f32 %r353, %r140, %r274, %r346;
|
||||
fma.rn.f32 %r354, %r112, %r276, %r345;
|
||||
fma.rn.f32 %r355, %r112, %r274, %r344;
|
||||
fma.rn.f32 %r356, %r108, %r276, %r343;
|
||||
fma.rn.f32 %r357, %r108, %r274, %r342;
|
||||
fma.rn.f32 %r358, %r109, %r178, %r357;
|
||||
fma.rn.f32 %r359, %r109, %r277, %r356;
|
||||
fma.rn.f32 %r360, %r113, %r178, %r355;
|
||||
fma.rn.f32 %r361, %r113, %r277, %r354;
|
||||
fma.rn.f32 %r362, %r141, %r178, %r353;
|
||||
fma.rn.f32 %r363, %r141, %r277, %r352;
|
||||
fma.rn.f32 %r364, %r145, %r178, %r351;
|
||||
fma.rn.f32 %r365, %r145, %r277, %r350;
|
||||
$L__tmp1:
|
||||
.loc 2 110 15 // triton_helpers.py:110:15 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
|
||||
setp.lt.f32 %p19, %r365, 0f00000000;
|
||||
setp.lt.f32 %p20, %r364, 0f00000000;
|
||||
setp.lt.f32 %p21, %r363, 0f00000000;
|
||||
setp.lt.f32 %p22, %r362, 0f00000000;
|
||||
setp.lt.f32 %p23, %r361, 0f00000000;
|
||||
setp.lt.f32 %p24, %r360, 0f00000000;
|
||||
setp.lt.f32 %p25, %r359, 0f00000000;
|
||||
setp.lt.f32 %p26, %r358, 0f00000000;
|
||||
.loc 2 113 29 // triton_helpers.py:113:29 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
|
||||
selp.f32 %r366, 0f00000000, %r358, %p26;
|
||||
selp.f32 %r367, 0f00000000, %r359, %p25;
|
||||
selp.f32 %r368, 0f00000000, %r360, %p24;
|
||||
selp.f32 %r369, 0f00000000, %r361, %p23;
|
||||
selp.f32 %r370, 0f00000000, %r362, %p22;
|
||||
selp.f32 %r371, 0f00000000, %r363, %p21;
|
||||
selp.f32 %r372, 0f00000000, %r364, %p20;
|
||||
selp.f32 %r373, 0f00000000, %r365, %p19;
|
||||
$L__tmp2:
|
||||
.loc 1 104 52 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:52
|
||||
shl.b32 %r374, %r77, 4;
|
||||
shl.b32 %r375, %r78, 4;
|
||||
.loc 1 104 49 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:49
|
||||
add.s32 %r376, %r374, %r80;
|
||||
add.s32 %r377, %r80, %r375;
|
||||
.loc 1 104 25 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:25
|
||||
mul.wide.s32 %rd27, %r376, 4;
|
||||
add.s64 %rd19, %rd4, %rd27;
|
||||
mul.wide.s32 %rd28, %r377, 4;
|
||||
add.s64 %rd20, %rd4, %rd28;
|
||||
.loc 1 104 78 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:78
|
||||
selp.f32 %r378, %r366, %r368, %p18;
|
||||
or.b32 %r379, %r183, %r67;
|
||||
or.b32 %r380, %r379, %r84;
|
||||
shfl.sync.idx.b32 %r381, %r378, %r380, 31, -1;
|
||||
selp.f32 %r382, %r367, %r369, %p18;
|
||||
shfl.sync.idx.b32 %r383, %r382, %r380, 31, -1;
|
||||
selp.f32 %r384, %r368, %r366, %p18;
|
||||
xor.b32 %r385, %r171, %r67;
|
||||
or.b32 %r386, %r183, %r385;
|
||||
shfl.sync.idx.b32 %r387, %r384, %r386, 31, -1;
|
||||
selp.f32 %r388, %r369, %r367, %p18;
|
||||
shfl.sync.idx.b32 %r389, %r388, %r386, 31, -1;
|
||||
selp.f32 %r390, %r370, %r372, %p18;
|
||||
shfl.sync.idx.b32 %r391, %r390, %r380, 31, -1;
|
||||
selp.f32 %r392, %r371, %r373, %p18;
|
||||
shfl.sync.idx.b32 %r393, %r392, %r380, 31, -1;
|
||||
selp.f32 %r394, %r372, %r370, %p18;
|
||||
shfl.sync.idx.b32 %r395, %r394, %r386, 31, -1;
|
||||
selp.f32 %r396, %r373, %r371, %p18;
|
||||
shfl.sync.idx.b32 %r397, %r396, %r386, 31, -1;
|
||||
selp.b32 %r45, %r387, %r381, %p17;
|
||||
selp.b32 %r46, %r389, %r383, %p17;
|
||||
selp.b32 %r49, %r395, %r391, %p17;
|
||||
selp.b32 %r50, %r397, %r393, %p17;
|
||||
selp.b32 %r43, %r381, %r387, %p17;
|
||||
selp.b32 %r44, %r383, %r389, %p17;
|
||||
// begin inline asm
|
||||
@%p10 st.global.v4.b32 [ %rd19 + 0 ], { %r43, %r44, %r45, %r46 };
|
||||
// end inline asm
|
||||
selp.b32 %r47, %r391, %r395, %p17;
|
||||
selp.b32 %r48, %r393, %r397, %p17;
|
||||
// begin inline asm
|
||||
@%p11 st.global.v4.b32 [ %rd20 + 0 ], { %r47, %r48, %r49, %r50 };
|
||||
// end inline asm
|
||||
$L__BB0_1: // %common.ret
|
||||
.loc 1 0 0 // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
|
||||
ret;
|
||||
$L__tmp3:
|
||||
$L__func_end0:
|
||||
// -- End function
|
||||
}
|
||||
.file 1 "/tmp/torchinductor_shangdiy/uw/cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py"
|
||||
.file 2 "/home/shangdiy/pytorch/torch/_inductor/runtime/triton_helpers.py"
|
||||
.section .debug_abbrev
|
||||
{
|
||||
.b8 1 // Abbreviation Code
|
||||
.b8 17 // DW_TAG_compile_unit
|
||||
.b8 1 // DW_CHILDREN_yes
|
||||
.b8 37 // DW_AT_producer
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 19 // DW_AT_language
|
||||
.b8 5 // DW_FORM_data2
|
||||
.b8 3 // DW_AT_name
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 16 // DW_AT_stmt_list
|
||||
.b8 6 // DW_FORM_data4
|
||||
.b8 27 // DW_AT_comp_dir
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 2 // Abbreviation Code
|
||||
.b8 46 // DW_TAG_subprogram
|
||||
.b8 0 // DW_CHILDREN_no
|
||||
.b8 3 // DW_AT_name
|
||||
.b8 8 // DW_FORM_string
|
||||
.b8 32 // DW_AT_inline
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 3 // Abbreviation Code
|
||||
.b8 46 // DW_TAG_subprogram
|
||||
.b8 1 // DW_CHILDREN_yes
|
||||
.b8 17 // DW_AT_low_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 18 // DW_AT_high_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 49 // DW_AT_abstract_origin
|
||||
.b8 19 // DW_FORM_ref4
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 4 // Abbreviation Code
|
||||
.b8 29 // DW_TAG_inlined_subroutine
|
||||
.b8 0 // DW_CHILDREN_no
|
||||
.b8 49 // DW_AT_abstract_origin
|
||||
.b8 19 // DW_FORM_ref4
|
||||
.b8 17 // DW_AT_low_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 18 // DW_AT_high_pc
|
||||
.b8 1 // DW_FORM_addr
|
||||
.b8 88 // DW_AT_call_file
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 89 // DW_AT_call_line
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 87 // DW_AT_call_column
|
||||
.b8 11 // DW_FORM_data1
|
||||
.b8 0 // EOM(1)
|
||||
.b8 0 // EOM(2)
|
||||
.b8 0 // EOM(3)
|
||||
}
|
||||
.section .debug_info
|
||||
{
|
||||
.b32 195 // Length of Unit
|
||||
.b8 2 // DWARF version number
|
||||
.b8 0
|
||||
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
||||
.b8 8 // Address Size (in bytes)
|
||||
.b8 1 // Abbrev [1] 0xb:0xbc DW_TAG_compile_unit
|
||||
.b8 116 // DW_AT_producer
|
||||
.b8 114
|
||||
.b8 105
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 110
|
||||
.b8 0
|
||||
.b8 2 // DW_AT_language
|
||||
.b8 0
|
||||
.b8 99 // DW_AT_name
|
||||
.b8 117
|
||||
.b8 119
|
||||
.b8 104
|
||||
.b8 98
|
||||
.b8 112
|
||||
.b8 101
|
||||
.b8 99
|
||||
.b8 100
|
||||
.b8 50
|
||||
.b8 117
|
||||
.b8 107
|
||||
.b8 101
|
||||
.b8 115
|
||||
.b8 111
|
||||
.b8 51
|
||||
.b8 106
|
||||
.b8 120
|
||||
.b8 101
|
||||
.b8 107
|
||||
.b8 99
|
||||
.b8 122
|
||||
.b8 118
|
||||
.b8 103
|
||||
.b8 104
|
||||
.b8 98
|
||||
.b8 55
|
||||
.b8 97
|
||||
.b8 104
|
||||
.b8 50
|
||||
.b8 104
|
||||
.b8 107
|
||||
.b8 50
|
||||
.b8 122
|
||||
.b8 111
|
||||
.b8 98
|
||||
.b8 54
|
||||
.b8 55
|
||||
.b8 111
|
||||
.b8 112
|
||||
.b8 99
|
||||
.b8 52
|
||||
.b8 51
|
||||
.b8 97
|
||||
.b8 103
|
||||
.b8 104
|
||||
.b8 101
|
||||
.b8 53
|
||||
.b8 119
|
||||
.b8 108
|
||||
.b8 118
|
||||
.b8 51
|
||||
.b8 46
|
||||
.b8 112
|
||||
.b8 121
|
||||
.b8 0
|
||||
.b32 .debug_line // DW_AT_stmt_list
|
||||
.b8 47 // DW_AT_comp_dir
|
||||
.b8 116
|
||||
.b8 109
|
||||
.b8 112
|
||||
.b8 47
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 114
|
||||
.b8 99
|
||||
.b8 104
|
||||
.b8 105
|
||||
.b8 110
|
||||
.b8 100
|
||||
.b8 117
|
||||
.b8 99
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 114
|
||||
.b8 95
|
||||
.b8 115
|
||||
.b8 104
|
||||
.b8 97
|
||||
.b8 110
|
||||
.b8 103
|
||||
.b8 100
|
||||
.b8 105
|
||||
.b8 121
|
||||
.b8 47
|
||||
.b8 117
|
||||
.b8 119
|
||||
.b8 0
|
||||
.b8 2 // Abbrev [2] 0x70:0x28 DW_TAG_subprogram
|
||||
.b8 109 // DW_AT_name
|
||||
.b8 111
|
||||
.b8 100
|
||||
.b8 101
|
||||
.b8 108
|
||||
.b8 95
|
||||
.b8 116
|
||||
.b8 114
|
||||
.b8 105
|
||||
.b8 116
|
||||
.b8 111
|
||||
.b8 110
|
||||
.b8 95
|
||||
.b8 116
|
||||
.b8 101
|
||||
.b8 109
|
||||
.b8 95
|
||||
.b8 102
|
||||
.b8 117
|
||||
.b8 115
|
||||
.b8 101
|
||||
.b8 100
|
||||
.b8 95
|
||||
.b8 97
|
||||
.b8 100
|
||||
.b8 100
|
||||
.b8 109
|
||||
.b8 109
|
||||
.b8 95
|
||||
.b8 114
|
||||
.b8 101
|
||||
.b8 108
|
||||
.b8 117
|
||||
.b8 95
|
||||
.b8 116
|
||||
.b8 95
|
||||
.b8 48
|
||||
.b8 0
|
||||
.b8 1 // DW_AT_inline
|
||||
.b8 3 // Abbrev [3] 0x98:0x2e DW_TAG_subprogram
|
||||
.b64 $L__func_begin0 // DW_AT_low_pc
|
||||
.b64 $L__func_end0 // DW_AT_high_pc
|
||||
.b32 112 // DW_AT_abstract_origin
|
||||
.b8 4 // Abbrev [4] 0xad:0x18 DW_TAG_inlined_subroutine
|
||||
.b32 112 // DW_AT_abstract_origin
|
||||
.b64 $L__tmp1 // DW_AT_low_pc
|
||||
.b64 $L__tmp2 // DW_AT_high_pc
|
||||
.b8 1 // DW_AT_call_file
|
||||
.b8 103 // DW_AT_call_line
|
||||
.b8 40 // DW_AT_call_column
|
||||
.b8 0 // End Of Children Mark
|
||||
.b8 0 // End Of Children Mark
|
||||
}
|
||||
.section .debug_macinfo { }
|
||||
8
model2/data/aotinductor/model/script.ld
Normal file
8
model2/data/aotinductor/model/script.ld
Normal file
@ -0,0 +1,8 @@
|
||||
SECTIONS {
|
||||
/* By default, in LLD 16, .lrodata is placed immediately after .rodata.
|
||||
* However, .lrodata can be very large in our compiled models, which leads to
|
||||
* relocation out-of-range errors for relative relocations. So we place it
|
||||
* after other the sections that are referenced from .text using relative
|
||||
* relocations. This is the default behavior in GNU ld. */
|
||||
.lrodata : { *(.lrodata) }
|
||||
} INSERT AFTER .bss;
|
||||
147
model2/main.cpp
Normal file
147
model2/main.cpp
Normal file
@ -0,0 +1,147 @@
|
||||
// Windows for #include <dlfcn.h>
|
||||
#include <windows.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
// Include the AOTInductor headers
|
||||
// #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
|
||||
#include <torch/csrc/inductor/aoti_runtime/interface.h>
|
||||
// #include <torch/csrc/inductor/aoti_runtime/model_container.h>
|
||||
// #include <torch/csrc/inductor/aoti_torch/tensor_converter.h> // @manual
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <standalone/slim/core/Empty.h>
|
||||
#include <standalone/slim/cuda/Guard.h>
|
||||
#include <standalone/torch/csrc/inductor/aoti_torch/tensor_converter.h>
|
||||
|
||||
static std::wstring u8u16(const char* s) {
|
||||
int len = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
|
||||
std::wstring wbuf(len, L'\0');
|
||||
MultiByteToWideChar(CP_UTF8, 0, s, -1, &wbuf[0], len);
|
||||
if (!wbuf.empty() && wbuf.back() == L'\0') {
|
||||
wbuf.pop_back();
|
||||
}
|
||||
return wbuf;
|
||||
}
|
||||
|
||||
int main() {
|
||||
try {
|
||||
|
||||
// Load the DLL (model.pyd is a DLL on Windows)
|
||||
HMODULE handle = nullptr;
|
||||
{
|
||||
auto wname = u8u16(R"(C:\Users\shangdiy\source\repos\pytorch\model2\model.pyd)");
|
||||
|
||||
// Try LoadLibraryExW with safe search flags if supported
|
||||
if (GetProcAddress(GetModuleHandleW(L"KERNEL32.DLL"), "AddDllDirectory") != NULL) {
|
||||
handle = LoadLibraryExW(
|
||||
wname.c_str(),
|
||||
NULL,
|
||||
LOAD_LIBRARY_SEARCH_DEFAULT_DIRS);
|
||||
}
|
||||
|
||||
// Fallback if that failed
|
||||
if (!handle) {
|
||||
handle = LoadLibraryW(wname.c_str());
|
||||
}
|
||||
|
||||
if (!handle) {
|
||||
DWORD dw = GetLastError();
|
||||
char buf[512];
|
||||
FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
|
||||
buf, sizeof(buf), NULL);
|
||||
std::cerr << "Failed to load model.pyd. WinError " << dw << ": " << buf << std::endl;
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Loaded model.pyd" << std::endl;
|
||||
}
|
||||
}
|
||||
decltype(&AOTInductorModelContainerCreateWithDevice) create_model{nullptr};
|
||||
decltype(&AOTInductorModelContainerDelete) delete_model{nullptr};
|
||||
decltype(&AOTInductorModelContainerRun) run_model{nullptr};
|
||||
|
||||
|
||||
#define AOTI_LOAD_SYMBOL(handle_, var, name_str) \
|
||||
var = reinterpret_cast<decltype(var)>(GetProcAddress(handle_, name_str)); \
|
||||
if (!var) { \
|
||||
throw std::runtime_error("Could not GetProcAddress " name_str); \
|
||||
}
|
||||
|
||||
AOTI_LOAD_SYMBOL(handle, create_model, "AOTInductorModelContainerCreateWithDevice");
|
||||
AOTI_LOAD_SYMBOL(handle, run_model, "AOTInductorModelContainerRun");
|
||||
AOTI_LOAD_SYMBOL(handle, delete_model, "AOTInductorModelContainerDelete");
|
||||
#undef AOTI_LOAD_SYMBOL
|
||||
|
||||
// Create array of input/output handles
|
||||
slim::SlimTensor x = slim::empty({8, 10}, c10::kFloat, c10::Device(c10::kCUDA, 0));
|
||||
float fill_value = 1.0;
|
||||
x.fill_(fill_value);
|
||||
// AOTInductorModel::run will steal the ownership of the input and output
|
||||
// tensor pointers
|
||||
std::vector<slim::SlimTensor> inputs = {x};
|
||||
std::vector<AtenTensorHandle> input_handles =
|
||||
unsafe_alloc_new_handles_from_tensors(inputs);
|
||||
|
||||
AtenTensorHandle output_handle;
|
||||
AOTInductorModelContainerHandle container_handle;
|
||||
cudaStream_t stream = slim::cuda::getCurrentCUDAStream(0);
|
||||
// aoti_torch_get_current_cuda_stream(0, (void**)&stream);
|
||||
|
||||
// Reinterpret as the opaque handle for AOTInductor
|
||||
AOTInductorStreamHandle stream_handle = reinterpret_cast<AOTInductorStreamHandle>(stream);
|
||||
|
||||
// Construct model
|
||||
const char* cubin_dir = R"(C:\Users\shangdiy\source\repos\pytorch\model2\)";
|
||||
AOTIRuntimeError err =
|
||||
create_model(&container_handle, 1, "cuda", cubin_dir);
|
||||
if (err != AOTI_RUNTIME_SUCCESS) {
|
||||
throw std::runtime_error("Failed to create model container");
|
||||
} else {
|
||||
std::cout << "Created model\n";
|
||||
}
|
||||
|
||||
// Run the model
|
||||
err = run_model(container_handle, input_handles.data(),
|
||||
1, // num_inputs
|
||||
&output_handle,
|
||||
1, // num_outputs
|
||||
stream_handle, // stream
|
||||
nullptr // proxy_executor
|
||||
);
|
||||
if (err != AOTI_RUNTIME_SUCCESS) {
|
||||
throw std::runtime_error("Failed to run model");
|
||||
} else {
|
||||
std::cout << "Finish model\n";
|
||||
}
|
||||
|
||||
std::vector<slim::SlimTensor> outputs =
|
||||
alloc_tensors_by_stealing_from_handles(&output_handle, 1);
|
||||
|
||||
// Print the result
|
||||
slim::SlimTensor slim_tensor = outputs[0];
|
||||
auto slim_cpu = slim_tensor.cpu();
|
||||
float *slim_data = static_cast<float *>(slim_cpu.data_ptr());
|
||||
std::cout << "Output" << std::endl;
|
||||
std::cout << "slim_data ptr: " << slim_data << "\n";
|
||||
size_t num_elements = slim_cpu.numel(); // or equivalent method
|
||||
std::cout << num_elements << std::endl;
|
||||
|
||||
for (size_t i = 0; i < num_elements; ++i) {
|
||||
std::cout << slim_data[i] << "\n";
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
|
||||
delete_model(container_handle);
|
||||
FreeLibrary(handle);
|
||||
|
||||
return 0;
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
BIN
model2/model.exp
Normal file
BIN
model2/model.exp
Normal file
Binary file not shown.
BIN
model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
Normal file
BIN
model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
Normal file
Binary file not shown.
BIN
model2/model_triton_tem_fused_addmm_relu_t_0.cubin
Normal file
BIN
model2/model_triton_tem_fused_addmm_relu_t_0.cubin
Normal file
Binary file not shown.
Reference in New Issue
Block a user