mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-11-04 08:00:58 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			ciflow/ind
			...
			windows_li
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 8cd74b302f | 
							
								
								
									
										1
									
								
								model2/.data/serialization_id
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/.data/serialization_id
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
1171719005974771805808300960005001569062
 | 
			
		||||
							
								
								
									
										1
									
								
								model2/.data/version
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/.data/version
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
6
 | 
			
		||||
							
								
								
									
										33
									
								
								model2/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								model2/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,33 @@
 | 
			
		||||
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 | 
			
		||||
project(aoti_example LANGUAGES CXX)
 | 
			
		||||
set(CMAKE_CXX_STANDARD 17)
 | 
			
		||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
add_executable(aoti_example main.cpp)
 | 
			
		||||
set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17)
 | 
			
		||||
 | 
			
		||||
find_package(TorchStandalone REQUIRED)
 | 
			
		||||
# Set up include directories to find headers at the correct paths
 | 
			
		||||
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS})
 | 
			
		||||
target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
 | 
			
		||||
 | 
			
		||||
enable_language(CUDA)
 | 
			
		||||
set(CMAKE_CUDA_STANDARD 17)
 | 
			
		||||
find_package(CUDAToolkit REQUIRED)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
target_compile_definitions(aoti_example PRIVATE NOMINMAX USE_CUDA)
 | 
			
		||||
 | 
			
		||||
# Add compile flags
 | 
			
		||||
target_compile_options(aoti_example PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
target_link_libraries(aoti_example PRIVATE
 | 
			
		||||
    ${TorchStandalone_LIBRARIES}  # if you have this variable from find_package(TorchStandalone)
 | 
			
		||||
    CUDA::cudart                  # CUDA runtime library
 | 
			
		||||
    cuda                       # CUDA driver library (usually nvcuda.lib on Windows)
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# cmake -DTorchStandalone_DIR="C:/Users/shangdiy/source/repos/torchnative/standalone/build/torchstandalone_install/lib/cmake/TorchStandalone"  ..  
 | 
			
		||||
# cmake --build . --config Release
 | 
			
		||||
							
								
								
									
										1
									
								
								model2/archive_format
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/archive_format
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
pt2
 | 
			
		||||
							
								
								
									
										1
									
								
								model2/archive_version
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/archive_version
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
0
 | 
			
		||||
							
								
								
									
										1
									
								
								model2/byteorder
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/byteorder
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
little
 | 
			
		||||
							
								
								
									
										69
									
								
								model2/data/aotinductor/model/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								model2/data/aotinductor/model/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,69 @@
 | 
			
		||||
 | 
			
		||||
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 | 
			
		||||
project(model LANGUAGES CXX)
 | 
			
		||||
set(CMAKE_CXX_STANDARD 17)
 | 
			
		||||
 | 
			
		||||
# Set a library target
 | 
			
		||||
add_library(model SHARED)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# TODO: change to TorchStandalone
 | 
			
		||||
find_package(TorchStandalone REQUIRED)
 | 
			
		||||
# Set up include directories to find headers at the correct paths
 | 
			
		||||
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS})
 | 
			
		||||
target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Add macro definitions
 | 
			
		||||
target_compile_definitions(model PRIVATE NOMINMAX TORCH_INDUCTOR_CPP_WRAPPER STANDALONE_TORCH_HEADER  C10_USING_CUSTOM_GENERATED_MACROS USE_CUDA) # CPU_CAPABILITY_AVX512  
 | 
			
		||||
 | 
			
		||||
# Add compile flags
 | 
			
		||||
target_compile_options(model PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
 | 
			
		||||
 | 
			
		||||
# Backend-specific flags
 | 
			
		||||
# target_compile_options(model PRIVATE  -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16  -c)  # TODO remove
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
enable_language(CUDA)
 | 
			
		||||
set(CMAKE_CUDA_STANDARD 17)
 | 
			
		||||
find_package(CUDAToolkit REQUIRED)
 | 
			
		||||
 | 
			
		||||
# Make output use .pyd instead of .dll
 | 
			
		||||
set_target_properties(model PROPERTIES 
 | 
			
		||||
    SUFFIX ".pyd" 
 | 
			
		||||
    LINK_FLAGS "/DEF:${CMAKE_CURRENT_SOURCE_DIR}/model_exports.def"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
set(KERNEL_TARGETS "")
 | 
			
		||||
set(KERNEL_OBJECT_FILES "")
 | 
			
		||||
# Function to compile ptx to cubin
 | 
			
		||||
function(embed_gpu_kernel KERNEL_NAME PTX_FILE)
 | 
			
		||||
    set(CUBIN_BASENAME ${KERNEL_NAME}.cubin)
 | 
			
		||||
    set(CUBIN_FILE ${CMAKE_CURRENT_BINARY_DIR}/${CUBIN_BASENAME})
 | 
			
		||||
    # --- PTX to FATBIN Command & Target ---
 | 
			
		||||
    add_custom_command(
 | 
			
		||||
        OUTPUT ${CUBIN_FILE}
 | 
			
		||||
        COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --cubin ${PTX_FILE}
 | 
			
		||||
                -o ${CUBIN_FILE} ${NVCC_GENCODE_FLAGS}
 | 
			
		||||
                -gencode arch=compute_89,code=sm_89
 | 
			
		||||
        DEPENDS ${PTX_FILE}
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    add_custom_target(build_kernel_object_${KERNEL_NAME} DEPENDS ${CUBIN_FILE})
 | 
			
		||||
    set(KERNEL_TARGETS ${KERNEL_TARGETS} build_kernel_object_${KERNEL_NAME} PARENT_SCOPE)
 | 
			
		||||
endfunction()
 | 
			
		||||
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model.wrapper.cpp)
 | 
			
		||||
target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model_consts.weights.cpp)
 | 
			
		||||
 | 
			
		||||
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_t_0 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_t_0.ptx)
 | 
			
		||||
 | 
			
		||||
embed_gpu_kernel(model_triton_tem_fused_addmm_relu_sigmoid_t_1 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx)
 | 
			
		||||
add_dependencies(model ${KERNEL_TARGETS})
 | 
			
		||||
target_link_libraries(model PRIVATE ${KERNEL_OBJECT_FILES})
 | 
			
		||||
 | 
			
		||||
# Add linker flags
 | 
			
		||||
target_link_options(model PRIVATE )
 | 
			
		||||
 | 
			
		||||
# Add libraries
 | 
			
		||||
# TODO: change to TorchStandalone
 | 
			
		||||
target_link_libraries(model PRIVATE  ${TorchStandalone_LIBRARIES} cuda CUDA::cudart)
 | 
			
		||||
							
								
								
									
										1041
									
								
								model2/data/aotinductor/model/model.wrapper.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1041
									
								
								model2/data/aotinductor/model/model.wrapper.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -0,0 +1 @@
 | 
			
		||||
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": [], "libraries": [], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": true}
 | 
			
		||||
@ -0,0 +1 @@
 | 
			
		||||
{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/libs", "/home/shangdiy/pytorch/torch/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib/stubs"], "libraries": ["torch", "torch_cpu", "sleef", "c10", "c10_cuda", "cuda", "torch_cuda"], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": false}
 | 
			
		||||
@ -0,0 +1 @@
 | 
			
		||||
{"AOTI_DEVICE_KEY": "cuda"}
 | 
			
		||||
							
								
								
									
										58
									
								
								model2/data/aotinductor/model/model_consts.weights.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								model2/data/aotinductor/model/model_consts.weights.cpp
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,58 @@
 | 
			
		||||
#if defined(__clang__) || defined (__GNUC__)	
 | 
			
		||||
#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))	
 | 
			
		||||
#else	
 | 
			
		||||
#define ATTRIBUTE_NO_SANITIZE_ADDRESS	
 | 
			
		||||
#endif	
 | 
			
		||||
	
 | 
			
		||||
ATTRIBUTE_NO_SANITIZE_ADDRESS	
 | 
			
		||||
alignas(64) extern unsigned char _binary_constants_bin_start[768] = {	
 | 
			
		||||
69, 165, 123, 190, 252, 181, 16, 62, 242, 69, 193, 59, 44, 80, 100, 62, 	
 | 
			
		||||
237, 163, 142, 188, 138, 139, 109, 190, 61, 248, 20, 190, 77, 84, 143, 60, 	
 | 
			
		||||
111, 90, 163, 60, 96, 140, 230, 189, 101, 69, 38, 190, 132, 190, 12, 188, 	
 | 
			
		||||
28, 113, 159, 190, 252, 128, 154, 62, 247, 234, 217, 61, 206, 79, 58, 61, 	
 | 
			
		||||
224, 209, 135, 61, 211, 238, 147, 62, 231, 229, 27, 190, 169, 208, 57, 62, 	
 | 
			
		||||
100, 20, 153, 190, 161, 160, 85, 190, 207, 10, 156, 62, 234, 107, 155, 190, 	
 | 
			
		||||
188, 85, 116, 62, 27, 211, 114, 60, 94, 21, 158, 189, 147, 210, 34, 62, 	
 | 
			
		||||
203, 109, 80, 62, 28, 242, 141, 189, 205, 27, 152, 190, 38, 104, 6, 189, 	
 | 
			
		||||
211, 16, 249, 189, 72, 103, 143, 190, 163, 44, 140, 189, 178, 223, 127, 189, 	
 | 
			
		||||
5, 112, 160, 189, 177, 55, 132, 190, 218, 22, 159, 62, 115, 30, 35, 190, 	
 | 
			
		||||
26, 247, 9, 62, 251, 219, 27, 62, 165, 86, 135, 62, 99, 168, 66, 190, 	
 | 
			
		||||
238, 64, 93, 62, 65, 147, 86, 62, 167, 108, 97, 62, 183, 219, 50, 190, 	
 | 
			
		||||
138, 83, 106, 62, 90, 122, 208, 189, 149, 140, 161, 188, 44, 145, 194, 189, 	
 | 
			
		||||
5, 142, 186, 61, 202, 230, 153, 190, 133, 72, 136, 62, 251, 1, 3, 62, 	
 | 
			
		||||
225, 146, 54, 190, 91, 176, 219, 189, 118, 244, 10, 189, 232, 107, 142, 62, 	
 | 
			
		||||
185, 6, 151, 62, 241, 137, 223, 61, 124, 100, 114, 190, 15, 240, 168, 189, 	
 | 
			
		||||
149, 252, 58, 190, 238, 93, 243, 188, 144, 218, 115, 61, 159, 91, 6, 62, 	
 | 
			
		||||
57, 139, 74, 190, 84, 200, 49, 61, 193, 78, 32, 190, 84, 121, 26, 190, 	
 | 
			
		||||
219, 39, 115, 190, 171, 127, 94, 62, 248, 253, 177, 61, 63, 18, 127, 62, 	
 | 
			
		||||
146, 18, 137, 189, 203, 90, 161, 190, 139, 194, 239, 58, 126, 54, 40, 190, 	
 | 
			
		||||
47, 247, 30, 190, 106, 93, 191, 61, 22, 48, 120, 61, 56, 123, 7, 62, 	
 | 
			
		||||
150, 229, 210, 189, 118, 231, 158, 188, 7, 98, 215, 60, 72, 251, 89, 190, 	
 | 
			
		||||
190, 160, 137, 190, 173, 194, 158, 62, 225, 26, 118, 190, 174, 199, 4, 189, 	
 | 
			
		||||
205, 148, 16, 62, 20, 225, 155, 61, 90, 124, 133, 190, 88, 196, 34, 61, 	
 | 
			
		||||
26, 104, 51, 190, 149, 106, 40, 62, 25, 136, 177, 60, 169, 111, 138, 190, 	
 | 
			
		||||
214, 181, 226, 189, 109, 17, 77, 62, 224, 166, 55, 62, 250, 128, 160, 61, 	
 | 
			
		||||
104, 223, 250, 61, 34, 182, 210, 187, 60, 87, 149, 190, 189, 55, 98, 188, 	
 | 
			
		||||
58, 86, 85, 190, 170, 43, 132, 190, 81, 220, 87, 190, 47, 226, 138, 62, 	
 | 
			
		||||
189, 162, 36, 190, 30, 232, 34, 60, 138, 147, 167, 61, 151, 129, 157, 61, 	
 | 
			
		||||
206, 33, 152, 62, 109, 227, 113, 190, 147, 255, 11, 190, 175, 56, 46, 189, 	
 | 
			
		||||
46, 238, 1, 189, 123, 159, 85, 188, 14, 126, 148, 189, 226, 226, 169, 189, 	
 | 
			
		||||
255, 106, 134, 61, 38, 140, 187, 60, 119, 73, 49, 62, 32, 236, 43, 62, 	
 | 
			
		||||
78, 33, 232, 189, 72, 188, 139, 62, 94, 206, 20, 62, 25, 230, 75, 189, 	
 | 
			
		||||
171, 239, 26, 190, 136, 218, 121, 62, 96, 115, 85, 62, 126, 92, 55, 190, 	
 | 
			
		||||
112, 108, 134, 61, 64, 212, 69, 190, 253, 118, 214, 188, 210, 116, 66, 62, 	
 | 
			
		||||
204, 131, 123, 190, 13, 151, 38, 190, 56, 17, 252, 189, 153, 151, 138, 62, 	
 | 
			
		||||
21, 30, 216, 61, 146, 103, 32, 62, 140, 60, 78, 62, 183, 149, 174, 61, 	
 | 
			
		||||
95, 153, 164, 61, 144, 167, 187, 189, 112, 53, 153, 190, 127, 195, 105, 61, 	
 | 
			
		||||
169, 167, 251, 189, 42, 204, 123, 62, 116, 193, 86, 62, 98, 147, 30, 61, 	
 | 
			
		||||
176, 138, 137, 62, 245, 244, 17, 62, 201, 90, 140, 62, 177, 110, 77, 62, 	
 | 
			
		||||
188, 31, 129, 190, 66, 203, 85, 62, 182, 209, 112, 188, 216, 91, 222, 59, 	
 | 
			
		||||
18, 208, 131, 189, 151, 142, 150, 190, 36, 252, 31, 62, 241, 2, 180, 61, 	
 | 
			
		||||
83, 240, 159, 62, 37, 152, 115, 190, 13, 52, 107, 62, 169, 178, 148, 62, 	
 | 
			
		||||
171, 54, 38, 62, 33, 4, 199, 189, 201, 247, 216, 189, 225, 89, 146, 190, 	
 | 
			
		||||
192, 118, 79, 189, 92, 171, 12, 62, 136, 235, 3, 62, 180, 202, 87, 62, 	
 | 
			
		||||
8, 129, 122, 61, 160, 75, 170, 188, 20, 84, 6, 62, 60, 194, 56, 190, 	
 | 
			
		||||
182, 99, 44, 190, 88, 96, 228, 189, 50, 106, 5, 190, 34, 133, 12, 190, 	
 | 
			
		||||
26, 50, 0, 190, 176, 25, 127, 61, 48, 69, 219, 61, 192, 237, 252, 187, 	
 | 
			
		||||
};	
 | 
			
		||||
alignas(64) extern unsigned char * _binary_constants_bin_end;	
 | 
			
		||||
							
								
								
									
										6
									
								
								model2/data/aotinductor/model/model_exports.def
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								model2/data/aotinductor/model/model_exports.def
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,6 @@
 | 
			
		||||
LIBRARY model
 | 
			
		||||
EXPORTS
 | 
			
		||||
    AOTInductorModelContainerCreate
 | 
			
		||||
    AOTInductorModelContainerCreateWithDevice
 | 
			
		||||
    AOTInductorModelContainerRun
 | 
			
		||||
    AOTInductorModelContainerDelete
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							@ -0,0 +1,684 @@
 | 
			
		||||
//
 | 
			
		||||
// Generated by LLVM NVPTX Back-End
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
.version 8.7
 | 
			
		||||
.target sm_89
 | 
			
		||||
.address_size 64
 | 
			
		||||
 | 
			
		||||
	// .globl	model_triton_tem_fused_addmm_relu_sigmoid_t_1 // -- Begin function model_triton_tem_fused_addmm_relu_sigmoid_t_1
 | 
			
		||||
.extern .shared .align 16 .b8 global_smem[];
 | 
			
		||||
                                        // @model_triton_tem_fused_addmm_relu_sigmoid_t_1
 | 
			
		||||
.visible .entry model_triton_tem_fused_addmm_relu_sigmoid_t_1(
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0,
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1,
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2,
 | 
			
		||||
	.param .u32 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3,
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_4
 | 
			
		||||
)
 | 
			
		||||
.reqntid 32
 | 
			
		||||
{
 | 
			
		||||
	.reg .pred 	%p<12>;
 | 
			
		||||
	.reg .b32 	%r<375>;
 | 
			
		||||
	.reg .b64 	%rd<27>;
 | 
			
		||||
	.loc	1 18 0                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
 | 
			
		||||
$L__func_begin0:
 | 
			
		||||
	.loc	1 18 0                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
 | 
			
		||||
 | 
			
		||||
// %bb.0:
 | 
			
		||||
	ld.param.b32 	%r1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3];
 | 
			
		||||
$L__tmp0:
 | 
			
		||||
	.loc	1 34 16                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:34:16
 | 
			
		||||
	setp.ne.s32 	%p1, %r1, 0;
 | 
			
		||||
	@%p1 bra 	$L__BB0_2;
 | 
			
		||||
	bra.uni 	$L__BB0_1;
 | 
			
		||||
$L__BB0_2:
 | 
			
		||||
	.loc	1 0 16                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0:16
 | 
			
		||||
	ld.param.b64 	%rd3, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2];
 | 
			
		||||
	ld.param.b64 	%rd2, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1];
 | 
			
		||||
	ld.param.b64 	%rd1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0];
 | 
			
		||||
	.loc	1 43 24                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:43:24
 | 
			
		||||
	mov.u32 	%r26, %ctaid.x;
 | 
			
		||||
	.loc	1 44 28                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:28
 | 
			
		||||
	add.s32 	%r27, %r1, 15;
 | 
			
		||||
	.loc	1 44 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:34
 | 
			
		||||
	shr.s32 	%r28, %r27, 31;
 | 
			
		||||
	shr.u32 	%r29, %r28, 28;
 | 
			
		||||
	add.s32 	%r30, %r27, %r29;
 | 
			
		||||
	shr.s32 	%r31, %r30, 4;
 | 
			
		||||
	.loc	1 50 41                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:41
 | 
			
		||||
	and.b32 	%r32, %r26, 2147483640;
 | 
			
		||||
	.loc	1 50 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:30
 | 
			
		||||
	sub.s32 	%r33, %r31, %r32;
 | 
			
		||||
	.loc	1 50 50                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:50
 | 
			
		||||
	min.s32 	%r34, %r33, 8;
 | 
			
		||||
	.loc	1 51 40                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:40
 | 
			
		||||
	rem.s32 	%r35, %r26, %r34;
 | 
			
		||||
	.loc	1 51 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:34
 | 
			
		||||
	add.s32 	%r36, %r35, %r32;
 | 
			
		||||
	.loc	1 52 19                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:19
 | 
			
		||||
	and.b32 	%r37, %r26, 7;
 | 
			
		||||
	.loc	1 52 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:30
 | 
			
		||||
	div.s32 	%r38, %r37, %r34;
 | 
			
		||||
	.loc	1 56 17                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:17
 | 
			
		||||
	shl.b32 	%r39, %r36, 4;
 | 
			
		||||
	.loc	1 56 40                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:40
 | 
			
		||||
	mov.u32 	%r40, %tid.x;
 | 
			
		||||
	shr.u32 	%r41, %r40, 2;
 | 
			
		||||
	and.b32 	%r42, %r41, 2;
 | 
			
		||||
	bfe.u32 	%r43, %r40, 2, 2;
 | 
			
		||||
	and.b32 	%r44, %r40, 16;
 | 
			
		||||
	shr.u32 	%r45, %r44, 2;
 | 
			
		||||
	or.b32 	%r46, %r43, %r45;
 | 
			
		||||
	and.b32 	%r47, %r40, 15;
 | 
			
		||||
	bfe.u32 	%r48, %r40, 4, 1;
 | 
			
		||||
	.loc	1 56 27                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:27
 | 
			
		||||
	or.b32 	%r49, %r39, %r46;
 | 
			
		||||
	or.b32 	%r50, %r49, 8;
 | 
			
		||||
	or.b32 	%r51, %r39, %r47;
 | 
			
		||||
	.loc	1 0 0                           // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
 | 
			
		||||
	rem.s32 	%r52, %r50, %r1;
 | 
			
		||||
	rem.s32 	%r53, %r49, %r1;
 | 
			
		||||
	.loc	1 71 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:71:30
 | 
			
		||||
	shl.b32 	%r54, %r40, 2;
 | 
			
		||||
	and.b32 	%r55, %r54, 12;
 | 
			
		||||
	.loc	1 76 28                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:28
 | 
			
		||||
	shl.b32 	%r56, %r53, 4;
 | 
			
		||||
	shl.b32 	%r57, %r52, 4;
 | 
			
		||||
	.loc	1 76 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:25
 | 
			
		||||
	or.b32 	%r58, %r56, %r55;
 | 
			
		||||
	or.b32 	%r59, %r57, %r55;
 | 
			
		||||
	.loc	1 77 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:25
 | 
			
		||||
	mul.wide.s32 	%rd16, %r58, 4;
 | 
			
		||||
	add.s64 	%rd4, %rd1, %rd16;
 | 
			
		||||
	mul.wide.s32 	%rd17, %r59, 4;
 | 
			
		||||
	add.s64 	%rd5, %rd1, %rd17;
 | 
			
		||||
	.loc	1 77 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r2, 0x0;
 | 
			
		||||
	mov.u32 %r3, 0x0;
 | 
			
		||||
	mov.u32 %r4, 0x0;
 | 
			
		||||
	mov.u32 %r5, 0x0;
 | 
			
		||||
	ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd4 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r6, 0x0;
 | 
			
		||||
	mov.u32 %r7, 0x0;
 | 
			
		||||
	mov.u32 %r8, 0x0;
 | 
			
		||||
	mov.u32 %r9, 0x0;
 | 
			
		||||
	ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd5 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	shl.b32 	%r60, %r46, 6;
 | 
			
		||||
	shl.b32 	%r61, %r55, 2;
 | 
			
		||||
	or.b32 	%r62, %r60, %r61;
 | 
			
		||||
	mov.b32 	%r63, global_smem;
 | 
			
		||||
	add.s32 	%r64, %r63, %r62;
 | 
			
		||||
	st.shared.v4.b32 	[%r64], {%r2, %r3, %r4, %r5};
 | 
			
		||||
	st.shared.v4.b32 	[%r64+512], {%r6, %r7, %r8, %r9};
 | 
			
		||||
	.loc	1 82 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:25
 | 
			
		||||
	mul.wide.u32 	%rd18, %r55, 4;
 | 
			
		||||
	add.s64 	%rd6, %rd2, %rd18;
 | 
			
		||||
	.loc	1 82 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r10, 0x0;
 | 
			
		||||
	mov.u32 %r11, 0x0;
 | 
			
		||||
	mov.u32 %r12, 0x0;
 | 
			
		||||
	mov.u32 %r13, 0x0;
 | 
			
		||||
	ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd6 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r14, 0x0;
 | 
			
		||||
	mov.u32 %r15, 0x0;
 | 
			
		||||
	mov.u32 %r16, 0x0;
 | 
			
		||||
	mov.u32 %r17, 0x0;
 | 
			
		||||
	ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	add.s32 	%r65, %r63, 1024;
 | 
			
		||||
	add.s32 	%r66, %r65, %r62;
 | 
			
		||||
	st.shared.v4.b32 	[%r66], {%r10, %r11, %r12, %r13};
 | 
			
		||||
	st.shared.v4.b32 	[%r66+512], {%r14, %r15, %r16, %r17};
 | 
			
		||||
	.loc	1 90 17                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:17
 | 
			
		||||
	shl.b32 	%r67, %r38, 4;
 | 
			
		||||
	.loc	1 90 27                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:27
 | 
			
		||||
	or.b32 	%r68, %r67, %r48;
 | 
			
		||||
	.loc	1 93 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:20
 | 
			
		||||
	setp.lt.s32 	%p10, %r51, %r1;
 | 
			
		||||
	.loc	1 93 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:34
 | 
			
		||||
	setp.eq.s32 	%p11, %r68, 0;
 | 
			
		||||
	.loc	1 93 26                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:26
 | 
			
		||||
	and.pred 	%p2, %p10, %p11;
 | 
			
		||||
	.loc	1 96 21                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:96:21
 | 
			
		||||
	add.s32 	%r69, %r51, %r68;
 | 
			
		||||
	add.s32 	%r70, %r69, 2;
 | 
			
		||||
	add.s32 	%r71, %r69, 4;
 | 
			
		||||
	add.s32 	%r72, %r69, 6;
 | 
			
		||||
	add.s32 	%r73, %r69, 8;
 | 
			
		||||
	add.s32 	%r74, %r69, 10;
 | 
			
		||||
	add.s32 	%r75, %r69, 12;
 | 
			
		||||
	add.s32 	%r76, %r69, 14;
 | 
			
		||||
	.loc	1 77 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
 | 
			
		||||
	bar.sync 	0;
 | 
			
		||||
	or.b32 	%r77, %r42, %r45;
 | 
			
		||||
	shl.b32 	%r78, %r77, 6;
 | 
			
		||||
	add.s32 	%r79, %r63, %r78;
 | 
			
		||||
	ld.shared.v4.b32 	{%r80, %r81, %r82, %r83}, [%r79+112];
 | 
			
		||||
	ld.shared.v4.b32 	{%r84, %r85, %r86, %r87}, [%r79+96];
 | 
			
		||||
	ld.shared.v4.b32 	{%r88, %r89, %r90, %r91}, [%r79+80];
 | 
			
		||||
	ld.shared.v4.b32 	{%r92, %r93, %r94, %r95}, [%r79+64];
 | 
			
		||||
	ld.shared.v4.b32 	{%r96, %r97, %r98, %r99}, [%r79+48];
 | 
			
		||||
	ld.shared.v4.b32 	{%r100, %r101, %r102, %r103}, [%r79+32];
 | 
			
		||||
	ld.shared.v4.b32 	{%r104, %r105, %r106, %r107}, [%r79+16];
 | 
			
		||||
	ld.shared.v4.b32 	{%r108, %r109, %r110, %r111}, [%r79];
 | 
			
		||||
	ld.shared.v4.b32 	{%r112, %r113, %r114, %r115}, [%r79+624];
 | 
			
		||||
	ld.shared.v4.b32 	{%r116, %r117, %r118, %r119}, [%r79+608];
 | 
			
		||||
	ld.shared.v4.b32 	{%r120, %r121, %r122, %r123}, [%r79+592];
 | 
			
		||||
	ld.shared.v4.b32 	{%r124, %r125, %r126, %r127}, [%r79+576];
 | 
			
		||||
	ld.shared.v4.b32 	{%r128, %r129, %r130, %r131}, [%r79+560];
 | 
			
		||||
	ld.shared.v4.b32 	{%r132, %r133, %r134, %r135}, [%r79+544];
 | 
			
		||||
	ld.shared.v4.b32 	{%r136, %r137, %r138, %r139}, [%r79+528];
 | 
			
		||||
	ld.shared.v4.b32 	{%r140, %r141, %r142, %r143}, [%r79+512];
 | 
			
		||||
	.loc	1 82 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
 | 
			
		||||
	shl.b32 	%r144, %r40, 1;
 | 
			
		||||
	and.b32 	%r145, %r144, 14;
 | 
			
		||||
	shl.b32 	%r146, %r145, 6;
 | 
			
		||||
	add.s32 	%r147, %r65, %r146;
 | 
			
		||||
	ld.shared.v4.b32 	{%r148, %r149, %r150, %r151}, [%r147];
 | 
			
		||||
	ld.shared.v4.b32 	{%r152, %r153, %r154, %r155}, [%r147+64];
 | 
			
		||||
	ld.shared.v4.b32 	{%r156, %r157, %r158, %r159}, [%r147+16];
 | 
			
		||||
	ld.shared.v4.b32 	{%r160, %r161, %r162, %r163}, [%r147+80];
 | 
			
		||||
	ld.shared.v4.b32 	{%r164, %r165, %r166, %r167}, [%r147+32];
 | 
			
		||||
	ld.shared.v4.b32 	{%r168, %r169, %r170, %r171}, [%r147+96];
 | 
			
		||||
	ld.shared.v4.b32 	{%r172, %r173, %r174, %r175}, [%r147+48];
 | 
			
		||||
	ld.shared.v4.b32 	{%r176, %r177, %r178, %r179}, [%r147+112];
 | 
			
		||||
	.loc	1 85 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:85:25
 | 
			
		||||
	fma.rn.f32 	%r180, %r108, %r148, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r181, %r109, %r149, %r180;
 | 
			
		||||
	fma.rn.f32 	%r182, %r110, %r150, %r181;
 | 
			
		||||
	fma.rn.f32 	%r183, %r111, %r151, %r182;
 | 
			
		||||
	fma.rn.f32 	%r184, %r104, %r156, %r183;
 | 
			
		||||
	fma.rn.f32 	%r185, %r105, %r157, %r184;
 | 
			
		||||
	fma.rn.f32 	%r186, %r106, %r158, %r185;
 | 
			
		||||
	fma.rn.f32 	%r187, %r107, %r159, %r186;
 | 
			
		||||
	fma.rn.f32 	%r188, %r100, %r164, %r187;
 | 
			
		||||
	fma.rn.f32 	%r189, %r101, %r165, %r188;
 | 
			
		||||
	fma.rn.f32 	%r190, %r102, %r166, %r189;
 | 
			
		||||
	fma.rn.f32 	%r191, %r103, %r167, %r190;
 | 
			
		||||
	fma.rn.f32 	%r192, %r96, %r172, %r191;
 | 
			
		||||
	fma.rn.f32 	%r193, %r97, %r173, %r192;
 | 
			
		||||
	fma.rn.f32 	%r194, %r98, %r174, %r193;
 | 
			
		||||
	fma.rn.f32 	%r195, %r99, %r175, %r194;
 | 
			
		||||
	fma.rn.f32 	%r196, %r108, %r152, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r197, %r109, %r153, %r196;
 | 
			
		||||
	fma.rn.f32 	%r198, %r110, %r154, %r197;
 | 
			
		||||
	fma.rn.f32 	%r199, %r111, %r155, %r198;
 | 
			
		||||
	fma.rn.f32 	%r200, %r104, %r160, %r199;
 | 
			
		||||
	fma.rn.f32 	%r201, %r105, %r161, %r200;
 | 
			
		||||
	fma.rn.f32 	%r202, %r106, %r162, %r201;
 | 
			
		||||
	fma.rn.f32 	%r203, %r107, %r163, %r202;
 | 
			
		||||
	fma.rn.f32 	%r204, %r100, %r168, %r203;
 | 
			
		||||
	fma.rn.f32 	%r205, %r101, %r169, %r204;
 | 
			
		||||
	fma.rn.f32 	%r206, %r102, %r170, %r205;
 | 
			
		||||
	fma.rn.f32 	%r207, %r103, %r171, %r206;
 | 
			
		||||
	fma.rn.f32 	%r208, %r96, %r176, %r207;
 | 
			
		||||
	fma.rn.f32 	%r209, %r97, %r177, %r208;
 | 
			
		||||
	fma.rn.f32 	%r210, %r98, %r178, %r209;
 | 
			
		||||
	fma.rn.f32 	%r211, %r99, %r179, %r210;
 | 
			
		||||
	fma.rn.f32 	%r212, %r92, %r148, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r213, %r93, %r149, %r212;
 | 
			
		||||
	fma.rn.f32 	%r214, %r94, %r150, %r213;
 | 
			
		||||
	fma.rn.f32 	%r215, %r95, %r151, %r214;
 | 
			
		||||
	fma.rn.f32 	%r216, %r88, %r156, %r215;
 | 
			
		||||
	fma.rn.f32 	%r217, %r89, %r157, %r216;
 | 
			
		||||
	fma.rn.f32 	%r218, %r90, %r158, %r217;
 | 
			
		||||
	fma.rn.f32 	%r219, %r91, %r159, %r218;
 | 
			
		||||
	fma.rn.f32 	%r220, %r84, %r164, %r219;
 | 
			
		||||
	fma.rn.f32 	%r221, %r85, %r165, %r220;
 | 
			
		||||
	fma.rn.f32 	%r222, %r86, %r166, %r221;
 | 
			
		||||
	fma.rn.f32 	%r223, %r87, %r167, %r222;
 | 
			
		||||
	fma.rn.f32 	%r224, %r80, %r172, %r223;
 | 
			
		||||
	fma.rn.f32 	%r225, %r81, %r173, %r224;
 | 
			
		||||
	fma.rn.f32 	%r226, %r82, %r174, %r225;
 | 
			
		||||
	fma.rn.f32 	%r227, %r83, %r175, %r226;
 | 
			
		||||
	fma.rn.f32 	%r228, %r92, %r152, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r229, %r93, %r153, %r228;
 | 
			
		||||
	fma.rn.f32 	%r230, %r94, %r154, %r229;
 | 
			
		||||
	fma.rn.f32 	%r231, %r95, %r155, %r230;
 | 
			
		||||
	fma.rn.f32 	%r232, %r88, %r160, %r231;
 | 
			
		||||
	fma.rn.f32 	%r233, %r89, %r161, %r232;
 | 
			
		||||
	fma.rn.f32 	%r234, %r90, %r162, %r233;
 | 
			
		||||
	fma.rn.f32 	%r235, %r91, %r163, %r234;
 | 
			
		||||
	fma.rn.f32 	%r236, %r84, %r168, %r235;
 | 
			
		||||
	fma.rn.f32 	%r237, %r85, %r169, %r236;
 | 
			
		||||
	fma.rn.f32 	%r238, %r86, %r170, %r237;
 | 
			
		||||
	fma.rn.f32 	%r239, %r87, %r171, %r238;
 | 
			
		||||
	fma.rn.f32 	%r240, %r80, %r176, %r239;
 | 
			
		||||
	fma.rn.f32 	%r241, %r81, %r177, %r240;
 | 
			
		||||
	fma.rn.f32 	%r242, %r82, %r178, %r241;
 | 
			
		||||
	fma.rn.f32 	%r243, %r83, %r179, %r242;
 | 
			
		||||
	fma.rn.f32 	%r244, %r140, %r148, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r245, %r141, %r149, %r244;
 | 
			
		||||
	fma.rn.f32 	%r246, %r142, %r150, %r245;
 | 
			
		||||
	fma.rn.f32 	%r247, %r143, %r151, %r246;
 | 
			
		||||
	fma.rn.f32 	%r248, %r136, %r156, %r247;
 | 
			
		||||
	fma.rn.f32 	%r249, %r137, %r157, %r248;
 | 
			
		||||
	fma.rn.f32 	%r250, %r138, %r158, %r249;
 | 
			
		||||
	fma.rn.f32 	%r251, %r139, %r159, %r250;
 | 
			
		||||
	fma.rn.f32 	%r252, %r132, %r164, %r251;
 | 
			
		||||
	fma.rn.f32 	%r253, %r133, %r165, %r252;
 | 
			
		||||
	fma.rn.f32 	%r254, %r134, %r166, %r253;
 | 
			
		||||
	fma.rn.f32 	%r255, %r135, %r167, %r254;
 | 
			
		||||
	fma.rn.f32 	%r256, %r128, %r172, %r255;
 | 
			
		||||
	fma.rn.f32 	%r257, %r129, %r173, %r256;
 | 
			
		||||
	fma.rn.f32 	%r258, %r130, %r174, %r257;
 | 
			
		||||
	fma.rn.f32 	%r259, %r131, %r175, %r258;
 | 
			
		||||
	fma.rn.f32 	%r260, %r140, %r152, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r261, %r141, %r153, %r260;
 | 
			
		||||
	fma.rn.f32 	%r262, %r142, %r154, %r261;
 | 
			
		||||
	fma.rn.f32 	%r263, %r143, %r155, %r262;
 | 
			
		||||
	fma.rn.f32 	%r264, %r136, %r160, %r263;
 | 
			
		||||
	fma.rn.f32 	%r265, %r137, %r161, %r264;
 | 
			
		||||
	fma.rn.f32 	%r266, %r138, %r162, %r265;
 | 
			
		||||
	fma.rn.f32 	%r267, %r139, %r163, %r266;
 | 
			
		||||
	fma.rn.f32 	%r268, %r132, %r168, %r267;
 | 
			
		||||
	fma.rn.f32 	%r269, %r133, %r169, %r268;
 | 
			
		||||
	fma.rn.f32 	%r270, %r134, %r170, %r269;
 | 
			
		||||
	fma.rn.f32 	%r271, %r135, %r171, %r270;
 | 
			
		||||
	fma.rn.f32 	%r272, %r128, %r176, %r271;
 | 
			
		||||
	fma.rn.f32 	%r273, %r129, %r177, %r272;
 | 
			
		||||
	fma.rn.f32 	%r274, %r130, %r178, %r273;
 | 
			
		||||
	fma.rn.f32 	%r275, %r131, %r179, %r274;
 | 
			
		||||
	fma.rn.f32 	%r276, %r124, %r148, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r277, %r125, %r149, %r276;
 | 
			
		||||
	fma.rn.f32 	%r278, %r126, %r150, %r277;
 | 
			
		||||
	fma.rn.f32 	%r279, %r127, %r151, %r278;
 | 
			
		||||
	fma.rn.f32 	%r280, %r120, %r156, %r279;
 | 
			
		||||
	fma.rn.f32 	%r281, %r121, %r157, %r280;
 | 
			
		||||
	fma.rn.f32 	%r282, %r122, %r158, %r281;
 | 
			
		||||
	fma.rn.f32 	%r283, %r123, %r159, %r282;
 | 
			
		||||
	fma.rn.f32 	%r284, %r116, %r164, %r283;
 | 
			
		||||
	fma.rn.f32 	%r285, %r117, %r165, %r284;
 | 
			
		||||
	fma.rn.f32 	%r286, %r118, %r166, %r285;
 | 
			
		||||
	fma.rn.f32 	%r287, %r119, %r167, %r286;
 | 
			
		||||
	fma.rn.f32 	%r288, %r112, %r172, %r287;
 | 
			
		||||
	fma.rn.f32 	%r289, %r113, %r173, %r288;
 | 
			
		||||
	fma.rn.f32 	%r290, %r114, %r174, %r289;
 | 
			
		||||
	fma.rn.f32 	%r291, %r115, %r175, %r290;
 | 
			
		||||
	fma.rn.f32 	%r292, %r124, %r152, 0fBE0AE428;
 | 
			
		||||
	fma.rn.f32 	%r293, %r125, %r153, %r292;
 | 
			
		||||
	fma.rn.f32 	%r294, %r126, %r154, %r293;
 | 
			
		||||
	fma.rn.f32 	%r295, %r127, %r155, %r294;
 | 
			
		||||
	fma.rn.f32 	%r296, %r120, %r160, %r295;
 | 
			
		||||
	fma.rn.f32 	%r297, %r121, %r161, %r296;
 | 
			
		||||
	fma.rn.f32 	%r298, %r122, %r162, %r297;
 | 
			
		||||
	fma.rn.f32 	%r299, %r123, %r163, %r298;
 | 
			
		||||
	fma.rn.f32 	%r300, %r116, %r168, %r299;
 | 
			
		||||
	fma.rn.f32 	%r301, %r117, %r169, %r300;
 | 
			
		||||
	fma.rn.f32 	%r302, %r118, %r170, %r301;
 | 
			
		||||
	fma.rn.f32 	%r303, %r119, %r171, %r302;
 | 
			
		||||
	fma.rn.f32 	%r304, %r112, %r176, %r303;
 | 
			
		||||
	fma.rn.f32 	%r305, %r113, %r177, %r304;
 | 
			
		||||
	fma.rn.f32 	%r306, %r114, %r178, %r305;
 | 
			
		||||
	fma.rn.f32 	%r307, %r115, %r179, %r306;
 | 
			
		||||
	mov.b32 	%r308, 0f00000000;
 | 
			
		||||
$L__tmp1:
 | 
			
		||||
	.loc	2 47 30                         // standard.py:47:30 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
 | 
			
		||||
	sub.f32 	%r309, %r308, %r195;
 | 
			
		||||
	sub.f32 	%r310, %r308, %r211;
 | 
			
		||||
	sub.f32 	%r311, %r308, %r227;
 | 
			
		||||
	sub.f32 	%r312, %r308, %r243;
 | 
			
		||||
	sub.f32 	%r313, %r308, %r259;
 | 
			
		||||
	sub.f32 	%r314, %r308, %r275;
 | 
			
		||||
	sub.f32 	%r315, %r308, %r291;
 | 
			
		||||
	sub.f32 	%r316, %r308, %r307;
 | 
			
		||||
	.loc	2 47 29                         // standard.py:47:29 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
 | 
			
		||||
	mul.f32 	%r317, %r309, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r318, %r317;
 | 
			
		||||
	mul.f32 	%r319, %r310, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r320, %r319;
 | 
			
		||||
	mul.f32 	%r321, %r311, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r322, %r321;
 | 
			
		||||
	mul.f32 	%r323, %r312, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r324, %r323;
 | 
			
		||||
	mul.f32 	%r325, %r313, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r326, %r325;
 | 
			
		||||
	mul.f32 	%r327, %r314, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r328, %r327;
 | 
			
		||||
	mul.f32 	%r329, %r315, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r330, %r329;
 | 
			
		||||
	mul.f32 	%r331, %r316, 0f3FB8AA3B;
 | 
			
		||||
	ex2.approx.f32 	%r332, %r331;
 | 
			
		||||
	.loc	2 47 20                         // standard.py:47:20 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
 | 
			
		||||
	add.f32 	%r333, %r318, 0f3F800000;
 | 
			
		||||
	add.f32 	%r334, %r320, 0f3F800000;
 | 
			
		||||
	add.f32 	%r335, %r322, 0f3F800000;
 | 
			
		||||
	add.f32 	%r336, %r324, 0f3F800000;
 | 
			
		||||
	add.f32 	%r337, %r326, 0f3F800000;
 | 
			
		||||
	add.f32 	%r338, %r328, 0f3F800000;
 | 
			
		||||
	add.f32 	%r339, %r330, 0f3F800000;
 | 
			
		||||
	add.f32 	%r340, %r332, 0f3F800000;
 | 
			
		||||
	mov.b32 	%r341, 0f3F800000;
 | 
			
		||||
	.loc	2 47 16                         // standard.py:47:16 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
 | 
			
		||||
	div.full.f32 	%r342, %r341, %r333;
 | 
			
		||||
	div.full.f32 	%r343, %r341, %r334;
 | 
			
		||||
	div.full.f32 	%r344, %r341, %r335;
 | 
			
		||||
	div.full.f32 	%r345, %r341, %r336;
 | 
			
		||||
	div.full.f32 	%r346, %r341, %r337;
 | 
			
		||||
	div.full.f32 	%r347, %r341, %r338;
 | 
			
		||||
	div.full.f32 	%r348, %r341, %r339;
 | 
			
		||||
	div.full.f32 	%r349, %r341, %r340;
 | 
			
		||||
$L__tmp2:
 | 
			
		||||
	.loc	1 100 25                        // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:25
 | 
			
		||||
	mul.wide.s32 	%rd19, %r69, 4;
 | 
			
		||||
	add.s64 	%rd8, %rd3, %rd19;
 | 
			
		||||
	mul.wide.s32 	%rd20, %r70, 4;
 | 
			
		||||
	add.s64 	%rd9, %rd3, %rd20;
 | 
			
		||||
	mul.wide.s32 	%rd21, %r71, 4;
 | 
			
		||||
	add.s64 	%rd10, %rd3, %rd21;
 | 
			
		||||
	mul.wide.s32 	%rd22, %r72, 4;
 | 
			
		||||
	add.s64 	%rd11, %rd3, %rd22;
 | 
			
		||||
	mul.wide.s32 	%rd23, %r73, 4;
 | 
			
		||||
	add.s64 	%rd12, %rd3, %rd23;
 | 
			
		||||
	mul.wide.s32 	%rd24, %r74, 4;
 | 
			
		||||
	add.s64 	%rd13, %rd3, %rd24;
 | 
			
		||||
	mul.wide.s32 	%rd25, %r75, 4;
 | 
			
		||||
	add.s64 	%rd14, %rd3, %rd25;
 | 
			
		||||
	mul.wide.s32 	%rd26, %r76, 4;
 | 
			
		||||
	add.s64 	%rd15, %rd3, %rd26;
 | 
			
		||||
	.loc	1 100 68                        // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:68
 | 
			
		||||
	bar.sync 	0;
 | 
			
		||||
	and.b32 	%r350, %r54, 96;
 | 
			
		||||
	or.b32 	%r351, %r145, %r350;
 | 
			
		||||
	shl.b32 	%r352, %r40, 4;
 | 
			
		||||
	and.b32 	%r353, %r352, 240;
 | 
			
		||||
	shr.u32 	%r354, %r350, 1;
 | 
			
		||||
	add.s32 	%r355, %r63, %r354;
 | 
			
		||||
	shl.b32 	%r356, %r351, 2;
 | 
			
		||||
	add.s32 	%r357, %r355, %r356;
 | 
			
		||||
	st.shared.v2.b32 	[%r357], {%r342, %r343};
 | 
			
		||||
	or.b32 	%r358, %r350, 16;
 | 
			
		||||
	shr.u32 	%r359, %r358, 1;
 | 
			
		||||
	add.s32 	%r360, %r63, %r359;
 | 
			
		||||
	add.s32 	%r361, %r360, %r356;
 | 
			
		||||
	st.shared.v2.b32 	[%r361+64], {%r344, %r345};
 | 
			
		||||
	or.b32 	%r362, %r350, 128;
 | 
			
		||||
	shr.u32 	%r363, %r362, 1;
 | 
			
		||||
	add.s32 	%r364, %r63, %r363;
 | 
			
		||||
	add.s32 	%r365, %r364, %r356;
 | 
			
		||||
	st.shared.v2.b32 	[%r365+512], {%r346, %r347};
 | 
			
		||||
	or.b32 	%r366, %r350, 144;
 | 
			
		||||
	shr.u32 	%r367, %r366, 1;
 | 
			
		||||
	add.s32 	%r368, %r63, %r367;
 | 
			
		||||
	add.s32 	%r369, %r368, %r356;
 | 
			
		||||
	st.shared.v2.b32 	[%r369+576], {%r348, %r349};
 | 
			
		||||
	bar.sync 	0;
 | 
			
		||||
	shr.u32 	%r370, %r353, 1;
 | 
			
		||||
	add.s32 	%r371, %r63, %r370;
 | 
			
		||||
	shl.b32 	%r372, %r353, 2;
 | 
			
		||||
	add.s32 	%r373, %r371, %r372;
 | 
			
		||||
	add.s32 	%r374, %r373, %r45;
 | 
			
		||||
	ld.shared.b32 	%r18, [%r374];
 | 
			
		||||
	ld.shared.b32 	%r19, [%r374+8];
 | 
			
		||||
	ld.shared.b32 	%r20, [%r374+16];
 | 
			
		||||
	ld.shared.b32 	%r21, [%r374+24];
 | 
			
		||||
	ld.shared.b32 	%r22, [%r374+32];
 | 
			
		||||
	ld.shared.b32 	%r23, [%r374+40];
 | 
			
		||||
	ld.shared.b32 	%r24, [%r374+48];
 | 
			
		||||
	ld.shared.b32 	%r25, [%r374+56];
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p2 st.global.b32 [ %rd8 + 0 ], { %r18 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	mov.pred 	%p3, 0;
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p3 st.global.b32 [ %rd9 + 0 ], { %r19 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p3 st.global.b32 [ %rd10 + 0 ], { %r20 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p3 st.global.b32 [ %rd11 + 0 ], { %r21 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p3 st.global.b32 [ %rd12 + 0 ], { %r22 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p3 st.global.b32 [ %rd13 + 0 ], { %r23 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p3 st.global.b32 [ %rd14 + 0 ], { %r24 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p3 st.global.b32 [ %rd15 + 0 ], { %r25 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
$L__BB0_1:                              // %common.ret
 | 
			
		||||
	.loc	1 0 0                           // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
 | 
			
		||||
	ret;
 | 
			
		||||
$L__tmp3:
 | 
			
		||||
$L__func_end0:
 | 
			
		||||
                                        // -- End function
 | 
			
		||||
}
 | 
			
		||||
	.file	1 "/tmp/torchinductor_shangdiy/q4/cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py"
 | 
			
		||||
	.file	2 "/home/shangdiy/miniconda3/envs/pytorch-3.10/lib/python3.10/site-packages/triton/language/standard.py"
 | 
			
		||||
	.section	.debug_abbrev
 | 
			
		||||
	{
 | 
			
		||||
.b8 1                                   // Abbreviation Code
 | 
			
		||||
.b8 17                                  // DW_TAG_compile_unit
 | 
			
		||||
.b8 1                                   // DW_CHILDREN_yes
 | 
			
		||||
.b8 37                                  // DW_AT_producer
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 19                                  // DW_AT_language
 | 
			
		||||
.b8 5                                   // DW_FORM_data2
 | 
			
		||||
.b8 3                                   // DW_AT_name
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 16                                  // DW_AT_stmt_list
 | 
			
		||||
.b8 6                                   // DW_FORM_data4
 | 
			
		||||
.b8 27                                  // DW_AT_comp_dir
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 2                                   // Abbreviation Code
 | 
			
		||||
.b8 46                                  // DW_TAG_subprogram
 | 
			
		||||
.b8 0                                   // DW_CHILDREN_no
 | 
			
		||||
.b8 3                                   // DW_AT_name
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 32                                  // DW_AT_inline
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 3                                   // Abbreviation Code
 | 
			
		||||
.b8 46                                  // DW_TAG_subprogram
 | 
			
		||||
.b8 1                                   // DW_CHILDREN_yes
 | 
			
		||||
.b8 17                                  // DW_AT_low_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 18                                  // DW_AT_high_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 49                                  // DW_AT_abstract_origin
 | 
			
		||||
.b8 19                                  // DW_FORM_ref4
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 4                                   // Abbreviation Code
 | 
			
		||||
.b8 29                                  // DW_TAG_inlined_subroutine
 | 
			
		||||
.b8 0                                   // DW_CHILDREN_no
 | 
			
		||||
.b8 49                                  // DW_AT_abstract_origin
 | 
			
		||||
.b8 19                                  // DW_FORM_ref4
 | 
			
		||||
.b8 17                                  // DW_AT_low_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 18                                  // DW_AT_high_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 88                                  // DW_AT_call_file
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 89                                  // DW_AT_call_line
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 87                                  // DW_AT_call_column
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 0                                   // EOM(3)
 | 
			
		||||
	}
 | 
			
		||||
	.section	.debug_info
 | 
			
		||||
	{
 | 
			
		||||
.b32 203                                // Length of Unit
 | 
			
		||||
.b8 2                                   // DWARF version number
 | 
			
		||||
.b8 0
 | 
			
		||||
.b32 .debug_abbrev                      // Offset Into Abbrev. Section
 | 
			
		||||
.b8 8                                   // Address Size (in bytes)
 | 
			
		||||
.b8 1                                   // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
 | 
			
		||||
.b8 116                                 // DW_AT_producer
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 2                                   // DW_AT_language
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 99                                  // DW_AT_name
 | 
			
		||||
.b8 113
 | 
			
		||||
.b8 52
 | 
			
		||||
.b8 54
 | 
			
		||||
.b8 118
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 108
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 122
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 54
 | 
			
		||||
.b8 55
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 119
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 106
 | 
			
		||||
.b8 53
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 52
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 54
 | 
			
		||||
.b8 122
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 51
 | 
			
		||||
.b8 54
 | 
			
		||||
.b8 55
 | 
			
		||||
.b8 103
 | 
			
		||||
.b8 113
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 107
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 119
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 50
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 108
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 121
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 106
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 51
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 107
 | 
			
		||||
.b8 46
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 121
 | 
			
		||||
.b8 0
 | 
			
		||||
.b32 .debug_line                        // DW_AT_stmt_list
 | 
			
		||||
.b8 47                                  // DW_AT_comp_dir
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 47
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 115
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 103
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 121
 | 
			
		||||
.b8 47
 | 
			
		||||
.b8 113
 | 
			
		||||
.b8 52
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 2                                   // Abbrev [2] 0x70:0x30 DW_TAG_subprogram
 | 
			
		||||
.b8 109                                 // DW_AT_name
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 108
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 102
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 115
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 108
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 115
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 103
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 49
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 1                                   // DW_AT_inline
 | 
			
		||||
.b8 3                                   // Abbrev [3] 0xa0:0x2e DW_TAG_subprogram
 | 
			
		||||
.b64 $L__func_begin0                    // DW_AT_low_pc
 | 
			
		||||
.b64 $L__func_end0                      // DW_AT_high_pc
 | 
			
		||||
.b32 112                                // DW_AT_abstract_origin
 | 
			
		||||
.b8 4                                   // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine
 | 
			
		||||
.b32 112                                // DW_AT_abstract_origin
 | 
			
		||||
.b64 $L__tmp1                           // DW_AT_low_pc
 | 
			
		||||
.b64 $L__tmp2                           // DW_AT_high_pc
 | 
			
		||||
.b8 1                                   // DW_AT_call_file
 | 
			
		||||
.b8 99                                  // DW_AT_call_line
 | 
			
		||||
.b8 22                                  // DW_AT_call_column
 | 
			
		||||
.b8 0                                   // End Of Children Mark
 | 
			
		||||
.b8 0                                   // End Of Children Mark
 | 
			
		||||
	}
 | 
			
		||||
	.section	.debug_macinfo	{	}
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							@ -0,0 +1,727 @@
 | 
			
		||||
//
 | 
			
		||||
// Generated by LLVM NVPTX Back-End
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
.version 8.7
 | 
			
		||||
.target sm_89
 | 
			
		||||
.address_size 64
 | 
			
		||||
 | 
			
		||||
	// .globl	model_triton_tem_fused_addmm_relu_t_0 // -- Begin function model_triton_tem_fused_addmm_relu_t_0
 | 
			
		||||
.extern .shared .align 16 .b8 global_smem[];
 | 
			
		||||
                                        // @model_triton_tem_fused_addmm_relu_t_0
 | 
			
		||||
.visible .entry model_triton_tem_fused_addmm_relu_t_0(
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_0,
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_1,
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_2,
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_3,
 | 
			
		||||
	.param .u32 model_triton_tem_fused_addmm_relu_t_0_param_4,
 | 
			
		||||
	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_5
 | 
			
		||||
)
 | 
			
		||||
.reqntid 32
 | 
			
		||||
{
 | 
			
		||||
	.reg .pred 	%p<27>;
 | 
			
		||||
	.reg .b32 	%r<398>;
 | 
			
		||||
	.reg .b64 	%rd<29>;
 | 
			
		||||
	.loc	1 18 0                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
 | 
			
		||||
$L__func_begin0:
 | 
			
		||||
	.loc	1 18 0                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
 | 
			
		||||
 | 
			
		||||
// %bb.0:
 | 
			
		||||
	ld.param.b32 	%r1, [model_triton_tem_fused_addmm_relu_t_0_param_4];
 | 
			
		||||
$L__tmp0:
 | 
			
		||||
	.loc	1 34 16                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:34:16
 | 
			
		||||
	and.b32 	%r2, %r1, 268435455;
 | 
			
		||||
	setp.ne.s32 	%p1, %r2, 0;
 | 
			
		||||
	@%p1 bra 	$L__BB0_2;
 | 
			
		||||
	bra.uni 	$L__BB0_1;
 | 
			
		||||
$L__BB0_2:
 | 
			
		||||
	.loc	1 0 16                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0:16
 | 
			
		||||
	ld.param.b64 	%rd4, [model_triton_tem_fused_addmm_relu_t_0_param_3];
 | 
			
		||||
	ld.param.b64 	%rd3, [model_triton_tem_fused_addmm_relu_t_0_param_2];
 | 
			
		||||
	ld.param.b64 	%rd2, [model_triton_tem_fused_addmm_relu_t_0_param_1];
 | 
			
		||||
	ld.param.b64 	%rd1, [model_triton_tem_fused_addmm_relu_t_0_param_0];
 | 
			
		||||
	.loc	1 43 24                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:43:24
 | 
			
		||||
	mov.u32 	%r51, %ctaid.x;
 | 
			
		||||
	.loc	1 44 28                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:28
 | 
			
		||||
	add.s32 	%r52, %r1, 15;
 | 
			
		||||
	.loc	1 44 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:34
 | 
			
		||||
	shr.s32 	%r53, %r52, 31;
 | 
			
		||||
	shr.u32 	%r54, %r53, 28;
 | 
			
		||||
	add.s32 	%r55, %r52, %r54;
 | 
			
		||||
	shr.s32 	%r56, %r55, 4;
 | 
			
		||||
	.loc	1 50 41                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:41
 | 
			
		||||
	and.b32 	%r57, %r51, 2147483640;
 | 
			
		||||
	.loc	1 50 30                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:30
 | 
			
		||||
	sub.s32 	%r58, %r56, %r57;
 | 
			
		||||
	.loc	1 50 50                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:50
 | 
			
		||||
	min.s32 	%r59, %r58, 8;
 | 
			
		||||
	.loc	1 51 40                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:40
 | 
			
		||||
	rem.s32 	%r60, %r51, %r59;
 | 
			
		||||
	.loc	1 51 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:34
 | 
			
		||||
	add.s32 	%r61, %r60, %r57;
 | 
			
		||||
	.loc	1 52 19                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:19
 | 
			
		||||
	and.b32 	%r62, %r51, 7;
 | 
			
		||||
	.loc	1 52 30                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:30
 | 
			
		||||
	div.s32 	%r63, %r62, %r59;
 | 
			
		||||
	.loc	1 56 17                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:17
 | 
			
		||||
	shl.b32 	%r64, %r61, 4;
 | 
			
		||||
	.loc	1 56 40                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:40
 | 
			
		||||
	mov.u32 	%r65, %tid.x;
 | 
			
		||||
	and.b32 	%r66, %r65, 4;
 | 
			
		||||
	bfe.u32 	%r67, %r65, 2, 1;
 | 
			
		||||
	shr.u32 	%r68, %r65, 2;
 | 
			
		||||
	and.b32 	%r69, %r68, 6;
 | 
			
		||||
	or.b32 	%r70, %r69, %r67;
 | 
			
		||||
	bfe.u32 	%r71, %r65, 3, 2;
 | 
			
		||||
	and.b32 	%r72, %r65, 1;
 | 
			
		||||
	shl.b32 	%r73, %r72, 2;
 | 
			
		||||
	shl.b32 	%r74, %r65, 2;
 | 
			
		||||
	and.b32 	%r75, %r74, 12;
 | 
			
		||||
	or.b32 	%r76, %r75, 2;
 | 
			
		||||
	.loc	1 56 27                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:27
 | 
			
		||||
	or.b32 	%r77, %r64, %r70;
 | 
			
		||||
	or.b32 	%r78, %r77, 8;
 | 
			
		||||
	.loc	1 57 17                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:17
 | 
			
		||||
	shl.b32 	%r79, %r63, 4;
 | 
			
		||||
	.loc	1 57 27                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:27
 | 
			
		||||
	or.b32 	%r80, %r79, %r75;
 | 
			
		||||
	.loc	1 0 0                           // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
 | 
			
		||||
	rem.s32 	%r81, %r78, %r1;
 | 
			
		||||
	rem.s32 	%r82, %r77, %r1;
 | 
			
		||||
	.loc	1 71 36                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:71:36
 | 
			
		||||
	setp.lt.u32 	%p2, %r75, 10;
 | 
			
		||||
	setp.lt.u32 	%p3, %r76, 10;
 | 
			
		||||
	.loc	1 72 24                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:24
 | 
			
		||||
	shl.b32 	%r83, %r65, 1;
 | 
			
		||||
	and.b32 	%r84, %r83, 6;
 | 
			
		||||
	shl.b32 	%r85, %r66, 1;
 | 
			
		||||
	or.b32 	%r86, %r84, %r85;
 | 
			
		||||
	.loc	1 72 36                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:36
 | 
			
		||||
	setp.lt.u32 	%p6, %r86, 10;
 | 
			
		||||
	.loc	1 79 28                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:28
 | 
			
		||||
	mul.lo.s32 	%r87, %r82, 10;
 | 
			
		||||
	mul.lo.s32 	%r88, %r81, 10;
 | 
			
		||||
	.loc	1 79 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:25
 | 
			
		||||
	add.s32 	%r89, %r87, %r75;
 | 
			
		||||
	add.s32 	%r90, %r87, %r76;
 | 
			
		||||
	add.s32 	%r91, %r88, %r75;
 | 
			
		||||
	add.s32 	%r92, %r88, %r76;
 | 
			
		||||
	.loc	1 80 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:25
 | 
			
		||||
	mul.wide.s32 	%rd21, %r89, 4;
 | 
			
		||||
	add.s64 	%rd5, %rd1, %rd21;
 | 
			
		||||
	mul.wide.s32 	%rd22, %r90, 4;
 | 
			
		||||
	add.s64 	%rd6, %rd1, %rd22;
 | 
			
		||||
	mul.wide.s32 	%rd23, %r91, 4;
 | 
			
		||||
	add.s64 	%rd7, %rd1, %rd23;
 | 
			
		||||
	mul.wide.s32 	%rd24, %r92, 4;
 | 
			
		||||
	add.s64 	%rd8, %rd1, %rd24;
 | 
			
		||||
	mov.b32 	%r5, 0;
 | 
			
		||||
	.loc	1 80 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r3, %r5;
 | 
			
		||||
	mov.u32 %r4, %r5;
 | 
			
		||||
	@%p2 ld.global.v2.b32 { %r3, %r4 }, [ %rd5 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r7, %r5;
 | 
			
		||||
	mov.u32 %r8, %r5;
 | 
			
		||||
	@%p3 ld.global.v2.b32 { %r7, %r8 }, [ %rd6 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r11, %r5;
 | 
			
		||||
	mov.u32 %r12, %r5;
 | 
			
		||||
	@%p2 ld.global.v2.b32 { %r11, %r12 }, [ %rd7 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r15, %r5;
 | 
			
		||||
	mov.u32 %r16, %r5;
 | 
			
		||||
	@%p3 ld.global.v2.b32 { %r15, %r16 }, [ %rd8 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	shl.b32 	%r93, %r75, 2;
 | 
			
		||||
	mov.b32 	%r94, global_smem;
 | 
			
		||||
	add.s32 	%r95, %r94, %r93;
 | 
			
		||||
	shl.b32 	%r96, %r70, 6;
 | 
			
		||||
	add.s32 	%r97, %r95, %r96;
 | 
			
		||||
	st.shared.v4.b32 	[%r97], {%r3, %r4, %r7, %r8};
 | 
			
		||||
	st.shared.v4.b32 	[%r97+512], {%r11, %r12, %r15, %r16};
 | 
			
		||||
	.loc	1 85 50                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:50
 | 
			
		||||
	mad.lo.s32 	%r98, %r71, 10, %r86;
 | 
			
		||||
	.loc	1 85 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:25
 | 
			
		||||
	mul.wide.u32 	%rd25, %r98, 4;
 | 
			
		||||
	add.s64 	%rd9, %rd2, %rd25;
 | 
			
		||||
	add.s64 	%rd10, %rd9, 160;
 | 
			
		||||
	add.s64 	%rd11, %rd9, 320;
 | 
			
		||||
	add.s64 	%rd12, %rd9, 480;
 | 
			
		||||
	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r19, %r5;
 | 
			
		||||
	mov.u32 %r20, %r5;
 | 
			
		||||
	@%p6 ld.global.v2.b32 { %r19, %r20 }, [ %rd9 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r23, %r5;
 | 
			
		||||
	mov.u32 %r24, %r5;
 | 
			
		||||
	@%p6 ld.global.v2.b32 { %r23, %r24 }, [ %rd10 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r27, %r5;
 | 
			
		||||
	mov.u32 %r28, %r5;
 | 
			
		||||
	@%p6 ld.global.v2.b32 { %r27, %r28 }, [ %rd11 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r31, %r5;
 | 
			
		||||
	mov.u32 %r32, %r5;
 | 
			
		||||
	@%p6 ld.global.v2.b32 { %r31, %r32 }, [ %rd12 + 0 ];
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	shl.b32 	%r99, %r86, 2;
 | 
			
		||||
	add.s32 	%r100, %r94, 1024;
 | 
			
		||||
	add.s32 	%r101, %r100, %r99;
 | 
			
		||||
	shl.b32 	%r102, %r71, 6;
 | 
			
		||||
	add.s32 	%r103, %r101, %r102;
 | 
			
		||||
	st.shared.v2.b32 	[%r103], {%r19, %r20};
 | 
			
		||||
	st.shared.v2.b32 	[%r103+256], {%r23, %r24};
 | 
			
		||||
	st.shared.v2.b32 	[%r103+512], {%r27, %r28};
 | 
			
		||||
	st.shared.v2.b32 	[%r103+768], {%r31, %r32};
 | 
			
		||||
	.loc	1 96 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:20
 | 
			
		||||
	setp.lt.s32 	%p14, %r77, %r1;
 | 
			
		||||
	setp.lt.s32 	%p15, %r78, %r1;
 | 
			
		||||
	.loc	1 96 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:34
 | 
			
		||||
	setp.eq.s32 	%p16, %r63, 0;
 | 
			
		||||
	.loc	1 96 26                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:26
 | 
			
		||||
	and.pred 	%p10, %p16, %p14;
 | 
			
		||||
	and.pred 	%p11, %p15, %p16;
 | 
			
		||||
	.loc	1 100 30                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:30
 | 
			
		||||
	mul.wide.u32 	%rd26, %r80, 4;
 | 
			
		||||
	add.s64 	%rd14, %rd3, %rd26;
 | 
			
		||||
	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u64 %rd15, 0x0;
 | 
			
		||||
	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r35, 0x0;
 | 
			
		||||
	mov.u32 %r36, 0x0;
 | 
			
		||||
	mov.u32 %r37, 0x0;
 | 
			
		||||
	mov.u32 %r38, 0x0;
 | 
			
		||||
	@%p10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15;
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u64 %rd18, 0x0;
 | 
			
		||||
	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	mov.u32 %r39, 0x0;
 | 
			
		||||
	mov.u32 %r40, 0x0;
 | 
			
		||||
	mov.u32 %r41, 0x0;
 | 
			
		||||
	mov.u32 %r42, 0x0;
 | 
			
		||||
	@%p11 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd14 + 0 ], %rd18;
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	.loc	1 80 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
 | 
			
		||||
	bar.sync 	0;
 | 
			
		||||
	shl.b32 	%r104, %r69, 6;
 | 
			
		||||
	add.s32 	%r105, %r94, %r104;
 | 
			
		||||
	ld.shared.v4.b32 	{%r106, %r107, %r108, %r109}, [%r105+48];
 | 
			
		||||
	ld.shared.v4.b32 	{%r110, %r111, %r112, %r113}, [%r105+112];
 | 
			
		||||
	ld.shared.v4.b32 	{%r114, %r115, %r116, %r117}, [%r105+32];
 | 
			
		||||
	ld.shared.v4.b32 	{%r118, %r119, %r120, %r121}, [%r105+96];
 | 
			
		||||
	ld.shared.v4.b32 	{%r122, %r123, %r124, %r125}, [%r105+16];
 | 
			
		||||
	ld.shared.v4.b32 	{%r126, %r127, %r128, %r129}, [%r105+80];
 | 
			
		||||
	ld.shared.v4.b32 	{%r130, %r131, %r132, %r133}, [%r105];
 | 
			
		||||
	ld.shared.v4.b32 	{%r134, %r135, %r136, %r137}, [%r105+64];
 | 
			
		||||
	ld.shared.v4.b32 	{%r138, %r139, %r140, %r141}, [%r105+560];
 | 
			
		||||
	ld.shared.v4.b32 	{%r142, %r143, %r144, %r145}, [%r105+624];
 | 
			
		||||
	ld.shared.v4.b32 	{%r146, %r147, %r148, %r149}, [%r105+544];
 | 
			
		||||
	ld.shared.v4.b32 	{%r150, %r151, %r152, %r153}, [%r105+608];
 | 
			
		||||
	ld.shared.v4.b32 	{%r154, %r155, %r156, %r157}, [%r105+528];
 | 
			
		||||
	ld.shared.v4.b32 	{%r158, %r159, %r160, %r161}, [%r105+592];
 | 
			
		||||
	ld.shared.v4.b32 	{%r162, %r163, %r164, %r165}, [%r105+512];
 | 
			
		||||
	ld.shared.v4.b32 	{%r166, %r167, %r168, %r169}, [%r105+576];
 | 
			
		||||
	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
 | 
			
		||||
	mad.lo.s32 	%r170, %r86, 60, %r101;
 | 
			
		||||
	or.b32 	%r171, %r84, 1;
 | 
			
		||||
	or.b32 	%r172, %r171, %r85;
 | 
			
		||||
	shl.b32 	%r173, %r172, 6;
 | 
			
		||||
	add.s32 	%r174, %r100, %r173;
 | 
			
		||||
	ld.shared.b32 	%r175, [%r170+16];
 | 
			
		||||
	ld.shared.b32 	%r176, [%r174+16];
 | 
			
		||||
	ld.shared.b32 	%r177, [%r174+52];
 | 
			
		||||
	ld.shared.b32 	%r178, [%r170+60];
 | 
			
		||||
	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
 | 
			
		||||
	shr.u32 	%r179, %r66, 1;
 | 
			
		||||
	setp.eq.s32 	%p17, %r66, 0;
 | 
			
		||||
	bfe.u32 	%r180, %r65, 1, 1;
 | 
			
		||||
	or.b32 	%r181, %r73, %r180;
 | 
			
		||||
	or.b32 	%r182, %r181, %r179;
 | 
			
		||||
	and.b32 	%r183, %r65, 24;
 | 
			
		||||
	or.b32 	%r184, %r182, %r183;
 | 
			
		||||
	selp.b32 	%r185, %r35, %r37, %p17;
 | 
			
		||||
	shfl.sync.idx.b32 	%r186, %r185, %r184, 31, -1;
 | 
			
		||||
	selp.b32 	%r187, %r36, %r38, %p17;
 | 
			
		||||
	shfl.sync.idx.b32 	%r188, %r187, %r184, 31, -1;
 | 
			
		||||
	selp.b32 	%r189, %r37, %r35, %p17;
 | 
			
		||||
	xor.b32 	%r190, %r184, 4;
 | 
			
		||||
	shfl.sync.idx.b32 	%r191, %r189, %r190, 31, -1;
 | 
			
		||||
	selp.b32 	%r192, %r38, %r36, %p17;
 | 
			
		||||
	shfl.sync.idx.b32 	%r193, %r192, %r190, 31, -1;
 | 
			
		||||
	selp.b32 	%r194, %r39, %r41, %p17;
 | 
			
		||||
	shfl.sync.idx.b32 	%r195, %r194, %r184, 31, -1;
 | 
			
		||||
	selp.b32 	%r196, %r40, %r42, %p17;
 | 
			
		||||
	shfl.sync.idx.b32 	%r197, %r196, %r184, 31, -1;
 | 
			
		||||
	selp.b32 	%r198, %r41, %r39, %p17;
 | 
			
		||||
	shfl.sync.idx.b32 	%r199, %r198, %r190, 31, -1;
 | 
			
		||||
	selp.b32 	%r200, %r42, %r40, %p17;
 | 
			
		||||
	shfl.sync.idx.b32 	%r201, %r200, %r190, 31, -1;
 | 
			
		||||
	setp.eq.s32 	%p18, %r72, 0;
 | 
			
		||||
	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
 | 
			
		||||
	ld.shared.v4.b32 	{%r202, %r203, %r204, %r205}, [%r170];
 | 
			
		||||
	ld.shared.v4.b32 	{%r206, %r207, %r208, %r209}, [%r174];
 | 
			
		||||
	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
 | 
			
		||||
	selp.f32 	%r210, %r186, %r191, %p18;
 | 
			
		||||
	selp.f32 	%r211, %r188, %r193, %p18;
 | 
			
		||||
	selp.f32 	%r212, %r191, %r186, %p18;
 | 
			
		||||
	selp.f32 	%r213, %r193, %r188, %p18;
 | 
			
		||||
	selp.f32 	%r214, %r195, %r199, %p18;
 | 
			
		||||
	selp.f32 	%r215, %r197, %r201, %p18;
 | 
			
		||||
	selp.f32 	%r216, %r199, %r195, %p18;
 | 
			
		||||
	selp.f32 	%r217, %r201, %r197, %p18;
 | 
			
		||||
	.loc	1 88 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
 | 
			
		||||
	fma.rn.f32 	%r218, %r166, %r206, %r217;
 | 
			
		||||
	fma.rn.f32 	%r219, %r166, %r202, %r216;
 | 
			
		||||
	fma.rn.f32 	%r220, %r162, %r206, %r215;
 | 
			
		||||
	fma.rn.f32 	%r221, %r162, %r202, %r214;
 | 
			
		||||
	fma.rn.f32 	%r222, %r134, %r206, %r213;
 | 
			
		||||
	fma.rn.f32 	%r223, %r134, %r202, %r212;
 | 
			
		||||
	fma.rn.f32 	%r224, %r130, %r206, %r211;
 | 
			
		||||
	fma.rn.f32 	%r225, %r130, %r202, %r210;
 | 
			
		||||
	fma.rn.f32 	%r226, %r131, %r203, %r225;
 | 
			
		||||
	fma.rn.f32 	%r227, %r131, %r207, %r224;
 | 
			
		||||
	fma.rn.f32 	%r228, %r135, %r203, %r223;
 | 
			
		||||
	fma.rn.f32 	%r229, %r135, %r207, %r222;
 | 
			
		||||
	fma.rn.f32 	%r230, %r163, %r203, %r221;
 | 
			
		||||
	fma.rn.f32 	%r231, %r163, %r207, %r220;
 | 
			
		||||
	fma.rn.f32 	%r232, %r167, %r203, %r219;
 | 
			
		||||
	fma.rn.f32 	%r233, %r167, %r207, %r218;
 | 
			
		||||
	fma.rn.f32 	%r234, %r168, %r208, %r233;
 | 
			
		||||
	fma.rn.f32 	%r235, %r168, %r204, %r232;
 | 
			
		||||
	fma.rn.f32 	%r236, %r164, %r208, %r231;
 | 
			
		||||
	fma.rn.f32 	%r237, %r164, %r204, %r230;
 | 
			
		||||
	fma.rn.f32 	%r238, %r136, %r208, %r229;
 | 
			
		||||
	fma.rn.f32 	%r239, %r136, %r204, %r228;
 | 
			
		||||
	fma.rn.f32 	%r240, %r132, %r208, %r227;
 | 
			
		||||
	fma.rn.f32 	%r241, %r132, %r204, %r226;
 | 
			
		||||
	fma.rn.f32 	%r242, %r133, %r205, %r241;
 | 
			
		||||
	fma.rn.f32 	%r243, %r133, %r209, %r240;
 | 
			
		||||
	fma.rn.f32 	%r244, %r137, %r205, %r239;
 | 
			
		||||
	fma.rn.f32 	%r245, %r137, %r209, %r238;
 | 
			
		||||
	fma.rn.f32 	%r246, %r165, %r205, %r237;
 | 
			
		||||
	fma.rn.f32 	%r247, %r165, %r209, %r236;
 | 
			
		||||
	fma.rn.f32 	%r248, %r169, %r205, %r235;
 | 
			
		||||
	fma.rn.f32 	%r249, %r169, %r209, %r234;
 | 
			
		||||
	fma.rn.f32 	%r250, %r158, %r176, %r249;
 | 
			
		||||
	fma.rn.f32 	%r251, %r158, %r175, %r248;
 | 
			
		||||
	fma.rn.f32 	%r252, %r154, %r176, %r247;
 | 
			
		||||
	fma.rn.f32 	%r253, %r154, %r175, %r246;
 | 
			
		||||
	fma.rn.f32 	%r254, %r126, %r176, %r245;
 | 
			
		||||
	fma.rn.f32 	%r255, %r126, %r175, %r244;
 | 
			
		||||
	fma.rn.f32 	%r256, %r122, %r176, %r243;
 | 
			
		||||
	fma.rn.f32 	%r257, %r122, %r175, %r242;
 | 
			
		||||
	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
 | 
			
		||||
	ld.shared.b32 	%r258, [%r170+24];
 | 
			
		||||
	ld.shared.b32 	%r259, [%r170+20];
 | 
			
		||||
	ld.shared.b32 	%r260, [%r174+24];
 | 
			
		||||
	ld.shared.b32 	%r261, [%r174+20];
 | 
			
		||||
	ld.shared.b32 	%r262, [%r170+32];
 | 
			
		||||
	ld.shared.b32 	%r263, [%r170+28];
 | 
			
		||||
	ld.shared.b32 	%r264, [%r174+32];
 | 
			
		||||
	ld.shared.b32 	%r265, [%r174+28];
 | 
			
		||||
	ld.shared.b32 	%r266, [%r170+40];
 | 
			
		||||
	ld.shared.b32 	%r267, [%r170+36];
 | 
			
		||||
	ld.shared.b32 	%r268, [%r174+40];
 | 
			
		||||
	ld.shared.b32 	%r269, [%r174+36];
 | 
			
		||||
	ld.shared.b32 	%r270, [%r170+48];
 | 
			
		||||
	ld.shared.b32 	%r271, [%r170+44];
 | 
			
		||||
	ld.shared.b32 	%r272, [%r174+48];
 | 
			
		||||
	ld.shared.b32 	%r273, [%r174+44];
 | 
			
		||||
	ld.shared.b32 	%r274, [%r170+56];
 | 
			
		||||
	ld.shared.b32 	%r275, [%r170+52];
 | 
			
		||||
	ld.shared.v2.b32 	{%r276, %r277}, [%r174+56];
 | 
			
		||||
	.loc	1 88 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
 | 
			
		||||
	fma.rn.f32 	%r278, %r123, %r259, %r257;
 | 
			
		||||
	fma.rn.f32 	%r279, %r123, %r261, %r256;
 | 
			
		||||
	fma.rn.f32 	%r280, %r127, %r259, %r255;
 | 
			
		||||
	fma.rn.f32 	%r281, %r127, %r261, %r254;
 | 
			
		||||
	fma.rn.f32 	%r282, %r155, %r259, %r253;
 | 
			
		||||
	fma.rn.f32 	%r283, %r155, %r261, %r252;
 | 
			
		||||
	fma.rn.f32 	%r284, %r159, %r259, %r251;
 | 
			
		||||
	fma.rn.f32 	%r285, %r159, %r261, %r250;
 | 
			
		||||
	fma.rn.f32 	%r286, %r160, %r260, %r285;
 | 
			
		||||
	fma.rn.f32 	%r287, %r160, %r258, %r284;
 | 
			
		||||
	fma.rn.f32 	%r288, %r156, %r260, %r283;
 | 
			
		||||
	fma.rn.f32 	%r289, %r156, %r258, %r282;
 | 
			
		||||
	fma.rn.f32 	%r290, %r128, %r260, %r281;
 | 
			
		||||
	fma.rn.f32 	%r291, %r128, %r258, %r280;
 | 
			
		||||
	fma.rn.f32 	%r292, %r124, %r260, %r279;
 | 
			
		||||
	fma.rn.f32 	%r293, %r124, %r258, %r278;
 | 
			
		||||
	fma.rn.f32 	%r294, %r125, %r263, %r293;
 | 
			
		||||
	fma.rn.f32 	%r295, %r125, %r265, %r292;
 | 
			
		||||
	fma.rn.f32 	%r296, %r129, %r263, %r291;
 | 
			
		||||
	fma.rn.f32 	%r297, %r129, %r265, %r290;
 | 
			
		||||
	fma.rn.f32 	%r298, %r157, %r263, %r289;
 | 
			
		||||
	fma.rn.f32 	%r299, %r157, %r265, %r288;
 | 
			
		||||
	fma.rn.f32 	%r300, %r161, %r263, %r287;
 | 
			
		||||
	fma.rn.f32 	%r301, %r161, %r265, %r286;
 | 
			
		||||
	fma.rn.f32 	%r302, %r150, %r264, %r301;
 | 
			
		||||
	fma.rn.f32 	%r303, %r150, %r262, %r300;
 | 
			
		||||
	fma.rn.f32 	%r304, %r146, %r264, %r299;
 | 
			
		||||
	fma.rn.f32 	%r305, %r146, %r262, %r298;
 | 
			
		||||
	fma.rn.f32 	%r306, %r118, %r264, %r297;
 | 
			
		||||
	fma.rn.f32 	%r307, %r118, %r262, %r296;
 | 
			
		||||
	fma.rn.f32 	%r308, %r114, %r264, %r295;
 | 
			
		||||
	fma.rn.f32 	%r309, %r114, %r262, %r294;
 | 
			
		||||
	fma.rn.f32 	%r310, %r115, %r267, %r309;
 | 
			
		||||
	fma.rn.f32 	%r311, %r115, %r269, %r308;
 | 
			
		||||
	fma.rn.f32 	%r312, %r119, %r267, %r307;
 | 
			
		||||
	fma.rn.f32 	%r313, %r119, %r269, %r306;
 | 
			
		||||
	fma.rn.f32 	%r314, %r147, %r267, %r305;
 | 
			
		||||
	fma.rn.f32 	%r315, %r147, %r269, %r304;
 | 
			
		||||
	fma.rn.f32 	%r316, %r151, %r267, %r303;
 | 
			
		||||
	fma.rn.f32 	%r317, %r151, %r269, %r302;
 | 
			
		||||
	fma.rn.f32 	%r318, %r152, %r268, %r317;
 | 
			
		||||
	fma.rn.f32 	%r319, %r152, %r266, %r316;
 | 
			
		||||
	fma.rn.f32 	%r320, %r148, %r268, %r315;
 | 
			
		||||
	fma.rn.f32 	%r321, %r148, %r266, %r314;
 | 
			
		||||
	fma.rn.f32 	%r322, %r120, %r268, %r313;
 | 
			
		||||
	fma.rn.f32 	%r323, %r120, %r266, %r312;
 | 
			
		||||
	fma.rn.f32 	%r324, %r116, %r268, %r311;
 | 
			
		||||
	fma.rn.f32 	%r325, %r116, %r266, %r310;
 | 
			
		||||
	fma.rn.f32 	%r326, %r117, %r271, %r325;
 | 
			
		||||
	fma.rn.f32 	%r327, %r117, %r273, %r324;
 | 
			
		||||
	fma.rn.f32 	%r328, %r121, %r271, %r323;
 | 
			
		||||
	fma.rn.f32 	%r329, %r121, %r273, %r322;
 | 
			
		||||
	fma.rn.f32 	%r330, %r149, %r271, %r321;
 | 
			
		||||
	fma.rn.f32 	%r331, %r149, %r273, %r320;
 | 
			
		||||
	fma.rn.f32 	%r332, %r153, %r271, %r319;
 | 
			
		||||
	fma.rn.f32 	%r333, %r153, %r273, %r318;
 | 
			
		||||
	fma.rn.f32 	%r334, %r142, %r272, %r333;
 | 
			
		||||
	fma.rn.f32 	%r335, %r142, %r270, %r332;
 | 
			
		||||
	fma.rn.f32 	%r336, %r138, %r272, %r331;
 | 
			
		||||
	fma.rn.f32 	%r337, %r138, %r270, %r330;
 | 
			
		||||
	fma.rn.f32 	%r338, %r110, %r272, %r329;
 | 
			
		||||
	fma.rn.f32 	%r339, %r110, %r270, %r328;
 | 
			
		||||
	fma.rn.f32 	%r340, %r106, %r272, %r327;
 | 
			
		||||
	fma.rn.f32 	%r341, %r106, %r270, %r326;
 | 
			
		||||
	fma.rn.f32 	%r342, %r107, %r275, %r341;
 | 
			
		||||
	fma.rn.f32 	%r343, %r107, %r177, %r340;
 | 
			
		||||
	fma.rn.f32 	%r344, %r111, %r275, %r339;
 | 
			
		||||
	fma.rn.f32 	%r345, %r111, %r177, %r338;
 | 
			
		||||
	fma.rn.f32 	%r346, %r139, %r275, %r337;
 | 
			
		||||
	fma.rn.f32 	%r347, %r139, %r177, %r336;
 | 
			
		||||
	fma.rn.f32 	%r348, %r143, %r275, %r335;
 | 
			
		||||
	fma.rn.f32 	%r349, %r143, %r177, %r334;
 | 
			
		||||
	fma.rn.f32 	%r350, %r144, %r276, %r349;
 | 
			
		||||
	fma.rn.f32 	%r351, %r144, %r274, %r348;
 | 
			
		||||
	fma.rn.f32 	%r352, %r140, %r276, %r347;
 | 
			
		||||
	fma.rn.f32 	%r353, %r140, %r274, %r346;
 | 
			
		||||
	fma.rn.f32 	%r354, %r112, %r276, %r345;
 | 
			
		||||
	fma.rn.f32 	%r355, %r112, %r274, %r344;
 | 
			
		||||
	fma.rn.f32 	%r356, %r108, %r276, %r343;
 | 
			
		||||
	fma.rn.f32 	%r357, %r108, %r274, %r342;
 | 
			
		||||
	fma.rn.f32 	%r358, %r109, %r178, %r357;
 | 
			
		||||
	fma.rn.f32 	%r359, %r109, %r277, %r356;
 | 
			
		||||
	fma.rn.f32 	%r360, %r113, %r178, %r355;
 | 
			
		||||
	fma.rn.f32 	%r361, %r113, %r277, %r354;
 | 
			
		||||
	fma.rn.f32 	%r362, %r141, %r178, %r353;
 | 
			
		||||
	fma.rn.f32 	%r363, %r141, %r277, %r352;
 | 
			
		||||
	fma.rn.f32 	%r364, %r145, %r178, %r351;
 | 
			
		||||
	fma.rn.f32 	%r365, %r145, %r277, %r350;
 | 
			
		||||
$L__tmp1:
 | 
			
		||||
	.loc	2 110 15                        // triton_helpers.py:110:15 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
 | 
			
		||||
	setp.lt.f32 	%p19, %r365, 0f00000000;
 | 
			
		||||
	setp.lt.f32 	%p20, %r364, 0f00000000;
 | 
			
		||||
	setp.lt.f32 	%p21, %r363, 0f00000000;
 | 
			
		||||
	setp.lt.f32 	%p22, %r362, 0f00000000;
 | 
			
		||||
	setp.lt.f32 	%p23, %r361, 0f00000000;
 | 
			
		||||
	setp.lt.f32 	%p24, %r360, 0f00000000;
 | 
			
		||||
	setp.lt.f32 	%p25, %r359, 0f00000000;
 | 
			
		||||
	setp.lt.f32 	%p26, %r358, 0f00000000;
 | 
			
		||||
	.loc	2 113 29                        // triton_helpers.py:113:29 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
 | 
			
		||||
	selp.f32 	%r366, 0f00000000, %r358, %p26;
 | 
			
		||||
	selp.f32 	%r367, 0f00000000, %r359, %p25;
 | 
			
		||||
	selp.f32 	%r368, 0f00000000, %r360, %p24;
 | 
			
		||||
	selp.f32 	%r369, 0f00000000, %r361, %p23;
 | 
			
		||||
	selp.f32 	%r370, 0f00000000, %r362, %p22;
 | 
			
		||||
	selp.f32 	%r371, 0f00000000, %r363, %p21;
 | 
			
		||||
	selp.f32 	%r372, 0f00000000, %r364, %p20;
 | 
			
		||||
	selp.f32 	%r373, 0f00000000, %r365, %p19;
 | 
			
		||||
$L__tmp2:
 | 
			
		||||
	.loc	1 104 52                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:52
 | 
			
		||||
	shl.b32 	%r374, %r77, 4;
 | 
			
		||||
	shl.b32 	%r375, %r78, 4;
 | 
			
		||||
	.loc	1 104 49                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:49
 | 
			
		||||
	add.s32 	%r376, %r374, %r80;
 | 
			
		||||
	add.s32 	%r377, %r80, %r375;
 | 
			
		||||
	.loc	1 104 25                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:25
 | 
			
		||||
	mul.wide.s32 	%rd27, %r376, 4;
 | 
			
		||||
	add.s64 	%rd19, %rd4, %rd27;
 | 
			
		||||
	mul.wide.s32 	%rd28, %r377, 4;
 | 
			
		||||
	add.s64 	%rd20, %rd4, %rd28;
 | 
			
		||||
	.loc	1 104 78                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:78
 | 
			
		||||
	selp.f32 	%r378, %r366, %r368, %p18;
 | 
			
		||||
	or.b32 	%r379, %r183, %r67;
 | 
			
		||||
	or.b32 	%r380, %r379, %r84;
 | 
			
		||||
	shfl.sync.idx.b32 	%r381, %r378, %r380, 31, -1;
 | 
			
		||||
	selp.f32 	%r382, %r367, %r369, %p18;
 | 
			
		||||
	shfl.sync.idx.b32 	%r383, %r382, %r380, 31, -1;
 | 
			
		||||
	selp.f32 	%r384, %r368, %r366, %p18;
 | 
			
		||||
	xor.b32 	%r385, %r171, %r67;
 | 
			
		||||
	or.b32 	%r386, %r183, %r385;
 | 
			
		||||
	shfl.sync.idx.b32 	%r387, %r384, %r386, 31, -1;
 | 
			
		||||
	selp.f32 	%r388, %r369, %r367, %p18;
 | 
			
		||||
	shfl.sync.idx.b32 	%r389, %r388, %r386, 31, -1;
 | 
			
		||||
	selp.f32 	%r390, %r370, %r372, %p18;
 | 
			
		||||
	shfl.sync.idx.b32 	%r391, %r390, %r380, 31, -1;
 | 
			
		||||
	selp.f32 	%r392, %r371, %r373, %p18;
 | 
			
		||||
	shfl.sync.idx.b32 	%r393, %r392, %r380, 31, -1;
 | 
			
		||||
	selp.f32 	%r394, %r372, %r370, %p18;
 | 
			
		||||
	shfl.sync.idx.b32 	%r395, %r394, %r386, 31, -1;
 | 
			
		||||
	selp.f32 	%r396, %r373, %r371, %p18;
 | 
			
		||||
	shfl.sync.idx.b32 	%r397, %r396, %r386, 31, -1;
 | 
			
		||||
	selp.b32 	%r45, %r387, %r381, %p17;
 | 
			
		||||
	selp.b32 	%r46, %r389, %r383, %p17;
 | 
			
		||||
	selp.b32 	%r49, %r395, %r391, %p17;
 | 
			
		||||
	selp.b32 	%r50, %r397, %r393, %p17;
 | 
			
		||||
	selp.b32 	%r43, %r381, %r387, %p17;
 | 
			
		||||
	selp.b32 	%r44, %r383, %r389, %p17;
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p10 st.global.v4.b32 [ %rd19 + 0 ], { %r43, %r44, %r45, %r46 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
	selp.b32 	%r47, %r391, %r395, %p17;
 | 
			
		||||
	selp.b32 	%r48, %r393, %r397, %p17;
 | 
			
		||||
	// begin inline asm
 | 
			
		||||
	@%p11 st.global.v4.b32 [ %rd20 + 0 ], { %r47, %r48, %r49, %r50 };
 | 
			
		||||
	// end inline asm
 | 
			
		||||
$L__BB0_1:                              // %common.ret
 | 
			
		||||
	.loc	1 0 0                           // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
 | 
			
		||||
	ret;
 | 
			
		||||
$L__tmp3:
 | 
			
		||||
$L__func_end0:
 | 
			
		||||
                                        // -- End function
 | 
			
		||||
}
 | 
			
		||||
	.file	1 "/tmp/torchinductor_shangdiy/uw/cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py"
 | 
			
		||||
	.file	2 "/home/shangdiy/pytorch/torch/_inductor/runtime/triton_helpers.py"
 | 
			
		||||
	.section	.debug_abbrev
 | 
			
		||||
	{
 | 
			
		||||
.b8 1                                   // Abbreviation Code
 | 
			
		||||
.b8 17                                  // DW_TAG_compile_unit
 | 
			
		||||
.b8 1                                   // DW_CHILDREN_yes
 | 
			
		||||
.b8 37                                  // DW_AT_producer
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 19                                  // DW_AT_language
 | 
			
		||||
.b8 5                                   // DW_FORM_data2
 | 
			
		||||
.b8 3                                   // DW_AT_name
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 16                                  // DW_AT_stmt_list
 | 
			
		||||
.b8 6                                   // DW_FORM_data4
 | 
			
		||||
.b8 27                                  // DW_AT_comp_dir
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 2                                   // Abbreviation Code
 | 
			
		||||
.b8 46                                  // DW_TAG_subprogram
 | 
			
		||||
.b8 0                                   // DW_CHILDREN_no
 | 
			
		||||
.b8 3                                   // DW_AT_name
 | 
			
		||||
.b8 8                                   // DW_FORM_string
 | 
			
		||||
.b8 32                                  // DW_AT_inline
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 3                                   // Abbreviation Code
 | 
			
		||||
.b8 46                                  // DW_TAG_subprogram
 | 
			
		||||
.b8 1                                   // DW_CHILDREN_yes
 | 
			
		||||
.b8 17                                  // DW_AT_low_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 18                                  // DW_AT_high_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 49                                  // DW_AT_abstract_origin
 | 
			
		||||
.b8 19                                  // DW_FORM_ref4
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 4                                   // Abbreviation Code
 | 
			
		||||
.b8 29                                  // DW_TAG_inlined_subroutine
 | 
			
		||||
.b8 0                                   // DW_CHILDREN_no
 | 
			
		||||
.b8 49                                  // DW_AT_abstract_origin
 | 
			
		||||
.b8 19                                  // DW_FORM_ref4
 | 
			
		||||
.b8 17                                  // DW_AT_low_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 18                                  // DW_AT_high_pc
 | 
			
		||||
.b8 1                                   // DW_FORM_addr
 | 
			
		||||
.b8 88                                  // DW_AT_call_file
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 89                                  // DW_AT_call_line
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 87                                  // DW_AT_call_column
 | 
			
		||||
.b8 11                                  // DW_FORM_data1
 | 
			
		||||
.b8 0                                   // EOM(1)
 | 
			
		||||
.b8 0                                   // EOM(2)
 | 
			
		||||
.b8 0                                   // EOM(3)
 | 
			
		||||
	}
 | 
			
		||||
	.section	.debug_info
 | 
			
		||||
	{
 | 
			
		||||
.b32 195                                // Length of Unit
 | 
			
		||||
.b8 2                                   // DWARF version number
 | 
			
		||||
.b8 0
 | 
			
		||||
.b32 .debug_abbrev                      // Offset Into Abbrev. Section
 | 
			
		||||
.b8 8                                   // Address Size (in bytes)
 | 
			
		||||
.b8 1                                   // Abbrev [1] 0xb:0xbc DW_TAG_compile_unit
 | 
			
		||||
.b8 116                                 // DW_AT_producer
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 2                                   // DW_AT_language
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 99                                  // DW_AT_name
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 119
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 98
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 50
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 107
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 115
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 51
 | 
			
		||||
.b8 106
 | 
			
		||||
.b8 120
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 107
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 122
 | 
			
		||||
.b8 118
 | 
			
		||||
.b8 103
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 98
 | 
			
		||||
.b8 55
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 50
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 107
 | 
			
		||||
.b8 50
 | 
			
		||||
.b8 122
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 98
 | 
			
		||||
.b8 54
 | 
			
		||||
.b8 55
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 52
 | 
			
		||||
.b8 51
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 103
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 53
 | 
			
		||||
.b8 119
 | 
			
		||||
.b8 108
 | 
			
		||||
.b8 118
 | 
			
		||||
.b8 51
 | 
			
		||||
.b8 46
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 121
 | 
			
		||||
.b8 0
 | 
			
		||||
.b32 .debug_line                        // DW_AT_stmt_list
 | 
			
		||||
.b8 47                                  // DW_AT_comp_dir
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 112
 | 
			
		||||
.b8 47
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 99
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 115
 | 
			
		||||
.b8 104
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 103
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 121
 | 
			
		||||
.b8 47
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 119
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 2                                   // Abbrev [2] 0x70:0x28 DW_TAG_subprogram
 | 
			
		||||
.b8 109                                 // DW_AT_name
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 108
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 105
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 111
 | 
			
		||||
.b8 110
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 102
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 115
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 97
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 100
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 109
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 114
 | 
			
		||||
.b8 101
 | 
			
		||||
.b8 108
 | 
			
		||||
.b8 117
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 116
 | 
			
		||||
.b8 95
 | 
			
		||||
.b8 48
 | 
			
		||||
.b8 0
 | 
			
		||||
.b8 1                                   // DW_AT_inline
 | 
			
		||||
.b8 3                                   // Abbrev [3] 0x98:0x2e DW_TAG_subprogram
 | 
			
		||||
.b64 $L__func_begin0                    // DW_AT_low_pc
 | 
			
		||||
.b64 $L__func_end0                      // DW_AT_high_pc
 | 
			
		||||
.b32 112                                // DW_AT_abstract_origin
 | 
			
		||||
.b8 4                                   // Abbrev [4] 0xad:0x18 DW_TAG_inlined_subroutine
 | 
			
		||||
.b32 112                                // DW_AT_abstract_origin
 | 
			
		||||
.b64 $L__tmp1                           // DW_AT_low_pc
 | 
			
		||||
.b64 $L__tmp2                           // DW_AT_high_pc
 | 
			
		||||
.b8 1                                   // DW_AT_call_file
 | 
			
		||||
.b8 103                                 // DW_AT_call_line
 | 
			
		||||
.b8 40                                  // DW_AT_call_column
 | 
			
		||||
.b8 0                                   // End Of Children Mark
 | 
			
		||||
.b8 0                                   // End Of Children Mark
 | 
			
		||||
	}
 | 
			
		||||
	.section	.debug_macinfo	{	}
 | 
			
		||||
							
								
								
									
										8
									
								
								model2/data/aotinductor/model/script.ld
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								model2/data/aotinductor/model/script.ld
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,8 @@
 | 
			
		||||
SECTIONS {
 | 
			
		||||
  /* By default, in LLD 16, .lrodata is placed immediately after .rodata.
 | 
			
		||||
   * However, .lrodata can be very large in our compiled models, which leads to
 | 
			
		||||
   * relocation out-of-range errors for relative relocations. So we place it
 | 
			
		||||
   * after other the sections that are referenced from .text using relative
 | 
			
		||||
   * relocations. This is the default behavior in GNU ld. */
 | 
			
		||||
  .lrodata : { *(.lrodata) }
 | 
			
		||||
 } INSERT AFTER .bss;
 | 
			
		||||
							
								
								
									
										147
									
								
								model2/main.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										147
									
								
								model2/main.cpp
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,147 @@
 | 
			
		||||
// Windows for #include <dlfcn.h>
 | 
			
		||||
#include <windows.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <memory>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <string>
 | 
			
		||||
 | 
			
		||||
// Include the AOTInductor headers
 | 
			
		||||
// #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
 | 
			
		||||
#include <torch/csrc/inductor/aoti_runtime/interface.h>
 | 
			
		||||
// #include <torch/csrc/inductor/aoti_runtime/model_container.h>
 | 
			
		||||
// #include <torch/csrc/inductor/aoti_torch/tensor_converter.h> // @manual
 | 
			
		||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
 | 
			
		||||
#include <standalone/slim/core/Empty.h>
 | 
			
		||||
#include <standalone/slim/cuda/Guard.h>
 | 
			
		||||
#include <standalone/torch/csrc/inductor/aoti_torch/tensor_converter.h>
 | 
			
		||||
 | 
			
		||||
static std::wstring u8u16(const char* s) {
 | 
			
		||||
    int len = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
 | 
			
		||||
    std::wstring wbuf(len, L'\0');
 | 
			
		||||
    MultiByteToWideChar(CP_UTF8, 0, s, -1, &wbuf[0], len);
 | 
			
		||||
    if (!wbuf.empty() && wbuf.back() == L'\0') {
 | 
			
		||||
        wbuf.pop_back();
 | 
			
		||||
    }
 | 
			
		||||
    return wbuf;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main() {
 | 
			
		||||
  try {
 | 
			
		||||
 | 
			
		||||
    // Load the DLL (model.pyd is a DLL on Windows)
 | 
			
		||||
HMODULE handle = nullptr;
 | 
			
		||||
{
 | 
			
		||||
    auto wname = u8u16(R"(C:\Users\shangdiy\source\repos\pytorch\model2\model.pyd)");
 | 
			
		||||
 | 
			
		||||
    // Try LoadLibraryExW with safe search flags if supported
 | 
			
		||||
    if (GetProcAddress(GetModuleHandleW(L"KERNEL32.DLL"), "AddDllDirectory") != NULL) {
 | 
			
		||||
        handle = LoadLibraryExW(
 | 
			
		||||
            wname.c_str(),
 | 
			
		||||
            NULL,
 | 
			
		||||
            LOAD_LIBRARY_SEARCH_DEFAULT_DIRS);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Fallback if that failed
 | 
			
		||||
    if (!handle) {
 | 
			
		||||
        handle = LoadLibraryW(wname.c_str());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!handle) {
 | 
			
		||||
        DWORD dw = GetLastError();
 | 
			
		||||
        char buf[512];
 | 
			
		||||
        FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
 | 
			
		||||
                       NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
 | 
			
		||||
                       buf, sizeof(buf), NULL);
 | 
			
		||||
        std::cerr << "Failed to load model.pyd. WinError " << dw << ": " << buf << std::endl;
 | 
			
		||||
        return 1;
 | 
			
		||||
    } else {
 | 
			
		||||
        std::cout << "Loaded model.pyd" << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
    decltype(&AOTInductorModelContainerCreateWithDevice) create_model{nullptr}; 
 | 
			
		||||
    decltype(&AOTInductorModelContainerDelete) delete_model{nullptr}; 
 | 
			
		||||
    decltype(&AOTInductorModelContainerRun) run_model{nullptr};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define AOTI_LOAD_SYMBOL(handle_, var, name_str) \
 | 
			
		||||
    var = reinterpret_cast<decltype(var)>(GetProcAddress(handle_, name_str)); \
 | 
			
		||||
    if (!var) { \
 | 
			
		||||
        throw std::runtime_error("Could not GetProcAddress " name_str); \
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
        AOTI_LOAD_SYMBOL(handle, create_model, "AOTInductorModelContainerCreateWithDevice");
 | 
			
		||||
        AOTI_LOAD_SYMBOL(handle, run_model, "AOTInductorModelContainerRun");
 | 
			
		||||
        AOTI_LOAD_SYMBOL(handle, delete_model, "AOTInductorModelContainerDelete");
 | 
			
		||||
#undef AOTI_LOAD_SYMBOL
 | 
			
		||||
 | 
			
		||||
    // Create array of input/output handles
 | 
			
		||||
        slim::SlimTensor x = slim::empty({8, 10}, c10::kFloat, c10::Device(c10::kCUDA, 0));
 | 
			
		||||
        float fill_value = 1.0;
 | 
			
		||||
        x.fill_(fill_value);
 | 
			
		||||
    // AOTInductorModel::run will steal the ownership of the input and output
 | 
			
		||||
    // tensor pointers
 | 
			
		||||
        std::vector<slim::SlimTensor> inputs = {x};
 | 
			
		||||
        std::vector<AtenTensorHandle> input_handles =
 | 
			
		||||
            unsafe_alloc_new_handles_from_tensors(inputs);
 | 
			
		||||
 | 
			
		||||
        AtenTensorHandle output_handle;
 | 
			
		||||
        AOTInductorModelContainerHandle container_handle;
 | 
			
		||||
        cudaStream_t stream = slim::cuda::getCurrentCUDAStream(0);
 | 
			
		||||
        // aoti_torch_get_current_cuda_stream(0, (void**)&stream);
 | 
			
		||||
 | 
			
		||||
        // Reinterpret as the opaque handle for AOTInductor
 | 
			
		||||
        AOTInductorStreamHandle stream_handle = reinterpret_cast<AOTInductorStreamHandle>(stream);
 | 
			
		||||
 | 
			
		||||
        // Construct model
 | 
			
		||||
       const char* cubin_dir = R"(C:\Users\shangdiy\source\repos\pytorch\model2\)";
 | 
			
		||||
        AOTIRuntimeError err =
 | 
			
		||||
            create_model(&container_handle, 1, "cuda", cubin_dir);
 | 
			
		||||
        if (err != AOTI_RUNTIME_SUCCESS) {
 | 
			
		||||
          throw std::runtime_error("Failed to create model container");
 | 
			
		||||
        } else {
 | 
			
		||||
          std::cout << "Created model\n";
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Run the model
 | 
			
		||||
        err = run_model(container_handle, input_handles.data(),
 | 
			
		||||
                          1, // num_inputs
 | 
			
		||||
                          &output_handle,
 | 
			
		||||
                          1,       // num_outputs
 | 
			
		||||
                          stream_handle, // stream
 | 
			
		||||
                          nullptr  // proxy_executor
 | 
			
		||||
        );
 | 
			
		||||
        if (err != AOTI_RUNTIME_SUCCESS) {
 | 
			
		||||
          throw std::runtime_error("Failed to run model");
 | 
			
		||||
        } else {
 | 
			
		||||
          std::cout << "Finish model\n";
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::vector<slim::SlimTensor> outputs =
 | 
			
		||||
            alloc_tensors_by_stealing_from_handles(&output_handle, 1);
 | 
			
		||||
 | 
			
		||||
    // Print the result
 | 
			
		||||
    slim::SlimTensor slim_tensor = outputs[0];
 | 
			
		||||
    auto slim_cpu = slim_tensor.cpu();
 | 
			
		||||
    float *slim_data = static_cast<float *>(slim_cpu.data_ptr());
 | 
			
		||||
     std::cout << "Output" << std::endl;
 | 
			
		||||
     std::cout << "slim_data ptr: " << slim_data << "\n";
 | 
			
		||||
    size_t num_elements = slim_cpu.numel(); // or equivalent method
 | 
			
		||||
     std::cout << num_elements << std::endl;
 | 
			
		||||
 | 
			
		||||
    for (size_t i = 0; i <  num_elements; ++i) {
 | 
			
		||||
      std::cout << slim_data[i] << "\n";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    std::cout << "Done" << std::endl;
 | 
			
		||||
 | 
			
		||||
    delete_model(container_handle);
 | 
			
		||||
    FreeLibrary(handle);
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
  } catch (const std::exception &e) {
 | 
			
		||||
    std::cerr << "Error: " << e.what() << std::endl;
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								model2/model.exp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model2/model.exp
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_t_0.cubin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_t_0.cubin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
		Reference in New Issue
	
	Block a user