mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 04:04:57 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			ciflow/tru
			...
			windows_li
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 8cd74b302f | 
							
								
								
									
										1
									
								
								model2/.data/serialization_id
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/.data/serialization_id
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| 1171719005974771805808300960005001569062 | ||||
							
								
								
									
										1
									
								
								model2/.data/version
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/.data/version
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| 6 | ||||
							
								
								
									
										33
									
								
								model2/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								model2/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | ||||
| cmake_minimum_required(VERSION 3.27 FATAL_ERROR) | ||||
| project(aoti_example LANGUAGES CXX) | ||||
| set(CMAKE_CXX_STANDARD 17) | ||||
| set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||||
|  | ||||
|  | ||||
| add_executable(aoti_example main.cpp) | ||||
| set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17) | ||||
|  | ||||
| find_package(TorchStandalone REQUIRED) | ||||
| # Set up include directories to find headers at the correct paths | ||||
| target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS}) | ||||
| target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone) | ||||
|  | ||||
| enable_language(CUDA) | ||||
| set(CMAKE_CUDA_STANDARD 17) | ||||
| find_package(CUDAToolkit REQUIRED) | ||||
|  | ||||
|  | ||||
| target_compile_definitions(aoti_example PRIVATE NOMINMAX USE_CUDA) | ||||
|  | ||||
| # Add compile flags | ||||
| target_compile_options(aoti_example PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental ) | ||||
|  | ||||
|  | ||||
| target_link_libraries(aoti_example PRIVATE | ||||
|     ${TorchStandalone_LIBRARIES}  # if you have this variable from find_package(TorchStandalone) | ||||
|     CUDA::cudart                  # CUDA runtime library | ||||
|     cuda                       # CUDA driver library (usually nvcuda.lib on Windows) | ||||
| ) | ||||
|  | ||||
| # cmake -DTorchStandalone_DIR="C:/Users/shangdiy/source/repos/torchnative/standalone/build/torchstandalone_install/lib/cmake/TorchStandalone"  ..   | ||||
| # cmake --build . --config Release | ||||
							
								
								
									
										1
									
								
								model2/archive_format
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/archive_format
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| pt2 | ||||
							
								
								
									
										1
									
								
								model2/archive_version
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/archive_version
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| 0 | ||||
							
								
								
									
										1
									
								
								model2/byteorder
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								model2/byteorder
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| little | ||||
							
								
								
									
										69
									
								
								model2/data/aotinductor/model/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								model2/data/aotinductor/model/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,69 @@ | ||||
|  | ||||
| cmake_minimum_required(VERSION 3.27 FATAL_ERROR) | ||||
| project(model LANGUAGES CXX) | ||||
| set(CMAKE_CXX_STANDARD 17) | ||||
|  | ||||
| # Set a library target | ||||
| add_library(model SHARED) | ||||
|  | ||||
|  | ||||
| # TODO: change to TorchStandalone | ||||
| find_package(TorchStandalone REQUIRED) | ||||
| # Set up include directories to find headers at the correct paths | ||||
| target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS}) | ||||
| target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone) | ||||
|  | ||||
|  | ||||
| # Add macro definitions | ||||
| target_compile_definitions(model PRIVATE NOMINMAX TORCH_INDUCTOR_CPP_WRAPPER STANDALONE_TORCH_HEADER  C10_USING_CUSTOM_GENERATED_MACROS USE_CUDA) # CPU_CAPABILITY_AVX512   | ||||
|  | ||||
| # Add compile flags | ||||
| target_compile_options(model PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental ) | ||||
|  | ||||
| # Backend-specific flags | ||||
| # target_compile_options(model PRIVATE  -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16  -c)  # TODO remove | ||||
|  | ||||
|  | ||||
| enable_language(CUDA) | ||||
| set(CMAKE_CUDA_STANDARD 17) | ||||
| find_package(CUDAToolkit REQUIRED) | ||||
|  | ||||
| # Make output use .pyd instead of .dll | ||||
| set_target_properties(model PROPERTIES  | ||||
|     SUFFIX ".pyd"  | ||||
|     LINK_FLAGS "/DEF:${CMAKE_CURRENT_SOURCE_DIR}/model_exports.def" | ||||
| ) | ||||
|  | ||||
| set(KERNEL_TARGETS "") | ||||
| set(KERNEL_OBJECT_FILES "") | ||||
| # Function to compile ptx to cubin | ||||
| function(embed_gpu_kernel KERNEL_NAME PTX_FILE) | ||||
|     set(CUBIN_BASENAME ${KERNEL_NAME}.cubin) | ||||
|     set(CUBIN_FILE ${CMAKE_CURRENT_BINARY_DIR}/${CUBIN_BASENAME}) | ||||
|     # --- PTX to FATBIN Command & Target --- | ||||
|     add_custom_command( | ||||
|         OUTPUT ${CUBIN_FILE} | ||||
|         COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --cubin ${PTX_FILE} | ||||
|                 -o ${CUBIN_FILE} ${NVCC_GENCODE_FLAGS} | ||||
|                 -gencode arch=compute_89,code=sm_89 | ||||
|         DEPENDS ${PTX_FILE} | ||||
|     ) | ||||
|  | ||||
|     add_custom_target(build_kernel_object_${KERNEL_NAME} DEPENDS ${CUBIN_FILE}) | ||||
|     set(KERNEL_TARGETS ${KERNEL_TARGETS} build_kernel_object_${KERNEL_NAME} PARENT_SCOPE) | ||||
| endfunction() | ||||
| target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model.wrapper.cpp) | ||||
| target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model_consts.weights.cpp) | ||||
|  | ||||
| embed_gpu_kernel(model_triton_tem_fused_addmm_relu_t_0 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_t_0.ptx) | ||||
|  | ||||
| embed_gpu_kernel(model_triton_tem_fused_addmm_relu_sigmoid_t_1 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx) | ||||
| add_dependencies(model ${KERNEL_TARGETS}) | ||||
| target_link_libraries(model PRIVATE ${KERNEL_OBJECT_FILES}) | ||||
|  | ||||
| # Add linker flags | ||||
| target_link_options(model PRIVATE ) | ||||
|  | ||||
| # Add libraries | ||||
| # TODO: change to TorchStandalone | ||||
| target_link_libraries(model PRIVATE  ${TorchStandalone_LIBRARIES} cuda CUDA::cudart) | ||||
							
								
								
									
										1041
									
								
								model2/data/aotinductor/model/model.wrapper.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1041
									
								
								model2/data/aotinductor/model/model.wrapper.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1 @@ | ||||
| {"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": [], "libraries": [], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": true} | ||||
| @ -0,0 +1 @@ | ||||
| {"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/libs", "/home/shangdiy/pytorch/torch/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib/stubs"], "libraries": ["torch", "torch_cpu", "sleef", "c10", "c10_cuda", "cuda", "torch_cuda"], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": false} | ||||
| @ -0,0 +1 @@ | ||||
| {"AOTI_DEVICE_KEY": "cuda"} | ||||
							
								
								
									
										58
									
								
								model2/data/aotinductor/model/model_consts.weights.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								model2/data/aotinductor/model/model_consts.weights.cpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,58 @@ | ||||
| #if defined(__clang__) || defined (__GNUC__)	 | ||||
| #define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))	 | ||||
| #else	 | ||||
| #define ATTRIBUTE_NO_SANITIZE_ADDRESS	 | ||||
| #endif	 | ||||
| 	 | ||||
| ATTRIBUTE_NO_SANITIZE_ADDRESS	 | ||||
| alignas(64) extern unsigned char _binary_constants_bin_start[768] = {	 | ||||
| 69, 165, 123, 190, 252, 181, 16, 62, 242, 69, 193, 59, 44, 80, 100, 62, 	 | ||||
| 237, 163, 142, 188, 138, 139, 109, 190, 61, 248, 20, 190, 77, 84, 143, 60, 	 | ||||
| 111, 90, 163, 60, 96, 140, 230, 189, 101, 69, 38, 190, 132, 190, 12, 188, 	 | ||||
| 28, 113, 159, 190, 252, 128, 154, 62, 247, 234, 217, 61, 206, 79, 58, 61, 	 | ||||
| 224, 209, 135, 61, 211, 238, 147, 62, 231, 229, 27, 190, 169, 208, 57, 62, 	 | ||||
| 100, 20, 153, 190, 161, 160, 85, 190, 207, 10, 156, 62, 234, 107, 155, 190, 	 | ||||
| 188, 85, 116, 62, 27, 211, 114, 60, 94, 21, 158, 189, 147, 210, 34, 62, 	 | ||||
| 203, 109, 80, 62, 28, 242, 141, 189, 205, 27, 152, 190, 38, 104, 6, 189, 	 | ||||
| 211, 16, 249, 189, 72, 103, 143, 190, 163, 44, 140, 189, 178, 223, 127, 189, 	 | ||||
| 5, 112, 160, 189, 177, 55, 132, 190, 218, 22, 159, 62, 115, 30, 35, 190, 	 | ||||
| 26, 247, 9, 62, 251, 219, 27, 62, 165, 86, 135, 62, 99, 168, 66, 190, 	 | ||||
| 238, 64, 93, 62, 65, 147, 86, 62, 167, 108, 97, 62, 183, 219, 50, 190, 	 | ||||
| 138, 83, 106, 62, 90, 122, 208, 189, 149, 140, 161, 188, 44, 145, 194, 189, 	 | ||||
| 5, 142, 186, 61, 202, 230, 153, 190, 133, 72, 136, 62, 251, 1, 3, 62, 	 | ||||
| 225, 146, 54, 190, 91, 176, 219, 189, 118, 244, 10, 189, 232, 107, 142, 62, 	 | ||||
| 185, 6, 151, 62, 241, 137, 223, 61, 124, 100, 114, 190, 15, 240, 168, 189, 	 | ||||
| 149, 252, 58, 190, 238, 93, 243, 188, 144, 218, 115, 61, 159, 91, 6, 62, 	 | ||||
| 57, 139, 74, 190, 84, 200, 49, 61, 193, 78, 32, 190, 84, 121, 26, 190, 	 | ||||
| 219, 39, 115, 190, 171, 127, 94, 62, 248, 253, 177, 61, 63, 18, 127, 62, 	 | ||||
| 146, 18, 137, 189, 203, 90, 161, 190, 139, 194, 239, 58, 126, 54, 40, 190, 	 | ||||
| 47, 247, 30, 190, 106, 93, 191, 61, 22, 48, 120, 61, 56, 123, 7, 62, 	 | ||||
| 150, 229, 210, 189, 118, 231, 158, 188, 7, 98, 215, 60, 72, 251, 89, 190, 	 | ||||
| 190, 160, 137, 190, 173, 194, 158, 62, 225, 26, 118, 190, 174, 199, 4, 189, 	 | ||||
| 205, 148, 16, 62, 20, 225, 155, 61, 90, 124, 133, 190, 88, 196, 34, 61, 	 | ||||
| 26, 104, 51, 190, 149, 106, 40, 62, 25, 136, 177, 60, 169, 111, 138, 190, 	 | ||||
| 214, 181, 226, 189, 109, 17, 77, 62, 224, 166, 55, 62, 250, 128, 160, 61, 	 | ||||
| 104, 223, 250, 61, 34, 182, 210, 187, 60, 87, 149, 190, 189, 55, 98, 188, 	 | ||||
| 58, 86, 85, 190, 170, 43, 132, 190, 81, 220, 87, 190, 47, 226, 138, 62, 	 | ||||
| 189, 162, 36, 190, 30, 232, 34, 60, 138, 147, 167, 61, 151, 129, 157, 61, 	 | ||||
| 206, 33, 152, 62, 109, 227, 113, 190, 147, 255, 11, 190, 175, 56, 46, 189, 	 | ||||
| 46, 238, 1, 189, 123, 159, 85, 188, 14, 126, 148, 189, 226, 226, 169, 189, 	 | ||||
| 255, 106, 134, 61, 38, 140, 187, 60, 119, 73, 49, 62, 32, 236, 43, 62, 	 | ||||
| 78, 33, 232, 189, 72, 188, 139, 62, 94, 206, 20, 62, 25, 230, 75, 189, 	 | ||||
| 171, 239, 26, 190, 136, 218, 121, 62, 96, 115, 85, 62, 126, 92, 55, 190, 	 | ||||
| 112, 108, 134, 61, 64, 212, 69, 190, 253, 118, 214, 188, 210, 116, 66, 62, 	 | ||||
| 204, 131, 123, 190, 13, 151, 38, 190, 56, 17, 252, 189, 153, 151, 138, 62, 	 | ||||
| 21, 30, 216, 61, 146, 103, 32, 62, 140, 60, 78, 62, 183, 149, 174, 61, 	 | ||||
| 95, 153, 164, 61, 144, 167, 187, 189, 112, 53, 153, 190, 127, 195, 105, 61, 	 | ||||
| 169, 167, 251, 189, 42, 204, 123, 62, 116, 193, 86, 62, 98, 147, 30, 61, 	 | ||||
| 176, 138, 137, 62, 245, 244, 17, 62, 201, 90, 140, 62, 177, 110, 77, 62, 	 | ||||
| 188, 31, 129, 190, 66, 203, 85, 62, 182, 209, 112, 188, 216, 91, 222, 59, 	 | ||||
| 18, 208, 131, 189, 151, 142, 150, 190, 36, 252, 31, 62, 241, 2, 180, 61, 	 | ||||
| 83, 240, 159, 62, 37, 152, 115, 190, 13, 52, 107, 62, 169, 178, 148, 62, 	 | ||||
| 171, 54, 38, 62, 33, 4, 199, 189, 201, 247, 216, 189, 225, 89, 146, 190, 	 | ||||
| 192, 118, 79, 189, 92, 171, 12, 62, 136, 235, 3, 62, 180, 202, 87, 62, 	 | ||||
| 8, 129, 122, 61, 160, 75, 170, 188, 20, 84, 6, 62, 60, 194, 56, 190, 	 | ||||
| 182, 99, 44, 190, 88, 96, 228, 189, 50, 106, 5, 190, 34, 133, 12, 190, 	 | ||||
| 26, 50, 0, 190, 176, 25, 127, 61, 48, 69, 219, 61, 192, 237, 252, 187, 	 | ||||
| };	 | ||||
| alignas(64) extern unsigned char * _binary_constants_bin_end;	 | ||||
							
								
								
									
										6
									
								
								model2/data/aotinductor/model/model_exports.def
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								model2/data/aotinductor/model/model_exports.def
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | ||||
| LIBRARY model | ||||
| EXPORTS | ||||
|     AOTInductorModelContainerCreate | ||||
|     AOTInductorModelContainerCreateWithDevice | ||||
|     AOTInductorModelContainerRun | ||||
|     AOTInductorModelContainerDelete | ||||
										
											Binary file not shown.
										
									
								
							| @ -0,0 +1,684 @@ | ||||
| // | ||||
| // Generated by LLVM NVPTX Back-End | ||||
| // | ||||
|  | ||||
| .version 8.7 | ||||
| .target sm_89 | ||||
| .address_size 64 | ||||
|  | ||||
| 	// .globl	model_triton_tem_fused_addmm_relu_sigmoid_t_1 // -- Begin function model_triton_tem_fused_addmm_relu_sigmoid_t_1 | ||||
| .extern .shared .align 16 .b8 global_smem[]; | ||||
|                                         // @model_triton_tem_fused_addmm_relu_sigmoid_t_1 | ||||
| .visible .entry model_triton_tem_fused_addmm_relu_sigmoid_t_1( | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0, | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1, | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2, | ||||
| 	.param .u32 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3, | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_4 | ||||
| ) | ||||
| .reqntid 32 | ||||
| { | ||||
| 	.reg .pred 	%p<12>; | ||||
| 	.reg .b32 	%r<375>; | ||||
| 	.reg .b64 	%rd<27>; | ||||
| 	.loc	1 18 0                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0 | ||||
| $L__func_begin0: | ||||
| 	.loc	1 18 0                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0 | ||||
|  | ||||
| // %bb.0: | ||||
| 	ld.param.b32 	%r1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3]; | ||||
| $L__tmp0: | ||||
| 	.loc	1 34 16                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:34:16 | ||||
| 	setp.ne.s32 	%p1, %r1, 0; | ||||
| 	@%p1 bra 	$L__BB0_2; | ||||
| 	bra.uni 	$L__BB0_1; | ||||
| $L__BB0_2: | ||||
| 	.loc	1 0 16                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0:16 | ||||
| 	ld.param.b64 	%rd3, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2]; | ||||
| 	ld.param.b64 	%rd2, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1]; | ||||
| 	ld.param.b64 	%rd1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0]; | ||||
| 	.loc	1 43 24                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:43:24 | ||||
| 	mov.u32 	%r26, %ctaid.x; | ||||
| 	.loc	1 44 28                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:28 | ||||
| 	add.s32 	%r27, %r1, 15; | ||||
| 	.loc	1 44 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:34 | ||||
| 	shr.s32 	%r28, %r27, 31; | ||||
| 	shr.u32 	%r29, %r28, 28; | ||||
| 	add.s32 	%r30, %r27, %r29; | ||||
| 	shr.s32 	%r31, %r30, 4; | ||||
| 	.loc	1 50 41                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:41 | ||||
| 	and.b32 	%r32, %r26, 2147483640; | ||||
| 	.loc	1 50 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:30 | ||||
| 	sub.s32 	%r33, %r31, %r32; | ||||
| 	.loc	1 50 50                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:50 | ||||
| 	min.s32 	%r34, %r33, 8; | ||||
| 	.loc	1 51 40                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:40 | ||||
| 	rem.s32 	%r35, %r26, %r34; | ||||
| 	.loc	1 51 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:34 | ||||
| 	add.s32 	%r36, %r35, %r32; | ||||
| 	.loc	1 52 19                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:19 | ||||
| 	and.b32 	%r37, %r26, 7; | ||||
| 	.loc	1 52 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:30 | ||||
| 	div.s32 	%r38, %r37, %r34; | ||||
| 	.loc	1 56 17                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:17 | ||||
| 	shl.b32 	%r39, %r36, 4; | ||||
| 	.loc	1 56 40                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:40 | ||||
| 	mov.u32 	%r40, %tid.x; | ||||
| 	shr.u32 	%r41, %r40, 2; | ||||
| 	and.b32 	%r42, %r41, 2; | ||||
| 	bfe.u32 	%r43, %r40, 2, 2; | ||||
| 	and.b32 	%r44, %r40, 16; | ||||
| 	shr.u32 	%r45, %r44, 2; | ||||
| 	or.b32 	%r46, %r43, %r45; | ||||
| 	and.b32 	%r47, %r40, 15; | ||||
| 	bfe.u32 	%r48, %r40, 4, 1; | ||||
| 	.loc	1 56 27                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:27 | ||||
| 	or.b32 	%r49, %r39, %r46; | ||||
| 	or.b32 	%r50, %r49, 8; | ||||
| 	or.b32 	%r51, %r39, %r47; | ||||
| 	.loc	1 0 0                           // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0 | ||||
| 	rem.s32 	%r52, %r50, %r1; | ||||
| 	rem.s32 	%r53, %r49, %r1; | ||||
| 	.loc	1 71 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:71:30 | ||||
| 	shl.b32 	%r54, %r40, 2; | ||||
| 	and.b32 	%r55, %r54, 12; | ||||
| 	.loc	1 76 28                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:28 | ||||
| 	shl.b32 	%r56, %r53, 4; | ||||
| 	shl.b32 	%r57, %r52, 4; | ||||
| 	.loc	1 76 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:25 | ||||
| 	or.b32 	%r58, %r56, %r55; | ||||
| 	or.b32 	%r59, %r57, %r55; | ||||
| 	.loc	1 77 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:25 | ||||
| 	mul.wide.s32 	%rd16, %r58, 4; | ||||
| 	add.s64 	%rd4, %rd1, %rd16; | ||||
| 	mul.wide.s32 	%rd17, %r59, 4; | ||||
| 	add.s64 	%rd5, %rd1, %rd17; | ||||
| 	.loc	1 77 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20 | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r2, 0x0; | ||||
| 	mov.u32 %r3, 0x0; | ||||
| 	mov.u32 %r4, 0x0; | ||||
| 	mov.u32 %r5, 0x0; | ||||
| 	ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd4 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r6, 0x0; | ||||
| 	mov.u32 %r7, 0x0; | ||||
| 	mov.u32 %r8, 0x0; | ||||
| 	mov.u32 %r9, 0x0; | ||||
| 	ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd5 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	shl.b32 	%r60, %r46, 6; | ||||
| 	shl.b32 	%r61, %r55, 2; | ||||
| 	or.b32 	%r62, %r60, %r61; | ||||
| 	mov.b32 	%r63, global_smem; | ||||
| 	add.s32 	%r64, %r63, %r62; | ||||
| 	st.shared.v4.b32 	[%r64], {%r2, %r3, %r4, %r5}; | ||||
| 	st.shared.v4.b32 	[%r64+512], {%r6, %r7, %r8, %r9}; | ||||
| 	.loc	1 82 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:25 | ||||
| 	mul.wide.u32 	%rd18, %r55, 4; | ||||
| 	add.s64 	%rd6, %rd2, %rd18; | ||||
| 	.loc	1 82 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20 | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r10, 0x0; | ||||
| 	mov.u32 %r11, 0x0; | ||||
| 	mov.u32 %r12, 0x0; | ||||
| 	mov.u32 %r13, 0x0; | ||||
| 	ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd6 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r14, 0x0; | ||||
| 	mov.u32 %r15, 0x0; | ||||
| 	mov.u32 %r16, 0x0; | ||||
| 	mov.u32 %r17, 0x0; | ||||
| 	ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	add.s32 	%r65, %r63, 1024; | ||||
| 	add.s32 	%r66, %r65, %r62; | ||||
| 	st.shared.v4.b32 	[%r66], {%r10, %r11, %r12, %r13}; | ||||
| 	st.shared.v4.b32 	[%r66+512], {%r14, %r15, %r16, %r17}; | ||||
| 	.loc	1 90 17                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:17 | ||||
| 	shl.b32 	%r67, %r38, 4; | ||||
| 	.loc	1 90 27                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:27 | ||||
| 	or.b32 	%r68, %r67, %r48; | ||||
| 	.loc	1 93 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:20 | ||||
| 	setp.lt.s32 	%p10, %r51, %r1; | ||||
| 	.loc	1 93 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:34 | ||||
| 	setp.eq.s32 	%p11, %r68, 0; | ||||
| 	.loc	1 93 26                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:26 | ||||
| 	and.pred 	%p2, %p10, %p11; | ||||
| 	.loc	1 96 21                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:96:21 | ||||
| 	add.s32 	%r69, %r51, %r68; | ||||
| 	add.s32 	%r70, %r69, 2; | ||||
| 	add.s32 	%r71, %r69, 4; | ||||
| 	add.s32 	%r72, %r69, 6; | ||||
| 	add.s32 	%r73, %r69, 8; | ||||
| 	add.s32 	%r74, %r69, 10; | ||||
| 	add.s32 	%r75, %r69, 12; | ||||
| 	add.s32 	%r76, %r69, 14; | ||||
| 	.loc	1 77 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20 | ||||
| 	bar.sync 	0; | ||||
| 	or.b32 	%r77, %r42, %r45; | ||||
| 	shl.b32 	%r78, %r77, 6; | ||||
| 	add.s32 	%r79, %r63, %r78; | ||||
| 	ld.shared.v4.b32 	{%r80, %r81, %r82, %r83}, [%r79+112]; | ||||
| 	ld.shared.v4.b32 	{%r84, %r85, %r86, %r87}, [%r79+96]; | ||||
| 	ld.shared.v4.b32 	{%r88, %r89, %r90, %r91}, [%r79+80]; | ||||
| 	ld.shared.v4.b32 	{%r92, %r93, %r94, %r95}, [%r79+64]; | ||||
| 	ld.shared.v4.b32 	{%r96, %r97, %r98, %r99}, [%r79+48]; | ||||
| 	ld.shared.v4.b32 	{%r100, %r101, %r102, %r103}, [%r79+32]; | ||||
| 	ld.shared.v4.b32 	{%r104, %r105, %r106, %r107}, [%r79+16]; | ||||
| 	ld.shared.v4.b32 	{%r108, %r109, %r110, %r111}, [%r79]; | ||||
| 	ld.shared.v4.b32 	{%r112, %r113, %r114, %r115}, [%r79+624]; | ||||
| 	ld.shared.v4.b32 	{%r116, %r117, %r118, %r119}, [%r79+608]; | ||||
| 	ld.shared.v4.b32 	{%r120, %r121, %r122, %r123}, [%r79+592]; | ||||
| 	ld.shared.v4.b32 	{%r124, %r125, %r126, %r127}, [%r79+576]; | ||||
| 	ld.shared.v4.b32 	{%r128, %r129, %r130, %r131}, [%r79+560]; | ||||
| 	ld.shared.v4.b32 	{%r132, %r133, %r134, %r135}, [%r79+544]; | ||||
| 	ld.shared.v4.b32 	{%r136, %r137, %r138, %r139}, [%r79+528]; | ||||
| 	ld.shared.v4.b32 	{%r140, %r141, %r142, %r143}, [%r79+512]; | ||||
| 	.loc	1 82 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20 | ||||
| 	shl.b32 	%r144, %r40, 1; | ||||
| 	and.b32 	%r145, %r144, 14; | ||||
| 	shl.b32 	%r146, %r145, 6; | ||||
| 	add.s32 	%r147, %r65, %r146; | ||||
| 	ld.shared.v4.b32 	{%r148, %r149, %r150, %r151}, [%r147]; | ||||
| 	ld.shared.v4.b32 	{%r152, %r153, %r154, %r155}, [%r147+64]; | ||||
| 	ld.shared.v4.b32 	{%r156, %r157, %r158, %r159}, [%r147+16]; | ||||
| 	ld.shared.v4.b32 	{%r160, %r161, %r162, %r163}, [%r147+80]; | ||||
| 	ld.shared.v4.b32 	{%r164, %r165, %r166, %r167}, [%r147+32]; | ||||
| 	ld.shared.v4.b32 	{%r168, %r169, %r170, %r171}, [%r147+96]; | ||||
| 	ld.shared.v4.b32 	{%r172, %r173, %r174, %r175}, [%r147+48]; | ||||
| 	ld.shared.v4.b32 	{%r176, %r177, %r178, %r179}, [%r147+112]; | ||||
| 	.loc	1 85 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:85:25 | ||||
| 	fma.rn.f32 	%r180, %r108, %r148, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r181, %r109, %r149, %r180; | ||||
| 	fma.rn.f32 	%r182, %r110, %r150, %r181; | ||||
| 	fma.rn.f32 	%r183, %r111, %r151, %r182; | ||||
| 	fma.rn.f32 	%r184, %r104, %r156, %r183; | ||||
| 	fma.rn.f32 	%r185, %r105, %r157, %r184; | ||||
| 	fma.rn.f32 	%r186, %r106, %r158, %r185; | ||||
| 	fma.rn.f32 	%r187, %r107, %r159, %r186; | ||||
| 	fma.rn.f32 	%r188, %r100, %r164, %r187; | ||||
| 	fma.rn.f32 	%r189, %r101, %r165, %r188; | ||||
| 	fma.rn.f32 	%r190, %r102, %r166, %r189; | ||||
| 	fma.rn.f32 	%r191, %r103, %r167, %r190; | ||||
| 	fma.rn.f32 	%r192, %r96, %r172, %r191; | ||||
| 	fma.rn.f32 	%r193, %r97, %r173, %r192; | ||||
| 	fma.rn.f32 	%r194, %r98, %r174, %r193; | ||||
| 	fma.rn.f32 	%r195, %r99, %r175, %r194; | ||||
| 	fma.rn.f32 	%r196, %r108, %r152, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r197, %r109, %r153, %r196; | ||||
| 	fma.rn.f32 	%r198, %r110, %r154, %r197; | ||||
| 	fma.rn.f32 	%r199, %r111, %r155, %r198; | ||||
| 	fma.rn.f32 	%r200, %r104, %r160, %r199; | ||||
| 	fma.rn.f32 	%r201, %r105, %r161, %r200; | ||||
| 	fma.rn.f32 	%r202, %r106, %r162, %r201; | ||||
| 	fma.rn.f32 	%r203, %r107, %r163, %r202; | ||||
| 	fma.rn.f32 	%r204, %r100, %r168, %r203; | ||||
| 	fma.rn.f32 	%r205, %r101, %r169, %r204; | ||||
| 	fma.rn.f32 	%r206, %r102, %r170, %r205; | ||||
| 	fma.rn.f32 	%r207, %r103, %r171, %r206; | ||||
| 	fma.rn.f32 	%r208, %r96, %r176, %r207; | ||||
| 	fma.rn.f32 	%r209, %r97, %r177, %r208; | ||||
| 	fma.rn.f32 	%r210, %r98, %r178, %r209; | ||||
| 	fma.rn.f32 	%r211, %r99, %r179, %r210; | ||||
| 	fma.rn.f32 	%r212, %r92, %r148, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r213, %r93, %r149, %r212; | ||||
| 	fma.rn.f32 	%r214, %r94, %r150, %r213; | ||||
| 	fma.rn.f32 	%r215, %r95, %r151, %r214; | ||||
| 	fma.rn.f32 	%r216, %r88, %r156, %r215; | ||||
| 	fma.rn.f32 	%r217, %r89, %r157, %r216; | ||||
| 	fma.rn.f32 	%r218, %r90, %r158, %r217; | ||||
| 	fma.rn.f32 	%r219, %r91, %r159, %r218; | ||||
| 	fma.rn.f32 	%r220, %r84, %r164, %r219; | ||||
| 	fma.rn.f32 	%r221, %r85, %r165, %r220; | ||||
| 	fma.rn.f32 	%r222, %r86, %r166, %r221; | ||||
| 	fma.rn.f32 	%r223, %r87, %r167, %r222; | ||||
| 	fma.rn.f32 	%r224, %r80, %r172, %r223; | ||||
| 	fma.rn.f32 	%r225, %r81, %r173, %r224; | ||||
| 	fma.rn.f32 	%r226, %r82, %r174, %r225; | ||||
| 	fma.rn.f32 	%r227, %r83, %r175, %r226; | ||||
| 	fma.rn.f32 	%r228, %r92, %r152, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r229, %r93, %r153, %r228; | ||||
| 	fma.rn.f32 	%r230, %r94, %r154, %r229; | ||||
| 	fma.rn.f32 	%r231, %r95, %r155, %r230; | ||||
| 	fma.rn.f32 	%r232, %r88, %r160, %r231; | ||||
| 	fma.rn.f32 	%r233, %r89, %r161, %r232; | ||||
| 	fma.rn.f32 	%r234, %r90, %r162, %r233; | ||||
| 	fma.rn.f32 	%r235, %r91, %r163, %r234; | ||||
| 	fma.rn.f32 	%r236, %r84, %r168, %r235; | ||||
| 	fma.rn.f32 	%r237, %r85, %r169, %r236; | ||||
| 	fma.rn.f32 	%r238, %r86, %r170, %r237; | ||||
| 	fma.rn.f32 	%r239, %r87, %r171, %r238; | ||||
| 	fma.rn.f32 	%r240, %r80, %r176, %r239; | ||||
| 	fma.rn.f32 	%r241, %r81, %r177, %r240; | ||||
| 	fma.rn.f32 	%r242, %r82, %r178, %r241; | ||||
| 	fma.rn.f32 	%r243, %r83, %r179, %r242; | ||||
| 	fma.rn.f32 	%r244, %r140, %r148, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r245, %r141, %r149, %r244; | ||||
| 	fma.rn.f32 	%r246, %r142, %r150, %r245; | ||||
| 	fma.rn.f32 	%r247, %r143, %r151, %r246; | ||||
| 	fma.rn.f32 	%r248, %r136, %r156, %r247; | ||||
| 	fma.rn.f32 	%r249, %r137, %r157, %r248; | ||||
| 	fma.rn.f32 	%r250, %r138, %r158, %r249; | ||||
| 	fma.rn.f32 	%r251, %r139, %r159, %r250; | ||||
| 	fma.rn.f32 	%r252, %r132, %r164, %r251; | ||||
| 	fma.rn.f32 	%r253, %r133, %r165, %r252; | ||||
| 	fma.rn.f32 	%r254, %r134, %r166, %r253; | ||||
| 	fma.rn.f32 	%r255, %r135, %r167, %r254; | ||||
| 	fma.rn.f32 	%r256, %r128, %r172, %r255; | ||||
| 	fma.rn.f32 	%r257, %r129, %r173, %r256; | ||||
| 	fma.rn.f32 	%r258, %r130, %r174, %r257; | ||||
| 	fma.rn.f32 	%r259, %r131, %r175, %r258; | ||||
| 	fma.rn.f32 	%r260, %r140, %r152, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r261, %r141, %r153, %r260; | ||||
| 	fma.rn.f32 	%r262, %r142, %r154, %r261; | ||||
| 	fma.rn.f32 	%r263, %r143, %r155, %r262; | ||||
| 	fma.rn.f32 	%r264, %r136, %r160, %r263; | ||||
| 	fma.rn.f32 	%r265, %r137, %r161, %r264; | ||||
| 	fma.rn.f32 	%r266, %r138, %r162, %r265; | ||||
| 	fma.rn.f32 	%r267, %r139, %r163, %r266; | ||||
| 	fma.rn.f32 	%r268, %r132, %r168, %r267; | ||||
| 	fma.rn.f32 	%r269, %r133, %r169, %r268; | ||||
| 	fma.rn.f32 	%r270, %r134, %r170, %r269; | ||||
| 	fma.rn.f32 	%r271, %r135, %r171, %r270; | ||||
| 	fma.rn.f32 	%r272, %r128, %r176, %r271; | ||||
| 	fma.rn.f32 	%r273, %r129, %r177, %r272; | ||||
| 	fma.rn.f32 	%r274, %r130, %r178, %r273; | ||||
| 	fma.rn.f32 	%r275, %r131, %r179, %r274; | ||||
| 	fma.rn.f32 	%r276, %r124, %r148, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r277, %r125, %r149, %r276; | ||||
| 	fma.rn.f32 	%r278, %r126, %r150, %r277; | ||||
| 	fma.rn.f32 	%r279, %r127, %r151, %r278; | ||||
| 	fma.rn.f32 	%r280, %r120, %r156, %r279; | ||||
| 	fma.rn.f32 	%r281, %r121, %r157, %r280; | ||||
| 	fma.rn.f32 	%r282, %r122, %r158, %r281; | ||||
| 	fma.rn.f32 	%r283, %r123, %r159, %r282; | ||||
| 	fma.rn.f32 	%r284, %r116, %r164, %r283; | ||||
| 	fma.rn.f32 	%r285, %r117, %r165, %r284; | ||||
| 	fma.rn.f32 	%r286, %r118, %r166, %r285; | ||||
| 	fma.rn.f32 	%r287, %r119, %r167, %r286; | ||||
| 	fma.rn.f32 	%r288, %r112, %r172, %r287; | ||||
| 	fma.rn.f32 	%r289, %r113, %r173, %r288; | ||||
| 	fma.rn.f32 	%r290, %r114, %r174, %r289; | ||||
| 	fma.rn.f32 	%r291, %r115, %r175, %r290; | ||||
| 	fma.rn.f32 	%r292, %r124, %r152, 0fBE0AE428; | ||||
| 	fma.rn.f32 	%r293, %r125, %r153, %r292; | ||||
| 	fma.rn.f32 	%r294, %r126, %r154, %r293; | ||||
| 	fma.rn.f32 	%r295, %r127, %r155, %r294; | ||||
| 	fma.rn.f32 	%r296, %r120, %r160, %r295; | ||||
| 	fma.rn.f32 	%r297, %r121, %r161, %r296; | ||||
| 	fma.rn.f32 	%r298, %r122, %r162, %r297; | ||||
| 	fma.rn.f32 	%r299, %r123, %r163, %r298; | ||||
| 	fma.rn.f32 	%r300, %r116, %r168, %r299; | ||||
| 	fma.rn.f32 	%r301, %r117, %r169, %r300; | ||||
| 	fma.rn.f32 	%r302, %r118, %r170, %r301; | ||||
| 	fma.rn.f32 	%r303, %r119, %r171, %r302; | ||||
| 	fma.rn.f32 	%r304, %r112, %r176, %r303; | ||||
| 	fma.rn.f32 	%r305, %r113, %r177, %r304; | ||||
| 	fma.rn.f32 	%r306, %r114, %r178, %r305; | ||||
| 	fma.rn.f32 	%r307, %r115, %r179, %r306; | ||||
| 	mov.b32 	%r308, 0f00000000; | ||||
| $L__tmp1: | ||||
| 	.loc	2 47 30                         // standard.py:47:30 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ] | ||||
| 	sub.f32 	%r309, %r308, %r195; | ||||
| 	sub.f32 	%r310, %r308, %r211; | ||||
| 	sub.f32 	%r311, %r308, %r227; | ||||
| 	sub.f32 	%r312, %r308, %r243; | ||||
| 	sub.f32 	%r313, %r308, %r259; | ||||
| 	sub.f32 	%r314, %r308, %r275; | ||||
| 	sub.f32 	%r315, %r308, %r291; | ||||
| 	sub.f32 	%r316, %r308, %r307; | ||||
| 	.loc	2 47 29                         // standard.py:47:29 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ] | ||||
| 	mul.f32 	%r317, %r309, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r318, %r317; | ||||
| 	mul.f32 	%r319, %r310, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r320, %r319; | ||||
| 	mul.f32 	%r321, %r311, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r322, %r321; | ||||
| 	mul.f32 	%r323, %r312, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r324, %r323; | ||||
| 	mul.f32 	%r325, %r313, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r326, %r325; | ||||
| 	mul.f32 	%r327, %r314, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r328, %r327; | ||||
| 	mul.f32 	%r329, %r315, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r330, %r329; | ||||
| 	mul.f32 	%r331, %r316, 0f3FB8AA3B; | ||||
| 	ex2.approx.f32 	%r332, %r331; | ||||
| 	.loc	2 47 20                         // standard.py:47:20 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ] | ||||
| 	add.f32 	%r333, %r318, 0f3F800000; | ||||
| 	add.f32 	%r334, %r320, 0f3F800000; | ||||
| 	add.f32 	%r335, %r322, 0f3F800000; | ||||
| 	add.f32 	%r336, %r324, 0f3F800000; | ||||
| 	add.f32 	%r337, %r326, 0f3F800000; | ||||
| 	add.f32 	%r338, %r328, 0f3F800000; | ||||
| 	add.f32 	%r339, %r330, 0f3F800000; | ||||
| 	add.f32 	%r340, %r332, 0f3F800000; | ||||
| 	mov.b32 	%r341, 0f3F800000; | ||||
| 	.loc	2 47 16                         // standard.py:47:16 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ] | ||||
| 	div.full.f32 	%r342, %r341, %r333; | ||||
| 	div.full.f32 	%r343, %r341, %r334; | ||||
| 	div.full.f32 	%r344, %r341, %r335; | ||||
| 	div.full.f32 	%r345, %r341, %r336; | ||||
| 	div.full.f32 	%r346, %r341, %r337; | ||||
| 	div.full.f32 	%r347, %r341, %r338; | ||||
| 	div.full.f32 	%r348, %r341, %r339; | ||||
| 	div.full.f32 	%r349, %r341, %r340; | ||||
| $L__tmp2: | ||||
| 	.loc	1 100 25                        // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:25 | ||||
| 	mul.wide.s32 	%rd19, %r69, 4; | ||||
| 	add.s64 	%rd8, %rd3, %rd19; | ||||
| 	mul.wide.s32 	%rd20, %r70, 4; | ||||
| 	add.s64 	%rd9, %rd3, %rd20; | ||||
| 	mul.wide.s32 	%rd21, %r71, 4; | ||||
| 	add.s64 	%rd10, %rd3, %rd21; | ||||
| 	mul.wide.s32 	%rd22, %r72, 4; | ||||
| 	add.s64 	%rd11, %rd3, %rd22; | ||||
| 	mul.wide.s32 	%rd23, %r73, 4; | ||||
| 	add.s64 	%rd12, %rd3, %rd23; | ||||
| 	mul.wide.s32 	%rd24, %r74, 4; | ||||
| 	add.s64 	%rd13, %rd3, %rd24; | ||||
| 	mul.wide.s32 	%rd25, %r75, 4; | ||||
| 	add.s64 	%rd14, %rd3, %rd25; | ||||
| 	mul.wide.s32 	%rd26, %r76, 4; | ||||
| 	add.s64 	%rd15, %rd3, %rd26; | ||||
| 	.loc	1 100 68                        // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:68 | ||||
| 	bar.sync 	0; | ||||
| 	and.b32 	%r350, %r54, 96; | ||||
| 	or.b32 	%r351, %r145, %r350; | ||||
| 	shl.b32 	%r352, %r40, 4; | ||||
| 	and.b32 	%r353, %r352, 240; | ||||
| 	shr.u32 	%r354, %r350, 1; | ||||
| 	add.s32 	%r355, %r63, %r354; | ||||
| 	shl.b32 	%r356, %r351, 2; | ||||
| 	add.s32 	%r357, %r355, %r356; | ||||
| 	st.shared.v2.b32 	[%r357], {%r342, %r343}; | ||||
| 	or.b32 	%r358, %r350, 16; | ||||
| 	shr.u32 	%r359, %r358, 1; | ||||
| 	add.s32 	%r360, %r63, %r359; | ||||
| 	add.s32 	%r361, %r360, %r356; | ||||
| 	st.shared.v2.b32 	[%r361+64], {%r344, %r345}; | ||||
| 	or.b32 	%r362, %r350, 128; | ||||
| 	shr.u32 	%r363, %r362, 1; | ||||
| 	add.s32 	%r364, %r63, %r363; | ||||
| 	add.s32 	%r365, %r364, %r356; | ||||
| 	st.shared.v2.b32 	[%r365+512], {%r346, %r347}; | ||||
| 	or.b32 	%r366, %r350, 144; | ||||
| 	shr.u32 	%r367, %r366, 1; | ||||
| 	add.s32 	%r368, %r63, %r367; | ||||
| 	add.s32 	%r369, %r368, %r356; | ||||
| 	st.shared.v2.b32 	[%r369+576], {%r348, %r349}; | ||||
| 	bar.sync 	0; | ||||
| 	shr.u32 	%r370, %r353, 1; | ||||
| 	add.s32 	%r371, %r63, %r370; | ||||
| 	shl.b32 	%r372, %r353, 2; | ||||
| 	add.s32 	%r373, %r371, %r372; | ||||
| 	add.s32 	%r374, %r373, %r45; | ||||
| 	ld.shared.b32 	%r18, [%r374]; | ||||
| 	ld.shared.b32 	%r19, [%r374+8]; | ||||
| 	ld.shared.b32 	%r20, [%r374+16]; | ||||
| 	ld.shared.b32 	%r21, [%r374+24]; | ||||
| 	ld.shared.b32 	%r22, [%r374+32]; | ||||
| 	ld.shared.b32 	%r23, [%r374+40]; | ||||
| 	ld.shared.b32 	%r24, [%r374+48]; | ||||
| 	ld.shared.b32 	%r25, [%r374+56]; | ||||
| 	// begin inline asm | ||||
| 	@%p2 st.global.b32 [ %rd8 + 0 ], { %r18 }; | ||||
| 	// end inline asm | ||||
| 	mov.pred 	%p3, 0; | ||||
| 	// begin inline asm | ||||
| 	@%p3 st.global.b32 [ %rd9 + 0 ], { %r19 }; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	@%p3 st.global.b32 [ %rd10 + 0 ], { %r20 }; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	@%p3 st.global.b32 [ %rd11 + 0 ], { %r21 }; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	@%p3 st.global.b32 [ %rd12 + 0 ], { %r22 }; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	@%p3 st.global.b32 [ %rd13 + 0 ], { %r23 }; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	@%p3 st.global.b32 [ %rd14 + 0 ], { %r24 }; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	@%p3 st.global.b32 [ %rd15 + 0 ], { %r25 }; | ||||
| 	// end inline asm | ||||
| $L__BB0_1:                              // %common.ret | ||||
| 	.loc	1 0 0                           // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0 | ||||
| 	ret; | ||||
| $L__tmp3: | ||||
| $L__func_end0: | ||||
|                                         // -- End function | ||||
| } | ||||
| 	.file	1 "/tmp/torchinductor_shangdiy/q4/cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py" | ||||
| 	.file	2 "/home/shangdiy/miniconda3/envs/pytorch-3.10/lib/python3.10/site-packages/triton/language/standard.py" | ||||
| 	.section	.debug_abbrev | ||||
| 	{ | ||||
| .b8 1                                   // Abbreviation Code | ||||
| .b8 17                                  // DW_TAG_compile_unit | ||||
| .b8 1                                   // DW_CHILDREN_yes | ||||
| .b8 37                                  // DW_AT_producer | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 19                                  // DW_AT_language | ||||
| .b8 5                                   // DW_FORM_data2 | ||||
| .b8 3                                   // DW_AT_name | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 16                                  // DW_AT_stmt_list | ||||
| .b8 6                                   // DW_FORM_data4 | ||||
| .b8 27                                  // DW_AT_comp_dir | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 2                                   // Abbreviation Code | ||||
| .b8 46                                  // DW_TAG_subprogram | ||||
| .b8 0                                   // DW_CHILDREN_no | ||||
| .b8 3                                   // DW_AT_name | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 32                                  // DW_AT_inline | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 3                                   // Abbreviation Code | ||||
| .b8 46                                  // DW_TAG_subprogram | ||||
| .b8 1                                   // DW_CHILDREN_yes | ||||
| .b8 17                                  // DW_AT_low_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 18                                  // DW_AT_high_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 49                                  // DW_AT_abstract_origin | ||||
| .b8 19                                  // DW_FORM_ref4 | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 4                                   // Abbreviation Code | ||||
| .b8 29                                  // DW_TAG_inlined_subroutine | ||||
| .b8 0                                   // DW_CHILDREN_no | ||||
| .b8 49                                  // DW_AT_abstract_origin | ||||
| .b8 19                                  // DW_FORM_ref4 | ||||
| .b8 17                                  // DW_AT_low_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 18                                  // DW_AT_high_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 88                                  // DW_AT_call_file | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 89                                  // DW_AT_call_line | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 87                                  // DW_AT_call_column | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 0                                   // EOM(3) | ||||
| 	} | ||||
| 	.section	.debug_info | ||||
| 	{ | ||||
| .b32 203                                // Length of Unit | ||||
| .b8 2                                   // DWARF version number | ||||
| .b8 0 | ||||
| .b32 .debug_abbrev                      // Offset Into Abbrev. Section | ||||
| .b8 8                                   // Address Size (in bytes) | ||||
| .b8 1                                   // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit | ||||
| .b8 116                                 // DW_AT_producer | ||||
| .b8 114 | ||||
| .b8 105 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 110 | ||||
| .b8 0 | ||||
| .b8 2                                   // DW_AT_language | ||||
| .b8 0 | ||||
| .b8 99                                  // DW_AT_name | ||||
| .b8 113 | ||||
| .b8 52 | ||||
| .b8 54 | ||||
| .b8 118 | ||||
| .b8 111 | ||||
| .b8 108 | ||||
| .b8 114 | ||||
| .b8 122 | ||||
| .b8 109 | ||||
| .b8 97 | ||||
| .b8 54 | ||||
| .b8 55 | ||||
| .b8 105 | ||||
| .b8 110 | ||||
| .b8 100 | ||||
| .b8 116 | ||||
| .b8 119 | ||||
| .b8 104 | ||||
| .b8 106 | ||||
| .b8 53 | ||||
| .b8 97 | ||||
| .b8 52 | ||||
| .b8 110 | ||||
| .b8 54 | ||||
| .b8 122 | ||||
| .b8 114 | ||||
| .b8 51 | ||||
| .b8 54 | ||||
| .b8 55 | ||||
| .b8 103 | ||||
| .b8 113 | ||||
| .b8 112 | ||||
| .b8 107 | ||||
| .b8 99 | ||||
| .b8 119 | ||||
| .b8 111 | ||||
| .b8 50 | ||||
| .b8 104 | ||||
| .b8 108 | ||||
| .b8 112 | ||||
| .b8 104 | ||||
| .b8 121 | ||||
| .b8 110 | ||||
| .b8 104 | ||||
| .b8 106 | ||||
| .b8 116 | ||||
| .b8 104 | ||||
| .b8 110 | ||||
| .b8 51 | ||||
| .b8 117 | ||||
| .b8 107 | ||||
| .b8 46 | ||||
| .b8 112 | ||||
| .b8 121 | ||||
| .b8 0 | ||||
| .b32 .debug_line                        // DW_AT_stmt_list | ||||
| .b8 47                                  // DW_AT_comp_dir | ||||
| .b8 116 | ||||
| .b8 109 | ||||
| .b8 112 | ||||
| .b8 47 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 114 | ||||
| .b8 99 | ||||
| .b8 104 | ||||
| .b8 105 | ||||
| .b8 110 | ||||
| .b8 100 | ||||
| .b8 117 | ||||
| .b8 99 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 114 | ||||
| .b8 95 | ||||
| .b8 115 | ||||
| .b8 104 | ||||
| .b8 97 | ||||
| .b8 110 | ||||
| .b8 103 | ||||
| .b8 100 | ||||
| .b8 105 | ||||
| .b8 121 | ||||
| .b8 47 | ||||
| .b8 113 | ||||
| .b8 52 | ||||
| .b8 0 | ||||
| .b8 2                                   // Abbrev [2] 0x70:0x30 DW_TAG_subprogram | ||||
| .b8 109                                 // DW_AT_name | ||||
| .b8 111 | ||||
| .b8 100 | ||||
| .b8 101 | ||||
| .b8 108 | ||||
| .b8 95 | ||||
| .b8 116 | ||||
| .b8 114 | ||||
| .b8 105 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 110 | ||||
| .b8 95 | ||||
| .b8 116 | ||||
| .b8 101 | ||||
| .b8 109 | ||||
| .b8 95 | ||||
| .b8 102 | ||||
| .b8 117 | ||||
| .b8 115 | ||||
| .b8 101 | ||||
| .b8 100 | ||||
| .b8 95 | ||||
| .b8 97 | ||||
| .b8 100 | ||||
| .b8 100 | ||||
| .b8 109 | ||||
| .b8 109 | ||||
| .b8 95 | ||||
| .b8 114 | ||||
| .b8 101 | ||||
| .b8 108 | ||||
| .b8 117 | ||||
| .b8 95 | ||||
| .b8 115 | ||||
| .b8 105 | ||||
| .b8 103 | ||||
| .b8 109 | ||||
| .b8 111 | ||||
| .b8 105 | ||||
| .b8 100 | ||||
| .b8 95 | ||||
| .b8 116 | ||||
| .b8 95 | ||||
| .b8 49 | ||||
| .b8 0 | ||||
| .b8 1                                   // DW_AT_inline | ||||
| .b8 3                                   // Abbrev [3] 0xa0:0x2e DW_TAG_subprogram | ||||
| .b64 $L__func_begin0                    // DW_AT_low_pc | ||||
| .b64 $L__func_end0                      // DW_AT_high_pc | ||||
| .b32 112                                // DW_AT_abstract_origin | ||||
| .b8 4                                   // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine | ||||
| .b32 112                                // DW_AT_abstract_origin | ||||
| .b64 $L__tmp1                           // DW_AT_low_pc | ||||
| .b64 $L__tmp2                           // DW_AT_high_pc | ||||
| .b8 1                                   // DW_AT_call_file | ||||
| .b8 99                                  // DW_AT_call_line | ||||
| .b8 22                                  // DW_AT_call_column | ||||
| .b8 0                                   // End Of Children Mark | ||||
| .b8 0                                   // End Of Children Mark | ||||
| 	} | ||||
| 	.section	.debug_macinfo	{	} | ||||
										
											Binary file not shown.
										
									
								
							| @ -0,0 +1,727 @@ | ||||
| // | ||||
| // Generated by LLVM NVPTX Back-End | ||||
| // | ||||
|  | ||||
| .version 8.7 | ||||
| .target sm_89 | ||||
| .address_size 64 | ||||
|  | ||||
| 	// .globl	model_triton_tem_fused_addmm_relu_t_0 // -- Begin function model_triton_tem_fused_addmm_relu_t_0 | ||||
| .extern .shared .align 16 .b8 global_smem[]; | ||||
|                                         // @model_triton_tem_fused_addmm_relu_t_0 | ||||
| .visible .entry model_triton_tem_fused_addmm_relu_t_0( | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_0, | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_1, | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_2, | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_3, | ||||
| 	.param .u32 model_triton_tem_fused_addmm_relu_t_0_param_4, | ||||
| 	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_5 | ||||
| ) | ||||
| .reqntid 32 | ||||
| { | ||||
| 	.reg .pred 	%p<27>; | ||||
| 	.reg .b32 	%r<398>; | ||||
| 	.reg .b64 	%rd<29>; | ||||
| 	.loc	1 18 0                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0 | ||||
| $L__func_begin0: | ||||
| 	.loc	1 18 0                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0 | ||||
|  | ||||
| // %bb.0: | ||||
| 	ld.param.b32 	%r1, [model_triton_tem_fused_addmm_relu_t_0_param_4]; | ||||
| $L__tmp0: | ||||
| 	.loc	1 34 16                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:34:16 | ||||
| 	and.b32 	%r2, %r1, 268435455; | ||||
| 	setp.ne.s32 	%p1, %r2, 0; | ||||
| 	@%p1 bra 	$L__BB0_2; | ||||
| 	bra.uni 	$L__BB0_1; | ||||
| $L__BB0_2: | ||||
| 	.loc	1 0 16                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0:16 | ||||
| 	ld.param.b64 	%rd4, [model_triton_tem_fused_addmm_relu_t_0_param_3]; | ||||
| 	ld.param.b64 	%rd3, [model_triton_tem_fused_addmm_relu_t_0_param_2]; | ||||
| 	ld.param.b64 	%rd2, [model_triton_tem_fused_addmm_relu_t_0_param_1]; | ||||
| 	ld.param.b64 	%rd1, [model_triton_tem_fused_addmm_relu_t_0_param_0]; | ||||
| 	.loc	1 43 24                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:43:24 | ||||
| 	mov.u32 	%r51, %ctaid.x; | ||||
| 	.loc	1 44 28                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:28 | ||||
| 	add.s32 	%r52, %r1, 15; | ||||
| 	.loc	1 44 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:34 | ||||
| 	shr.s32 	%r53, %r52, 31; | ||||
| 	shr.u32 	%r54, %r53, 28; | ||||
| 	add.s32 	%r55, %r52, %r54; | ||||
| 	shr.s32 	%r56, %r55, 4; | ||||
| 	.loc	1 50 41                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:41 | ||||
| 	and.b32 	%r57, %r51, 2147483640; | ||||
| 	.loc	1 50 30                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:30 | ||||
| 	sub.s32 	%r58, %r56, %r57; | ||||
| 	.loc	1 50 50                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:50 | ||||
| 	min.s32 	%r59, %r58, 8; | ||||
| 	.loc	1 51 40                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:40 | ||||
| 	rem.s32 	%r60, %r51, %r59; | ||||
| 	.loc	1 51 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:34 | ||||
| 	add.s32 	%r61, %r60, %r57; | ||||
| 	.loc	1 52 19                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:19 | ||||
| 	and.b32 	%r62, %r51, 7; | ||||
| 	.loc	1 52 30                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:30 | ||||
| 	div.s32 	%r63, %r62, %r59; | ||||
| 	.loc	1 56 17                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:17 | ||||
| 	shl.b32 	%r64, %r61, 4; | ||||
| 	.loc	1 56 40                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:40 | ||||
| 	mov.u32 	%r65, %tid.x; | ||||
| 	and.b32 	%r66, %r65, 4; | ||||
| 	bfe.u32 	%r67, %r65, 2, 1; | ||||
| 	shr.u32 	%r68, %r65, 2; | ||||
| 	and.b32 	%r69, %r68, 6; | ||||
| 	or.b32 	%r70, %r69, %r67; | ||||
| 	bfe.u32 	%r71, %r65, 3, 2; | ||||
| 	and.b32 	%r72, %r65, 1; | ||||
| 	shl.b32 	%r73, %r72, 2; | ||||
| 	shl.b32 	%r74, %r65, 2; | ||||
| 	and.b32 	%r75, %r74, 12; | ||||
| 	or.b32 	%r76, %r75, 2; | ||||
| 	.loc	1 56 27                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:27 | ||||
| 	or.b32 	%r77, %r64, %r70; | ||||
| 	or.b32 	%r78, %r77, 8; | ||||
| 	.loc	1 57 17                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:17 | ||||
| 	shl.b32 	%r79, %r63, 4; | ||||
| 	.loc	1 57 27                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:27 | ||||
| 	or.b32 	%r80, %r79, %r75; | ||||
| 	.loc	1 0 0                           // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0 | ||||
| 	rem.s32 	%r81, %r78, %r1; | ||||
| 	rem.s32 	%r82, %r77, %r1; | ||||
| 	.loc	1 71 36                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:71:36 | ||||
| 	setp.lt.u32 	%p2, %r75, 10; | ||||
| 	setp.lt.u32 	%p3, %r76, 10; | ||||
| 	.loc	1 72 24                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:24 | ||||
| 	shl.b32 	%r83, %r65, 1; | ||||
| 	and.b32 	%r84, %r83, 6; | ||||
| 	shl.b32 	%r85, %r66, 1; | ||||
| 	or.b32 	%r86, %r84, %r85; | ||||
| 	.loc	1 72 36                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:36 | ||||
| 	setp.lt.u32 	%p6, %r86, 10; | ||||
| 	.loc	1 79 28                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:28 | ||||
| 	mul.lo.s32 	%r87, %r82, 10; | ||||
| 	mul.lo.s32 	%r88, %r81, 10; | ||||
| 	.loc	1 79 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:25 | ||||
| 	add.s32 	%r89, %r87, %r75; | ||||
| 	add.s32 	%r90, %r87, %r76; | ||||
| 	add.s32 	%r91, %r88, %r75; | ||||
| 	add.s32 	%r92, %r88, %r76; | ||||
| 	.loc	1 80 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:25 | ||||
| 	mul.wide.s32 	%rd21, %r89, 4; | ||||
| 	add.s64 	%rd5, %rd1, %rd21; | ||||
| 	mul.wide.s32 	%rd22, %r90, 4; | ||||
| 	add.s64 	%rd6, %rd1, %rd22; | ||||
| 	mul.wide.s32 	%rd23, %r91, 4; | ||||
| 	add.s64 	%rd7, %rd1, %rd23; | ||||
| 	mul.wide.s32 	%rd24, %r92, 4; | ||||
| 	add.s64 	%rd8, %rd1, %rd24; | ||||
| 	mov.b32 	%r5, 0; | ||||
| 	.loc	1 80 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20 | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r3, %r5; | ||||
| 	mov.u32 %r4, %r5; | ||||
| 	@%p2 ld.global.v2.b32 { %r3, %r4 }, [ %rd5 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r7, %r5; | ||||
| 	mov.u32 %r8, %r5; | ||||
| 	@%p3 ld.global.v2.b32 { %r7, %r8 }, [ %rd6 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r11, %r5; | ||||
| 	mov.u32 %r12, %r5; | ||||
| 	@%p2 ld.global.v2.b32 { %r11, %r12 }, [ %rd7 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r15, %r5; | ||||
| 	mov.u32 %r16, %r5; | ||||
| 	@%p3 ld.global.v2.b32 { %r15, %r16 }, [ %rd8 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	shl.b32 	%r93, %r75, 2; | ||||
| 	mov.b32 	%r94, global_smem; | ||||
| 	add.s32 	%r95, %r94, %r93; | ||||
| 	shl.b32 	%r96, %r70, 6; | ||||
| 	add.s32 	%r97, %r95, %r96; | ||||
| 	st.shared.v4.b32 	[%r97], {%r3, %r4, %r7, %r8}; | ||||
| 	st.shared.v4.b32 	[%r97+512], {%r11, %r12, %r15, %r16}; | ||||
| 	.loc	1 85 50                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:50 | ||||
| 	mad.lo.s32 	%r98, %r71, 10, %r86; | ||||
| 	.loc	1 85 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:25 | ||||
| 	mul.wide.u32 	%rd25, %r98, 4; | ||||
| 	add.s64 	%rd9, %rd2, %rd25; | ||||
| 	add.s64 	%rd10, %rd9, 160; | ||||
| 	add.s64 	%rd11, %rd9, 320; | ||||
| 	add.s64 	%rd12, %rd9, 480; | ||||
| 	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20 | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r19, %r5; | ||||
| 	mov.u32 %r20, %r5; | ||||
| 	@%p6 ld.global.v2.b32 { %r19, %r20 }, [ %rd9 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r23, %r5; | ||||
| 	mov.u32 %r24, %r5; | ||||
| 	@%p6 ld.global.v2.b32 { %r23, %r24 }, [ %rd10 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r27, %r5; | ||||
| 	mov.u32 %r28, %r5; | ||||
| 	@%p6 ld.global.v2.b32 { %r27, %r28 }, [ %rd11 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r31, %r5; | ||||
| 	mov.u32 %r32, %r5; | ||||
| 	@%p6 ld.global.v2.b32 { %r31, %r32 }, [ %rd12 + 0 ]; | ||||
| 	// end inline asm | ||||
| 	shl.b32 	%r99, %r86, 2; | ||||
| 	add.s32 	%r100, %r94, 1024; | ||||
| 	add.s32 	%r101, %r100, %r99; | ||||
| 	shl.b32 	%r102, %r71, 6; | ||||
| 	add.s32 	%r103, %r101, %r102; | ||||
| 	st.shared.v2.b32 	[%r103], {%r19, %r20}; | ||||
| 	st.shared.v2.b32 	[%r103+256], {%r23, %r24}; | ||||
| 	st.shared.v2.b32 	[%r103+512], {%r27, %r28}; | ||||
| 	st.shared.v2.b32 	[%r103+768], {%r31, %r32}; | ||||
| 	.loc	1 96 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:20 | ||||
| 	setp.lt.s32 	%p14, %r77, %r1; | ||||
| 	setp.lt.s32 	%p15, %r78, %r1; | ||||
| 	.loc	1 96 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:34 | ||||
| 	setp.eq.s32 	%p16, %r63, 0; | ||||
| 	.loc	1 96 26                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:26 | ||||
| 	and.pred 	%p10, %p16, %p14; | ||||
| 	and.pred 	%p11, %p15, %p16; | ||||
| 	.loc	1 100 30                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:30 | ||||
| 	mul.wide.u32 	%rd26, %r80, 4; | ||||
| 	add.s64 	%rd14, %rd3, %rd26; | ||||
| 	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66 | ||||
| 	// begin inline asm | ||||
| 	mov.u64 %rd15, 0x0; | ||||
| 	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r35, 0x0; | ||||
| 	mov.u32 %r36, 0x0; | ||||
| 	mov.u32 %r37, 0x0; | ||||
| 	mov.u32 %r38, 0x0; | ||||
| 	@%p10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u64 %rd18, 0x0; | ||||
| 	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; | ||||
| 	// end inline asm | ||||
| 	// begin inline asm | ||||
| 	mov.u32 %r39, 0x0; | ||||
| 	mov.u32 %r40, 0x0; | ||||
| 	mov.u32 %r41, 0x0; | ||||
| 	mov.u32 %r42, 0x0; | ||||
| 	@%p11 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd14 + 0 ], %rd18; | ||||
| 	// end inline asm | ||||
| 	.loc	1 80 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20 | ||||
| 	bar.sync 	0; | ||||
| 	shl.b32 	%r104, %r69, 6; | ||||
| 	add.s32 	%r105, %r94, %r104; | ||||
| 	ld.shared.v4.b32 	{%r106, %r107, %r108, %r109}, [%r105+48]; | ||||
| 	ld.shared.v4.b32 	{%r110, %r111, %r112, %r113}, [%r105+112]; | ||||
| 	ld.shared.v4.b32 	{%r114, %r115, %r116, %r117}, [%r105+32]; | ||||
| 	ld.shared.v4.b32 	{%r118, %r119, %r120, %r121}, [%r105+96]; | ||||
| 	ld.shared.v4.b32 	{%r122, %r123, %r124, %r125}, [%r105+16]; | ||||
| 	ld.shared.v4.b32 	{%r126, %r127, %r128, %r129}, [%r105+80]; | ||||
| 	ld.shared.v4.b32 	{%r130, %r131, %r132, %r133}, [%r105]; | ||||
| 	ld.shared.v4.b32 	{%r134, %r135, %r136, %r137}, [%r105+64]; | ||||
| 	ld.shared.v4.b32 	{%r138, %r139, %r140, %r141}, [%r105+560]; | ||||
| 	ld.shared.v4.b32 	{%r142, %r143, %r144, %r145}, [%r105+624]; | ||||
| 	ld.shared.v4.b32 	{%r146, %r147, %r148, %r149}, [%r105+544]; | ||||
| 	ld.shared.v4.b32 	{%r150, %r151, %r152, %r153}, [%r105+608]; | ||||
| 	ld.shared.v4.b32 	{%r154, %r155, %r156, %r157}, [%r105+528]; | ||||
| 	ld.shared.v4.b32 	{%r158, %r159, %r160, %r161}, [%r105+592]; | ||||
| 	ld.shared.v4.b32 	{%r162, %r163, %r164, %r165}, [%r105+512]; | ||||
| 	ld.shared.v4.b32 	{%r166, %r167, %r168, %r169}, [%r105+576]; | ||||
| 	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20 | ||||
| 	mad.lo.s32 	%r170, %r86, 60, %r101; | ||||
| 	or.b32 	%r171, %r84, 1; | ||||
| 	or.b32 	%r172, %r171, %r85; | ||||
| 	shl.b32 	%r173, %r172, 6; | ||||
| 	add.s32 	%r174, %r100, %r173; | ||||
| 	ld.shared.b32 	%r175, [%r170+16]; | ||||
| 	ld.shared.b32 	%r176, [%r174+16]; | ||||
| 	ld.shared.b32 	%r177, [%r174+52]; | ||||
| 	ld.shared.b32 	%r178, [%r170+60]; | ||||
| 	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66 | ||||
| 	shr.u32 	%r179, %r66, 1; | ||||
| 	setp.eq.s32 	%p17, %r66, 0; | ||||
| 	bfe.u32 	%r180, %r65, 1, 1; | ||||
| 	or.b32 	%r181, %r73, %r180; | ||||
| 	or.b32 	%r182, %r181, %r179; | ||||
| 	and.b32 	%r183, %r65, 24; | ||||
| 	or.b32 	%r184, %r182, %r183; | ||||
| 	selp.b32 	%r185, %r35, %r37, %p17; | ||||
| 	shfl.sync.idx.b32 	%r186, %r185, %r184, 31, -1; | ||||
| 	selp.b32 	%r187, %r36, %r38, %p17; | ||||
| 	shfl.sync.idx.b32 	%r188, %r187, %r184, 31, -1; | ||||
| 	selp.b32 	%r189, %r37, %r35, %p17; | ||||
| 	xor.b32 	%r190, %r184, 4; | ||||
| 	shfl.sync.idx.b32 	%r191, %r189, %r190, 31, -1; | ||||
| 	selp.b32 	%r192, %r38, %r36, %p17; | ||||
| 	shfl.sync.idx.b32 	%r193, %r192, %r190, 31, -1; | ||||
| 	selp.b32 	%r194, %r39, %r41, %p17; | ||||
| 	shfl.sync.idx.b32 	%r195, %r194, %r184, 31, -1; | ||||
| 	selp.b32 	%r196, %r40, %r42, %p17; | ||||
| 	shfl.sync.idx.b32 	%r197, %r196, %r184, 31, -1; | ||||
| 	selp.b32 	%r198, %r41, %r39, %p17; | ||||
| 	shfl.sync.idx.b32 	%r199, %r198, %r190, 31, -1; | ||||
| 	selp.b32 	%r200, %r42, %r40, %p17; | ||||
| 	shfl.sync.idx.b32 	%r201, %r200, %r190, 31, -1; | ||||
| 	setp.eq.s32 	%p18, %r72, 0; | ||||
| 	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20 | ||||
| 	ld.shared.v4.b32 	{%r202, %r203, %r204, %r205}, [%r170]; | ||||
| 	ld.shared.v4.b32 	{%r206, %r207, %r208, %r209}, [%r174]; | ||||
| 	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66 | ||||
| 	selp.f32 	%r210, %r186, %r191, %p18; | ||||
| 	selp.f32 	%r211, %r188, %r193, %p18; | ||||
| 	selp.f32 	%r212, %r191, %r186, %p18; | ||||
| 	selp.f32 	%r213, %r193, %r188, %p18; | ||||
| 	selp.f32 	%r214, %r195, %r199, %p18; | ||||
| 	selp.f32 	%r215, %r197, %r201, %p18; | ||||
| 	selp.f32 	%r216, %r199, %r195, %p18; | ||||
| 	selp.f32 	%r217, %r201, %r197, %p18; | ||||
| 	.loc	1 88 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25 | ||||
| 	fma.rn.f32 	%r218, %r166, %r206, %r217; | ||||
| 	fma.rn.f32 	%r219, %r166, %r202, %r216; | ||||
| 	fma.rn.f32 	%r220, %r162, %r206, %r215; | ||||
| 	fma.rn.f32 	%r221, %r162, %r202, %r214; | ||||
| 	fma.rn.f32 	%r222, %r134, %r206, %r213; | ||||
| 	fma.rn.f32 	%r223, %r134, %r202, %r212; | ||||
| 	fma.rn.f32 	%r224, %r130, %r206, %r211; | ||||
| 	fma.rn.f32 	%r225, %r130, %r202, %r210; | ||||
| 	fma.rn.f32 	%r226, %r131, %r203, %r225; | ||||
| 	fma.rn.f32 	%r227, %r131, %r207, %r224; | ||||
| 	fma.rn.f32 	%r228, %r135, %r203, %r223; | ||||
| 	fma.rn.f32 	%r229, %r135, %r207, %r222; | ||||
| 	fma.rn.f32 	%r230, %r163, %r203, %r221; | ||||
| 	fma.rn.f32 	%r231, %r163, %r207, %r220; | ||||
| 	fma.rn.f32 	%r232, %r167, %r203, %r219; | ||||
| 	fma.rn.f32 	%r233, %r167, %r207, %r218; | ||||
| 	fma.rn.f32 	%r234, %r168, %r208, %r233; | ||||
| 	fma.rn.f32 	%r235, %r168, %r204, %r232; | ||||
| 	fma.rn.f32 	%r236, %r164, %r208, %r231; | ||||
| 	fma.rn.f32 	%r237, %r164, %r204, %r230; | ||||
| 	fma.rn.f32 	%r238, %r136, %r208, %r229; | ||||
| 	fma.rn.f32 	%r239, %r136, %r204, %r228; | ||||
| 	fma.rn.f32 	%r240, %r132, %r208, %r227; | ||||
| 	fma.rn.f32 	%r241, %r132, %r204, %r226; | ||||
| 	fma.rn.f32 	%r242, %r133, %r205, %r241; | ||||
| 	fma.rn.f32 	%r243, %r133, %r209, %r240; | ||||
| 	fma.rn.f32 	%r244, %r137, %r205, %r239; | ||||
| 	fma.rn.f32 	%r245, %r137, %r209, %r238; | ||||
| 	fma.rn.f32 	%r246, %r165, %r205, %r237; | ||||
| 	fma.rn.f32 	%r247, %r165, %r209, %r236; | ||||
| 	fma.rn.f32 	%r248, %r169, %r205, %r235; | ||||
| 	fma.rn.f32 	%r249, %r169, %r209, %r234; | ||||
| 	fma.rn.f32 	%r250, %r158, %r176, %r249; | ||||
| 	fma.rn.f32 	%r251, %r158, %r175, %r248; | ||||
| 	fma.rn.f32 	%r252, %r154, %r176, %r247; | ||||
| 	fma.rn.f32 	%r253, %r154, %r175, %r246; | ||||
| 	fma.rn.f32 	%r254, %r126, %r176, %r245; | ||||
| 	fma.rn.f32 	%r255, %r126, %r175, %r244; | ||||
| 	fma.rn.f32 	%r256, %r122, %r176, %r243; | ||||
| 	fma.rn.f32 	%r257, %r122, %r175, %r242; | ||||
| 	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20 | ||||
| 	ld.shared.b32 	%r258, [%r170+24]; | ||||
| 	ld.shared.b32 	%r259, [%r170+20]; | ||||
| 	ld.shared.b32 	%r260, [%r174+24]; | ||||
| 	ld.shared.b32 	%r261, [%r174+20]; | ||||
| 	ld.shared.b32 	%r262, [%r170+32]; | ||||
| 	ld.shared.b32 	%r263, [%r170+28]; | ||||
| 	ld.shared.b32 	%r264, [%r174+32]; | ||||
| 	ld.shared.b32 	%r265, [%r174+28]; | ||||
| 	ld.shared.b32 	%r266, [%r170+40]; | ||||
| 	ld.shared.b32 	%r267, [%r170+36]; | ||||
| 	ld.shared.b32 	%r268, [%r174+40]; | ||||
| 	ld.shared.b32 	%r269, [%r174+36]; | ||||
| 	ld.shared.b32 	%r270, [%r170+48]; | ||||
| 	ld.shared.b32 	%r271, [%r170+44]; | ||||
| 	ld.shared.b32 	%r272, [%r174+48]; | ||||
| 	ld.shared.b32 	%r273, [%r174+44]; | ||||
| 	ld.shared.b32 	%r274, [%r170+56]; | ||||
| 	ld.shared.b32 	%r275, [%r170+52]; | ||||
| 	ld.shared.v2.b32 	{%r276, %r277}, [%r174+56]; | ||||
| 	.loc	1 88 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25 | ||||
| 	fma.rn.f32 	%r278, %r123, %r259, %r257; | ||||
| 	fma.rn.f32 	%r279, %r123, %r261, %r256; | ||||
| 	fma.rn.f32 	%r280, %r127, %r259, %r255; | ||||
| 	fma.rn.f32 	%r281, %r127, %r261, %r254; | ||||
| 	fma.rn.f32 	%r282, %r155, %r259, %r253; | ||||
| 	fma.rn.f32 	%r283, %r155, %r261, %r252; | ||||
| 	fma.rn.f32 	%r284, %r159, %r259, %r251; | ||||
| 	fma.rn.f32 	%r285, %r159, %r261, %r250; | ||||
| 	fma.rn.f32 	%r286, %r160, %r260, %r285; | ||||
| 	fma.rn.f32 	%r287, %r160, %r258, %r284; | ||||
| 	fma.rn.f32 	%r288, %r156, %r260, %r283; | ||||
| 	fma.rn.f32 	%r289, %r156, %r258, %r282; | ||||
| 	fma.rn.f32 	%r290, %r128, %r260, %r281; | ||||
| 	fma.rn.f32 	%r291, %r128, %r258, %r280; | ||||
| 	fma.rn.f32 	%r292, %r124, %r260, %r279; | ||||
| 	fma.rn.f32 	%r293, %r124, %r258, %r278; | ||||
| 	fma.rn.f32 	%r294, %r125, %r263, %r293; | ||||
| 	fma.rn.f32 	%r295, %r125, %r265, %r292; | ||||
| 	fma.rn.f32 	%r296, %r129, %r263, %r291; | ||||
| 	fma.rn.f32 	%r297, %r129, %r265, %r290; | ||||
| 	fma.rn.f32 	%r298, %r157, %r263, %r289; | ||||
| 	fma.rn.f32 	%r299, %r157, %r265, %r288; | ||||
| 	fma.rn.f32 	%r300, %r161, %r263, %r287; | ||||
| 	fma.rn.f32 	%r301, %r161, %r265, %r286; | ||||
| 	fma.rn.f32 	%r302, %r150, %r264, %r301; | ||||
| 	fma.rn.f32 	%r303, %r150, %r262, %r300; | ||||
| 	fma.rn.f32 	%r304, %r146, %r264, %r299; | ||||
| 	fma.rn.f32 	%r305, %r146, %r262, %r298; | ||||
| 	fma.rn.f32 	%r306, %r118, %r264, %r297; | ||||
| 	fma.rn.f32 	%r307, %r118, %r262, %r296; | ||||
| 	fma.rn.f32 	%r308, %r114, %r264, %r295; | ||||
| 	fma.rn.f32 	%r309, %r114, %r262, %r294; | ||||
| 	fma.rn.f32 	%r310, %r115, %r267, %r309; | ||||
| 	fma.rn.f32 	%r311, %r115, %r269, %r308; | ||||
| 	fma.rn.f32 	%r312, %r119, %r267, %r307; | ||||
| 	fma.rn.f32 	%r313, %r119, %r269, %r306; | ||||
| 	fma.rn.f32 	%r314, %r147, %r267, %r305; | ||||
| 	fma.rn.f32 	%r315, %r147, %r269, %r304; | ||||
| 	fma.rn.f32 	%r316, %r151, %r267, %r303; | ||||
| 	fma.rn.f32 	%r317, %r151, %r269, %r302; | ||||
| 	fma.rn.f32 	%r318, %r152, %r268, %r317; | ||||
| 	fma.rn.f32 	%r319, %r152, %r266, %r316; | ||||
| 	fma.rn.f32 	%r320, %r148, %r268, %r315; | ||||
| 	fma.rn.f32 	%r321, %r148, %r266, %r314; | ||||
| 	fma.rn.f32 	%r322, %r120, %r268, %r313; | ||||
| 	fma.rn.f32 	%r323, %r120, %r266, %r312; | ||||
| 	fma.rn.f32 	%r324, %r116, %r268, %r311; | ||||
| 	fma.rn.f32 	%r325, %r116, %r266, %r310; | ||||
| 	fma.rn.f32 	%r326, %r117, %r271, %r325; | ||||
| 	fma.rn.f32 	%r327, %r117, %r273, %r324; | ||||
| 	fma.rn.f32 	%r328, %r121, %r271, %r323; | ||||
| 	fma.rn.f32 	%r329, %r121, %r273, %r322; | ||||
| 	fma.rn.f32 	%r330, %r149, %r271, %r321; | ||||
| 	fma.rn.f32 	%r331, %r149, %r273, %r320; | ||||
| 	fma.rn.f32 	%r332, %r153, %r271, %r319; | ||||
| 	fma.rn.f32 	%r333, %r153, %r273, %r318; | ||||
| 	fma.rn.f32 	%r334, %r142, %r272, %r333; | ||||
| 	fma.rn.f32 	%r335, %r142, %r270, %r332; | ||||
| 	fma.rn.f32 	%r336, %r138, %r272, %r331; | ||||
| 	fma.rn.f32 	%r337, %r138, %r270, %r330; | ||||
| 	fma.rn.f32 	%r338, %r110, %r272, %r329; | ||||
| 	fma.rn.f32 	%r339, %r110, %r270, %r328; | ||||
| 	fma.rn.f32 	%r340, %r106, %r272, %r327; | ||||
| 	fma.rn.f32 	%r341, %r106, %r270, %r326; | ||||
| 	fma.rn.f32 	%r342, %r107, %r275, %r341; | ||||
| 	fma.rn.f32 	%r343, %r107, %r177, %r340; | ||||
| 	fma.rn.f32 	%r344, %r111, %r275, %r339; | ||||
| 	fma.rn.f32 	%r345, %r111, %r177, %r338; | ||||
| 	fma.rn.f32 	%r346, %r139, %r275, %r337; | ||||
| 	fma.rn.f32 	%r347, %r139, %r177, %r336; | ||||
| 	fma.rn.f32 	%r348, %r143, %r275, %r335; | ||||
| 	fma.rn.f32 	%r349, %r143, %r177, %r334; | ||||
| 	fma.rn.f32 	%r350, %r144, %r276, %r349; | ||||
| 	fma.rn.f32 	%r351, %r144, %r274, %r348; | ||||
| 	fma.rn.f32 	%r352, %r140, %r276, %r347; | ||||
| 	fma.rn.f32 	%r353, %r140, %r274, %r346; | ||||
| 	fma.rn.f32 	%r354, %r112, %r276, %r345; | ||||
| 	fma.rn.f32 	%r355, %r112, %r274, %r344; | ||||
| 	fma.rn.f32 	%r356, %r108, %r276, %r343; | ||||
| 	fma.rn.f32 	%r357, %r108, %r274, %r342; | ||||
| 	fma.rn.f32 	%r358, %r109, %r178, %r357; | ||||
| 	fma.rn.f32 	%r359, %r109, %r277, %r356; | ||||
| 	fma.rn.f32 	%r360, %r113, %r178, %r355; | ||||
| 	fma.rn.f32 	%r361, %r113, %r277, %r354; | ||||
| 	fma.rn.f32 	%r362, %r141, %r178, %r353; | ||||
| 	fma.rn.f32 	%r363, %r141, %r277, %r352; | ||||
| 	fma.rn.f32 	%r364, %r145, %r178, %r351; | ||||
| 	fma.rn.f32 	%r365, %r145, %r277, %r350; | ||||
| $L__tmp1: | ||||
| 	.loc	2 110 15                        // triton_helpers.py:110:15 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ] | ||||
| 	setp.lt.f32 	%p19, %r365, 0f00000000; | ||||
| 	setp.lt.f32 	%p20, %r364, 0f00000000; | ||||
| 	setp.lt.f32 	%p21, %r363, 0f00000000; | ||||
| 	setp.lt.f32 	%p22, %r362, 0f00000000; | ||||
| 	setp.lt.f32 	%p23, %r361, 0f00000000; | ||||
| 	setp.lt.f32 	%p24, %r360, 0f00000000; | ||||
| 	setp.lt.f32 	%p25, %r359, 0f00000000; | ||||
| 	setp.lt.f32 	%p26, %r358, 0f00000000; | ||||
| 	.loc	2 113 29                        // triton_helpers.py:113:29 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ] | ||||
| 	selp.f32 	%r366, 0f00000000, %r358, %p26; | ||||
| 	selp.f32 	%r367, 0f00000000, %r359, %p25; | ||||
| 	selp.f32 	%r368, 0f00000000, %r360, %p24; | ||||
| 	selp.f32 	%r369, 0f00000000, %r361, %p23; | ||||
| 	selp.f32 	%r370, 0f00000000, %r362, %p22; | ||||
| 	selp.f32 	%r371, 0f00000000, %r363, %p21; | ||||
| 	selp.f32 	%r372, 0f00000000, %r364, %p20; | ||||
| 	selp.f32 	%r373, 0f00000000, %r365, %p19; | ||||
| $L__tmp2: | ||||
| 	.loc	1 104 52                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:52 | ||||
| 	shl.b32 	%r374, %r77, 4; | ||||
| 	shl.b32 	%r375, %r78, 4; | ||||
| 	.loc	1 104 49                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:49 | ||||
| 	add.s32 	%r376, %r374, %r80; | ||||
| 	add.s32 	%r377, %r80, %r375; | ||||
| 	.loc	1 104 25                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:25 | ||||
| 	mul.wide.s32 	%rd27, %r376, 4; | ||||
| 	add.s64 	%rd19, %rd4, %rd27; | ||||
| 	mul.wide.s32 	%rd28, %r377, 4; | ||||
| 	add.s64 	%rd20, %rd4, %rd28; | ||||
| 	.loc	1 104 78                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:78 | ||||
| 	selp.f32 	%r378, %r366, %r368, %p18; | ||||
| 	or.b32 	%r379, %r183, %r67; | ||||
| 	or.b32 	%r380, %r379, %r84; | ||||
| 	shfl.sync.idx.b32 	%r381, %r378, %r380, 31, -1; | ||||
| 	selp.f32 	%r382, %r367, %r369, %p18; | ||||
| 	shfl.sync.idx.b32 	%r383, %r382, %r380, 31, -1; | ||||
| 	selp.f32 	%r384, %r368, %r366, %p18; | ||||
| 	xor.b32 	%r385, %r171, %r67; | ||||
| 	or.b32 	%r386, %r183, %r385; | ||||
| 	shfl.sync.idx.b32 	%r387, %r384, %r386, 31, -1; | ||||
| 	selp.f32 	%r388, %r369, %r367, %p18; | ||||
| 	shfl.sync.idx.b32 	%r389, %r388, %r386, 31, -1; | ||||
| 	selp.f32 	%r390, %r370, %r372, %p18; | ||||
| 	shfl.sync.idx.b32 	%r391, %r390, %r380, 31, -1; | ||||
| 	selp.f32 	%r392, %r371, %r373, %p18; | ||||
| 	shfl.sync.idx.b32 	%r393, %r392, %r380, 31, -1; | ||||
| 	selp.f32 	%r394, %r372, %r370, %p18; | ||||
| 	shfl.sync.idx.b32 	%r395, %r394, %r386, 31, -1; | ||||
| 	selp.f32 	%r396, %r373, %r371, %p18; | ||||
| 	shfl.sync.idx.b32 	%r397, %r396, %r386, 31, -1; | ||||
| 	selp.b32 	%r45, %r387, %r381, %p17; | ||||
| 	selp.b32 	%r46, %r389, %r383, %p17; | ||||
| 	selp.b32 	%r49, %r395, %r391, %p17; | ||||
| 	selp.b32 	%r50, %r397, %r393, %p17; | ||||
| 	selp.b32 	%r43, %r381, %r387, %p17; | ||||
| 	selp.b32 	%r44, %r383, %r389, %p17; | ||||
| 	// begin inline asm | ||||
| 	@%p10 st.global.v4.b32 [ %rd19 + 0 ], { %r43, %r44, %r45, %r46 }; | ||||
| 	// end inline asm | ||||
| 	selp.b32 	%r47, %r391, %r395, %p17; | ||||
| 	selp.b32 	%r48, %r393, %r397, %p17; | ||||
| 	// begin inline asm | ||||
| 	@%p11 st.global.v4.b32 [ %rd20 + 0 ], { %r47, %r48, %r49, %r50 }; | ||||
| 	// end inline asm | ||||
| $L__BB0_1:                              // %common.ret | ||||
| 	.loc	1 0 0                           // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0 | ||||
| 	ret; | ||||
| $L__tmp3: | ||||
| $L__func_end0: | ||||
|                                         // -- End function | ||||
| } | ||||
| 	.file	1 "/tmp/torchinductor_shangdiy/uw/cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py" | ||||
| 	.file	2 "/home/shangdiy/pytorch/torch/_inductor/runtime/triton_helpers.py" | ||||
| 	.section	.debug_abbrev | ||||
| 	{ | ||||
| .b8 1                                   // Abbreviation Code | ||||
| .b8 17                                  // DW_TAG_compile_unit | ||||
| .b8 1                                   // DW_CHILDREN_yes | ||||
| .b8 37                                  // DW_AT_producer | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 19                                  // DW_AT_language | ||||
| .b8 5                                   // DW_FORM_data2 | ||||
| .b8 3                                   // DW_AT_name | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 16                                  // DW_AT_stmt_list | ||||
| .b8 6                                   // DW_FORM_data4 | ||||
| .b8 27                                  // DW_AT_comp_dir | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 2                                   // Abbreviation Code | ||||
| .b8 46                                  // DW_TAG_subprogram | ||||
| .b8 0                                   // DW_CHILDREN_no | ||||
| .b8 3                                   // DW_AT_name | ||||
| .b8 8                                   // DW_FORM_string | ||||
| .b8 32                                  // DW_AT_inline | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 3                                   // Abbreviation Code | ||||
| .b8 46                                  // DW_TAG_subprogram | ||||
| .b8 1                                   // DW_CHILDREN_yes | ||||
| .b8 17                                  // DW_AT_low_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 18                                  // DW_AT_high_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 49                                  // DW_AT_abstract_origin | ||||
| .b8 19                                  // DW_FORM_ref4 | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 4                                   // Abbreviation Code | ||||
| .b8 29                                  // DW_TAG_inlined_subroutine | ||||
| .b8 0                                   // DW_CHILDREN_no | ||||
| .b8 49                                  // DW_AT_abstract_origin | ||||
| .b8 19                                  // DW_FORM_ref4 | ||||
| .b8 17                                  // DW_AT_low_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 18                                  // DW_AT_high_pc | ||||
| .b8 1                                   // DW_FORM_addr | ||||
| .b8 88                                  // DW_AT_call_file | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 89                                  // DW_AT_call_line | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 87                                  // DW_AT_call_column | ||||
| .b8 11                                  // DW_FORM_data1 | ||||
| .b8 0                                   // EOM(1) | ||||
| .b8 0                                   // EOM(2) | ||||
| .b8 0                                   // EOM(3) | ||||
| 	} | ||||
| 	.section	.debug_info | ||||
| 	{ | ||||
| .b32 195                                // Length of Unit | ||||
| .b8 2                                   // DWARF version number | ||||
| .b8 0 | ||||
| .b32 .debug_abbrev                      // Offset Into Abbrev. Section | ||||
| .b8 8                                   // Address Size (in bytes) | ||||
| .b8 1                                   // Abbrev [1] 0xb:0xbc DW_TAG_compile_unit | ||||
| .b8 116                                 // DW_AT_producer | ||||
| .b8 114 | ||||
| .b8 105 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 110 | ||||
| .b8 0 | ||||
| .b8 2                                   // DW_AT_language | ||||
| .b8 0 | ||||
| .b8 99                                  // DW_AT_name | ||||
| .b8 117 | ||||
| .b8 119 | ||||
| .b8 104 | ||||
| .b8 98 | ||||
| .b8 112 | ||||
| .b8 101 | ||||
| .b8 99 | ||||
| .b8 100 | ||||
| .b8 50 | ||||
| .b8 117 | ||||
| .b8 107 | ||||
| .b8 101 | ||||
| .b8 115 | ||||
| .b8 111 | ||||
| .b8 51 | ||||
| .b8 106 | ||||
| .b8 120 | ||||
| .b8 101 | ||||
| .b8 107 | ||||
| .b8 99 | ||||
| .b8 122 | ||||
| .b8 118 | ||||
| .b8 103 | ||||
| .b8 104 | ||||
| .b8 98 | ||||
| .b8 55 | ||||
| .b8 97 | ||||
| .b8 104 | ||||
| .b8 50 | ||||
| .b8 104 | ||||
| .b8 107 | ||||
| .b8 50 | ||||
| .b8 122 | ||||
| .b8 111 | ||||
| .b8 98 | ||||
| .b8 54 | ||||
| .b8 55 | ||||
| .b8 111 | ||||
| .b8 112 | ||||
| .b8 99 | ||||
| .b8 52 | ||||
| .b8 51 | ||||
| .b8 97 | ||||
| .b8 103 | ||||
| .b8 104 | ||||
| .b8 101 | ||||
| .b8 53 | ||||
| .b8 119 | ||||
| .b8 108 | ||||
| .b8 118 | ||||
| .b8 51 | ||||
| .b8 46 | ||||
| .b8 112 | ||||
| .b8 121 | ||||
| .b8 0 | ||||
| .b32 .debug_line                        // DW_AT_stmt_list | ||||
| .b8 47                                  // DW_AT_comp_dir | ||||
| .b8 116 | ||||
| .b8 109 | ||||
| .b8 112 | ||||
| .b8 47 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 114 | ||||
| .b8 99 | ||||
| .b8 104 | ||||
| .b8 105 | ||||
| .b8 110 | ||||
| .b8 100 | ||||
| .b8 117 | ||||
| .b8 99 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 114 | ||||
| .b8 95 | ||||
| .b8 115 | ||||
| .b8 104 | ||||
| .b8 97 | ||||
| .b8 110 | ||||
| .b8 103 | ||||
| .b8 100 | ||||
| .b8 105 | ||||
| .b8 121 | ||||
| .b8 47 | ||||
| .b8 117 | ||||
| .b8 119 | ||||
| .b8 0 | ||||
| .b8 2                                   // Abbrev [2] 0x70:0x28 DW_TAG_subprogram | ||||
| .b8 109                                 // DW_AT_name | ||||
| .b8 111 | ||||
| .b8 100 | ||||
| .b8 101 | ||||
| .b8 108 | ||||
| .b8 95 | ||||
| .b8 116 | ||||
| .b8 114 | ||||
| .b8 105 | ||||
| .b8 116 | ||||
| .b8 111 | ||||
| .b8 110 | ||||
| .b8 95 | ||||
| .b8 116 | ||||
| .b8 101 | ||||
| .b8 109 | ||||
| .b8 95 | ||||
| .b8 102 | ||||
| .b8 117 | ||||
| .b8 115 | ||||
| .b8 101 | ||||
| .b8 100 | ||||
| .b8 95 | ||||
| .b8 97 | ||||
| .b8 100 | ||||
| .b8 100 | ||||
| .b8 109 | ||||
| .b8 109 | ||||
| .b8 95 | ||||
| .b8 114 | ||||
| .b8 101 | ||||
| .b8 108 | ||||
| .b8 117 | ||||
| .b8 95 | ||||
| .b8 116 | ||||
| .b8 95 | ||||
| .b8 48 | ||||
| .b8 0 | ||||
| .b8 1                                   // DW_AT_inline | ||||
| .b8 3                                   // Abbrev [3] 0x98:0x2e DW_TAG_subprogram | ||||
| .b64 $L__func_begin0                    // DW_AT_low_pc | ||||
| .b64 $L__func_end0                      // DW_AT_high_pc | ||||
| .b32 112                                // DW_AT_abstract_origin | ||||
| .b8 4                                   // Abbrev [4] 0xad:0x18 DW_TAG_inlined_subroutine | ||||
| .b32 112                                // DW_AT_abstract_origin | ||||
| .b64 $L__tmp1                           // DW_AT_low_pc | ||||
| .b64 $L__tmp2                           // DW_AT_high_pc | ||||
| .b8 1                                   // DW_AT_call_file | ||||
| .b8 103                                 // DW_AT_call_line | ||||
| .b8 40                                  // DW_AT_call_column | ||||
| .b8 0                                   // End Of Children Mark | ||||
| .b8 0                                   // End Of Children Mark | ||||
| 	} | ||||
| 	.section	.debug_macinfo	{	} | ||||
							
								
								
									
										8
									
								
								model2/data/aotinductor/model/script.ld
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								model2/data/aotinductor/model/script.ld
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| SECTIONS { | ||||
|   /* By default, in LLD 16, .lrodata is placed immediately after .rodata. | ||||
|    * However, .lrodata can be very large in our compiled models, which leads to | ||||
|    * relocation out-of-range errors for relative relocations. So we place it | ||||
|    * after other the sections that are referenced from .text using relative | ||||
|    * relocations. This is the default behavior in GNU ld. */ | ||||
|   .lrodata : { *(.lrodata) } | ||||
|  } INSERT AFTER .bss; | ||||
							
								
								
									
										147
									
								
								model2/main.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										147
									
								
								model2/main.cpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,147 @@ | ||||
| // Windows for #include <dlfcn.h> | ||||
| #include <windows.h> | ||||
| #include <stdio.h> | ||||
|  | ||||
| #include <iostream> | ||||
| #include <memory> | ||||
| #include <vector> | ||||
| #include <string> | ||||
|  | ||||
| // Include the AOTInductor headers | ||||
| // #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h> | ||||
| #include <torch/csrc/inductor/aoti_runtime/interface.h> | ||||
| // #include <torch/csrc/inductor/aoti_runtime/model_container.h> | ||||
| // #include <torch/csrc/inductor/aoti_torch/tensor_converter.h> // @manual | ||||
| #include <torch/csrc/inductor/aoti_torch/c/shim.h> | ||||
| #include <standalone/slim/core/Empty.h> | ||||
| #include <standalone/slim/cuda/Guard.h> | ||||
| #include <standalone/torch/csrc/inductor/aoti_torch/tensor_converter.h> | ||||
|  | ||||
| static std::wstring u8u16(const char* s) { | ||||
|     int len = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0); | ||||
|     std::wstring wbuf(len, L'\0'); | ||||
|     MultiByteToWideChar(CP_UTF8, 0, s, -1, &wbuf[0], len); | ||||
|     if (!wbuf.empty() && wbuf.back() == L'\0') { | ||||
|         wbuf.pop_back(); | ||||
|     } | ||||
|     return wbuf; | ||||
| } | ||||
|  | ||||
| int main() { | ||||
|   try { | ||||
|  | ||||
|     // Load the DLL (model.pyd is a DLL on Windows) | ||||
| HMODULE handle = nullptr; | ||||
| { | ||||
|     auto wname = u8u16(R"(C:\Users\shangdiy\source\repos\pytorch\model2\model.pyd)"); | ||||
|  | ||||
|     // Try LoadLibraryExW with safe search flags if supported | ||||
|     if (GetProcAddress(GetModuleHandleW(L"KERNEL32.DLL"), "AddDllDirectory") != NULL) { | ||||
|         handle = LoadLibraryExW( | ||||
|             wname.c_str(), | ||||
|             NULL, | ||||
|             LOAD_LIBRARY_SEARCH_DEFAULT_DIRS); | ||||
|     } | ||||
|  | ||||
|     // Fallback if that failed | ||||
|     if (!handle) { | ||||
|         handle = LoadLibraryW(wname.c_str()); | ||||
|     } | ||||
|  | ||||
|     if (!handle) { | ||||
|         DWORD dw = GetLastError(); | ||||
|         char buf[512]; | ||||
|         FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, | ||||
|                        NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), | ||||
|                        buf, sizeof(buf), NULL); | ||||
|         std::cerr << "Failed to load model.pyd. WinError " << dw << ": " << buf << std::endl; | ||||
|         return 1; | ||||
|     } else { | ||||
|         std::cout << "Loaded model.pyd" << std::endl; | ||||
|     } | ||||
| } | ||||
|     decltype(&AOTInductorModelContainerCreateWithDevice) create_model{nullptr};  | ||||
|     decltype(&AOTInductorModelContainerDelete) delete_model{nullptr};  | ||||
|     decltype(&AOTInductorModelContainerRun) run_model{nullptr}; | ||||
|  | ||||
|  | ||||
| #define AOTI_LOAD_SYMBOL(handle_, var, name_str) \ | ||||
|     var = reinterpret_cast<decltype(var)>(GetProcAddress(handle_, name_str)); \ | ||||
|     if (!var) { \ | ||||
|         throw std::runtime_error("Could not GetProcAddress " name_str); \ | ||||
|     } | ||||
|  | ||||
|         AOTI_LOAD_SYMBOL(handle, create_model, "AOTInductorModelContainerCreateWithDevice"); | ||||
|         AOTI_LOAD_SYMBOL(handle, run_model, "AOTInductorModelContainerRun"); | ||||
|         AOTI_LOAD_SYMBOL(handle, delete_model, "AOTInductorModelContainerDelete"); | ||||
| #undef AOTI_LOAD_SYMBOL | ||||
|  | ||||
|     // Create array of input/output handles | ||||
|         slim::SlimTensor x = slim::empty({8, 10}, c10::kFloat, c10::Device(c10::kCUDA, 0)); | ||||
|         float fill_value = 1.0; | ||||
|         x.fill_(fill_value); | ||||
|     // AOTInductorModel::run will steal the ownership of the input and output | ||||
|     // tensor pointers | ||||
|         std::vector<slim::SlimTensor> inputs = {x}; | ||||
|         std::vector<AtenTensorHandle> input_handles = | ||||
|             unsafe_alloc_new_handles_from_tensors(inputs); | ||||
|  | ||||
|         AtenTensorHandle output_handle; | ||||
|         AOTInductorModelContainerHandle container_handle; | ||||
|         cudaStream_t stream = slim::cuda::getCurrentCUDAStream(0); | ||||
|         // aoti_torch_get_current_cuda_stream(0, (void**)&stream); | ||||
|  | ||||
|         // Reinterpret as the opaque handle for AOTInductor | ||||
|         AOTInductorStreamHandle stream_handle = reinterpret_cast<AOTInductorStreamHandle>(stream); | ||||
|  | ||||
|         // Construct model | ||||
|        const char* cubin_dir = R"(C:\Users\shangdiy\source\repos\pytorch\model2\)"; | ||||
|         AOTIRuntimeError err = | ||||
|             create_model(&container_handle, 1, "cuda", cubin_dir); | ||||
|         if (err != AOTI_RUNTIME_SUCCESS) { | ||||
|           throw std::runtime_error("Failed to create model container"); | ||||
|         } else { | ||||
|           std::cout << "Created model\n"; | ||||
|         } | ||||
|  | ||||
|         // Run the model | ||||
|         err = run_model(container_handle, input_handles.data(), | ||||
|                           1, // num_inputs | ||||
|                           &output_handle, | ||||
|                           1,       // num_outputs | ||||
|                           stream_handle, // stream | ||||
|                           nullptr  // proxy_executor | ||||
|         ); | ||||
|         if (err != AOTI_RUNTIME_SUCCESS) { | ||||
|           throw std::runtime_error("Failed to run model"); | ||||
|         } else { | ||||
|           std::cout << "Finish model\n"; | ||||
|         } | ||||
|  | ||||
|         std::vector<slim::SlimTensor> outputs = | ||||
|             alloc_tensors_by_stealing_from_handles(&output_handle, 1); | ||||
|  | ||||
|     // Print the result | ||||
|     slim::SlimTensor slim_tensor = outputs[0]; | ||||
|     auto slim_cpu = slim_tensor.cpu(); | ||||
|     float *slim_data = static_cast<float *>(slim_cpu.data_ptr()); | ||||
|      std::cout << "Output" << std::endl; | ||||
|      std::cout << "slim_data ptr: " << slim_data << "\n"; | ||||
|     size_t num_elements = slim_cpu.numel(); // or equivalent method | ||||
|      std::cout << num_elements << std::endl; | ||||
|  | ||||
|     for (size_t i = 0; i <  num_elements; ++i) { | ||||
|       std::cout << slim_data[i] << "\n"; | ||||
|     } | ||||
|  | ||||
|     std::cout << "Done" << std::endl; | ||||
|  | ||||
|     delete_model(container_handle); | ||||
|     FreeLibrary(handle); | ||||
|  | ||||
|     return 0; | ||||
|   } catch (const std::exception &e) { | ||||
|     std::cerr << "Error: " << e.what() << std::endl; | ||||
|     return 1; | ||||
|   } | ||||
| } | ||||
							
								
								
									
										
											BIN
										
									
								
								model2/model.exp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model2/model.exp
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_t_0.cubin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								model2/model_triton_tem_fused_addmm_relu_t_0.cubin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
		Reference in New Issue
	
	Block a user
	