mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-20 21:14:14 +08:00 
			
		
		
		
	Enable optimized dynamic quantization on aarch64 (#126687)
oneDNN+ACL has optimized kernels for s8s8 matmul, so input is signed. This change leaves behaviour on all other platforms the same. This change requires https://github.com/intel/ideep/pull/313 to go in, and oneDNN 3.5 for the optimized kernels. This change speeds up dynamic quantized linear by ~10x. Also, do you have a policy on copyright headers? Arm's usual policy when contributing to open source projects is to include a copyright header on any file which is modified. Would this be acceptable? If not, is there somewhere else suitable to note copyright? Pull Request resolved: https://github.com/pytorch/pytorch/pull/126687 Approved by: https://github.com/jgong5, https://github.com/malfet, https://github.com/snadampal Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
This commit is contained in:
		
				
					committed by
					
						 PyTorch MergeBot
						PyTorch MergeBot
					
				
			
			
				
	
			
			
			
						parent
						
							f71c3d265a
						
					
				
				
					commit
					50d5aa8c10
				
			| @ -26,6 +26,7 @@ | ||||
|  | ||||
| #include <algorithm> | ||||
| #include <string> | ||||
| #include <type_traits> | ||||
|  | ||||
| int register_linear_params(); | ||||
|  | ||||
| @ -530,12 +531,19 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl( | ||||
|     x_min = t_min.item<float>(); | ||||
|   } | ||||
| #endif | ||||
|   const int precision = 8; | ||||
|  | ||||
| #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() | ||||
|   // oneDNN+ACL has optimized kernels for s8s8 matmul, so input is signed | ||||
|   using input_qtype = int8_t; | ||||
| #else | ||||
|   using input_qtype = uint8_t; | ||||
| #endif | ||||
|  | ||||
|   auto q_params = quant_utils::ChooseQuantizationParams( | ||||
|       /*min=*/x_min, | ||||
|       /*max=*/x_max, | ||||
|       /*qmin=*/0, | ||||
|       /*qmax=*/(1 << precision) - 1, | ||||
|       /*qmin=*/std::numeric_limits<input_qtype>::min(), | ||||
|       /*qmax=*/std::numeric_limits<input_qtype>::max(), | ||||
|       /*preserve_sparsity=*/false, | ||||
|       /*force_scale_power_of_two=*/false, | ||||
|       /*reduce_range=*/reduce_range); | ||||
| @ -573,7 +581,8 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl( | ||||
|       ideep::matmul_forward::prepare</*is_dynamic=*/true>( | ||||
|           params, x, w, b, y, | ||||
|           src_scales, weights_scales, ideep::scale_t(), | ||||
|           src_zero_point, ideep::zero_point_t(), 1.0f, 1.0f, op_attr); | ||||
|           src_zero_point, ideep::zero_point_t(), 1.0f, 1.0f, op_attr, | ||||
|           ideep::tensor::data_type::f32, std::is_signed_v<input_qtype> ? ideep::s8s8 : ideep::u8s8); | ||||
|       get_cache() = LinearPrimitiveCache(cache_key, params); | ||||
|       w = w.reorder_if_differ_in(params.pd.weights_desc()); | ||||
|   }); | ||||
|  | ||||
		Reference in New Issue
	
	Block a user