mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[Caffe2] Improve SVE batch box cox by 2% (#163360)
Summary: Improve bound checking on exp computation, decreasing the longest dependency chain by 1. Box-cox benchmarks show about 2% of improved throughput. Precision remains unaltered. before: NonZeroLambdaBatch 155.30us 6.44K after: NonZeroLambdaBatch 151.78us 6.59K Test Plan: Correctness: buck2 test @//mode/opt //koski/functions_contrib/df4ai/tests:batch_box_cox_test Performance: buck2 run @//mode/opt //koski/functions_contrib/df4ai/benchmark:boxcox_benchmark Differential Revision: D82847111 Privacy Context Container: L1208939 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163360 Approved by: https://github.com/Skylion007
This commit is contained in:
committed by
PyTorch MergeBot
parent
5050cfa363
commit
eb11d172e3
@ -114,14 +114,20 @@ inline float32x4_t vexpq_f32(float32x4_t x) {
|
||||
|
||||
auto poly = svset_neonq(svundef_f32(), vfmaq_f32(scale, p12345, scale));
|
||||
|
||||
auto pHigh = svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input);
|
||||
auto pLow = svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input);
|
||||
|
||||
auto bound = svsel_f32(
|
||||
pHigh,
|
||||
inf,
|
||||
zero);
|
||||
|
||||
auto pCombined = svorr_b_z(svptrue_b8(), pLow, pHigh);
|
||||
|
||||
// Handle underflow and overflow.
|
||||
poly = svsel_f32(
|
||||
svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input),
|
||||
zero,
|
||||
poly);
|
||||
poly = svsel_f32(
|
||||
svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input),
|
||||
inf,
|
||||
pCombined,
|
||||
bound,
|
||||
poly);
|
||||
|
||||
return svget_neonq(poly);
|
||||
|
Reference in New Issue
Block a user