mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[ROCm][layer_norm] Use __builtin_amdgcn_rcpf(x) instead of 1.f/x (#165589)
Replace (more) exact calculation with hardware approximation. Benefits: Reduced code size. Improved performance for certain scenarios. Experiments show low reduction in precision. Experiments show no significant performance regressions. bfloat16 as well as float16 related calculations may benefit largely from this change. Co-author: @mhalk @amd-hhashemi Pull Request resolved: https://github.com/pytorch/pytorch/pull/165589 Approved by: https://github.com/jeffdaily
This commit is contained in:
committed by
PyTorch MergeBot
parent
9fe3b2afbe
commit
202f83dc4e
@ -1044,6 +1044,17 @@ if(USE_ROCM)
|
||||
list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
|
||||
endif(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
|
||||
# Get EnVar 'USE_LAYERNORM_FAST_RECIPROCAL' (or default to on).
|
||||
if(DEFINED ENV{USE_LAYERNORM_FAST_RECIPROCAL})
|
||||
set(USE_LAYERNORM_FAST_RECIPROCAL $ENV{USE_LAYERNORM_FAST_RECIPROCAL})
|
||||
else()
|
||||
set(USE_LAYERNORM_FAST_RECIPROCAL ON)
|
||||
endif()
|
||||
|
||||
if(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
add_definitions(-DUSE_LAYERNORM_FAST_RECIPROCAL)
|
||||
endif()
|
||||
|
||||
# needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
|
||||
list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
|
||||
|
||||
|
Reference in New Issue
Block a user