diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 940680eb3682..c457bd3dba75 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -141,7 +141,11 @@ WelfordDataLN cuWelfordOnlineSum( if constexpr (!rms_norm){ U delta = val - curr_sum.mean; U new_count = curr_sum.count + 1.f; +#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL) + U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count); +#else U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster +#endif return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count}; } else{ return {0.f, curr_sum.sigma2 + val * val, 0}; @@ -159,7 +163,11 @@ WelfordDataLN cuWelfordCombine( U count = dataA.count + dataB.count; U mean, sigma2; if (count > decltype(dataB.count){0}) { +#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL) + auto coef = __builtin_amdgcn_rcpf(count); +#else auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division +#endif auto nA = dataA.count * coef; auto nB = dataB.count * coef; mean = nA*dataA.mean + nB*dataB.mean; diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 90fc3f284ac7..733183ef50bd 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1044,6 +1044,17 @@ if(USE_ROCM) list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) endif(CMAKE_BUILD_TYPE MATCHES Debug) + # Get EnVar 'USE_LAYERNORM_FAST_RECIPROCAL' (or default to on). + if(DEFINED ENV{USE_LAYERNORM_FAST_RECIPROCAL}) + set(USE_LAYERNORM_FAST_RECIPROCAL $ENV{USE_LAYERNORM_FAST_RECIPROCAL}) + else() + set(USE_LAYERNORM_FAST_RECIPROCAL ON) + endif() + + if(USE_LAYERNORM_FAST_RECIPROCAL) + add_definitions(-DUSE_LAYERNORM_FAST_RECIPROCAL) + endif() + # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 1fa1398a8917..60951d6c6867 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -128,11 +128,12 @@ function(caffe2_print_configuration_summary) endif() message(STATUS " USE_ROCM : ${USE_ROCM}") if(${USE_ROCM}) - message(STATUS " ROCM_VERSION : ${ROCM_VERSION}") - message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}") - message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}") - message(STATUS " USE_ROCM_CK_SDPA : ${USE_ROCM_CK_SDPA}") - message(STATUS " USE_ROCM_CK_GEMM : ${USE_ROCM_CK_GEMM}") + message(STATUS " ROCM_VERSION : ${ROCM_VERSION}") + message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}") + message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}") + message(STATUS " USE_ROCM_CK_SDPA : ${USE_ROCM_CK_SDPA}") + message(STATUS " USE_ROCM_CK_GEMM : ${USE_ROCM_CK_GEMM}") + message(STATUS " USE_LAYERNORM_FAST_RECIPROCAL : ${USE_LAYERNORM_FAST_RECIPROCAL}") endif() message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}") message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}") diff --git a/setup.py b/setup.py index bdfab24a0b32..a980a5f35216 100644 --- a/setup.py +++ b/setup.py @@ -156,6 +156,10 @@ # USE_ROCM_KERNEL_ASSERT=1 # Enable kernel assert in ROCm platform # +# USE_LAYERNORM_FAST_RECIPROCAL +# If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t. +# layer normalization. Default: enabled. +# # USE_ROCM_CK_GEMM=1 # Enable building CK GEMM backend in ROCm platform #