[Kernel] Improve machete memory bound perf (#21556)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
This commit is contained in:
czhu-cohere
2025-07-25 06:53:21 -07:00
committed by GitHub
parent b3caeb82e7
commit 136d750f5f

View File

@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
Shape_NKL shape_mkl) {
auto layout = TVbNbKL_to_offset(shape_mkl);
return make_layout(coalesce(get<0>(layout)), get<1>(layout),
get<2>(layout));
// for 4-bit elements, having >= 64 values per column
// allows TMA to load full 32-byte sectors
auto inner_layout =
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
}
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)