mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
[Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5) (#5136)
This commit is contained in:
committed by
GitHub
parent
b35be5403f
commit
6d21fa1cad
@ -32,7 +32,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
|
|||||||
float* c = reinterpret_cast<float*>(&frag_c);
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
if (psel == 0) {
|
if (psel == 0) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16."
|
||||||
|
"f32 "
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
||||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
@ -40,7 +41,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
|
|||||||
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
|
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
|
||||||
"r"(e[0]));
|
"r"(e[0]));
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16."
|
||||||
|
"f32 "
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
||||||
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
||||||
@ -49,7 +51,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
|
|||||||
"r"(e[0]));
|
"r"(e[0]));
|
||||||
} else {
|
} else {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16."
|
||||||
|
"f32 "
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
||||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
@ -57,7 +60,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
|
|||||||
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
|
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
|
||||||
"r"(e[0]));
|
"r"(e[0]));
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16."
|
||||||
|
"f32 "
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
||||||
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
||||||
|
|||||||
Reference in New Issue
Block a user