diff --git a/amd/comgr/CMakeLists.txt b/amd/comgr/CMakeLists.txt index b72fc5444b769..e42fa26aff3a7 100644 --- a/amd/comgr/CMakeLists.txt +++ b/amd/comgr/CMakeLists.txt @@ -108,6 +108,7 @@ set(SOURCES src/comgr-hotswap-llvm.cpp src/comgr-hotswap-patch-f32-to-e5m3.cpp src/comgr-hotswap-patch-inplace.cpp + src/comgr-hotswap-patch-vop3px-wrap.cpp src/comgr-hotswap-patch-vop3px2-src2.cpp src/comgr-hotswap-patch-wmma-hazard.cpp src/comgr-hotswap-patch-wmma-scale16.cpp diff --git a/amd/comgr/src/comgr-hotswap-b0a0.cpp b/amd/comgr/src/comgr-hotswap-b0a0.cpp index 38599efbc96e2..bf51eba5a7868 100644 --- a/amd/comgr/src/comgr-hotswap-b0a0.cpp +++ b/amd/comgr/src/comgr-hotswap-b0a0.cpp @@ -404,6 +404,10 @@ applyGfx1250B0toA0Rules(std::vector &Decoded, Patched += VT.applyWmmaHazardPatch(Ctx); if (VT.applyVop3px2Src2Fix) Patched += VT.applyVop3px2Src2Fix(Ctx); + // Must run after per-instruction splitters so their trampoline bodies can be + // wrapped before trampoline branches are finalized. + if (VT.applyVop3pxWrapPatch) + Patched += VT.applyVop3pxWrapPatch(Ctx); for (const llvm::StringMapEntry &KV : KernelStats) { StringRef KName = KV.first(); diff --git a/amd/comgr/src/comgr-hotswap-internal.h b/amd/comgr/src/comgr-hotswap-internal.h index 98666f42e07a9..067ed08efd693 100644 --- a/amd/comgr/src/comgr-hotswap-internal.h +++ b/amd/comgr/src/comgr-hotswap-internal.h @@ -601,6 +601,7 @@ struct HotswapPatchVTable { // loop completes. uint32_t (*applyWmmaHazardPatch)(PatchContext &) = nullptr; uint32_t (*applyVop3px2Src2Fix)(PatchContext &) = nullptr; + uint32_t (*applyVop3pxWrapPatch)(PatchContext &) = nullptr; }; /// Walk comgr-hotswap-patches.def and bind every patch module's diff --git a/amd/comgr/src/comgr-hotswap-patch-vop3px-wrap.cpp b/amd/comgr/src/comgr-hotswap-patch-vop3px-wrap.cpp new file mode 100644 index 0000000000000..cb6cc8ac585d6 --- /dev/null +++ b/amd/comgr/src/comgr-hotswap-patch-vop3px-wrap.cpp @@ -0,0 +1,291 @@ +//===- comgr-hotswap-patch-vop3px-wrap.cpp - VOP3PX wrap patch ----------===// +// +// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See +// amd/comgr/LICENSE.TXT in this repository for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Strong-symbol override for applyVop3pxWrapPatch. VOP3PX2 wrapping for +/// V_WMMA_F32_16X16X128_F8F6F4 on GFX1250 A0 silicon. +/// +/// On A0, an async trap fired between LD_SCALE and the WMMA half of a +/// VOP3PX2 pair is unrecoverable. The trap handler can rewind the PC for +/// known-paired forms (see ROCm/rocm-systems commit 74c647e6605, "rocr: +/// GFX12.5 : VOP3PX instruction split in trap handler"), but a standalone +/// v_wmma_f32_16x16x128_f8f6f4 (no preceding LD_SCALE) cannot be safely +/// rewound at trap time. +/// +/// Workaround: prepend an inline-zero LD_SCALE prefix to every standalone +/// V_WMMA_F32_16X16X128_F8F6F4, turning it into a fused VOP3PX2 with +/// scale=1.0 (a no-op). The trap handler's rewind path then handles it. +/// +/// The replacement is byte-level: an 8-byte LD_SCALE prefix is prepended +/// to the original 8-byte WMMA, leaving the WMMA portion bit-identical. +/// This avoids re-encoding modifier-rich operand layouts (matrix_a_fmt, +/// matrix_b_fmt, neg_lo, neg_hi, matrix_a_reuse, matrix_b_reuse, ...). +/// +/// Two-pass operation: +/// 1. Decoded[] scan -- wraps user-written standalone WMMAs. +/// 2. Trampoline scan -- wraps splitter-emitted WMMAs sitting in +/// trampoline bytes (the K=128 32x16x128_f4 splitter emits f8f6f4 +/// into trampolines). +/// +//===----------------------------------------------------------------------===// + +#include "comgr-hotswap-internal.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" + +using namespace llvm; + +namespace COMGR { +namespace hotswap { +namespace { + +// LD_SCALE prefix encoding for inline-0 scale (= scale 1.0): +// DWORD 0: 0xCC35_0000 (ENCODING=0xCC3, SCLOP=0x5) +// DWORD 1: 0x0401_0080 = (0x080) | (0x080 << 9) | (0x100 << 18) +// SCL_SRC0[40:32] = 0x80 (= inline 0) +// SCL_SRC1[49:41] = 0x80 (= inline 0) +// Constant[58:50] = 0x100 (= VGPR0; the VOP3PX2 scale_src2 field is +// architecturally unused, but if left at 0 the SQ mis-decodes it as +// an SGPR reference and stalls the SALU for 3 cycles; setting it to +// a VGPR encoding eliminates the false dependency. Same workaround +// the in-place vop3px2-src2 patch applies to user-emitted VOP3PX2 +// instructions; baking it into the wrap pass's prefix bytes keeps +// wrap-emitted trampolines stall-free at creation.) +constexpr uint8_t LdScalePrefix[8] = { + 0x00, 0x00, 0x35, 0xCC, // DWORD 0 + 0x80, 0x00, 0x01, 0x04, // DWORD 1 +}; +constexpr size_t LdScalePrefixSize = sizeof(LdScalePrefix); +constexpr size_t WmmaInstSize = 8; + +// 9 type combinations of f8f6f4 share the same printed mnemonic; the +// matrix_a_fmt / matrix_b_fmt modifiers distinguish them at the encoding +// level. We don't care about the variant distinction for wrapping -- the +// WMMA bytes are preserved verbatim and only the LD_SCALE prefix is +// prepended. +constexpr StringLiteral StandaloneWmma = "v_wmma_f32_16x16x128_f8f6f4"; + +// Already-wrapped detection: per ISA doc page 158, the SCALE prefix MUST +// be immediately preceding the WMMA -- no intervening instructions +// allowed. If the previous decoded instruction is a SCALE form, the WMMA +// is the trailing half of an existing VOP3PX2 and must NOT be wrapped. +constexpr StringLiteral AlreadyWrappedScale = + "v_wmma_scale_f32_16x16x128_f8f6f4"; +constexpr StringLiteral AlreadyWrappedScale16 = + "v_wmma_scale16_f32_16x16x128_f8f6f4"; + +// Defensive: 32x16x128_f4 should be eliminated by the K=128 splitter +// before the wrap pass runs. V_WMMA_SCALE_F32_32X16X128_F4 doesn't exist +// on A0 and a leftover would cause the trap-handler rewind to misdecode +// garbage. +constexpr StringLiteral F4Mnemonic = "v_wmma_f32_32x16x128_f4"; + +// Per-instruction patches (e.g. the K=128 splitter) record their rewrites +// by appending a Trampoline whose `OriginalOffset` points at the source +// instruction. The actual text-byte overwrite (s_branch over the original) +// only happens later in fixupTrampolineBranches, so within the dispatch +// pipeline the canonical "was this offset patched?" signal is "appears as +// a Trampoline.OriginalOffset", NOT a text-byte check. +bool offsetIsPatched(const std::vector &Trampolines, + uint64_t Offset) { + for (const Trampoline &T : Trampolines) + if (T.OriginalOffset == Offset) + return true; + return false; +} + +// Build a trampoline carrying the wrapped form. Replacement is a fixed +// 16 bytes (8-byte LD_SCALE prefix + 8-byte WMMA copied verbatim from +// text). The branch-back goes at the tail; fixupTrampolineBranches +// re-encodes it once the final layout is known. +Trampoline buildWrappedTrampoline(const uint8_t *OriginalWmmaBytes, + uint64_t OriginalOffset, + uint32_t OriginalSize, + uint64_t TrampTextOffset, + const LLVMState &LS) { + Trampoline T; + T.OriginalOffset = OriginalOffset; + T.OriginalSize = OriginalSize; + T.Bytes.reserve(LdScalePrefixSize + WmmaInstSize + MinInstSize); + T.Bytes.insert(T.Bytes.end(), LdScalePrefix, + LdScalePrefix + LdScalePrefixSize); + T.Bytes.insert(T.Bytes.end(), OriginalWmmaBytes, + OriginalWmmaBytes + WmmaInstSize); + + SmallVector Branch = LS.encodeSBranch( + TrampTextOffset + T.Bytes.size(), OriginalOffset + OriginalSize); + if (Branch.empty()) { + T.Bytes.clear(); + return T; + } + T.Bytes.insert(T.Bytes.end(), Branch.begin(), Branch.end()); + return T; +} + +std::string getMnemonic(const LLVMState &LS, const MCInst &Inst) { + if (LS.MCIP) { + std::pair Mnem = LS.MCIP->getMnemonic(Inst); + if (Mnem.first) + return StringRef(Mnem.first).rtrim().str(); + } + return LS.MCII->getName(Inst.getOpcode()).str(); +} + +bool decodeTrampolineInstruction(const LLVMState &LS, ArrayRef Body, + size_t Pos, InternalDecodedInst &DI) { + if (Pos >= Body.size()) + return false; + + ArrayRef Bytes = Body.drop_front(Pos); + uint64_t InstSize = 0; + MCDisassembler::DecodeStatus Status = + LS.MCD->getInstruction(DI.Inst, InstSize, Bytes, Pos, nulls()); + if (Status == MCDisassembler::Fail || InstSize == 0 || + Pos + InstSize > Body.size()) + return false; + + DI.Offset = Pos; + DI.Size = static_cast(InstSize); + DI.Mnemonic = getMnemonic(LS, DI.Inst); + return true; +} + +// Pass 1: wrap user-written standalone WMMAs found in Decoded[] whose +// bytes still match the WMMA encoding (i.e., not already replaced by +// another patch's s_branch). +uint32_t wrapDecodedInstructions(PatchContext &Ctx) { + uint32_t Patched = 0; + for (size_t I = 0, E = Ctx.Decoded.size(); I < E; ++I) { + const InternalDecodedInst &DI = Ctx.Decoded[I]; + if (DI.Mnemonic != StandaloneWmma) + continue; + if (DI.Size != WmmaInstSize) { + log() << "hotswap: error: VOP3PX wrap: " << DI.Mnemonic << " at offset 0x" + << utohexstr(DI.Offset) << " has unexpected size " << DI.Size + << "\n"; + continue; + } + if (DI.Offset + DI.Size > Ctx.TextSize) + continue; + if (offsetIsPatched(Ctx.OutTrampolines, DI.Offset)) + continue; // Another patch already claimed this offset. + if (I > 0) { + const InternalDecodedInst &Prev = Ctx.Decoded[I - 1]; + if (Prev.Mnemonic == AlreadyWrappedScale || + Prev.Mnemonic == AlreadyWrappedScale16) + continue; + } + + uint64_t TrampTextOffset = Ctx.TextSize; + for (const Trampoline &T : Ctx.OutTrampolines) + TrampTextOffset += T.Bytes.size(); + + Trampoline T = buildWrappedTrampoline(Ctx.Text + DI.Offset, DI.Offset, + DI.Size, TrampTextOffset, Ctx.LS); + if (T.Bytes.empty()) { + log() << "hotswap: error: VOP3PX wrap: trampoline encoding failed at 0x" + << utohexstr(DI.Offset) << "\n"; + continue; + } + Ctx.OutTrampolines.push_back(std::move(T)); + + log() << "hotswap: VOP3PX wrap: patched " << DI.Mnemonic << " at offset 0x" + << utohexstr(DI.Offset) << "\n"; + ++Patched; + } + return Patched; +} + +// Pass 2: decode trampoline bodies for splitter-emitted standalone WMMAs +// and prepend the LD_SCALE prefix in-place. Trampoline layout (per +// buildTrampoline / buildWrappedTrampoline): +// [replacement bytes ... ][branch-back 4 bytes] +// We only walk the body, not the branch-back placeholder. Each insert +// grows T.Bytes by LdScalePrefixSize; fixupTrampolineBranches re-encodes +// the branch-back later with the correct trampoline-end offset. +uint32_t wrapTrampolineInstructions(PatchContext &Ctx) { + uint32_t Patched = 0; + for (Trampoline &T : Ctx.OutTrampolines) { + if (T.Bytes.size() < MinInstSize) + continue; + size_t BodyEnd = T.Bytes.size() - MinInstSize; + size_t Pos = 0; + while (Pos < BodyEnd) { + InternalDecodedInst DI; + ArrayRef Body(T.Bytes.data(), BodyEnd); + if (!decodeTrampolineInstruction(Ctx.LS, Body, Pos, DI)) { + log() << "hotswap: error: VOP3PX wrap: could not decode " + << "trampoline body at offset 0x" << utohexstr(Pos) + << " for original offset 0x" << utohexstr(T.OriginalOffset) + << "\n"; + Pos += MinInstSize; + continue; + } + if (DI.Mnemonic != StandaloneWmma) { + Pos += DI.Size; + continue; + } + if (DI.Size != WmmaInstSize) { + log() << "hotswap: error: VOP3PX wrap: " << DI.Mnemonic + << " in trampoline for original offset 0x" + << utohexstr(T.OriginalOffset) << " has unexpected size " + << DI.Size << "\n"; + Pos += DI.Size; + continue; + } + T.Bytes.insert(T.Bytes.begin() + Pos, LdScalePrefix, + LdScalePrefix + LdScalePrefixSize); + BodyEnd += LdScalePrefixSize; + Pos += LdScalePrefixSize + WmmaInstSize; + ++Patched; + log() << "hotswap: VOP3PX wrap: patched in-trampoline WMMA (orig at 0x" + << utohexstr(T.OriginalOffset) << ")\n"; + } + } + return Patched; +} + +// Defensive: refuse to retarget if an unsupported 32x16x128_f4 leftover +// exists in Decoded[] -- the K=128 splitter should have eliminated all of +// these. A leftover would cause the trap-handler rewind to misdecode +// garbage, since V_WMMA_SCALE_F32_32X16X128_F4 doesn't exist on A0. +bool checkNoF4Leftovers(PatchContext &Ctx) { + bool Found = false; + for (const InternalDecodedInst &DI : Ctx.Decoded) { + if (DI.Mnemonic != F4Mnemonic) + continue; + if (DI.Offset + DI.Size > Ctx.TextSize) + continue; + if (offsetIsPatched(Ctx.OutTrampolines, DI.Offset)) + continue; // K=128 splitter handled it. + log() << "hotswap: error: VOP3PX wrap: unsplit " << F4Mnemonic << " at 0x" + << utohexstr(DI.Offset) + << " -- K=128 splitter must run before VOP3PX wrap\n"; + Found = true; + } + return Found; +} + +uint32_t applyVop3pxWrapPatchImpl(PatchContext &Ctx) { + if (checkNoF4Leftovers(Ctx)) + return 0; + uint32_t Patched = wrapDecodedInstructions(Ctx); + Patched += wrapTrampolineInstructions(Ctx); + return Patched; +} + +} // namespace + +void registerVop3pxWrapPatch(HotswapPatchVTable &VT) { + VT.applyVop3pxWrapPatch = &applyVop3pxWrapPatchImpl; +} + +} // namespace hotswap +} // namespace COMGR diff --git a/amd/comgr/src/comgr-hotswap-patches.def b/amd/comgr/src/comgr-hotswap-patches.def index f2fea35988888..b355a7b64c678 100644 --- a/amd/comgr/src/comgr-hotswap-patches.def +++ b/amd/comgr/src/comgr-hotswap-patches.def @@ -31,6 +31,7 @@ HOTSWAP_PATCH(InPlace) HOTSWAP_PATCH(Scratch) HOTSWAP_PATCH(Trampoline) HOTSWAP_PATCH(Vop3px2Src2) +HOTSWAP_PATCH(Vop3pxWrap) HOTSWAP_PATCH(WmmaHazard) HOTSWAP_PATCH(WmmaScale16) HOTSWAP_PATCH(WmmaSplit) diff --git a/amd/comgr/test-lit/hotswap-vop3px-wrap.s b/amd/comgr/test-lit/hotswap-vop3px-wrap.s new file mode 100644 index 0000000000000..fd76ba7f417c5 --- /dev/null +++ b/amd/comgr/test-lit/hotswap-vop3px-wrap.s @@ -0,0 +1,127 @@ +// Test VOP3PX2 wrap patches for GFX1250 B0-to-A0 hotswap. +// +// On A0 silicon, an async trap fired between LD_SCALE and the WMMA half +// of a VOP3PX2 pair is unrecoverable. The trap handler rewinds the PC +// for known-paired forms (ROCm/rocm-systems commit 74c647e6605); this +// pass ensures every standalone V_WMMA_F32_16X16X128_F8F6F4 (encoding +// 0xCC33) is paired with an inline-zero LD_SCALE prefix (effectively +// scale=1.0, a no-op) so the rewind path always has a pair to walk +// back to. +// +// The wrap is byte-level: an 8-byte LD_SCALE prefix is prepended to the +// original 8-byte WMMA, leaving the WMMA portion bit-identical. In +// disassembly the result reads as a single fused +// `v_wmma_scale_f32_16x16x128_f8f6f4` instruction with `0, 0` for the +// two scale operands. +// +// Per amd/comgr/AGENT_CONVENTIONS.md, LIT inputs are compiled with +// %clang directly (not through Comgr actions), and llvm-objdump / +// FileCheck go through their lit substitutions. + +// RUN: %clang -target amdgcn-amd-amdhsa -mcpu=gfx1250 -nostdlib %s -o %t.elf +// RUN: hotswap-rewrite %t.elf \ +// RUN: amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ +// RUN: --output %t.out.elf \ +// RUN: | %FileCheck --check-prefix=API %s +// API: RESULT: SUCCESS +// RUN: %llvm-objdump -d %t.out.elf | %FileCheck %s + +.amdgcn_target "amdgcn-amd-amdhsa--gfx1250" + +// ── Test 1: bare standalone WMMA gets wrapped ─────────────────────────────── +// +// The original WMMA must be replaced by an s_branch into the trampoline, +// where the wrapped form appears as a SCALE-prefixed VOP3PX2. +// +// CHECK-LABEL: : +// CHECK-NOT: v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +// CHECK: s_branch +.globl test_standalone_f8f6f4 +.p2align 8 +.type test_standalone_f8f6f4,@function +test_standalone_f8f6f4: + v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:15], v[8:23], v[16:23] + s_endpgm +.size test_standalone_f8f6f4, .-test_standalone_f8f6f4 + +// ── Test 2: standalone with explicit FP8/FP8 modifiers ────────────────────── +// +// Modifiers (matrix_a_fmt, matrix_b_fmt) must be preserved verbatim in +// the wrapped form because the WMMA bytes are copied unchanged. +// +// CHECK-LABEL: : +// CHECK-NOT: v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] matrix_a_fmt +// CHECK: s_branch +.globl test_standalone_fp8_fp8 +.p2align 8 +.type test_standalone_fp8_fp8,@function +test_standalone_fp8_fp8: + v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_fmt:MATRIX_FMT_FP8 matrix_b_fmt:MATRIX_FMT_FP8 + s_endpgm +.size test_standalone_fp8_fp8, .-test_standalone_fp8_fp8 + +// ── Test 3: standalone with FP6/FP4 mixed modifiers ───────────────────────── +// +// Verifies that all 9 type combinations are handled — the wrap is +// modifier-agnostic; only the WMMA's byte-level opcode (0xCC33) matters. +// +// CHECK-LABEL: : +// CHECK-NOT: v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] matrix_a_fmt +// CHECK: s_branch +.globl test_standalone_fp6_fp4 +.p2align 8 +.type test_standalone_fp6_fp4,@function +test_standalone_fp6_fp4: + v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:11], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 + s_endpgm +.size test_standalone_fp6_fp4, .-test_standalone_fp6_fp4 + +// ── Test 4: already-wrapped (SCALE prefix present) is left alone ───────────── +// +// A v_wmma_scale_f32_16x16x128_f8f6f4 is already a fused VOP3PX2. The +// wrap pass MUST NOT add a second LD_SCALE prefix in front of it (that +// would corrupt the encoding). We assert that no s_branch is installed +// over the user's already-wrapped form. +// +// CHECK-LABEL: : +// CHECK: v_wmma_scale_f32_16x16x128_f8f6f4 +// CHECK-NOT: s_branch +// CHECK: s_endpgm +.globl test_already_wrapped +.p2align 8 +.type test_already_wrapped,@function +test_already_wrapped: + v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:15], v[8:23], v[16:23], 0, 0 + s_endpgm +.size test_already_wrapped, .-test_already_wrapped + +// ── Test 5: K=128 splitter interaction ────────────────────────────────────── +// +// `v_wmma_f32_32x16x128_f4` is split by the K=128 splitter into two +// f8f6f4 WMMAs that land in a trampoline. The wrap pass's pass-2 +// trampoline scan must wrap BOTH of them. Disassembled trampoline +// region should contain `v_wmma_scale_f32_16x16x128_f8f6f4` (the wrapped +// form) and NO bare `v_wmma_f32_16x16x128_f8f6f4` (the unwrapped form). +// +// CHECK-LABEL: : +// CHECK-NOT: v_wmma_f32_32x16x128_f4 +// CHECK: s_branch +.globl test_f4_split_then_wrap +.p2align 8 +.type test_f4_split_then_wrap,@function +test_f4_split_then_wrap: + v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 0 + s_endpgm +.size test_f4_split_then_wrap, .-test_f4_split_then_wrap + +// ── Trampoline region asserts ─────────────────────────────────────────────── +// +// At least one wrapped form (LD_SCALE + WMMA) must appear in the +// trampoline tail. We use CHECK-DAG since trampolines are emitted in +// patch order and the lit harness's textual order may differ. We also +// assert that no bare standalone WMMA leaks through the splitter+wrap +// pipeline — every f8f6f4 in the rewritten ELF must be paired with a +// SCALE prefix (i.e., printed as `v_wmma_scale_*`). +// +// CHECK-DAG: v_wmma_scale_f32_16x16x128_f8f6f4 +// CHECK-NOT: v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} diff --git a/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s b/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s index 44308d1f30cd4..aa19fa05e467d 100644 --- a/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s +++ b/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s @@ -1,7 +1,16 @@ // COM: Passthrough test for the VOP3PX2 scale_src2 bit-field fix. A kernel -// COM: with no V_WMMA_SCALE* instructions must be left structurally -// COM: unchanged: no bits are modified, and the disassembly must match the -// COM: original layout. +// COM: with no V_WMMA_SCALE* instructions must not have the in-place +// COM: scale_src2 patch fire on it. +// COM: +// COM: Since the VOP3PX2 wrap pass landed (comgr-hotswap-patch-vop3px- +// COM: wrap.cpp), the bare f8f6f4 in this kernel now gets wrapped into a +// COM: VOP3PX2 in a trampoline; the resulting v_wmma_scale_* carries +// COM: scale_src2 = VGPR0 (0x100) baked into the wrap pass's prefix bytes +// COM: (the same SALU-stall workaround the in-place vop3px2-src2 fix +// COM: applies to user-emitted VOP3PX2). So the in-place vop3px2-src2 fix +// COM: still has nothing to do here -- it only fires on user-emitted +// COM: v_wmma_scale_* instructions in the kernel body, not on +// COM: wrap-emitted ones in trampolines. // RUN: %clang --target=amdgcn-amd-amdhsa -mcpu=gfx1250 -nostdlib %s -o %t.elf @@ -11,13 +20,15 @@ // RUN: | %FileCheck --check-prefix=API %s // API: RESULT: SUCCESS -// COM: No V_WMMA_SCALE instructions, so the patch must not fire. -// COM: Verify the disassembly layout is preserved and that v_wmma_scale -// COM: does not appear (DISASM-NOT scope: between v_wmma_f32 and s_endpgm). +// COM: Kernel body: bare f8f6f4 becomes an s_branch into the wrap trampoline. +// COM: Trampoline body: v_wmma_scale_* (the wrap-produced fused VOP3PX2); +// COM: this is from the VOP3PX2 wrap pass, not from the in-place +// COM: scale_src2 patch. // RUN: %llvm-objdump -d %t.out.elf | %FileCheck --check-prefix=DISASM %s -// DISASM: v_wmma_f32_16x16x128_f8f6f4 -// DISASM-NOT: v_wmma_scale -// DISASM: s_endpgm +// DISASM-LABEL: : +// DISASM: s_branch +// DISASM: s_endpgm +// DISASM: v_wmma_scale_f32_16x16x128_f8f6f4{{.*}}, 0, 0{{.*}}matrix_a_fmt:MATRIX_FMT_BF8{{.*}}matrix_b_fmt:MATRIX_FMT_FP6 // COM: Idempotency: second rewrite must produce identical bytes. // RUN: hotswap-rewrite %t.out.elf \ diff --git a/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s b/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s index 5062feeae1689..b884fa25e4573 100644 --- a/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s +++ b/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s @@ -26,8 +26,12 @@ // DISASM: s_branch // DISASM: s_endpgm -// DISASM: v_wmma_f32_16x16x128_f8f6f4 v[64:71], v[0:7], v[2:9], 1.0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 -// DISASM-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[72:79], v[8:15], v[2:9], 1.0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +// COM: After the K=128 splitter emits two f8f6f4 halves into a trampoline, +// COM: the VOP3PX2 wrap pass prepends an inline-0 LD_SCALE prefix to each. +// COM: The fused VOP3PX2 disassembles as v_wmma_scale_f32_16x16x128_f8f6f4 +// COM: with `, 0, 0` for the two scale operands (= scale 1.0, no-op). +// DISASM: v_wmma_scale_f32_16x16x128_f8f6f4 v[64:71], v[0:7], v[2:9], 1.0, 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +// DISASM-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[72:79], v[8:15], v[2:9], 1.0, 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 // DISASM-NEXT: s_branch .globl kernel .p2align 8 diff --git a/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s b/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s index 0d8b8b8ab5d84..3f154883b4cb2 100644 --- a/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s +++ b/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s @@ -22,8 +22,12 @@ // DISASM: s_branch // DISASM: s_endpgm -// DISASM: v_wmma_f32_16x16x128_f8f6f4 v[80:87], v[0:7], v[2:9], v[80:87] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1] -// DISASM-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[88:95], v[8:15], v[2:9], v[88:95] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1] +// COM: After the K=128 splitter emits two f8f6f4 halves into a trampoline, +// COM: the VOP3PX2 wrap pass prepends an inline-0 LD_SCALE prefix, so the +// COM: f8f6f4 disassembles as the fused v_wmma_scale form with `, 0, 0` +// COM: scale operands. neg_lo:[0,0,1] is preserved on both halves. +// DISASM: v_wmma_scale_f32_16x16x128_f8f6f4 v[80:87], v[0:7], v[2:9], v[80:87], 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1] +// DISASM-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[88:95], v[8:15], v[2:9], v[88:95], 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1] // DISASM-NEXT: s_branch .globl kernel .p2align 8 diff --git a/amd/comgr/test-lit/hotswap-wmma-split.s b/amd/comgr/test-lit/hotswap-wmma-split.s index 1f5dfb3989e27..ff38f925f924c 100644 --- a/amd/comgr/test-lit/hotswap-wmma-split.s +++ b/amd/comgr/test-lit/hotswap-wmma-split.s @@ -222,8 +222,14 @@ test_no_split_required: // COM: The replacement opcode is v_wmma_f32_16x16x128_f8f6f4 with both // COM: matrix-format modifiers literally MATRIX_FMT_FP4 so the f8f6f4 // COM: form interprets the data as f4 (matching the original opcode). -// DISASM-DAG: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:7], v[16:23], v[32:39]{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4 -// DISASM-DAG: v_wmma_f32_16x16x128_f8f6f4 v[40:47], v[8:15], v[16:23], v[40:47]{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4 +// COM: After the VOP3PX2 wrap pass runs, each f8f6f4 in the trampoline gets +// COM: an inline-0 LD_SCALE prefix prepended; the disassembler greedily +// COM: matches the 16-byte fused VOP3PX2, so the disassembly shows +// COM: `v_wmma_scale_f32_16x16x128_f8f6f4 ... , 0, 0 ...` instead of the +// COM: bare f8f6f4 mnemonic. The `, 0, 0` are the two scale operands +// COM: (= scale 1.0, no-op). +// DISASM-DAG: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:7], v[16:23], v[32:39], 0, 0{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4 +// DISASM-DAG: v_wmma_scale_f32_16x16x128_f8f6f4 v[40:47], v[8:15], v[16:23], v[40:47], 0, 0{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4 // Idempotency: rewriting the patched output again should produce identical // bytes (the splitter only fires on K=128 mnemonics, which no longer exist