Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions amd/comgr/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ set(SOURCES
src/comgr-hotswap-llvm.cpp
src/comgr-hotswap-patch-f32-to-e5m3.cpp
src/comgr-hotswap-patch-inplace.cpp
src/comgr-hotswap-patch-sethalt-fix.cpp
src/comgr-hotswap-patch-vop3px-wrap.cpp
src/comgr-hotswap-patch-vop3px2-src2.cpp
src/comgr-hotswap-patch-wmma-hazard.cpp
src/comgr-hotswap-patch-wmma-split.cpp
Expand Down
14 changes: 14 additions & 0 deletions amd/comgr/src/comgr-hotswap-b0a0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,20 @@ applyGfx1250B0toA0Rules(std::vector<InternalDecodedInst> &Decoded,
Patched += VT.applyWmmaHazardPatch(Ctx);
if (VT.applyVop3px2Src2Fix)
Patched += VT.applyVop3px2Src2Fix(Ctx);
// The VOP3PX2 wrap pass must run after the K=128 splitter (it relies
// on the splitter having already eliminated 32x16x128_f4 and emitted
// f8f6f4 into trampolines). The wrap pass scans both Decoded[] and
// trampoline bodies and prepends an LD_SCALE prefix to each standalone
// f8f6f4 WMMA, turning it into a VOP3PX2 that the trap handler's
// rewind path can safely retry.
if (VT.applyVop3pxWrapPatch)
Patched += VT.applyVop3pxWrapPatch(Ctx);
// sethalt-fix: defensive in-place s_sethalt -> s_nop. LLVM almost
// never emits s_sethalt (only via int_amdgcn_s_sethalt in debug
// builds), so this pass is dormant on production code; it exists to
// catch hand-written assembly / inline asm that ships s_sethalt.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should ideally document all of the pass pipeline in a single place, and I dont think this is the right place to do it.

Something to remember for the refactor.

if (VT.applySethaltFixPatch)
Patched += VT.applySethaltFixPatch(Ctx);

for (const llvm::StringMapEntry<KernelPatchStats> &KV : KernelStats) {
StringRef KName = KV.first();
Expand Down
2 changes: 2 additions & 0 deletions amd/comgr/src/comgr-hotswap-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,8 @@ struct HotswapPatchVTable {
// loop completes.
uint32_t (*applyWmmaHazardPatch)(PatchContext &) = nullptr;
uint32_t (*applyVop3px2Src2Fix)(PatchContext &) = nullptr;
uint32_t (*applyVop3pxWrapPatch)(PatchContext &) = nullptr;
uint32_t (*applySethaltFixPatch)(PatchContext &) = nullptr;
};

/// Walk comgr-hotswap-patches.def and bind every patch module's
Expand Down
96 changes: 96 additions & 0 deletions amd/comgr/src/comgr-hotswap-patch-sethalt-fix.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
//===- comgr-hotswap-patch-sethalt-fix.cpp - in-shader sethalt fix ------===//
//
// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See
// amd/comgr/LICENSE.TXT in this repository for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Strong-symbol override for applySethaltFixPatch. Defensive hotswap
/// pass for the GFX1250 A0 LD_SCALE+WMMA clause-break bug as triggered
/// by the in-shader `s_sethalt` instruction.
///
/// On GFX1250 A0, the SQ has an FSM bug where a wave halt issued just
/// before LD_SCALE breaks the implicit LD_SCALE+WMMA clause: an
/// asynchronous clause-break event during the halt allows other waves
/// to interleave between this wave's LD_SCALE and its WMMA, which
/// either picks up a leaked scale (corrupting another wave's WMMA) or
/// loses its own scale (this wave's WMMA runs unscaled).
///
/// "Software running on A0 cannot use SQ_CMD.HALT operations" covers
/// two halt sources:
/// 1. `s_sethalt` -- in-shader instruction. Reachable from compiler
/// code objects; this pass handles it.
/// 2. SQ_CMD.HALT (SETHALT) -- external register write from the host
/// (debugger breakpoints, rocprofiler PC sampling, kernel CWSR
/// paths). NOT reachable from COMGR; needs ROCdbgapi /
/// rocprofiler / runtime changes to skip on GFX1250 A0. Out of
/// scope for this pass.
///
/// LLVM almost never emits `s_sethalt` in production code -- it appears
/// only via the `int_amdgcn_s_sethalt` intrinsic, which is largely
/// confined to debug builds. This pass exists for defense-in-depth
/// against any input ELF (compiler-emitted or hand-written) that
/// contains the instruction, and would otherwise hit the SQ FSM bug
/// when the kernel also uses VOP3PX2.
///
/// Strategy: detect every `s_sethalt` (mnemonic match against the MC
/// printer output) and rewrite to `s_nop 0` in-place. Same 4 bytes, no
/// relocation, no length change. Replacement bytes come from
/// LS.SNopBytes (pre-encoded at initLLVM() time -- same source the
/// splitter uses for trampoline padding). The shader proceeds without
/// the halt; if the halt was for an in-shader debug breakpoint, that's
/// the correct trade-off on A0 (the debugger should switch to
/// trap-based breakpoints, which is the source-2 fix the debugger team
/// owns).
///
//===----------------------------------------------------------------------===//

#include "comgr-hotswap-internal.h"

#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"

using namespace llvm;

namespace COMGR {
namespace hotswap {
namespace {

constexpr StringLiteral SSethaltMnemonic = "s_sethalt";

uint32_t applySethaltFixPatchImpl(PatchContext &Ctx) {
if (Ctx.LS.SNopBytes.size() != MinInstSize) {
log() << "hotswap: error: sethalt-fix: LS.SNopBytes is not "
<< MinInstSize << " bytes (got " << Ctx.LS.SNopBytes.size() << ")\n";
return 0;
}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this ever happen ? Just curious

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the expected GFX1250 path, no. SNopBytes is produced by the MC encoder for s_nop 0, and scalar s_nop should encode to the 4-byte MinInstSize.

I kept this as a defensive invariant check before the memcpy: if LLVMState/MC encoding initialization ever fails, or a future target path returns an unexpected encoding size, we should skip the patch instead of writing malformed bytes into .text. This is not expected to be
triggered by a valid code object.

Would you prefer we remove this? it seems codex is more defensive than usual here.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, thanks for the context.

if LLVMState/MC encoding initialization ever fails

I think for this one, we should catch init failure when it happens.

future target path returns an unexpected encoding size

I'd find it hard to envision a case where we update the encoding for s_nop. Maybe we should replace this with an assert ? No strong opinions there IMO. I'll leave it up to you.


uint32_t Patched = 0;
for (const InternalDecodedInst &DI : Ctx.Decoded) {
if (DI.Mnemonic != SSethaltMnemonic)
continue;
if (DI.Size != MinInstSize)
continue;
if (DI.Offset + DI.Size > Ctx.TextSize)
continue;
Comment thread
jammm marked this conversation as resolved.
Outdated

std::memcpy(Ctx.Text + DI.Offset, Ctx.LS.SNopBytes.data(), MinInstSize);

log() << "hotswap: sethalt-fix: neutralized s_sethalt at offset 0x"
<< utohexstr(DI.Offset) << "\n";
++Patched;
}

return Patched;
}

} // namespace

void registerSethaltFixPatch(HotswapPatchVTable &VT) {
VT.applySethaltFixPatch = &applySethaltFixPatchImpl;
}

} // namespace hotswap
} // namespace COMGR
263 changes: 263 additions & 0 deletions amd/comgr/src/comgr-hotswap-patch-vop3px-wrap.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
//===- comgr-hotswap-patch-vop3px-wrap.cpp - VOP3PX wrap patch ----------===//
//
// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See
// amd/comgr/LICENSE.TXT in this repository for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Strong-symbol override for applyVop3pxWrapPatch. VOP3PX2 wrapping for
/// V_WMMA_F32_16X16X128_F8F6F4 on GFX1250 A0 silicon.
///
/// On A0, an async trap fired between LD_SCALE and the WMMA half of a
/// VOP3PX2 pair is unrecoverable. The trap handler can rewind the PC for
/// known-paired forms (see ROCm/rocm-systems commit 74c647e6605, "rocr:
/// GFX12.5 : VOP3PX instruction split in trap handler"), but a standalone
/// v_wmma_f32_16x16x128_f8f6f4 (no preceding LD_SCALE) cannot be safely
/// rewound at trap time.
///
/// Workaround: prepend an inline-zero LD_SCALE prefix to every standalone
/// V_WMMA_F32_16X16X128_F8F6F4, turning it into a fused VOP3PX2 with
/// scale=1.0 (a no-op). The trap handler's rewind path then handles it.
///
/// The replacement is byte-level: an 8-byte LD_SCALE prefix is prepended
/// to the original 8-byte WMMA, leaving the WMMA portion bit-identical.
/// This avoids re-encoding modifier-rich operand layouts (matrix_a_fmt,
/// matrix_b_fmt, neg_lo, neg_hi, matrix_a_reuse, matrix_b_reuse, ...).
///
/// Two-pass operation:
/// 1. Decoded[] scan -- wraps user-written standalone WMMAs.
/// 2. Trampoline scan -- wraps splitter-emitted WMMAs sitting in
/// trampoline bytes (the K=128 32x16x128_f4 splitter emits f8f6f4
/// into trampolines).
///
//===----------------------------------------------------------------------===//

#include "comgr-hotswap-internal.h"

#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"

using namespace llvm;

namespace COMGR {
namespace hotswap {
namespace {

// LD_SCALE prefix encoding for inline-0 scale (= scale 1.0):
// DWORD 0: 0xCC35_0000 (ENCODING=0xCC3, SCLOP=0x5)
// DWORD 1: 0x0401_0080 = (0x080) | (0x080 << 9) | (0x100 << 18)
// SCL_SRC0[40:32] = 0x80 (= inline 0)
// SCL_SRC1[49:41] = 0x80 (= inline 0)
// Constant[58:50] = 0x100 (= VGPR0; the VOP3PX2 scale_src2 field is
// architecturally unused, but if left at 0 the SQ mis-decodes it as
// an SGPR reference and stalls the SALU for 3 cycles; setting it to
// a VGPR encoding eliminates the false dependency. Same workaround
// the in-place vop3px2-src2 patch applies to user-emitted VOP3PX2
// instructions; baking it into the wrap pass's prefix bytes keeps
// wrap-emitted trampolines stall-free at creation.)
constexpr uint8_t LdScalePrefix[8] = {
0x00, 0x00, 0x35, 0xCC, // DWORD 0
0x80, 0x00, 0x01, 0x04, // DWORD 1
};
constexpr size_t LdScalePrefixSize = sizeof(LdScalePrefix);
constexpr size_t WmmaInstSize = 8;

// 9 type combinations of f8f6f4 share the same printed mnemonic; the
// matrix_a_fmt / matrix_b_fmt modifiers distinguish them at the encoding
// level. We don't care about the variant distinction for wrapping -- the
// WMMA bytes are preserved verbatim and only the LD_SCALE prefix is
// prepended.
constexpr StringLiteral StandaloneWmma = "v_wmma_f32_16x16x128_f8f6f4";

// Already-wrapped detection: per ISA doc page 158, the SCALE prefix MUST
// be immediately preceding the WMMA -- no intervening instructions
// allowed. If the previous decoded instruction is a SCALE form, the WMMA
// is the trailing half of an existing VOP3PX2 and must NOT be wrapped.
constexpr StringLiteral AlreadyWrappedScale =
"v_wmma_scale_f32_16x16x128_f8f6f4";
constexpr StringLiteral AlreadyWrappedScale16 =
"v_wmma_scale16_f32_16x16x128_f8f6f4";

// Defensive: 32x16x128_f4 should be eliminated by the K=128 splitter
// before the wrap pass runs. V_WMMA_SCALE_F32_32X16X128_F4 doesn't exist
// on A0 and a leftover would cause the trap-handler rewind to misdecode
// garbage.
constexpr StringLiteral F4Mnemonic = "v_wmma_f32_32x16x128_f4";

// Byte-level WMMA detection (high two bytes of DWORD 0): 0xCC33.
constexpr uint8_t WmmaOpcodeByte2 = 0x33;
constexpr uint8_t WmmaOpcodeByte3 = 0xCC;
// LD_SCALE prefix detection (high two bytes of DWORD 0): 0xCC35.
constexpr uint8_t ScalePrefixByte2 = 0x35;
constexpr uint8_t ScalePrefixByte3 = 0xCC;

bool isWmmaBytes(const uint8_t *P) {
return P[2] == WmmaOpcodeByte2 && P[3] == WmmaOpcodeByte3;
}

bool isLdScaleBytes(const uint8_t *P) {
return P[2] == ScalePrefixByte2 && P[3] == ScalePrefixByte3;
}

// Per-instruction patches (e.g. the K=128 splitter) record their rewrites
// by appending a Trampoline whose `OriginalOffset` points at the source
// instruction. The actual text-byte overwrite (s_branch over the original)
// only happens later in fixupTrampolineBranches, so within the dispatch
// pipeline the canonical "was this offset patched?" signal is "appears as
// a Trampoline.OriginalOffset", NOT a text-byte check.
bool offsetIsPatched(const std::vector<Trampoline> &Trampolines,
uint64_t Offset) {
for (const Trampoline &T : Trampolines)
if (T.OriginalOffset == Offset)
return true;
return false;
}

// Build a trampoline carrying the wrapped form. Replacement is a fixed
// 16 bytes (8-byte LD_SCALE prefix + 8-byte WMMA copied verbatim from
// text). The branch-back goes at the tail; fixupTrampolineBranches
// re-encodes it once the final layout is known.
Trampoline buildWrappedTrampoline(const uint8_t *OriginalWmmaBytes,
uint64_t OriginalOffset,
uint32_t OriginalSize,
uint64_t TrampTextOffset,
const LLVMState &LS) {
Trampoline T;
T.OriginalOffset = OriginalOffset;
T.OriginalSize = OriginalSize;
T.Bytes.reserve(LdScalePrefixSize + WmmaInstSize + MinInstSize);
T.Bytes.insert(T.Bytes.end(), LdScalePrefix,
LdScalePrefix + LdScalePrefixSize);
T.Bytes.insert(T.Bytes.end(), OriginalWmmaBytes,
OriginalWmmaBytes + WmmaInstSize);

SmallVector<uint8_t> Branch =
LS.encodeSBranch(TrampTextOffset + T.Bytes.size(),
OriginalOffset + OriginalSize);
if (Branch.empty()) {
T.Bytes.clear();
return T;
}
T.Bytes.insert(T.Bytes.end(), Branch.begin(), Branch.end());
return T;
}

// Pass 1: wrap user-written standalone WMMAs found in Decoded[] whose
// bytes still match the WMMA encoding (i.e., not already replaced by
// another patch's s_branch).
uint32_t wrapDecodedInstructions(PatchContext &Ctx) {
uint32_t Patched = 0;
for (size_t I = 0, E = Ctx.Decoded.size(); I < E; ++I) {
const InternalDecodedInst &DI = Ctx.Decoded[I];
if (DI.Mnemonic != StandaloneWmma)
continue;
if (DI.Size != WmmaInstSize) {
log() << "hotswap: error: VOP3PX wrap: " << DI.Mnemonic
<< " at offset 0x" << utohexstr(DI.Offset)
<< " has unexpected size " << DI.Size << "\n";
continue;
}
if (DI.Offset + DI.Size > Ctx.TextSize)
continue;
if (offsetIsPatched(Ctx.OutTrampolines, DI.Offset))
continue; // Another patch already claimed this offset.
if (I > 0) {
const InternalDecodedInst &Prev = Ctx.Decoded[I - 1];
if (Prev.Mnemonic == AlreadyWrappedScale ||
Prev.Mnemonic == AlreadyWrappedScale16)
continue;
}

uint64_t TrampTextOffset = Ctx.TextSize;
for (const Trampoline &T : Ctx.OutTrampolines)
TrampTextOffset += T.Bytes.size();

Trampoline T = buildWrappedTrampoline(Ctx.Text + DI.Offset, DI.Offset,
DI.Size, TrampTextOffset, Ctx.LS);
if (T.Bytes.empty()) {
log() << "hotswap: error: VOP3PX wrap: trampoline encoding failed at 0x"
<< utohexstr(DI.Offset) << "\n";
continue;
}
Ctx.OutTrampolines.push_back(std::move(T));

log() << "hotswap: VOP3PX wrap: patched " << DI.Mnemonic << " at offset 0x"
<< utohexstr(DI.Offset) << "\n";
++Patched;
}
return Patched;
}

// Pass 2: scan trampoline bodies for splitter-emitted standalone WMMAs
// and prepend the LD_SCALE prefix in-place. Trampoline layout (per
// buildTrampoline / buildWrappedTrampoline):
// [replacement bytes ... ][branch-back 4 bytes]
// We only walk the body, not the branch-back placeholder. Each insert
// grows T.Bytes by LdScalePrefixSize; fixupTrampolineBranches re-encodes
// the branch-back later with the correct trampoline-end offset.
uint32_t wrapTrampolineInstructions(PatchContext &Ctx) {
uint32_t Patched = 0;
for (Trampoline &T : Ctx.OutTrampolines) {
if (T.Bytes.size() < MinInstSize)
continue;
size_t BodyEnd = T.Bytes.size() - MinInstSize;
size_t Pos = 0;
while (Pos + WmmaInstSize <= BodyEnd) {
const uint8_t *P = T.Bytes.data() + Pos;
if (!isWmmaBytes(P)) {
Pos += MinInstSize;
Comment thread
jammm marked this conversation as resolved.
continue;
}
if (Pos >= LdScalePrefixSize &&
isLdScaleBytes(T.Bytes.data() + Pos - LdScalePrefixSize)) {
Pos += WmmaInstSize;
continue;
}
T.Bytes.insert(T.Bytes.begin() + Pos, LdScalePrefix,
LdScalePrefix + LdScalePrefixSize);
BodyEnd += LdScalePrefixSize;
Pos += LdScalePrefixSize + WmmaInstSize;
++Patched;
log() << "hotswap: VOP3PX wrap: patched in-trampoline WMMA (orig at 0x"
<< utohexstr(T.OriginalOffset) << ")\n";
}
}
return Patched;
}

// Defensive: refuse to retarget if an unsupported 32x16x128_f4 leftover
// exists in Decoded[] -- the K=128 splitter should have eliminated all of
// these. A leftover would cause the trap-handler rewind to misdecode
// garbage, since V_WMMA_SCALE_F32_32X16X128_F4 doesn't exist on A0.
void checkNoF4Leftovers(PatchContext &Ctx) {
for (const InternalDecodedInst &DI : Ctx.Decoded) {
if (DI.Mnemonic != F4Mnemonic)
continue;
if (DI.Offset + DI.Size > Ctx.TextSize)
continue;
if (offsetIsPatched(Ctx.OutTrampolines, DI.Offset))
continue; // K=128 splitter handled it.
log() << "hotswap: error: VOP3PX wrap: unsplit " << F4Mnemonic
<< " at 0x" << utohexstr(DI.Offset)
<< " -- K=128 splitter must run before VOP3PX wrap\n";
}
}

uint32_t applyVop3pxWrapPatchImpl(PatchContext &Ctx) {
checkNoF4Leftovers(Ctx);
Comment thread
jammm marked this conversation as resolved.
Outdated
uint32_t Patched = wrapDecodedInstructions(Ctx);
Patched += wrapTrampolineInstructions(Ctx);
return Patched;
}

} // namespace

void registerVop3pxWrapPatch(HotswapPatchVTable &VT) {
VT.applyVop3pxWrapPatch = &applyVop3pxWrapPatchImpl;
}

} // namespace hotswap
} // namespace COMGR
Loading
Loading