diff --git a/amd/comgr/CMakeLists.txt b/amd/comgr/CMakeLists.txt
index b72fc5444b769..e42fa26aff3a7 100644
--- a/amd/comgr/CMakeLists.txt
+++ b/amd/comgr/CMakeLists.txt
@@ -108,6 +108,7 @@ set(SOURCES
   src/comgr-hotswap-llvm.cpp
   src/comgr-hotswap-patch-f32-to-e5m3.cpp
   src/comgr-hotswap-patch-inplace.cpp
+  src/comgr-hotswap-patch-vop3px-wrap.cpp
   src/comgr-hotswap-patch-vop3px2-src2.cpp
   src/comgr-hotswap-patch-wmma-hazard.cpp
   src/comgr-hotswap-patch-wmma-scale16.cpp
diff --git a/amd/comgr/src/comgr-hotswap-b0a0.cpp b/amd/comgr/src/comgr-hotswap-b0a0.cpp
index 38599efbc96e2..bf51eba5a7868 100644
--- a/amd/comgr/src/comgr-hotswap-b0a0.cpp
+++ b/amd/comgr/src/comgr-hotswap-b0a0.cpp
@@ -404,6 +404,10 @@ applyGfx1250B0toA0Rules(std::vector<InternalDecodedInst> &Decoded,
     Patched += VT.applyWmmaHazardPatch(Ctx);
   if (VT.applyVop3px2Src2Fix)
     Patched += VT.applyVop3px2Src2Fix(Ctx);
+  // Must run after per-instruction splitters so their trampoline bodies can be
+  // wrapped before trampoline branches are finalized.
+  if (VT.applyVop3pxWrapPatch)
+    Patched += VT.applyVop3pxWrapPatch(Ctx);
 
   for (const llvm::StringMapEntry<KernelPatchStats> &KV : KernelStats) {
     StringRef KName = KV.first();
diff --git a/amd/comgr/src/comgr-hotswap-internal.h b/amd/comgr/src/comgr-hotswap-internal.h
index 98666f42e07a9..067ed08efd693 100644
--- a/amd/comgr/src/comgr-hotswap-internal.h
+++ b/amd/comgr/src/comgr-hotswap-internal.h
@@ -601,6 +601,7 @@ struct HotswapPatchVTable {
   // loop completes.
   uint32_t (*applyWmmaHazardPatch)(PatchContext &) = nullptr;
   uint32_t (*applyVop3px2Src2Fix)(PatchContext &) = nullptr;
+  uint32_t (*applyVop3pxWrapPatch)(PatchContext &) = nullptr;
 };
 
 /// Walk comgr-hotswap-patches.def and bind every patch module's
diff --git a/amd/comgr/src/comgr-hotswap-patch-vop3px-wrap.cpp b/amd/comgr/src/comgr-hotswap-patch-vop3px-wrap.cpp
new file mode 100644
index 0000000000000..cb6cc8ac585d6
--- /dev/null
+++ b/amd/comgr/src/comgr-hotswap-patch-vop3px-wrap.cpp
@@ -0,0 +1,291 @@
+//===- comgr-hotswap-patch-vop3px-wrap.cpp - VOP3PX wrap patch ----------===//
+//
+// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See
+// amd/comgr/LICENSE.TXT in this repository for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Strong-symbol override for applyVop3pxWrapPatch. VOP3PX2 wrapping for
+/// V_WMMA_F32_16X16X128_F8F6F4 on GFX1250 A0 silicon.
+///
+/// On A0, an async trap fired between LD_SCALE and the WMMA half of a
+/// VOP3PX2 pair is unrecoverable. The trap handler can rewind the PC for
+/// known-paired forms (see ROCm/rocm-systems commit 74c647e6605, "rocr:
+/// GFX12.5 : VOP3PX instruction split in trap handler"), but a standalone
+/// v_wmma_f32_16x16x128_f8f6f4 (no preceding LD_SCALE) cannot be safely
+/// rewound at trap time.
+///
+/// Workaround: prepend an inline-zero LD_SCALE prefix to every standalone
+/// V_WMMA_F32_16X16X128_F8F6F4, turning it into a fused VOP3PX2 with
+/// scale=1.0 (a no-op). The trap handler's rewind path then handles it.
+///
+/// The replacement is byte-level: an 8-byte LD_SCALE prefix is prepended
+/// to the original 8-byte WMMA, leaving the WMMA portion bit-identical.
+/// This avoids re-encoding modifier-rich operand layouts (matrix_a_fmt,
+/// matrix_b_fmt, neg_lo, neg_hi, matrix_a_reuse, matrix_b_reuse, ...).
+///
+/// Two-pass operation:
+///   1. Decoded[] scan -- wraps user-written standalone WMMAs.
+///   2. Trampoline scan -- wraps splitter-emitted WMMAs sitting in
+///      trampoline bytes (the K=128 32x16x128_f4 splitter emits f8f6f4
+///      into trampolines).
+///
+//===----------------------------------------------------------------------===//
+
+#include "comgr-hotswap-internal.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+
+namespace COMGR {
+namespace hotswap {
+namespace {
+
+// LD_SCALE prefix encoding for inline-0 scale (= scale 1.0):
+//   DWORD 0: 0xCC35_0000 (ENCODING=0xCC3, SCLOP=0x5)
+//   DWORD 1: 0x0401_0080 = (0x080) | (0x080 << 9) | (0x100 << 18)
+//     SCL_SRC0[40:32] = 0x80 (= inline 0)
+//     SCL_SRC1[49:41] = 0x80 (= inline 0)
+//     Constant[58:50] = 0x100 (= VGPR0; the VOP3PX2 scale_src2 field is
+//     architecturally unused, but if left at 0 the SQ mis-decodes it as
+//     an SGPR reference and stalls the SALU for 3 cycles; setting it to
+//     a VGPR encoding eliminates the false dependency. Same workaround
+//     the in-place vop3px2-src2 patch applies to user-emitted VOP3PX2
+//     instructions; baking it into the wrap pass's prefix bytes keeps
+//     wrap-emitted trampolines stall-free at creation.)
+constexpr uint8_t LdScalePrefix[8] = {
+    0x00, 0x00, 0x35, 0xCC, // DWORD 0
+    0x80, 0x00, 0x01, 0x04, // DWORD 1
+};
+constexpr size_t LdScalePrefixSize = sizeof(LdScalePrefix);
+constexpr size_t WmmaInstSize = 8;
+
+// 9 type combinations of f8f6f4 share the same printed mnemonic; the
+// matrix_a_fmt / matrix_b_fmt modifiers distinguish them at the encoding
+// level. We don't care about the variant distinction for wrapping -- the
+// WMMA bytes are preserved verbatim and only the LD_SCALE prefix is
+// prepended.
+constexpr StringLiteral StandaloneWmma = "v_wmma_f32_16x16x128_f8f6f4";
+
+// Already-wrapped detection: per ISA doc page 158, the SCALE prefix MUST
+// be immediately preceding the WMMA -- no intervening instructions
+// allowed. If the previous decoded instruction is a SCALE form, the WMMA
+// is the trailing half of an existing VOP3PX2 and must NOT be wrapped.
+constexpr StringLiteral AlreadyWrappedScale =
+    "v_wmma_scale_f32_16x16x128_f8f6f4";
+constexpr StringLiteral AlreadyWrappedScale16 =
+    "v_wmma_scale16_f32_16x16x128_f8f6f4";
+
+// Defensive: 32x16x128_f4 should be eliminated by the K=128 splitter
+// before the wrap pass runs. V_WMMA_SCALE_F32_32X16X128_F4 doesn't exist
+// on A0 and a leftover would cause the trap-handler rewind to misdecode
+// garbage.
+constexpr StringLiteral F4Mnemonic = "v_wmma_f32_32x16x128_f4";
+
+// Per-instruction patches (e.g. the K=128 splitter) record their rewrites
+// by appending a Trampoline whose `OriginalOffset` points at the source
+// instruction. The actual text-byte overwrite (s_branch over the original)
+// only happens later in fixupTrampolineBranches, so within the dispatch
+// pipeline the canonical "was this offset patched?" signal is "appears as
+// a Trampoline.OriginalOffset", NOT a text-byte check.
+bool offsetIsPatched(const std::vector<Trampoline> &Trampolines,
+                     uint64_t Offset) {
+  for (const Trampoline &T : Trampolines)
+    if (T.OriginalOffset == Offset)
+      return true;
+  return false;
+}
+
+// Build a trampoline carrying the wrapped form. Replacement is a fixed
+// 16 bytes (8-byte LD_SCALE prefix + 8-byte WMMA copied verbatim from
+// text). The branch-back goes at the tail; fixupTrampolineBranches
+// re-encodes it once the final layout is known.
+Trampoline buildWrappedTrampoline(const uint8_t *OriginalWmmaBytes,
+                                  uint64_t OriginalOffset,
+                                  uint32_t OriginalSize,
+                                  uint64_t TrampTextOffset,
+                                  const LLVMState &LS) {
+  Trampoline T;
+  T.OriginalOffset = OriginalOffset;
+  T.OriginalSize = OriginalSize;
+  T.Bytes.reserve(LdScalePrefixSize + WmmaInstSize + MinInstSize);
+  T.Bytes.insert(T.Bytes.end(), LdScalePrefix,
+                 LdScalePrefix + LdScalePrefixSize);
+  T.Bytes.insert(T.Bytes.end(), OriginalWmmaBytes,
+                 OriginalWmmaBytes + WmmaInstSize);
+
+  SmallVector<uint8_t> Branch = LS.encodeSBranch(
+      TrampTextOffset + T.Bytes.size(), OriginalOffset + OriginalSize);
+  if (Branch.empty()) {
+    T.Bytes.clear();
+    return T;
+  }
+  T.Bytes.insert(T.Bytes.end(), Branch.begin(), Branch.end());
+  return T;
+}
+
+std::string getMnemonic(const LLVMState &LS, const MCInst &Inst) {
+  if (LS.MCIP) {
+    std::pair<const char *, uint64_t> Mnem = LS.MCIP->getMnemonic(Inst);
+    if (Mnem.first)
+      return StringRef(Mnem.first).rtrim().str();
+  }
+  return LS.MCII->getName(Inst.getOpcode()).str();
+}
+
+bool decodeTrampolineInstruction(const LLVMState &LS, ArrayRef<uint8_t> Body,
+                                 size_t Pos, InternalDecodedInst &DI) {
+  if (Pos >= Body.size())
+    return false;
+
+  ArrayRef<uint8_t> Bytes = Body.drop_front(Pos);
+  uint64_t InstSize = 0;
+  MCDisassembler::DecodeStatus Status =
+      LS.MCD->getInstruction(DI.Inst, InstSize, Bytes, Pos, nulls());
+  if (Status == MCDisassembler::Fail || InstSize == 0 ||
+      Pos + InstSize > Body.size())
+    return false;
+
+  DI.Offset = Pos;
+  DI.Size = static_cast<uint32_t>(InstSize);
+  DI.Mnemonic = getMnemonic(LS, DI.Inst);
+  return true;
+}
+
+// Pass 1: wrap user-written standalone WMMAs found in Decoded[] whose
+// bytes still match the WMMA encoding (i.e., not already replaced by
+// another patch's s_branch).
+uint32_t wrapDecodedInstructions(PatchContext &Ctx) {
+  uint32_t Patched = 0;
+  for (size_t I = 0, E = Ctx.Decoded.size(); I < E; ++I) {
+    const InternalDecodedInst &DI = Ctx.Decoded[I];
+    if (DI.Mnemonic != StandaloneWmma)
+      continue;
+    if (DI.Size != WmmaInstSize) {
+      log() << "hotswap: error: VOP3PX wrap: " << DI.Mnemonic << " at offset 0x"
+            << utohexstr(DI.Offset) << " has unexpected size " << DI.Size
+            << "\n";
+      continue;
+    }
+    if (DI.Offset + DI.Size > Ctx.TextSize)
+      continue;
+    if (offsetIsPatched(Ctx.OutTrampolines, DI.Offset))
+      continue; // Another patch already claimed this offset.
+    if (I > 0) {
+      const InternalDecodedInst &Prev = Ctx.Decoded[I - 1];
+      if (Prev.Mnemonic == AlreadyWrappedScale ||
+          Prev.Mnemonic == AlreadyWrappedScale16)
+        continue;
+    }
+
+    uint64_t TrampTextOffset = Ctx.TextSize;
+    for (const Trampoline &T : Ctx.OutTrampolines)
+      TrampTextOffset += T.Bytes.size();
+
+    Trampoline T = buildWrappedTrampoline(Ctx.Text + DI.Offset, DI.Offset,
+                                          DI.Size, TrampTextOffset, Ctx.LS);
+    if (T.Bytes.empty()) {
+      log() << "hotswap: error: VOP3PX wrap: trampoline encoding failed at 0x"
+            << utohexstr(DI.Offset) << "\n";
+      continue;
+    }
+    Ctx.OutTrampolines.push_back(std::move(T));
+
+    log() << "hotswap: VOP3PX wrap: patched " << DI.Mnemonic << " at offset 0x"
+          << utohexstr(DI.Offset) << "\n";
+    ++Patched;
+  }
+  return Patched;
+}
+
+// Pass 2: decode trampoline bodies for splitter-emitted standalone WMMAs
+// and prepend the LD_SCALE prefix in-place. Trampoline layout (per
+// buildTrampoline / buildWrappedTrampoline):
+//   [replacement bytes ... ][branch-back 4 bytes]
+// We only walk the body, not the branch-back placeholder. Each insert
+// grows T.Bytes by LdScalePrefixSize; fixupTrampolineBranches re-encodes
+// the branch-back later with the correct trampoline-end offset.
+uint32_t wrapTrampolineInstructions(PatchContext &Ctx) {
+  uint32_t Patched = 0;
+  for (Trampoline &T : Ctx.OutTrampolines) {
+    if (T.Bytes.size() < MinInstSize)
+      continue;
+    size_t BodyEnd = T.Bytes.size() - MinInstSize;
+    size_t Pos = 0;
+    while (Pos < BodyEnd) {
+      InternalDecodedInst DI;
+      ArrayRef<uint8_t> Body(T.Bytes.data(), BodyEnd);
+      if (!decodeTrampolineInstruction(Ctx.LS, Body, Pos, DI)) {
+        log() << "hotswap: error: VOP3PX wrap: could not decode "
+              << "trampoline body at offset 0x" << utohexstr(Pos)
+              << " for original offset 0x" << utohexstr(T.OriginalOffset)
+              << "\n";
+        Pos += MinInstSize;
+        continue;
+      }
+      if (DI.Mnemonic != StandaloneWmma) {
+        Pos += DI.Size;
+        continue;
+      }
+      if (DI.Size != WmmaInstSize) {
+        log() << "hotswap: error: VOP3PX wrap: " << DI.Mnemonic
+              << " in trampoline for original offset 0x"
+              << utohexstr(T.OriginalOffset) << " has unexpected size "
+              << DI.Size << "\n";
+        Pos += DI.Size;
+        continue;
+      }
+      T.Bytes.insert(T.Bytes.begin() + Pos, LdScalePrefix,
+                     LdScalePrefix + LdScalePrefixSize);
+      BodyEnd += LdScalePrefixSize;
+      Pos += LdScalePrefixSize + WmmaInstSize;
+      ++Patched;
+      log() << "hotswap: VOP3PX wrap: patched in-trampoline WMMA (orig at 0x"
+            << utohexstr(T.OriginalOffset) << ")\n";
+    }
+  }
+  return Patched;
+}
+
+// Defensive: refuse to retarget if an unsupported 32x16x128_f4 leftover
+// exists in Decoded[] -- the K=128 splitter should have eliminated all of
+// these. A leftover would cause the trap-handler rewind to misdecode
+// garbage, since V_WMMA_SCALE_F32_32X16X128_F4 doesn't exist on A0.
+bool checkNoF4Leftovers(PatchContext &Ctx) {
+  bool Found = false;
+  for (const InternalDecodedInst &DI : Ctx.Decoded) {
+    if (DI.Mnemonic != F4Mnemonic)
+      continue;
+    if (DI.Offset + DI.Size > Ctx.TextSize)
+      continue;
+    if (offsetIsPatched(Ctx.OutTrampolines, DI.Offset))
+      continue; // K=128 splitter handled it.
+    log() << "hotswap: error: VOP3PX wrap: unsplit " << F4Mnemonic << " at 0x"
+          << utohexstr(DI.Offset)
+          << " -- K=128 splitter must run before VOP3PX wrap\n";
+    Found = true;
+  }
+  return Found;
+}
+
+uint32_t applyVop3pxWrapPatchImpl(PatchContext &Ctx) {
+  if (checkNoF4Leftovers(Ctx))
+    return 0;
+  uint32_t Patched = wrapDecodedInstructions(Ctx);
+  Patched += wrapTrampolineInstructions(Ctx);
+  return Patched;
+}
+
+} // namespace
+
+void registerVop3pxWrapPatch(HotswapPatchVTable &VT) {
+  VT.applyVop3pxWrapPatch = &applyVop3pxWrapPatchImpl;
+}
+
+} // namespace hotswap
+} // namespace COMGR
diff --git a/amd/comgr/src/comgr-hotswap-patches.def b/amd/comgr/src/comgr-hotswap-patches.def
index f2fea35988888..b355a7b64c678 100644
--- a/amd/comgr/src/comgr-hotswap-patches.def
+++ b/amd/comgr/src/comgr-hotswap-patches.def
@@ -31,6 +31,7 @@ HOTSWAP_PATCH(InPlace)
 HOTSWAP_PATCH(Scratch)
 HOTSWAP_PATCH(Trampoline)
 HOTSWAP_PATCH(Vop3px2Src2)
+HOTSWAP_PATCH(Vop3pxWrap)
 HOTSWAP_PATCH(WmmaHazard)
 HOTSWAP_PATCH(WmmaScale16)
 HOTSWAP_PATCH(WmmaSplit)
diff --git a/amd/comgr/test-lit/hotswap-vop3px-wrap.s b/amd/comgr/test-lit/hotswap-vop3px-wrap.s
new file mode 100644
index 0000000000000..fd76ba7f417c5
--- /dev/null
+++ b/amd/comgr/test-lit/hotswap-vop3px-wrap.s
@@ -0,0 +1,127 @@
+// Test VOP3PX2 wrap patches for GFX1250 B0-to-A0 hotswap.
+//
+// On A0 silicon, an async trap fired between LD_SCALE and the WMMA half
+// of a VOP3PX2 pair is unrecoverable.  The trap handler rewinds the PC
+// for known-paired forms (ROCm/rocm-systems commit 74c647e6605); this
+// pass ensures every standalone V_WMMA_F32_16X16X128_F8F6F4 (encoding
+// 0xCC33) is paired with an inline-zero LD_SCALE prefix (effectively
+// scale=1.0, a no-op) so the rewind path always has a pair to walk
+// back to.
+//
+// The wrap is byte-level: an 8-byte LD_SCALE prefix is prepended to the
+// original 8-byte WMMA, leaving the WMMA portion bit-identical.  In
+// disassembly the result reads as a single fused
+// `v_wmma_scale_f32_16x16x128_f8f6f4` instruction with `0, 0` for the
+// two scale operands.
+//
+// Per amd/comgr/AGENT_CONVENTIONS.md, LIT inputs are compiled with
+// %clang directly (not through Comgr actions), and llvm-objdump /
+// FileCheck go through their lit substitutions.
+
+// RUN: %clang -target amdgcn-amd-amdhsa -mcpu=gfx1250 -nostdlib %s -o %t.elf
+// RUN: hotswap-rewrite %t.elf \
+// RUN:   amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \
+// RUN:   --output %t.out.elf \
+// RUN:   | %FileCheck --check-prefix=API %s
+// API: RESULT: SUCCESS
+// RUN: %llvm-objdump -d %t.out.elf | %FileCheck %s
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx1250"
+
+// ── Test 1: bare standalone WMMA gets wrapped ───────────────────────────────
+//
+// The original WMMA must be replaced by an s_branch into the trampoline,
+// where the wrapped form appears as a SCALE-prefixed VOP3PX2.
+//
+// CHECK-LABEL: <test_standalone_f8f6f4>:
+// CHECK-NOT:   v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+// CHECK:       s_branch
+.globl test_standalone_f8f6f4
+.p2align 8
+.type test_standalone_f8f6f4,@function
+test_standalone_f8f6f4:
+  v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:15], v[8:23], v[16:23]
+  s_endpgm
+.size test_standalone_f8f6f4, .-test_standalone_f8f6f4
+
+// ── Test 2: standalone with explicit FP8/FP8 modifiers ──────────────────────
+//
+// Modifiers (matrix_a_fmt, matrix_b_fmt) must be preserved verbatim in
+// the wrapped form because the WMMA bytes are copied unchanged.
+//
+// CHECK-LABEL: <test_standalone_fp8_fp8>:
+// CHECK-NOT:   v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] matrix_a_fmt
+// CHECK:       s_branch
+.globl test_standalone_fp8_fp8
+.p2align 8
+.type test_standalone_fp8_fp8,@function
+test_standalone_fp8_fp8:
+  v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_fmt:MATRIX_FMT_FP8 matrix_b_fmt:MATRIX_FMT_FP8
+  s_endpgm
+.size test_standalone_fp8_fp8, .-test_standalone_fp8_fp8
+
+// ── Test 3: standalone with FP6/FP4 mixed modifiers ─────────────────────────
+//
+// Verifies that all 9 type combinations are handled — the wrap is
+// modifier-agnostic; only the WMMA's byte-level opcode (0xCC33) matters.
+//
+// CHECK-LABEL: <test_standalone_fp6_fp4>:
+// CHECK-NOT:   v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] matrix_a_fmt
+// CHECK:       s_branch
+.globl test_standalone_fp6_fp4
+.p2align 8
+.type test_standalone_fp6_fp4,@function
+test_standalone_fp6_fp4:
+  v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:11], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+  s_endpgm
+.size test_standalone_fp6_fp4, .-test_standalone_fp6_fp4
+
+// ── Test 4: already-wrapped (SCALE prefix present) is left alone ─────────────
+//
+// A v_wmma_scale_f32_16x16x128_f8f6f4 is already a fused VOP3PX2.  The
+// wrap pass MUST NOT add a second LD_SCALE prefix in front of it (that
+// would corrupt the encoding).  We assert that no s_branch is installed
+// over the user's already-wrapped form.
+//
+// CHECK-LABEL: <test_already_wrapped>:
+// CHECK:       v_wmma_scale_f32_16x16x128_f8f6f4
+// CHECK-NOT:   s_branch
+// CHECK:       s_endpgm
+.globl test_already_wrapped
+.p2align 8
+.type test_already_wrapped,@function
+test_already_wrapped:
+  v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:15], v[8:23], v[16:23], 0, 0
+  s_endpgm
+.size test_already_wrapped, .-test_already_wrapped
+
+// ── Test 5: K=128 splitter interaction ──────────────────────────────────────
+//
+// `v_wmma_f32_32x16x128_f4` is split by the K=128 splitter into two
+// f8f6f4 WMMAs that land in a trampoline.  The wrap pass's pass-2
+// trampoline scan must wrap BOTH of them.  Disassembled trampoline
+// region should contain `v_wmma_scale_f32_16x16x128_f8f6f4` (the wrapped
+// form) and NO bare `v_wmma_f32_16x16x128_f8f6f4` (the unwrapped form).
+//
+// CHECK-LABEL: <test_f4_split_then_wrap>:
+// CHECK-NOT:   v_wmma_f32_32x16x128_f4
+// CHECK:       s_branch
+.globl test_f4_split_then_wrap
+.p2align 8
+.type test_f4_split_then_wrap,@function
+test_f4_split_then_wrap:
+  v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 0
+  s_endpgm
+.size test_f4_split_then_wrap, .-test_f4_split_then_wrap
+
+// ── Trampoline region asserts ───────────────────────────────────────────────
+//
+// At least one wrapped form (LD_SCALE + WMMA) must appear in the
+// trampoline tail.  We use CHECK-DAG since trampolines are emitted in
+// patch order and the lit harness's textual order may differ.  We also
+// assert that no bare standalone WMMA leaks through the splitter+wrap
+// pipeline — every f8f6f4 in the rewritten ELF must be paired with a
+// SCALE prefix (i.e., printed as `v_wmma_scale_*`).
+//
+// CHECK-DAG: v_wmma_scale_f32_16x16x128_f8f6f4
+// CHECK-NOT: v_wmma_f32_16x16x128_f8f6f4 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
diff --git a/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s b/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s
index 44308d1f30cd4..aa19fa05e467d 100644
--- a/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s
+++ b/amd/comgr/test-lit/hotswap-vop3px2-src2-noop.s
@@ -1,7 +1,16 @@
 // COM: Passthrough test for the VOP3PX2 scale_src2 bit-field fix. A kernel
-// COM: with no V_WMMA_SCALE* instructions must be left structurally
-// COM: unchanged: no bits are modified, and the disassembly must match the
-// COM: original layout.
+// COM: with no V_WMMA_SCALE* instructions must not have the in-place
+// COM: scale_src2 patch fire on it.
+// COM:
+// COM: Since the VOP3PX2 wrap pass landed (comgr-hotswap-patch-vop3px-
+// COM: wrap.cpp), the bare f8f6f4 in this kernel now gets wrapped into a
+// COM: VOP3PX2 in a trampoline; the resulting v_wmma_scale_* carries
+// COM: scale_src2 = VGPR0 (0x100) baked into the wrap pass's prefix bytes
+// COM: (the same SALU-stall workaround the in-place vop3px2-src2 fix
+// COM: applies to user-emitted VOP3PX2). So the in-place vop3px2-src2 fix
+// COM: still has nothing to do here -- it only fires on user-emitted
+// COM: v_wmma_scale_* instructions in the kernel body, not on
+// COM: wrap-emitted ones in trampolines.
 
 // RUN: %clang --target=amdgcn-amd-amdhsa -mcpu=gfx1250 -nostdlib %s -o %t.elf
 
@@ -11,13 +20,15 @@
 // RUN:   | %FileCheck --check-prefix=API %s
 // API: RESULT: SUCCESS
 
-// COM: No V_WMMA_SCALE instructions, so the patch must not fire.
-// COM: Verify the disassembly layout is preserved and that v_wmma_scale
-// COM: does not appear (DISASM-NOT scope: between v_wmma_f32 and s_endpgm).
+// COM: Kernel body: bare f8f6f4 becomes an s_branch into the wrap trampoline.
+// COM: Trampoline body: v_wmma_scale_* (the wrap-produced fused VOP3PX2);
+// COM: this is from the VOP3PX2 wrap pass, not from the in-place
+// COM: scale_src2 patch.
 // RUN: %llvm-objdump -d %t.out.elf | %FileCheck --check-prefix=DISASM %s
-// DISASM: v_wmma_f32_16x16x128_f8f6f4
-// DISASM-NOT: v_wmma_scale
-// DISASM: s_endpgm
+// DISASM-LABEL: <test_vop3px2_noop>:
+// DISASM:       s_branch
+// DISASM:       s_endpgm
+// DISASM:       v_wmma_scale_f32_16x16x128_f8f6f4{{.*}}, 0, 0{{.*}}matrix_a_fmt:MATRIX_FMT_BF8{{.*}}matrix_b_fmt:MATRIX_FMT_FP6
 
 // COM: Idempotency: second rewrite must produce identical bytes.
 // RUN: hotswap-rewrite %t.out.elf \
diff --git a/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s b/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s
index 5062feeae1689..b884fa25e4573 100644
--- a/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s
+++ b/amd/comgr/test-lit/hotswap-wmma-split-msplit-fp-imm.s
@@ -26,8 +26,12 @@
 // DISASM:       s_branch
 // DISASM:       s_endpgm
 
-// DISASM:       v_wmma_f32_16x16x128_f8f6f4 v[64:71], v[0:7], v[2:9], 1.0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
-// DISASM-NEXT:  v_wmma_f32_16x16x128_f8f6f4 v[72:79], v[8:15], v[2:9], 1.0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+// COM: After the K=128 splitter emits two f8f6f4 halves into a trampoline,
+// COM: the VOP3PX2 wrap pass prepends an inline-0 LD_SCALE prefix to each.
+// COM: The fused VOP3PX2 disassembles as v_wmma_scale_f32_16x16x128_f8f6f4
+// COM: with `, 0, 0` for the two scale operands (= scale 1.0, no-op).
+// DISASM:       v_wmma_scale_f32_16x16x128_f8f6f4 v[64:71], v[0:7], v[2:9], 1.0, 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+// DISASM-NEXT:  v_wmma_scale_f32_16x16x128_f8f6f4 v[72:79], v[8:15], v[2:9], 1.0, 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 // DISASM-NEXT:  s_branch
 .globl kernel
 .p2align 8
diff --git a/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s b/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s
index 0d8b8b8ab5d84..3f154883b4cb2 100644
--- a/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s
+++ b/amd/comgr/test-lit/hotswap-wmma-split-msplit-neg-lo.s
@@ -22,8 +22,12 @@
 // DISASM:       s_branch
 // DISASM:       s_endpgm
 
-// DISASM:       v_wmma_f32_16x16x128_f8f6f4 v[80:87], v[0:7], v[2:9], v[80:87] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1]
-// DISASM-NEXT:  v_wmma_f32_16x16x128_f8f6f4 v[88:95], v[8:15], v[2:9], v[88:95] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1]
+// COM: After the K=128 splitter emits two f8f6f4 halves into a trampoline,
+// COM: the VOP3PX2 wrap pass prepends an inline-0 LD_SCALE prefix, so the
+// COM: f8f6f4 disassembles as the fused v_wmma_scale form with `, 0, 0`
+// COM: scale operands. neg_lo:[0,0,1] is preserved on both halves.
+// DISASM:       v_wmma_scale_f32_16x16x128_f8f6f4 v[80:87], v[0:7], v[2:9], v[80:87], 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1]
+// DISASM-NEXT:  v_wmma_scale_f32_16x16x128_f8f6f4 v[88:95], v[8:15], v[2:9], v[88:95], 0, 0 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 neg_lo:[0,0,1]
 // DISASM-NEXT:  s_branch
 .globl kernel
 .p2align 8
diff --git a/amd/comgr/test-lit/hotswap-wmma-split.s b/amd/comgr/test-lit/hotswap-wmma-split.s
index 1f5dfb3989e27..ff38f925f924c 100644
--- a/amd/comgr/test-lit/hotswap-wmma-split.s
+++ b/amd/comgr/test-lit/hotswap-wmma-split.s
@@ -222,8 +222,14 @@ test_no_split_required:
 // COM: The replacement opcode is v_wmma_f32_16x16x128_f8f6f4 with both
 // COM: matrix-format modifiers literally MATRIX_FMT_FP4 so the f8f6f4
 // COM: form interprets the data as f4 (matching the original opcode).
-// DISASM-DAG: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:7], v[16:23], v[32:39]{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4
-// DISASM-DAG: v_wmma_f32_16x16x128_f8f6f4 v[40:47], v[8:15], v[16:23], v[40:47]{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4
+// COM: After the VOP3PX2 wrap pass runs, each f8f6f4 in the trampoline gets
+// COM: an inline-0 LD_SCALE prefix prepended; the disassembler greedily
+// COM: matches the 16-byte fused VOP3PX2, so the disassembly shows
+// COM: `v_wmma_scale_f32_16x16x128_f8f6f4 ... , 0, 0 ...` instead of the
+// COM: bare f8f6f4 mnemonic. The `, 0, 0` are the two scale operands
+// COM: (= scale 1.0, no-op).
+// DISASM-DAG: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:7], v[16:23], v[32:39], 0, 0{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4
+// DISASM-DAG: v_wmma_scale_f32_16x16x128_f8f6f4 v[40:47], v[8:15], v[16:23], v[40:47], 0, 0{{.*}}matrix_a_fmt:MATRIX_FMT_FP4{{.*}}matrix_b_fmt:MATRIX_FMT_FP4
 
 // Idempotency: rewriting the patched output again should produce identical
 // bytes (the splitter only fires on K=128 mnemonics, which no longer exist