Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 39 additions & 30 deletions llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1647,6 +1647,13 @@ class ControlFlowRewriter {

Register PrimarySuccessorExec;

// Track saved exec from S_AND_SAVEEXEC(_TERM) opc to rejoin at secondary
// block.
Register SavedExec;

// Track Def of above register.
MachineInstr *SavedExecMI = nullptr;

// Opcode for branches with implicit or opaque conditions:
// S_CBRANCH_EXECZ/NZ S_CBRANCH_VCCZ/NZ S_CBRANCH_SCC0/1
// -- all active threads branch uniformly.
Expand Down Expand Up @@ -2276,14 +2283,11 @@ void ControlFlowRewriter::rewrite() {

MachineBasicBlock::iterator MBBIOriginNodeEnd = OriginNode->Block->end();

// FIXME: Find a way to avoid adding MovTermOpc, instead add MovOpc. This
// Term operator being the first terminator, acts as an anchor point for
// finding the right insertion point in other parts of the Wave Transform.
// Since accumulator reset instructions may be added after this
// instruction, this move operation cannot be a terminator.
BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {},
TII.get(LMC.MovTermOpc), LMC.ExecReg)
.addReg(OriginCFGNodeInfo.PrimarySuccessorExec);
OriginCFGNodeInfo.SavedExec = LMU.createLaneMaskReg();
OriginCFGNodeInfo.SavedExecMI =
BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {},
TII.get(LMC.AndSaveExecTermOpc), OriginCFGNodeInfo.SavedExec)
.addReg(OriginCFGNodeInfo.PrimarySuccessorExec);
BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {},
TII.get(AMDGPU::SI_WAVE_CF_EDGE));
BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {},
Expand Down Expand Up @@ -2334,32 +2338,27 @@ void ControlFlowRewriter::rewrite() {
continue;

CFGNodeInfo &PredInfo = NodeInfo.find(Pred)->second;
Register PrimaryExec = PredInfo.PrimarySuccessorExec;

Register Rejoin;
if (!Rejoin) {
// Try to find a previously generated XOR (or merely masked) value
// for reuse.
auto MapIt = RegMap.find(std::make_pair(Pred->Block, PrimaryExec));
if (MapIt != RegMap.end()) {
Rejoin = MapIt->second.second;
if (!Rejoin)
PrimaryExec = MapIt->second.first;
}
}

if (!Rejoin) {
Rejoin = LMU.createLaneMaskReg();
BuildMI(*Pred->Block, Pred->Block->getFirstTerminator(), {},
TII.get(LMC.XorOpc), Rejoin)
.addReg(LMC.ExecReg)
.addReg(PrimaryExec);
}
// The rejoin contribution is the full EXEC saved by the
// S_AND_SAVEEXEC emitted at this OriginBranch in Step 2.2, bookkept on
// the pred's CFGNodeInfo.
Register Rejoin = PredInfo.SavedExec;

if (HasSingleDivergentPred)
if (HasSingleDivergentPred) {
DirectRejoin = Rejoin;
else
} else {
// The _term form of S_AND_SAVEEXEC is required while building the
// primary exec mask: it lets the updater machinery insert instructions
// at the terminator. For the secondary (rejoin) exec mask, however, the
// inserted instructions must land *after* the S_AND_SAVEEXEC since they
// consume its def. Demote the pred's terminator to its non-terminator
// form in place so getFirstTerminator() points past it, and subsequent
// rejoin-mask building iterations insert at the correct place.
if (PredInfo.SavedExecMI &&
PredInfo.SavedExecMI->getOpcode() == LMC.AndSaveExecTermOpc)
PredInfo.SavedExecMI->setDesc(TII.get(LMC.AndSaveExecOpc));
Updater.addAvailable(*Pred->Block, Rejoin);
}
}

Register RejoinMask =
Expand Down Expand Up @@ -2675,6 +2674,16 @@ class ForwardPropSimplifier {
const unsigned Opc = MI->getOpcode();
const Register Dst = MI->getOperand(0).getReg();

// SavedExec = S_AND_SAVEEXEC Prim
// Dst (SavedExec) = old EXEC; EXEC = EXEC & Prim; def SCC.
// Both the terminator and non-terminator forms appear: the non-terminator
// form for OriginBranches consumed by a rejoin accumulator (demoted in
// Step 3) and the terminator form otherwise.
if (Opc == LMC.AndSaveExecOpc || Opc == LMC.AndSaveExecTermOpc) {
Cur[Dst] = RegIntVariant{Dst};
continue;
}

// ACC = MOV 0
if (AccRegs.count(Dst) && Opc == LMC.MovOpc) {
const MachineOperand &Imm = MI->getOperand(1);
Expand Down
21 changes: 15 additions & 6 deletions llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,16 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB,
if (!PrevConstant) {
PrevMaskedReg = PrevReg;
}
// Donot mask CurReg if CurReg = S_AND_SAVEEXEC(_TERM) Reg
// Contributions from this Opc implies we are building the rejoin merge at
// secondary block and the contribution should be used as is , without EXEC
// AND masking.
if (!CurConstant) {
if ((PrevConstant && PrevVal) ||
const MachineInstr *CurDef = MF.getRegInfo().getUniqueVRegDef(CurReg);
bool IsSaveExecDef =
CurDef && (CurDef->getOpcode() == LMC.AndSaveExecOpc ||
CurDef->getOpcode() == LMC.AndSaveExecTermOpc);
if ((PrevConstant && PrevVal) || IsSaveExecDef ||
(LMA && LMA->isSubsetOfExec(CurReg, MBB, I))) {
CurMaskedReg = CurReg;
} else {
Expand Down Expand Up @@ -472,13 +480,14 @@ void GCNLaneMaskUpdater::insertAccumulatorResets() {

// TODO : We only need to compute EndInsertPt if any of B's AccFlagPairs has
// ResetAtEnd
const AMDGPU::LaneMaskConstants &LMConsts = LMU.getLaneMaskConsts();
MachineBasicBlock::iterator EndInsertPt;
EndInsertPt = B->getFirstTerminator();
if (EndInsertPt != B->end() && EndInsertPt->getOpcode() == LMU.getLaneMaskConsts().MovTermOpc &&
EndInsertPt->getOperand(0).getReg() ==
LMU.getLaneMaskConsts().ExecReg) {
EndInsertPt->setDesc(TII->get(LMU.getLaneMaskConsts().MovOpc));
EndInsertPt++;
if (EndInsertPt != B->end()) {
if (EndInsertPt->getOpcode() == LMConsts.AndSaveExecTermOpc) {
EndInsertPt->setDesc(TII->get(LMConsts.AndSaveExecOpc));
++EndInsertPt;
}
}

for (auto &[Acc, Flags] : AccFlagPairs) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec
; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX90A-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]]
; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1 (%ir-block.2):
Expand Down Expand Up @@ -100,7 +100,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX90A-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[V_CMP_EQ_U32_e64_]], implicit $exec
; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]]
; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.2
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2 (%ir-block.26):
Expand Down Expand Up @@ -144,7 +144,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX1200-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec
; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX1200-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]]
; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; GFX1200-NEXT: S_BRANCH %bb.1
; GFX1200-NEXT: {{ $}}
; GFX1200-NEXT: bb.1 (%ir-block.2):
Expand Down Expand Up @@ -182,7 +182,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX1200-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec
; GFX1200-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GFX1200-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[V_CMP_EQ_U32_e64_]], implicit $exec
; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]]
; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; GFX1200-NEXT: S_BRANCH %bb.2
; GFX1200-NEXT: {{ $}}
; GFX1200-NEXT: bb.2 (%ir-block.23):
Expand Down Expand Up @@ -222,7 +222,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; ITERATE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec
; ITERATE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; ITERATE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; ITERATE-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]]
; ITERATE-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; ITERATE-NEXT: S_BRANCH %bb.1
; ITERATE-NEXT: {{ $}}
; ITERATE-NEXT: bb.1 (%ir-block.2):
Expand Down Expand Up @@ -283,7 +283,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; ITERATE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_4]], implicit $exec
; ITERATE-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; ITERATE-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[V_CMP_EQ_U32_e64_]], implicit $exec
; ITERATE-NEXT: SI_BRCOND %bb.2, [[V_CMP_EQ_U32_e64_]]
; ITERATE-NEXT: SI_BRCOND %bb.2, [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; ITERATE-NEXT: S_BRANCH %bb.3
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %ret
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/WaveTransform/loop-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac
; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]]
; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6
; GFX90A-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]]
; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.do.body.preheader:
Expand Down Expand Up @@ -90,7 +90,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac
; GFX90A-NEXT: [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[COPY19]], implicit $exec
; GFX90A-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY13]], implicit $exec
; GFX90A-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY13]], implicit $exec
; GFX90A-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]]
; GFX90A-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.4
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.if.else:
Expand All @@ -111,7 +111,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_1]], implicit $exec
; GFX90A-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX90A-NEXT: [[V_CMP_NE_U32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_3]], killed [[S_MOV_B32_7]], implicit $exec
; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]]
; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.5.if.then11:
Expand Down Expand Up @@ -154,7 +154,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac
; GFX1200-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
; GFX1200-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6
; GFX1200-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]]
; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; GFX1200-NEXT: S_BRANCH %bb.1
; GFX1200-NEXT: {{ $}}
; GFX1200-NEXT: bb.1.do.body.preheader:
Expand Down Expand Up @@ -198,7 +198,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac
; GFX1200-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]], [[COPY10]], 0, implicit $exec
; GFX1200-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY11]], [[COPY12]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
; GFX1200-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]]
; GFX1200-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; GFX1200-NEXT: S_BRANCH %bb.4
; GFX1200-NEXT: {{ $}}
; GFX1200-NEXT: bb.3.if.else:
Expand All @@ -219,7 +219,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac
; GFX1200-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_1]], implicit $exec
; GFX1200-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX1200-NEXT: [[V_CMP_NE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_3]], killed [[S_MOV_B32_8]], implicit $exec
; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]]
; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; GFX1200-NEXT: S_BRANCH %bb.5
; GFX1200-NEXT: {{ $}}
; GFX1200-NEXT: bb.5.if.then11:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/WaveTransform/loop-mix-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr
; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]]
; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6
; GFX90A-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]]
; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.do.body.preheader:
Expand Down Expand Up @@ -89,7 +89,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_]], implicit $exec
; GFX90A-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX90A-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_2]], killed [[S_MOV_B32_5]], implicit $exec
; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_1]]
; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.5.if.then11:
Expand Down Expand Up @@ -117,7 +117,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr
; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
; GFX1200-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1023, killed [[COPY1]](s32), implicit $exec
; GFX1200-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 6, [[V_AND_B32_e64_]], implicit $exec
; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_GT_U32_e64_]]
; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_GT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; GFX1200-NEXT: S_BRANCH %bb.1
; GFX1200-NEXT: {{ $}}
; GFX1200-NEXT: bb.1.do.body.preheader:
Expand Down Expand Up @@ -165,7 +165,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr
; GFX1200-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, killed [[COPY8]], implicit $exec
; GFX1200-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_NE_U32_e64_]], implicit $exec
; GFX1200-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 1, killed [[V_CNDMASK_B32_e64_2]], implicit $exec
; GFX1200-NEXT: SI_BRCOND %bb.7, killed [[V_CMP_NE_U32_e64_1]]
; GFX1200-NEXT: SI_BRCOND %bb.7, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec
; GFX1200-NEXT: S_BRANCH %bb.6
; GFX1200-NEXT: {{ $}}
; GFX1200-NEXT: bb.6.if.then11:
Expand Down
Loading