diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index ec4fc1daea6c6..6082b6defda95 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -1647,6 +1647,13 @@ class ControlFlowRewriter { Register PrimarySuccessorExec; + // Track saved exec from S_AND_SAVEEXEC(_TERM) opc to rejoin at secondary + // block. + Register SavedExec; + + // Track Def of above register. + MachineInstr *SavedExecMI = nullptr; + // Opcode for branches with implicit or opaque conditions: // S_CBRANCH_EXECZ/NZ S_CBRANCH_VCCZ/NZ S_CBRANCH_SCC0/1 // -- all active threads branch uniformly. @@ -2276,14 +2283,11 @@ void ControlFlowRewriter::rewrite() { MachineBasicBlock::iterator MBBIOriginNodeEnd = OriginNode->Block->end(); - // FIXME: Find a way to avoid adding MovTermOpc, instead add MovOpc. This - // Term operator being the first terminator, acts as an anchor point for - // finding the right insertion point in other parts of the Wave Transform. - // Since accumulator reset instructions may be added after this - // instruction, this move operation cannot be a terminator. - BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {}, - TII.get(LMC.MovTermOpc), LMC.ExecReg) - .addReg(OriginCFGNodeInfo.PrimarySuccessorExec); + OriginCFGNodeInfo.SavedExec = LMU.createLaneMaskReg(); + OriginCFGNodeInfo.SavedExecMI = + BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {}, + TII.get(LMC.AndSaveExecTermOpc), OriginCFGNodeInfo.SavedExec) + .addReg(OriginCFGNodeInfo.PrimarySuccessorExec); BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {}, TII.get(AMDGPU::SI_WAVE_CF_EDGE)); BuildMI(*OriginNode->Block, MBBIOriginNodeEnd, {}, @@ -2334,32 +2338,27 @@ void ControlFlowRewriter::rewrite() { continue; CFGNodeInfo &PredInfo = NodeInfo.find(Pred)->second; - Register PrimaryExec = PredInfo.PrimarySuccessorExec; - - Register Rejoin; - if (!Rejoin) { - // Try to find a previously generated XOR (or merely masked) value - // for reuse. - auto MapIt = RegMap.find(std::make_pair(Pred->Block, PrimaryExec)); - if (MapIt != RegMap.end()) { - Rejoin = MapIt->second.second; - if (!Rejoin) - PrimaryExec = MapIt->second.first; - } - } - if (!Rejoin) { - Rejoin = LMU.createLaneMaskReg(); - BuildMI(*Pred->Block, Pred->Block->getFirstTerminator(), {}, - TII.get(LMC.XorOpc), Rejoin) - .addReg(LMC.ExecReg) - .addReg(PrimaryExec); - } + // The rejoin contribution is the full EXEC saved by the + // S_AND_SAVEEXEC emitted at this OriginBranch in Step 2.2, bookkept on + // the pred's CFGNodeInfo. + Register Rejoin = PredInfo.SavedExec; - if (HasSingleDivergentPred) + if (HasSingleDivergentPred) { DirectRejoin = Rejoin; - else + } else { + // The _term form of S_AND_SAVEEXEC is required while building the + // primary exec mask: it lets the updater machinery insert instructions + // at the terminator. For the secondary (rejoin) exec mask, however, the + // inserted instructions must land *after* the S_AND_SAVEEXEC since they + // consume its def. Demote the pred's terminator to its non-terminator + // form in place so getFirstTerminator() points past it, and subsequent + // rejoin-mask building iterations insert at the correct place. + if (PredInfo.SavedExecMI && + PredInfo.SavedExecMI->getOpcode() == LMC.AndSaveExecTermOpc) + PredInfo.SavedExecMI->setDesc(TII.get(LMC.AndSaveExecOpc)); Updater.addAvailable(*Pred->Block, Rejoin); + } } Register RejoinMask = @@ -2675,6 +2674,16 @@ class ForwardPropSimplifier { const unsigned Opc = MI->getOpcode(); const Register Dst = MI->getOperand(0).getReg(); + // SavedExec = S_AND_SAVEEXEC Prim + // Dst (SavedExec) = old EXEC; EXEC = EXEC & Prim; def SCC. + // Both the terminator and non-terminator forms appear: the non-terminator + // form for OriginBranches consumed by a rejoin accumulator (demoted in + // Step 3) and the terminator form otherwise. + if (Opc == LMC.AndSaveExecOpc || Opc == LMC.AndSaveExecTermOpc) { + Cur[Dst] = RegIntVariant{Dst}; + continue; + } + // ACC = MOV 0 if (AccRegs.count(Dst) && Opc == LMC.MovOpc) { const MachineOperand &Imm = MI->getOperand(1); diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index ee6b0aab6333d..15f854cc68dad 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -138,8 +138,16 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, if (!PrevConstant) { PrevMaskedReg = PrevReg; } + // Donot mask CurReg if CurReg = S_AND_SAVEEXEC(_TERM) Reg + // Contributions from this Opc implies we are building the rejoin merge at + // secondary block and the contribution should be used as is , without EXEC + // AND masking. if (!CurConstant) { - if ((PrevConstant && PrevVal) || + const MachineInstr *CurDef = MF.getRegInfo().getUniqueVRegDef(CurReg); + bool IsSaveExecDef = + CurDef && (CurDef->getOpcode() == LMC.AndSaveExecOpc || + CurDef->getOpcode() == LMC.AndSaveExecTermOpc); + if ((PrevConstant && PrevVal) || IsSaveExecDef || (LMA && LMA->isSubsetOfExec(CurReg, MBB, I))) { CurMaskedReg = CurReg; } else { @@ -472,13 +480,14 @@ void GCNLaneMaskUpdater::insertAccumulatorResets() { // TODO : We only need to compute EndInsertPt if any of B's AccFlagPairs has // ResetAtEnd + const AMDGPU::LaneMaskConstants &LMConsts = LMU.getLaneMaskConsts(); MachineBasicBlock::iterator EndInsertPt; EndInsertPt = B->getFirstTerminator(); - if (EndInsertPt != B->end() && EndInsertPt->getOpcode() == LMU.getLaneMaskConsts().MovTermOpc && - EndInsertPt->getOperand(0).getReg() == - LMU.getLaneMaskConsts().ExecReg) { - EndInsertPt->setDesc(TII->get(LMU.getLaneMaskConsts().MovOpc)); - EndInsertPt++; + if (EndInsertPt != B->end()) { + if (EndInsertPt->getOpcode() == LMConsts.AndSaveExecTermOpc) { + EndInsertPt->setDesc(TII->get(LMConsts.AndSaveExecOpc)); + ++EndInsertPt; + } } for (auto &[Acc, Flags] : AccFlagPairs) { diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/WaveTransform/global-atomic-fadd.f32-rtn.ll index d9b82a8dd2db4..5d8efa4f89163 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/global-atomic-fadd.f32-rtn.ll @@ -62,7 +62,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1 (%ir-block.2): @@ -100,7 +100,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[V_CMP_EQ_U32_e64_]], implicit $exec ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] - ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2 (%ir-block.26): @@ -144,7 +144,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX1200-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.1 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.1 (%ir-block.2): @@ -182,7 +182,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec ; GFX1200-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GFX1200-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.2 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.2 (%ir-block.23): @@ -222,7 +222,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; ITERATE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; ITERATE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; ITERATE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; ITERATE-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]] + ; ITERATE-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; ITERATE-NEXT: S_BRANCH %bb.1 ; ITERATE-NEXT: {{ $}} ; ITERATE-NEXT: bb.1 (%ir-block.2): @@ -283,7 +283,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; ITERATE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_4]], implicit $exec ; ITERATE-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; ITERATE-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; ITERATE-NEXT: SI_BRCOND %bb.2, [[V_CMP_EQ_U32_e64_]] + ; ITERATE-NEXT: SI_BRCOND %bb.2, [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; ITERATE-NEXT: S_BRANCH %bb.3 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-i1.ll b/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-i1.ll index 68d43a65f6455..76fe163c82be8 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-i1.ll @@ -37,7 +37,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]] ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6 ; GFX90A-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.do.body.preheader: @@ -90,7 +90,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac ; GFX90A-NEXT: [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[COPY19]], implicit $exec ; GFX90A-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY13]], implicit $exec ; GFX90A-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY13]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.if.else: @@ -111,7 +111,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_1]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_3]], killed [[S_MOV_B32_7]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]] + ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5.if.then11: @@ -154,7 +154,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac ; GFX1200-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec ; GFX1200-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6 ; GFX1200-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.1 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.1.do.body.preheader: @@ -198,7 +198,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac ; GFX1200-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]], [[COPY10]], 0, implicit $exec ; GFX1200-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY11]], [[COPY12]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1 - ; GFX1200-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_GT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.4 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.3.if.else: @@ -219,7 +219,7 @@ define amdgpu_kernel void @loop_i1(ptr addrspace(1) %filter.coerce, ptr addrspac ; GFX1200-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_1]], implicit $exec ; GFX1200-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_3]], killed [[S_MOV_B32_8]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]] + ; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_2]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.5 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.5.if.then11: diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-mix-i1.ll b/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-mix-i1.ll index b5b69fd65fc56..f0fe6b36c743b 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-mix-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/loop-mix-i1.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]] ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6 ; GFX90A-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.do.body.preheader: @@ -89,7 +89,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_2]], killed [[S_MOV_B32_5]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_1]] + ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5.if.then11: @@ -117,7 +117,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] ; GFX1200-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1023, killed [[COPY1]](s32), implicit $exec ; GFX1200-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 6, [[V_AND_B32_e64_]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_GT_U32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_GT_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.1 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.1.do.body.preheader: @@ -165,7 +165,7 @@ define amdgpu_kernel void @loop_mix_i1(ptr addrspace(1) %filter.coerce, ptr addr ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, killed [[COPY8]], implicit $exec ; GFX1200-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_NE_U32_e64_]], implicit $exec ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 1, killed [[V_CNDMASK_B32_e64_2]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.7, killed [[V_CMP_NE_U32_e64_1]] + ; GFX1200-NEXT: SI_BRCOND %bb.7, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.6 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.6.if.then11: diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/switch-i1.ll b/llvm/test/CodeGen/AMDGPU/WaveTransform/switch-i1.ll index 2070d6c907a13..3b6d1b079cf7e 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/switch-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/switch-i1.ll @@ -48,7 +48,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.arrayidx, addrspace 1) ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_LT_I32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_LT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.NodeBlock: @@ -58,7 +58,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX90A-NEXT: [[V_CMP_LT_I32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]], killed [[S_MOV_B32_3]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_MOV_B64_]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_LT_I32_e64_1]] + ; GFX90A-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_LT_I32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.LeafBlock: @@ -68,7 +68,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY4]], killed [[S_MOV_B32_4]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_MOV_B64_1]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_EQ_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.sw.bb: @@ -114,7 +114,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_6:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_2]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_3:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_6]], killed [[S_MOV_B32_9]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.10, killed [[V_CMP_NE_U32_e64_3]] + ; GFX90A-NEXT: SI_BRCOND %bb.10, killed [[V_CMP_NE_U32_e64_3]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.9 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.if.then: @@ -156,7 +156,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX1200-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.arrayidx, addrspace 1) ; GFX1200-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX1200-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_LT_I32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_LT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.2 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.2.NodeBlock: @@ -166,7 +166,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX1200-NEXT: [[V_CMP_LT_I32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec ; GFX1200-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0 ; GFX1200-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_MOV_B32_4]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_LT_I32_e64_1]] + ; GFX1200-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_LT_I32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.3 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.3.LeafBlock: @@ -176,7 +176,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[V_AND_B32_e64_]], killed [[S_MOV_B32_5]], implicit $exec ; GFX1200-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0 ; GFX1200-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_MOV_B32_6]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_EQ_U32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.7 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.4.sw.bb: @@ -222,7 +222,7 @@ define amdgpu_kernel void @switch_i1(ptr addrspace(1) %filter.coerce, ptr addrsp ; GFX1200-NEXT: [[V_CNDMASK_B32_e64_6:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_NE_U32_e64_2]], implicit $exec ; GFX1200-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_3:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_6]], killed [[S_MOV_B32_11]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.10, killed [[V_CMP_NE_U32_e64_3]] + ; GFX1200-NEXT: SI_BRCOND %bb.10, killed [[V_CMP_NE_U32_e64_3]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.9 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.9.if.then: diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/WaveTransform/unstructured-cfg-def-use-issue.ll index 771af2ca4a4b3..4083cf7d2d87d 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/unstructured-cfg-def-use-issue.ll @@ -84,7 +84,7 @@ define hidden void @widget() { ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX90A-NEXT: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_64 = nsz nofpexcept V_CMP_GT_F32_e64 0, [[COPY12]], 0, killed [[S_MOV_B32_3]], 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_GT_F32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_GT_F32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5.bb12: @@ -174,7 +174,7 @@ define hidden void @widget() { ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX1200-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX1200-NEXT: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = nsz nofpexcept V_CMP_GT_F32_e64 0, [[COPY10]], 0, killed [[S_MOV_B32_3]], 0, implicit $mode, implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_GT_F32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.6, killed [[V_CMP_GT_F32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.5 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.5.bb12: @@ -254,7 +254,7 @@ define hidden void @blam() { ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GFX90A-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[GLOBAL_LOAD_DWORD]], killed [[S_MOV_B32_3]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_I32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.bb6: @@ -262,7 +262,7 @@ define hidden void @blam() { ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[GLOBAL_LOAD_DWORD]], killed [[S_MOV_B32_4]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_EQ_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.bb8: @@ -270,7 +270,7 @@ define hidden void @blam() { ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[GLOBAL_LOAD_DWORD]], killed [[S_MOV_B32_5]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_NE_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb10: @@ -301,14 +301,14 @@ define hidden void @blam() { ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX90A-NEXT: [[V_CMP_EQ_F32_e64_1:%[0-9]+]]:sreg_64 = nsz nofpexcept V_CMP_EQ_F32_e64 0, [[COPY11]], 0, killed [[S_MOV_B32_6]], 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_EQ_F32_e64_1]] + ; GFX90A-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_EQ_F32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.bb14: ; GFX90A-NEXT: successors: %bb.8(0x50000000), %bb.7(0x30000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 0, implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.8, [[V_CMP_NE_U32_e64_1]] + ; GFX90A-NEXT: SI_BRCOND %bb.8, [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.bb16: @@ -373,7 +373,7 @@ define hidden void @blam() { ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR [[COPY10]], [[S_MOV_B32_3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX1200-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GFX1200-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[GLOBAL_LOAD_DWORD]], killed [[S_MOV_B32_4]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_I32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_LT_I32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.2 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.2.bb6: @@ -381,7 +381,7 @@ define hidden void @blam() { ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[GLOBAL_LOAD_DWORD]], killed [[S_MOV_B32_5]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_EQ_U32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.5, killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.1 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.3.bb8: @@ -389,7 +389,7 @@ define hidden void @blam() { ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[GLOBAL_LOAD_DWORD]], killed [[S_MOV_B32_6]], implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_NE_U32_e64_]] + ; GFX1200-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.4 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.4.bb10: @@ -419,14 +419,14 @@ define hidden void @blam() { ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX1200-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX1200-NEXT: [[V_CMP_EQ_F32_e64_1:%[0-9]+]]:sreg_32 = nsz nofpexcept V_CMP_EQ_F32_e64 0, [[COPY11]], 0, killed [[S_MOV_B32_8]], 0, implicit $mode, implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_EQ_F32_e64_1]] + ; GFX1200-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_EQ_F32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.6 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.6.bb14: ; GFX1200-NEXT: successors: %bb.8(0x50000000), %bb.7(0x30000000) ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 0, implicit $exec - ; GFX1200-NEXT: SI_BRCOND %bb.8, [[V_CMP_NE_U32_e64_1]] + ; GFX1200-NEXT: SI_BRCOND %bb.8, [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; GFX1200-NEXT: S_BRANCH %bb.7 ; GFX1200-NEXT: {{ $}} ; GFX1200-NEXT: bb.7.bb16: diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-basic.mir b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-basic.mir index 3ba45bb47b795..d174440a3798e 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-basic.mir +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-basic.mir @@ -17,7 +17,7 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], killed [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -28,7 +28,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.2 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.2: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -63,8 +63,7 @@ body: | ; POSTWT-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], killed [[S_MOV_B32_]], implicit $exec - ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -75,7 +74,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.2 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.2: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -111,7 +110,7 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], killed [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -122,7 +121,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.2 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.2: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -155,7 +154,7 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], killed [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -168,9 +167,8 @@ body: | ; POSTWT-NEXT: bb.4: ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -181,7 +179,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.3 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.3: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -284,8 +282,8 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -295,7 +293,7 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY1]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -304,10 +302,8 @@ body: | ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_1]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -316,8 +312,8 @@ body: | ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.8(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY2]], [[S_MOV_B32_]], implicit $exec - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_3]] + ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_2]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -330,11 +326,9 @@ body: | ; POSTWT-NEXT: bb.10: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.9(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_4]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_1]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_SAVEEXEC_B32_]], [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -352,11 +346,9 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_5:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_5]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_AND_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_3:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_EQ_U32_e64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_SAVEEXEC_B32_1]], [[S_AND_SAVEEXEC_B32_3]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.6 @@ -449,7 +441,7 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY1]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -459,7 +451,7 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY2]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -472,11 +464,9 @@ body: | ; POSTWT-NEXT: bb.9: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -494,11 +484,9 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_3]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_1]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_1]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_EQ_U32_e64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_1]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.6 @@ -585,7 +573,7 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -600,9 +588,8 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -635,7 +622,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.7 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.7: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -709,7 +696,7 @@ body: | ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -726,9 +713,8 @@ body: | ; POSTWT-NEXT: bb.4: ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[COPY2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[COPY2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[COPY2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -739,7 +725,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.3 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.3: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -1031,7 +1017,7 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], killed [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -1043,7 +1029,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.2 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.2: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.3: @@ -1096,7 +1082,7 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 0, killed $vgpr5, implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -1136,9 +1122,8 @@ body: | ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -1149,7 +1134,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.6 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x40000000), %bb.5(0x40000000) @@ -1251,7 +1236,7 @@ body: | ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 0, killed $vgpr5, implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -1264,9 +1249,8 @@ body: | ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_1]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -1277,7 +1261,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.6 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-divergent-cf.ll b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-divergent-cf.ll index 3fcb2aa0a282a..0fce2ddbd7be6 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-divergent-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-divergent-cf.ll @@ -38,7 +38,8 @@ define amdgpu_cs void @triangle(ptr addrspace(1) %out, i32 %val) { ; CHECK-LABEL: triangle: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2 -; CHECK-NEXT: s_xor_b64 exec, vcc, exec +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: .LBB0_1: ; %then @@ -82,17 +83,19 @@ define amdgpu_cs void @double_triangle(ptr addrspace(1) %out, i32 %val1, i32 %va ; CHECK-LABEL: double_triangle: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2 -; CHECK-NEXT: s_xor_b64 exec, vcc, exec +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB1_2 ; CHECK-NEXT: .LBB1_1: ; %then1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: .LBB1_2: ; %mid -; CHECK-NEXT: s_or_b64 exec, exec, vcc +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_movk_i32 s0, 0x65 ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s0, v3 -; CHECK-NEXT: s_xor_b64 exec, vcc, exec +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB1_4 ; CHECK-NEXT: .LBB1_3: ; %then2 @@ -137,16 +140,16 @@ define amdgpu_cs void @diamond(ptr addrspace(1) %out, i32 %val) { ; CHECK-LABEL: diamond: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2 -; CHECK-NEXT: s_xor_b64 exec, vcc, exec +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB2_2 ; CHECK-NEXT: .LBB2_1: ; %then ; CHECK-NEXT: v_mov_b32_e32 v2, 1 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: .LBB2_2: -; CHECK-NEXT: s_or_b64 exec, exec, vcc -; CHECK-NEXT: s_xor_b64 s[0:1], exec, vcc -; CHECK-NEXT: s_mov_b64 exec, vcc +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB2_4 ; CHECK-NEXT: .LBB2_3: ; %else @@ -191,33 +194,31 @@ define amdgpu_cs void @nested_if(ptr addrspace(1) %out, i32 %val1, i32 %val2) { ; CHECK-LABEL: nested_if: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2 -; CHECK-NEXT: s_xor_b64 exec, vcc, exec +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB3_2 ; CHECK-NEXT: .LBB3_1: ; %outer_then ; CHECK-NEXT: v_mov_b32_e32 v2, 1 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: .LBB3_2: -; CHECK-NEXT: s_or_b64 exec, exec, vcc -; CHECK-NEXT: s_xor_b64 s[0:1], exec, vcc -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_mov_b64 exec, vcc +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB3_7 ; CHECK-NEXT: .LBB3_3: ; %outer_else ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 31, v3 -; CHECK-NEXT: s_xor_b64 exec, vcc, exec +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[2:3] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB3_5 ; CHECK-NEXT: .LBB3_4: ; %inner_then ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: .LBB3_5: -; CHECK-NEXT: s_or_b64 exec, exec, vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, vcc -; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b64 exec, vcc ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB3_7 ; CHECK-NEXT: .LBB3_6: ; %inner_else @@ -270,7 +271,7 @@ define amdgpu_cs void @cascaded_if_shared_target(ptr addrspace(1) %out, i32 %val ; CHECK-NEXT: s_xor_b64 s[4:5], vcc, exec ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB4_2 ; CHECK-NEXT: .LBB4_1: ; %path_a @@ -278,9 +279,8 @@ define amdgpu_cs void @cascaded_if_shared_target(ptr addrspace(1) %out, i32 %val ; CHECK-NEXT: v_cmp_gt_u32_e64 s[0:1], 32, v3 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: s_or_b64 exec, exec, vcc -; CHECK-NEXT: s_xor_b64 s[4:5], exec, vcc -; CHECK-NEXT: s_mov_b64 exec, vcc +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB4_4 ; CHECK-NEXT: .LBB4_3: ; %path_b @@ -290,8 +290,7 @@ define amdgpu_cs void @cascaded_if_shared_target(ptr addrspace(1) %out, i32 %val ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: .LBB4_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[0:1] -; CHECK-NEXT: s_mov_b64 exec, s[0:1] +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB4_6 ; CHECK-NEXT: .LBB4_5: ; %target @@ -352,8 +351,9 @@ define amdgpu_cs void @uniform_bypass_divergent(ptr addrspace(1) %out, i32 %val1 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB5_3 ; CHECK-NEXT: ; %bb.1: ; %div_block -; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 31, v3 -; CHECK-NEXT: s_xor_b64 exec, s[0:1], exec +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 31, v3 +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB5_5 ; CHECK-NEXT: .LBB5_2: ; %then @@ -421,10 +421,9 @@ define amdgpu_cs void @self_loop(ptr addrspace(1) %out, i32 %val) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_add_i32 s2, s2, 1 ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 -; CHECK-NEXT: s_xor_b64 s[4:5], exec, vcc ; CHECK-NEXT: v_mov_b32_e32 v3, s2 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; CHECK-NEXT: s_mov_b64 exec, vcc ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execnz .LBB6_1 ; CHECK-NEXT: .LBB6_2: ; %exit @@ -480,10 +479,8 @@ define amdgpu_cs void @loop_two_exits(ptr addrspace(1) %out, i32 %val1, i32 %val ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, s4, v4 ; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec -; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; CHECK-NEXT: s_and_b64 s[8:9], s[8:9], exec -; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB7_4 ; CHECK-NEXT: .LBB7_2: ; %body @@ -497,11 +494,9 @@ define amdgpu_cs void @loop_two_exits(ptr addrspace(1) %out, i32 %val1, i32 %val ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v7 ; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec -; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; CHECK-NEXT: s_and_b64 s[8:9], s[8:9], exec -; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; CHECK-NEXT: global_store_dword v[5:6], v2, off -; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB7_4 ; CHECK-NEXT: .LBB7_3: ; %latch diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-fix-missing-defs.mir b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-fix-missing-defs.mir index 58a80d459f0d4..57b6079dd94ca 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-fix-missing-defs.mir +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-fix-missing-defs.mir @@ -26,7 +26,7 @@ body: | ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_]], implicit $exec ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 @@ -39,7 +39,7 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -87,7 +87,7 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_]], implicit $exec ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 @@ -101,9 +101,8 @@ body: | ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.2 @@ -115,7 +114,7 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc ; CHECK-NEXT: $vgpr1 = COPY [[DEF]] ; CHECK-NEXT: S_ENDPGM 0 bb.0: @@ -170,7 +169,7 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_]], implicit $exec ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 @@ -184,9 +183,8 @@ body: | ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.2 @@ -198,7 +196,7 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc ; CHECK-NEXT: $sgpr1 = COPY [[DEF]] ; CHECK-NEXT: S_ENDPGM 0 bb.0: @@ -252,7 +250,7 @@ body: | ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_]], implicit $exec ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 @@ -266,9 +264,8 @@ body: | ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.2 @@ -280,7 +277,7 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc ; CHECK-NEXT: $vgpr1 = COPY undef [[V_MOV_B32_e32_]] ; CHECK-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-inlineasm-br.mir b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-inlineasm-br.mir index 6498c3fd33f03..b0cf178750a45 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-inlineasm-br.mir +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-inlineasm-br.mir @@ -52,7 +52,7 @@ body: | ; POSTWT-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_]] ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -70,9 +70,8 @@ body: | ; POSTWT-NEXT: bb.8 (inlineasm-br-indirect-target): ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[COPY2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[COPY2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[COPY2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -82,17 +81,16 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_4]], implicit $exec - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_XOR_B32_2]], implicit-def $scc + ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_XOR_B32_1]], implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_1]] ; POSTWT-NEXT: S_BRANCH %bb.7 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.3(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -105,9 +103,8 @@ body: | ; POSTWT-NEXT: bb.6: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_3]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_3]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_3]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -118,7 +115,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.5 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.5: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_4]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-natural-loops.mir b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-natural-loops.mir index 07aecdd15ea6e..2feca0a00d9c5 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-natural-loops.mir +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-natural-loops.mir @@ -26,9 +26,8 @@ body: | ; POSTWT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 1, implicit-def $scc ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[S_ADD_U32_]], [[COPY]], implicit $exec ; POSTWT-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] - ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -83,9 +82,7 @@ body: | ; POSTWT-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_NE_U32_e64_]] ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -101,11 +98,9 @@ body: | ; POSTWT-NEXT: bb.4: ; POSTWT-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[COPY3]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[COPY3]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[COPY3]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -175,10 +170,8 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[S_ADD_U32_]], [[COPY1]], implicit $exec ; POSTWT-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] - ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_B32_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -242,9 +235,7 @@ body: | ; POSTWT-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_NE_U32_e64_]] ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -261,11 +252,9 @@ body: | ; POSTWT-NEXT: bb.4: ; POSTWT-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[COPY3]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[COPY3]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[COPY3]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -430,10 +419,8 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY3]], [[COPY]], implicit $exec - ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_B32_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -443,10 +430,8 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY3]], 1, 0, implicit $exec ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_ADD_U32_e64_]], [[COPY1]], implicit $exec - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_]], [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_]], [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -457,11 +442,9 @@ body: | ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], implicit $exec ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[V_CMP_NE_U32_e64_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_3]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_]], [[S_AND_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_XOR_B32_2]] + ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_2]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_]], [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -470,10 +453,8 @@ body: | ; POSTWT-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_MOV_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_4]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_B32_3]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_3:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_SAVEEXEC_B32_3]], implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec @@ -656,9 +637,8 @@ body: | ; POSTWT-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY3]], 1, 0, implicit $exec ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_ADD_U32_e64_]], [[COPY1]], implicit $exec ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -671,10 +651,8 @@ body: | ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_ADD_U32_e64_1]], [[COPY]], implicit $exec ; POSTWT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_1]] ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_3]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_3]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_3]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec @@ -759,9 +737,7 @@ body: | ; POSTWT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], 1, implicit-def $scc ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[S_ADD_U32_]], [[COPY]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -772,17 +748,15 @@ body: | ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[S_ADD_U32_]], [[COPY1]], implicit $exec ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] ; POSTWT-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = COPY [[V_CMP_NE_U32_e64_1]] - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_XOR_B32_2]], implicit-def $scc + ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_XOR_B32_1]], implicit-def $scc ; POSTWT-NEXT: S_BRANCH %bb.10 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.10: ; POSTWT-NEXT: successors: %bb.3(0x40000000), %bb.9(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_3]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_XOR_B32_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -793,16 +767,15 @@ body: | ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[S_ADD_U32_]], [[COPY2]], implicit $exec ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] ; POSTWT-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_4]], [[V_CMP_NE_U32_e64_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_4]], implicit-def $scc + ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_2]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_2]], implicit-def $scc ; POSTWT-NEXT: S_BRANCH %bb.9 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, %28, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_5:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, %20, implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -815,11 +788,9 @@ body: | ; POSTWT-NEXT: bb.9: ; POSTWT-NEXT: successors: %bb.1(0x40000000), %bb.8(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_6:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_4]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_6]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_4]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_3:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_4]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_SAVEEXEC_B32_3]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -828,8 +799,7 @@ body: | ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_MOV_B32_3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_7:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_4:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -840,7 +810,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.7 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_5]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 @@ -939,10 +909,8 @@ body: | ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_B32_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -952,12 +920,10 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[S_ADD_U32_]], [[COPY2]], implicit $exec ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U32_]] - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_3]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_2]] + ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_2]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -965,9 +931,8 @@ body: | ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, %21, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, %16, implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -981,8 +946,7 @@ body: | ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_MOV_B32_3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_5:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_3:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -993,7 +957,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.7 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_4]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 @@ -1084,9 +1048,7 @@ body: | ; POSTWT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], 1, implicit-def $scc ; POSTWT-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[S_ADD_U32_]], [[COPY]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_NE_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_NE_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -1105,10 +1067,8 @@ body: | ; POSTWT-NEXT: bb.10: ; POSTWT-NEXT: successors: %bb.3(0x40000000), %bb.9(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_XOR_B32_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -1127,9 +1087,8 @@ body: | ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, %28, implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, %20, implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -1142,11 +1101,9 @@ body: | ; POSTWT-NEXT: bb.9: ; POSTWT-NEXT: successors: %bb.1(0x40000000), %bb.8(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_4]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_4]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_4]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_3:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_4]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_SAVEEXEC_B32_3]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -1155,8 +1112,7 @@ body: | ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_MOV_B32_3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_5:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_4:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -1167,7 +1123,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.7 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_3]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-partial-join.mir b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-partial-join.mir index a8552d79e5420..5ff06967b8dd3 100644 --- a/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-partial-join.mir +++ b/llvm/test/CodeGen/AMDGPU/WaveTransform/wavetransform-partial-join.mir @@ -31,8 +31,8 @@ body: | ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -43,8 +43,8 @@ body: | ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY1]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_1]] - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_SAVEEXEC_B32_]], [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -53,8 +53,7 @@ body: | ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.8(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -63,8 +62,8 @@ body: | ; POSTWT-NEXT: successors: %bb.8(0x80000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY2]], [[S_MOV_B32_]], implicit $exec - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_3]], implicit-def $scc + ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_2]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], [[S_XOR_B32_2]], implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_2]] ; POSTWT-NEXT: S_BRANCH %bb.8 ; POSTWT-NEXT: {{ $}} @@ -76,9 +75,8 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_3:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -91,9 +89,8 @@ body: | ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_4]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_5:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_3]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_4:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -104,7 +101,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.6 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_5]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_4]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -184,7 +181,7 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY1]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -195,10 +192,8 @@ body: | ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY2]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_1]] - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_XOR_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -211,11 +206,9 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_3]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_1]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_1]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -229,10 +222,8 @@ body: | ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_4]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_3]], [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -316,7 +307,7 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -332,9 +323,8 @@ body: | ; POSTWT-NEXT: bb.10: ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.9(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -343,8 +333,8 @@ body: | ; POSTWT-NEXT: successors: %bb.9(0x80000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY2]], [[S_MOV_B32_]], implicit $exec - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_XOR_B32_2]], implicit-def $scc + ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc + ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_2]], [[S_XOR_B32_1]], implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_1]] ; POSTWT-NEXT: S_BRANCH %bb.9 ; POSTWT-NEXT: {{ $}} @@ -356,9 +346,8 @@ body: | ; POSTWT-NEXT: bb.9: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.8(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -371,9 +360,8 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_3]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_3]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_3]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -384,7 +372,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.6 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_4]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.7: @@ -463,8 +451,8 @@ body: | ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -475,8 +463,8 @@ body: | ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY1]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_1]] - ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_SAVEEXEC_B32_]], [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -485,8 +473,7 @@ body: | ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.8(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_2:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -509,9 +496,8 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_1]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_2]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_3:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -524,9 +510,8 @@ body: | ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_3]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_3]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_4:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -537,7 +522,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.6 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_4]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_4]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -610,7 +595,7 @@ body: | ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.1 @@ -626,9 +611,8 @@ body: | ; POSTWT-NEXT: bb.10: ; POSTWT-NEXT: successors: %bb.2(0x40000000), %bb.9(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.9, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.2 @@ -651,9 +635,8 @@ body: | ; POSTWT-NEXT: bb.9: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.8(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_2]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term1]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -666,9 +649,8 @@ body: | ; POSTWT-NEXT: bb.8: ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_3]], implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_3]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_3]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 @@ -679,7 +661,7 @@ body: | ; POSTWT-NEXT: S_BRANCH %bb.6 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.6: - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_3]], implicit-def $scc + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc ; POSTWT-NEXT: S_ENDPGM 0 ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: bb.7: @@ -764,7 +746,7 @@ body: | ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY1]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_term:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32_term [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.3 @@ -784,11 +766,9 @@ body: | ; POSTWT-NEXT: bb.7: ; POSTWT-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} - ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_EQ_U32_e64_]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[V_CMP_EQ_U32_e64_]] + ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_AND_SAVEEXEC_B32_term]], implicit-def $scc + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -892,8 +872,8 @@ body: | ; POSTWT-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[COPY2]], [[S_MOV_B32_]], implicit $exec ; POSTWT-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; POSTWT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_]] - ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = COPY [[V_CMP_EQ_U32_e64_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.4 @@ -912,10 +892,8 @@ body: | ; POSTWT-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; POSTWT-NEXT: {{ $}} ; POSTWT-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_MOV_B32_2]], implicit-def $scc - ; POSTWT-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_MOV_B32_1]], implicit-def $scc - ; POSTWT-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_1]], $exec_lo, implicit-def $scc - ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] - ; POSTWT-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] + ; POSTWT-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32 = S_AND_SAVEEXEC_B32 [[S_MOV_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; POSTWT-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = COPY [[S_AND_SAVEEXEC_B32_1]] ; POSTWT-NEXT: SI_WAVE_CF_EDGE implicit-def $scc ; POSTWT-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; POSTWT-NEXT: S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index cc8a19ed9a9bd..8278b4c660bc0 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -555,7 +555,8 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB11_2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.private @@ -567,9 +568,8 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB11_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB11_4 ; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.global @@ -605,7 +605,8 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB11_2 ; GFX950-NEXT: .LBB11_1: ; %atomicrmw.private @@ -616,9 +617,8 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v0, a[0:1], off ; GFX950-NEXT: .LBB11_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB11_4 ; GFX950-NEXT: .LBB11_3: ; %atomicrmw.global @@ -660,7 +660,8 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB12_2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.private @@ -672,9 +673,8 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB12_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB12_4 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.global @@ -708,7 +708,8 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-NEXT: .LBB12_1: ; %atomicrmw.private @@ -719,9 +720,8 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off ; GFX950-NEXT: .LBB12_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB12_4 ; GFX950-NEXT: .LBB12_3: ; %atomicrmw.global @@ -756,10 +756,11 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB13_2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.private @@ -771,9 +772,8 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB13_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB13_4 ; GFX90A-NEXT: .LBB13_3: ; %atomicrmw.global @@ -804,10 +804,11 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-NEXT: .LBB13_1: ; %atomicrmw.private @@ -818,9 +819,8 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off ; GFX950-NEXT: .LBB13_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB13_4 ; GFX950-NEXT: .LBB13_3: ; %atomicrmw.global @@ -857,10 +857,11 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB14_2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.private @@ -872,9 +873,8 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB14_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB14_4 ; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.global @@ -903,10 +903,11 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB14_2 ; GFX950-NEXT: .LBB14_1: ; %atomicrmw.private @@ -917,9 +918,8 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB14_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB14_4 ; GFX950-NEXT: .LBB14_3: ; %atomicrmw.global @@ -954,10 +954,11 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB15_2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.private @@ -969,9 +970,8 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB15_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB15_4 ; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.global @@ -1000,10 +1000,11 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB15_2 ; GFX950-NEXT: .LBB15_1: ; %atomicrmw.private @@ -1014,9 +1015,8 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB15_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB15_4 ; GFX950-NEXT: .LBB15_3: ; %atomicrmw.global @@ -1051,10 +1051,11 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB16_2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.private @@ -1066,9 +1067,8 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB16_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB16_4 ; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.global @@ -1099,10 +1099,11 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB16_2 ; GFX950-NEXT: .LBB16_1: ; %atomicrmw.private @@ -1113,9 +1114,8 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off ; GFX950-NEXT: .LBB16_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB16_4 ; GFX950-NEXT: .LBB16_3: ; %atomicrmw.global @@ -1157,7 +1157,8 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB17_2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.private @@ -1169,9 +1170,8 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB17_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB17_4 ; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global @@ -1205,7 +1205,8 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB17_2 ; GFX950-NEXT: .LBB17_1: ; %atomicrmw.private @@ -1216,9 +1217,8 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off ; GFX950-NEXT: .LBB17_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB17_4 ; GFX950-NEXT: .LBB17_3: ; %atomicrmw.global @@ -1253,10 +1253,11 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.private @@ -1268,9 +1269,8 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB18_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB18_4 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global @@ -1299,10 +1299,11 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB18_2 ; GFX950-NEXT: .LBB18_1: ; %atomicrmw.private @@ -1313,9 +1314,8 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB18_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB18_4 ; GFX950-NEXT: .LBB18_3: ; %atomicrmw.global @@ -1347,10 +1347,11 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB19_2 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.private @@ -1359,9 +1360,8 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB19_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB19_4 ; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global @@ -1387,7 +1387,8 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB19_2 ; GFX950-NEXT: .LBB19_1: ; %atomicrmw.private @@ -1396,9 +1397,8 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v0, a[0:1], off ; GFX950-NEXT: .LBB19_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB19_4 ; GFX950-NEXT: .LBB19_3: ; %atomicrmw.global @@ -1425,10 +1425,11 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB20_2 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.private @@ -1437,9 +1438,8 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB20_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB20_4 ; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global @@ -1462,10 +1462,11 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB20_2 ; GFX950-NEXT: .LBB20_1: ; %atomicrmw.private @@ -1474,9 +1475,8 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off ; GFX950-NEXT: .LBB20_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB20_4 ; GFX950-NEXT: .LBB20_3: ; %atomicrmw.global @@ -1523,10 +1523,9 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.end @@ -1559,10 +1558,9 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: .LBB21_2: ; %atomicrmw.end @@ -1603,10 +1601,9 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.end @@ -1638,10 +1635,9 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB22_1 ; GFX950-NEXT: .LBB22_2: ; %atomicrmw.end @@ -1680,10 +1676,9 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.end @@ -1715,10 +1710,9 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: .LBB23_2: ; %atomicrmw.end @@ -1758,10 +1752,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.end @@ -1792,10 +1785,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB24_1 ; GFX950-NEXT: .LBB24_2: ; %atomicrmw.end @@ -1834,10 +1826,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.end @@ -1868,10 +1859,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB25_1 ; GFX950-NEXT: .LBB25_2: ; %atomicrmw.end @@ -1910,10 +1900,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: .LBB26_2: ; %atomicrmw.end @@ -1945,10 +1934,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: .LBB26_2: ; %atomicrmw.end @@ -1989,10 +1977,9 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: .LBB27_2: ; %atomicrmw.end @@ -2024,10 +2011,9 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB27_1 ; GFX950-NEXT: .LBB27_2: ; %atomicrmw.end @@ -2066,10 +2052,9 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: .LBB28_2: ; %atomicrmw.end @@ -2100,10 +2085,9 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB28_1 ; GFX950-NEXT: .LBB28_2: ; %atomicrmw.end @@ -2192,10 +2176,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: .LBB29_2: ; %atomicrmw.end @@ -2329,10 +2312,9 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB29_1 ; GFX950-NEXT: .LBB29_2: ; %atomicrmw.end @@ -2427,10 +2409,9 @@ define void @flat_atomic_xor_expansion_i32_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: .LBB30_2: ; %atomicrmw.end @@ -2459,10 +2440,9 @@ define void @flat_atomic_xor_expansion_i32_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB30_1 ; GFX950-NEXT: .LBB30_2: ; %atomicrmw.end @@ -2496,10 +2476,9 @@ define void @flat_atomic_xor_expansion_i32_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: .LBB31_2: ; %atomicrmw.end @@ -2527,10 +2506,9 @@ define void @flat_atomic_xor_expansion_i32_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB31_1 ; GFX950-NEXT: .LBB31_2: ; %atomicrmw.end @@ -2562,7 +2540,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB32_2 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.private @@ -2579,10 +2557,8 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB32_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB32_5 ; GFX90A-NEXT: .LBB32_3: ; %atomicrmw.global @@ -2601,13 +2577,12 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB32_4 ; GFX90A-NEXT: .LBB32_5: ; %atomicrmw.phi @@ -2632,7 +2607,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB32_2 ; GFX950-NEXT: .LBB32_1: ; %atomicrmw.private @@ -2647,10 +2622,8 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB32_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB32_5 ; GFX950-NEXT: .LBB32_3: ; %atomicrmw.global @@ -2671,10 +2644,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB32_4 ; GFX950-NEXT: .LBB32_5: ; %atomicrmw.phi @@ -2707,7 +2679,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB33_2 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.private @@ -2722,10 +2694,8 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB33_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB33_5 ; GFX90A-NEXT: .LBB33_3: ; %atomicrmw.global @@ -2745,10 +2715,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB33_4 ; GFX90A-NEXT: .LBB33_5: ; %atomicrmw.phi @@ -2773,7 +2742,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB33_2 ; GFX950-NEXT: .LBB33_1: ; %atomicrmw.private @@ -2786,10 +2755,8 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB33_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB33_5 ; GFX950-NEXT: .LBB33_3: ; %atomicrmw.global @@ -2809,10 +2776,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB33_4 ; GFX950-NEXT: .LBB33_5: ; %atomicrmw.phi @@ -2843,7 +2809,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB34_2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.private @@ -2860,10 +2826,8 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB34_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB34_5 ; GFX90A-NEXT: .LBB34_3: ; %atomicrmw.global @@ -2882,13 +2846,12 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB34_4 ; GFX90A-NEXT: .LBB34_5: ; %atomicrmw.phi @@ -2912,7 +2875,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB34_2 ; GFX950-NEXT: .LBB34_1: ; %atomicrmw.private @@ -2927,10 +2890,8 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB34_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB34_5 ; GFX950-NEXT: .LBB34_3: ; %atomicrmw.global @@ -2951,10 +2912,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB34_4 ; GFX950-NEXT: .LBB34_5: ; %atomicrmw.phi @@ -2985,7 +2945,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB35_2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.private @@ -3000,10 +2960,8 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB35_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB35_5 ; GFX90A-NEXT: .LBB35_3: ; %atomicrmw.global @@ -3023,10 +2981,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB35_4 ; GFX90A-NEXT: .LBB35_5: ; %atomicrmw.phi @@ -3050,7 +3007,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB35_2 ; GFX950-NEXT: .LBB35_1: ; %atomicrmw.private @@ -3063,10 +3020,8 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB35_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB35_5 ; GFX950-NEXT: .LBB35_3: ; %atomicrmw.global @@ -3086,10 +3041,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB35_4 ; GFX950-NEXT: .LBB35_5: ; %atomicrmw.phi @@ -3120,7 +3074,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB36_2 ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.private @@ -3135,10 +3089,8 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB36_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB36_5 ; GFX90A-NEXT: .LBB36_3: ; %atomicrmw.global @@ -3158,10 +3110,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB36_4 ; GFX90A-NEXT: .LBB36_5: ; %atomicrmw.phi @@ -3185,7 +3136,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB36_2 ; GFX950-NEXT: .LBB36_1: ; %atomicrmw.private @@ -3198,10 +3149,8 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB36_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB36_5 ; GFX950-NEXT: .LBB36_3: ; %atomicrmw.global @@ -3221,10 +3170,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB36_4 ; GFX950-NEXT: .LBB36_5: ; %atomicrmw.phi @@ -3255,7 +3203,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB37_2 ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.private @@ -3272,10 +3220,8 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB37_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB37_5 ; GFX90A-NEXT: .LBB37_3: ; %atomicrmw.global @@ -3294,13 +3240,12 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB37_4 ; GFX90A-NEXT: .LBB37_5: ; %atomicrmw.phi @@ -3324,7 +3269,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB37_2 ; GFX950-NEXT: .LBB37_1: ; %atomicrmw.private @@ -3339,10 +3284,8 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB37_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB37_5 ; GFX950-NEXT: .LBB37_3: ; %atomicrmw.global @@ -3363,10 +3306,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB37_4 ; GFX950-NEXT: .LBB37_5: ; %atomicrmw.phi @@ -3399,7 +3341,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB38_2 ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.private @@ -3414,10 +3356,8 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB38_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB38_5 ; GFX90A-NEXT: .LBB38_3: ; %atomicrmw.global @@ -3437,10 +3377,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB38_4 ; GFX90A-NEXT: .LBB38_5: ; %atomicrmw.phi @@ -3465,7 +3404,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB38_2 ; GFX950-NEXT: .LBB38_1: ; %atomicrmw.private @@ -3478,10 +3417,8 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB38_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB38_5 ; GFX950-NEXT: .LBB38_3: ; %atomicrmw.global @@ -3501,10 +3438,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB38_4 ; GFX950-NEXT: .LBB38_5: ; %atomicrmw.phi @@ -3535,7 +3471,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB39_2 ; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.private @@ -3550,10 +3486,8 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB39_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB39_5 ; GFX90A-NEXT: .LBB39_3: ; %atomicrmw.global @@ -3573,10 +3507,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB39_4 ; GFX90A-NEXT: .LBB39_5: ; %atomicrmw.phi @@ -3600,7 +3533,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB39_2 ; GFX950-NEXT: .LBB39_1: ; %atomicrmw.private @@ -3613,10 +3546,8 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB39_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB39_5 ; GFX950-NEXT: .LBB39_3: ; %atomicrmw.global @@ -3636,10 +3567,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB39_4 ; GFX950-NEXT: .LBB39_5: ; %atomicrmw.phi @@ -3671,7 +3601,7 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.private @@ -3686,10 +3616,8 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB40_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB40_5 ; GFX90A-NEXT: .LBB40_3: ; %atomicrmw.global @@ -3709,10 +3637,9 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB40_4 ; GFX90A-NEXT: .LBB40_5: ; %atomicrmw.phi @@ -3734,7 +3661,7 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB40_2 ; GFX950-NEXT: .LBB40_1: ; %atomicrmw.private @@ -3747,10 +3674,8 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6 ; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-NEXT: .LBB40_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB40_5 ; GFX950-NEXT: .LBB40_3: ; %atomicrmw.global @@ -3770,10 +3695,9 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB40_4 ; GFX950-NEXT: .LBB40_5: ; %atomicrmw.phi @@ -3799,7 +3723,7 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB41_2 ; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.private @@ -3814,10 +3738,8 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB41_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB41_5 ; GFX90A-NEXT: .LBB41_3: ; %atomicrmw.global @@ -3837,10 +3759,9 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB41_4 ; GFX90A-NEXT: .LBB41_5: ; %atomicrmw.phi @@ -3861,7 +3782,7 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB41_2 ; GFX950-NEXT: .LBB41_1: ; %atomicrmw.private @@ -3874,10 +3795,8 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6 ; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-NEXT: .LBB41_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB41_5 ; GFX950-NEXT: .LBB41_3: ; %atomicrmw.global @@ -3897,10 +3816,9 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB41_4 ; GFX950-NEXT: .LBB41_5: ; %atomicrmw.phi @@ -4431,7 +4349,8 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB53_2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.private @@ -4448,9 +4367,8 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB53_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB53_4 ; GFX90A-NEXT: .LBB53_3: ; %atomicrmw.global @@ -4481,7 +4399,8 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB53_2 ; GFX950-NEXT: .LBB53_1: ; %atomicrmw.private @@ -4496,9 +4415,8 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB53_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB53_4 ; GFX950-NEXT: .LBB53_3: ; %atomicrmw.global @@ -4537,7 +4455,8 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB54_2 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.private @@ -4552,9 +4471,8 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB54_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB54_4 ; GFX90A-NEXT: .LBB54_3: ; %atomicrmw.global @@ -4583,7 +4501,8 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB54_2 ; GFX950-NEXT: .LBB54_1: ; %atomicrmw.private @@ -4596,9 +4515,8 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB54_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB54_4 ; GFX950-NEXT: .LBB54_3: ; %atomicrmw.global @@ -4630,10 +4548,11 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB55_2 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.private @@ -4650,9 +4569,8 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB55_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB55_4 ; GFX90A-NEXT: .LBB55_3: ; %atomicrmw.global @@ -4678,10 +4596,11 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB55_2 ; GFX950-NEXT: .LBB55_1: ; %atomicrmw.private @@ -4696,9 +4615,8 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB55_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB55_4 ; GFX950-NEXT: .LBB55_3: ; %atomicrmw.global @@ -4732,10 +4650,11 @@ define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB56_2 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.private @@ -4750,9 +4669,8 @@ define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB56_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB56_4 ; GFX90A-NEXT: .LBB56_3: ; %atomicrmw.global @@ -4776,10 +4694,11 @@ define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB56_2 ; GFX950-NEXT: .LBB56_1: ; %atomicrmw.private @@ -4792,9 +4711,8 @@ define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB56_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB56_4 ; GFX950-NEXT: .LBB56_3: ; %atomicrmw.global @@ -4826,10 +4744,11 @@ define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB57_2 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.private @@ -4844,9 +4763,8 @@ define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB57_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB57_4 ; GFX90A-NEXT: .LBB57_3: ; %atomicrmw.global @@ -4870,10 +4788,11 @@ define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB57_2 ; GFX950-NEXT: .LBB57_1: ; %atomicrmw.private @@ -4886,9 +4805,8 @@ define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB57_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB57_4 ; GFX950-NEXT: .LBB57_3: ; %atomicrmw.global @@ -4920,10 +4838,11 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB58_2 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.private @@ -4940,9 +4859,8 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB58_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB58_4 ; GFX90A-NEXT: .LBB58_3: ; %atomicrmw.global @@ -4968,10 +4886,11 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB58_2 ; GFX950-NEXT: .LBB58_1: ; %atomicrmw.private @@ -4986,9 +4905,8 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB58_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB58_4 ; GFX950-NEXT: .LBB58_3: ; %atomicrmw.global @@ -5027,7 +4945,8 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB59_2 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.private @@ -5042,9 +4961,8 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB59_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB59_4 ; GFX90A-NEXT: .LBB59_3: ; %atomicrmw.global @@ -5073,7 +4991,8 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB59_2 ; GFX950-NEXT: .LBB59_1: ; %atomicrmw.private @@ -5086,9 +5005,8 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB59_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB59_4 ; GFX950-NEXT: .LBB59_3: ; %atomicrmw.global @@ -5120,10 +5038,11 @@ define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB60_2 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.private @@ -5138,9 +5057,8 @@ define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB60_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB60_4 ; GFX90A-NEXT: .LBB60_3: ; %atomicrmw.global @@ -5164,10 +5082,11 @@ define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB60_2 ; GFX950-NEXT: .LBB60_1: ; %atomicrmw.private @@ -5180,9 +5099,8 @@ define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB60_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB60_4 ; GFX950-NEXT: .LBB60_3: ; %atomicrmw.global @@ -5218,7 +5136,8 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB61_2 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.private @@ -5233,9 +5152,8 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB61_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB61_4 ; GFX90A-NEXT: .LBB61_3: ; %atomicrmw.global @@ -5261,7 +5179,8 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB61_2 ; GFX950-NEXT: .LBB61_1: ; %atomicrmw.private @@ -5274,9 +5193,8 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB61_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB61_4 ; GFX950-NEXT: .LBB61_3: ; %atomicrmw.global @@ -5303,10 +5221,11 @@ define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB62_2 ; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.private @@ -5321,9 +5240,8 @@ define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB62_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB62_4 ; GFX90A-NEXT: .LBB62_3: ; %atomicrmw.global @@ -5344,10 +5262,11 @@ define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB62_2 ; GFX950-NEXT: .LBB62_1: ; %atomicrmw.private @@ -5360,9 +5279,8 @@ define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB62_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB62_4 ; GFX950-NEXT: .LBB62_3: ; %atomicrmw.global @@ -5618,10 +5536,9 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: .LBB69_2: ; %atomicrmw.end @@ -5652,10 +5569,9 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: .LBB69_2: ; %atomicrmw.end @@ -5691,10 +5607,9 @@ define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 ; GFX90A-NEXT: .LBB70_2: ; %atomicrmw.end @@ -5723,10 +5638,9 @@ define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB70_1 ; GFX950-NEXT: .LBB70_2: ; %atomicrmw.end @@ -6260,10 +6174,9 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: .LBB85_2: ; %atomicrmw.end @@ -6296,10 +6209,9 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: .LBB85_2: ; %atomicrmw.end @@ -6336,10 +6248,9 @@ define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 ; GFX90A-NEXT: .LBB86_2: ; %atomicrmw.end @@ -6370,10 +6281,9 @@ define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB86_1 ; GFX950-NEXT: .LBB86_2: ; %atomicrmw.end @@ -6408,10 +6318,9 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: .LBB87_2: ; %atomicrmw.end @@ -6442,10 +6351,9 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: .LBB87_2: ; %atomicrmw.end @@ -6480,10 +6388,9 @@ define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 ; GFX90A-NEXT: .LBB88_2: ; %atomicrmw.end @@ -6512,10 +6419,9 @@ define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB88_1 ; GFX950-NEXT: .LBB88_2: ; %atomicrmw.end @@ -6550,7 +6456,8 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB89_2 ; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.private @@ -6567,9 +6474,8 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB89_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB89_4 ; GFX90A-NEXT: .LBB89_3: ; %atomicrmw.global @@ -6600,7 +6506,8 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB89_2 ; GFX950-NEXT: .LBB89_1: ; %atomicrmw.private @@ -6614,9 +6521,8 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB89_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB89_4 ; GFX950-NEXT: .LBB89_3: ; %atomicrmw.global @@ -6648,10 +6554,11 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB90_2 ; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.private @@ -6666,9 +6573,8 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB90_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB90_4 ; GFX90A-NEXT: .LBB90_3: ; %atomicrmw.global @@ -6692,10 +6598,11 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB90_2 ; GFX950-NEXT: .LBB90_1: ; %atomicrmw.private @@ -6707,9 +6614,8 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB90_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB90_4 ; GFX950-NEXT: .LBB90_3: ; %atomicrmw.global @@ -6744,7 +6650,8 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB91_2 ; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.private @@ -6761,9 +6668,8 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB91_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB91_4 ; GFX90A-NEXT: .LBB91_3: ; %atomicrmw.global @@ -6794,7 +6700,8 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB91_2 ; GFX950-NEXT: .LBB91_1: ; %atomicrmw.private @@ -6810,9 +6717,8 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB91_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB91_4 ; GFX950-NEXT: .LBB91_3: ; %atomicrmw.global @@ -6844,10 +6750,11 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB92_2 ; GFX90A-NEXT: .LBB92_1: ; %atomicrmw.private @@ -6862,9 +6769,8 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB92_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB92_4 ; GFX90A-NEXT: .LBB92_3: ; %atomicrmw.global @@ -6888,10 +6794,11 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB92_2 ; GFX950-NEXT: .LBB92_1: ; %atomicrmw.private @@ -6905,9 +6812,8 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_subb_co_u32_e64 v3, s[0:1], v1, v3, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB92_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB92_4 ; GFX950-NEXT: .LBB92_3: ; %atomicrmw.global @@ -6942,7 +6848,8 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB93_2 ; GFX90A-NEXT: .LBB93_1: ; %atomicrmw.private @@ -6959,9 +6866,8 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB93_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB93_4 ; GFX90A-NEXT: .LBB93_3: ; %atomicrmw.global @@ -6992,7 +6898,8 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB93_2 ; GFX950-NEXT: .LBB93_1: ; %atomicrmw.private @@ -7007,9 +6914,8 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB93_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB93_4 ; GFX950-NEXT: .LBB93_3: ; %atomicrmw.global @@ -7041,10 +6947,11 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB94_2 ; GFX90A-NEXT: .LBB94_1: ; %atomicrmw.private @@ -7059,9 +6966,8 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB94_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB94_4 ; GFX90A-NEXT: .LBB94_3: ; %atomicrmw.global @@ -7085,10 +6991,11 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB94_2 ; GFX950-NEXT: .LBB94_1: ; %atomicrmw.private @@ -7101,9 +7008,8 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB94_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB94_4 ; GFX950-NEXT: .LBB94_3: ; %atomicrmw.global @@ -7140,7 +7046,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB95_2 ; GFX90A-NEXT: .LBB95_1: ; %atomicrmw.private @@ -7159,10 +7065,8 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB95_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB95_5 ; GFX90A-NEXT: .LBB95_3: ; %atomicrmw.global @@ -7179,13 +7083,12 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB95_4 ; GFX90A-NEXT: .LBB95_5: ; %atomicrmw.phi @@ -7212,7 +7115,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB95_2 ; GFX950-NEXT: .LBB95_1: ; %atomicrmw.private @@ -7229,10 +7132,8 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB95_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB95_5 ; GFX950-NEXT: .LBB95_3: ; %atomicrmw.global @@ -7252,10 +7153,9 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB95_4 ; GFX950-NEXT: .LBB95_5: ; %atomicrmw.phi @@ -7287,7 +7187,7 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB96_2 ; GFX90A-NEXT: .LBB96_1: ; %atomicrmw.private @@ -7304,10 +7204,8 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB96_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB96_5 ; GFX90A-NEXT: .LBB96_3: ; %atomicrmw.global @@ -7325,10 +7223,9 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB96_4 ; GFX90A-NEXT: .LBB96_5: ; %atomicrmw.phi @@ -7354,7 +7251,7 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB96_2 ; GFX950-NEXT: .LBB96_1: ; %atomicrmw.private @@ -7369,10 +7266,8 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_not_b32_e32 v2, v5 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB96_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB96_5 ; GFX950-NEXT: .LBB96_3: ; %atomicrmw.global @@ -7391,10 +7286,9 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB96_4 ; GFX950-NEXT: .LBB96_5: ; %atomicrmw.phi @@ -7426,7 +7320,8 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB97_2 ; GFX90A-NEXT: .LBB97_1: ; %atomicrmw.private @@ -7443,9 +7338,8 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB97_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB97_4 ; GFX90A-NEXT: .LBB97_3: ; %atomicrmw.global @@ -7476,7 +7370,8 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB97_2 ; GFX950-NEXT: .LBB97_1: ; %atomicrmw.private @@ -7491,9 +7386,8 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB97_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB97_4 ; GFX950-NEXT: .LBB97_3: ; %atomicrmw.global @@ -7525,10 +7419,11 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB98_2 ; GFX90A-NEXT: .LBB98_1: ; %atomicrmw.private @@ -7543,9 +7438,8 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB98_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB98_4 ; GFX90A-NEXT: .LBB98_3: ; %atomicrmw.global @@ -7569,10 +7463,11 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB98_2 ; GFX950-NEXT: .LBB98_1: ; %atomicrmw.private @@ -7585,9 +7480,8 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB98_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB98_4 ; GFX950-NEXT: .LBB98_3: ; %atomicrmw.global @@ -7622,7 +7516,8 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB99_2 ; GFX90A-NEXT: .LBB99_1: ; %atomicrmw.private @@ -7640,9 +7535,8 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB99_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB99_4 ; GFX90A-NEXT: .LBB99_3: ; %atomicrmw.global @@ -7673,7 +7567,8 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB99_2 ; GFX950-NEXT: .LBB99_1: ; %atomicrmw.private @@ -7689,9 +7584,8 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB99_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB99_4 ; GFX950-NEXT: .LBB99_3: ; %atomicrmw.global @@ -7723,10 +7617,11 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB100_2 ; GFX90A-NEXT: .LBB100_1: ; %atomicrmw.private @@ -7741,9 +7636,8 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB100_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB100_4 ; GFX90A-NEXT: .LBB100_3: ; %atomicrmw.global @@ -7767,10 +7661,11 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB100_2 ; GFX950-NEXT: .LBB100_1: ; %atomicrmw.private @@ -7785,9 +7680,8 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB100_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB100_4 ; GFX950-NEXT: .LBB100_3: ; %atomicrmw.global @@ -7822,7 +7716,8 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB101_2 ; GFX90A-NEXT: .LBB101_1: ; %atomicrmw.private @@ -7840,9 +7735,8 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB101_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB101_4 ; GFX90A-NEXT: .LBB101_3: ; %atomicrmw.global @@ -7873,7 +7767,8 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB101_2 ; GFX950-NEXT: .LBB101_1: ; %atomicrmw.private @@ -7889,9 +7784,8 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB101_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB101_4 ; GFX950-NEXT: .LBB101_3: ; %atomicrmw.global @@ -7923,10 +7817,11 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB102_2 ; GFX90A-NEXT: .LBB102_1: ; %atomicrmw.private @@ -7941,9 +7836,8 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB102_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB102_4 ; GFX90A-NEXT: .LBB102_3: ; %atomicrmw.global @@ -7967,10 +7861,11 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB102_2 ; GFX950-NEXT: .LBB102_1: ; %atomicrmw.private @@ -7985,9 +7880,8 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB102_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB102_4 ; GFX950-NEXT: .LBB102_3: ; %atomicrmw.global @@ -8022,7 +7916,8 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB103_2 ; GFX90A-NEXT: .LBB103_1: ; %atomicrmw.private @@ -8040,9 +7935,8 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB103_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB103_4 ; GFX90A-NEXT: .LBB103_3: ; %atomicrmw.global @@ -8073,7 +7967,8 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB103_2 ; GFX950-NEXT: .LBB103_1: ; %atomicrmw.private @@ -8089,9 +7984,8 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB103_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB103_4 ; GFX950-NEXT: .LBB103_3: ; %atomicrmw.global @@ -8123,10 +8017,11 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB104_2 ; GFX90A-NEXT: .LBB104_1: ; %atomicrmw.private @@ -8141,9 +8036,8 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB104_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB104_4 ; GFX90A-NEXT: .LBB104_3: ; %atomicrmw.global @@ -8167,10 +8061,11 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB104_2 ; GFX950-NEXT: .LBB104_1: ; %atomicrmw.private @@ -8185,9 +8080,8 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB104_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB104_4 ; GFX950-NEXT: .LBB104_3: ; %atomicrmw.global @@ -8222,7 +8116,8 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB105_2 ; GFX90A-NEXT: .LBB105_1: ; %atomicrmw.private @@ -8240,9 +8135,8 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB105_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB105_4 ; GFX90A-NEXT: .LBB105_3: ; %atomicrmw.global @@ -8273,7 +8167,8 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB105_2 ; GFX950-NEXT: .LBB105_1: ; %atomicrmw.private @@ -8289,9 +8184,8 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB105_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB105_4 ; GFX950-NEXT: .LBB105_3: ; %atomicrmw.global @@ -8323,10 +8217,11 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB106_2 ; GFX90A-NEXT: .LBB106_1: ; %atomicrmw.private @@ -8341,9 +8236,8 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB106_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB106_4 ; GFX90A-NEXT: .LBB106_3: ; %atomicrmw.global @@ -8367,10 +8261,11 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB106_2 ; GFX950-NEXT: .LBB106_1: ; %atomicrmw.private @@ -8385,9 +8280,8 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB106_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB106_4 ; GFX950-NEXT: .LBB106_3: ; %atomicrmw.global @@ -8422,7 +8316,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB107_2 ; GFX90A-NEXT: .LBB107_1: ; %atomicrmw.private @@ -8442,9 +8337,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB107_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB107_4 ; GFX90A-NEXT: .LBB107_3: ; %atomicrmw.global @@ -8475,7 +8369,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB107_2 ; GFX950-NEXT: .LBB107_1: ; %atomicrmw.private @@ -8492,9 +8387,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB107_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB107_4 ; GFX950-NEXT: .LBB107_3: ; %atomicrmw.global @@ -8526,10 +8420,11 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB108_2 ; GFX90A-NEXT: .LBB108_1: ; %atomicrmw.private @@ -8547,9 +8442,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB108_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB108_4 ; GFX90A-NEXT: .LBB108_3: ; %atomicrmw.global @@ -8573,10 +8467,11 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB108_2 ; GFX950-NEXT: .LBB108_1: ; %atomicrmw.private @@ -8592,9 +8487,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB108_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB108_4 ; GFX950-NEXT: .LBB108_3: ; %atomicrmw.global @@ -8629,7 +8523,8 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB109_2 ; GFX90A-NEXT: .LBB109_1: ; %atomicrmw.private @@ -8651,9 +8546,8 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: .LBB109_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB109_4 ; GFX90A-NEXT: .LBB109_3: ; %atomicrmw.global @@ -8684,7 +8578,8 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB109_2 ; GFX950-NEXT: .LBB109_1: ; %atomicrmw.private @@ -8703,9 +8598,8 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB109_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB109_4 ; GFX950-NEXT: .LBB109_3: ; %atomicrmw.global @@ -8737,10 +8631,11 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB110_2 ; GFX90A-NEXT: .LBB110_1: ; %atomicrmw.private @@ -8760,9 +8655,8 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB110_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB110_4 ; GFX90A-NEXT: .LBB110_3: ; %atomicrmw.global @@ -8786,10 +8680,11 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB110_2 ; GFX950-NEXT: .LBB110_1: ; %atomicrmw.private @@ -8806,9 +8701,8 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB110_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB110_4 ; GFX950-NEXT: .LBB110_3: ; %atomicrmw.global @@ -8845,7 +8739,7 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB111_2 ; GFX90A-NEXT: .LBB111_1: ; %atomicrmw.private @@ -8865,10 +8759,8 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB111_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB111_5 ; GFX90A-NEXT: .LBB111_3: ; %atomicrmw.global @@ -8886,13 +8778,12 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB111_4 ; GFX90A-NEXT: .LBB111_5: ; %atomicrmw.phi @@ -8919,7 +8810,7 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB111_2 ; GFX950-NEXT: .LBB111_1: ; %atomicrmw.private @@ -8939,10 +8830,8 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB111_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB111_5 ; GFX950-NEXT: .LBB111_3: ; %atomicrmw.global @@ -8965,10 +8854,9 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB111_4 ; GFX950-NEXT: .LBB111_5: ; %atomicrmw.phi @@ -9000,7 +8888,7 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB112_2 ; GFX90A-NEXT: .LBB112_1: ; %atomicrmw.private @@ -9018,10 +8906,8 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB112_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB112_5 ; GFX90A-NEXT: .LBB112_3: ; %atomicrmw.global @@ -9040,10 +8926,9 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB112_4 ; GFX90A-NEXT: .LBB112_5: ; %atomicrmw.phi @@ -9069,7 +8954,7 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB112_2 ; GFX950-NEXT: .LBB112_1: ; %atomicrmw.private @@ -9087,10 +8972,8 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB112_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB112_5 ; GFX950-NEXT: .LBB112_3: ; %atomicrmw.global @@ -9112,10 +8995,9 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB112_4 ; GFX950-NEXT: .LBB112_5: ; %atomicrmw.phi @@ -9149,7 +9031,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB113_2 ; GFX90A-NEXT: .LBB113_1: ; %atomicrmw.private @@ -9168,10 +9050,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB113_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB113_5 ; GFX90A-NEXT: .LBB113_3: ; %atomicrmw.global @@ -9188,13 +9068,12 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB113_4 ; GFX90A-NEXT: .LBB113_5: ; %atomicrmw.phi @@ -9221,7 +9100,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB113_2 ; GFX950-NEXT: .LBB113_1: ; %atomicrmw.private @@ -9240,10 +9119,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB113_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB113_5 ; GFX950-NEXT: .LBB113_3: ; %atomicrmw.global @@ -9265,10 +9142,9 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB113_4 ; GFX950-NEXT: .LBB113_5: ; %atomicrmw.phi @@ -9300,7 +9176,7 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB114_2 ; GFX90A-NEXT: .LBB114_1: ; %atomicrmw.private @@ -9317,10 +9193,8 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB114_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB114_5 ; GFX90A-NEXT: .LBB114_3: ; %atomicrmw.global @@ -9338,10 +9212,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB114_4 ; GFX90A-NEXT: .LBB114_5: ; %atomicrmw.phi @@ -9367,7 +9240,7 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB114_2 ; GFX950-NEXT: .LBB114_1: ; %atomicrmw.private @@ -9384,10 +9257,8 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB114_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB114_5 ; GFX950-NEXT: .LBB114_3: ; %atomicrmw.global @@ -9408,10 +9279,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB114_4 ; GFX950-NEXT: .LBB114_5: ; %atomicrmw.phi @@ -9441,12 +9311,13 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB115_2 ; GFX90A-NEXT: .LBB115_1: ; %atomicrmw.shared @@ -9456,10 +9327,8 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: .LBB115_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB115_7 ; GFX90A-NEXT: .LBB115_3: ; %atomicrmw.check.private @@ -9467,7 +9336,8 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB115_5 ; GFX90A-NEXT: .LBB115_4: ; %atomicrmw.private @@ -9479,11 +9349,9 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB115_5: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB115_7 ; GFX90A-NEXT: .LBB115_6: ; %atomicrmw.global @@ -9530,10 +9398,11 @@ define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB116_2 ; GFX90A-NEXT: .LBB116_1: ; %atomicrmw.shared @@ -9542,10 +9411,8 @@ define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB116_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB116_7 ; GFX90A-NEXT: .LBB116_3: ; %atomicrmw.check.private @@ -9553,7 +9420,8 @@ define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB116_5 ; GFX90A-NEXT: .LBB116_4: ; %atomicrmw.private @@ -9564,11 +9432,9 @@ define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX90A-NEXT: .LBB116_5: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB116_7 ; GFX90A-NEXT: .LBB116_6: ; %atomicrmw.global @@ -9619,10 +9485,9 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: .LBB117_2: ; %atomicrmw.end @@ -9652,10 +9517,9 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: .LBB117_2: ; %atomicrmw.end @@ -9690,10 +9554,9 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: .LBB118_2: ; %atomicrmw.end @@ -9721,10 +9584,9 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: .LBB118_2: ; %atomicrmw.end @@ -9761,10 +9623,9 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: .LBB119_2: ; %atomicrmw.end @@ -9796,10 +9657,9 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: .LBB119_2: ; %atomicrmw.end @@ -9836,10 +9696,9 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: .LBB120_2: ; %atomicrmw.end @@ -9869,10 +9728,9 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: .LBB120_2: ; %atomicrmw.end @@ -9909,10 +9767,9 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: .LBB121_2: ; %atomicrmw.end @@ -9944,10 +9801,9 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: .LBB121_2: ; %atomicrmw.end @@ -9984,10 +9840,9 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: .LBB122_2: ; %atomicrmw.end @@ -10017,10 +9872,9 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: .LBB122_2: ; %atomicrmw.end @@ -10058,10 +9912,9 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: .LBB123_2: ; %atomicrmw.end @@ -10092,10 +9945,9 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: .LBB123_2: ; %atomicrmw.end @@ -10133,10 +9985,9 @@ define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: .LBB124_2: ; %atomicrmw.end @@ -10165,10 +10016,9 @@ define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: .LBB124_2: ; %atomicrmw.end @@ -10206,10 +10056,9 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: .LBB125_2: ; %atomicrmw.end @@ -10240,10 +10089,9 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: .LBB125_2: ; %atomicrmw.end @@ -10281,10 +10129,9 @@ define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: .LBB126_2: ; %atomicrmw.end @@ -10313,10 +10160,9 @@ define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: .LBB126_2: ; %atomicrmw.end @@ -10351,7 +10197,8 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB127_2 ; GFX90A-NEXT: .LBB127_1: ; %atomicrmw.shared @@ -10362,10 +10209,8 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: .LBB127_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB127_7 ; GFX90A-NEXT: .LBB127_3: ; %atomicrmw.check.private @@ -10373,7 +10218,8 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB127_5 ; GFX90A-NEXT: .LBB127_4: ; %atomicrmw.private @@ -10388,11 +10234,9 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB127_5: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB127_7 ; GFX90A-NEXT: .LBB127_6: ; %atomicrmw.global @@ -10423,7 +10267,8 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB127_2 ; GFX950-NEXT: .LBB127_1: ; %atomicrmw.shared @@ -10435,10 +10280,8 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: .LBB127_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[2:3], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB127_7 ; GFX950-NEXT: .LBB127_3: ; %atomicrmw.check.private @@ -10447,7 +10290,8 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB127_5 ; GFX950-NEXT: .LBB127_4: ; %atomicrmw.private @@ -10461,11 +10305,9 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB127_5: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB127_7 ; GFX950-NEXT: .LBB127_6: ; %atomicrmw.global @@ -10497,10 +10339,11 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB128_2 ; GFX90A-NEXT: .LBB128_1: ; %atomicrmw.shared @@ -10509,10 +10352,8 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB128_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB128_7 ; GFX90A-NEXT: .LBB128_3: ; %atomicrmw.check.private @@ -10520,7 +10361,8 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB128_5 ; GFX90A-NEXT: .LBB128_4: ; %atomicrmw.private @@ -10533,11 +10375,9 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB128_5: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB128_7 ; GFX90A-NEXT: .LBB128_6: ; %atomicrmw.global @@ -10560,10 +10400,11 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB128_2 ; GFX950-NEXT: .LBB128_1: ; %atomicrmw.shared @@ -10573,10 +10414,8 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: .LBB128_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[2:3], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB128_7 ; GFX950-NEXT: .LBB128_3: ; %atomicrmw.check.private @@ -10585,7 +10424,8 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB128_5 ; GFX950-NEXT: .LBB128_4: ; %atomicrmw.private @@ -10597,11 +10437,9 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB128_5: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB128_7 ; GFX950-NEXT: .LBB128_6: ; %atomicrmw.global @@ -10637,7 +10475,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB129_2 ; GFX90A-NEXT: .LBB129_1: ; %atomicrmw.private @@ -10652,10 +10490,8 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB129_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB129_5 ; GFX90A-NEXT: .LBB129_3: ; %atomicrmw.global @@ -10669,13 +10505,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB129_4 ; GFX90A-NEXT: .LBB129_5: ; %atomicrmw.phi @@ -10702,7 +10537,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB129_2 ; GFX950-NEXT: .LBB129_1: ; %atomicrmw.private @@ -10716,10 +10551,8 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB129_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB129_5 ; GFX950-NEXT: .LBB129_3: ; %atomicrmw.global @@ -10737,10 +10570,9 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB129_4 ; GFX950-NEXT: .LBB129_5: ; %atomicrmw.phi @@ -10772,7 +10604,7 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB130_2 ; GFX90A-NEXT: .LBB130_1: ; %atomicrmw.private @@ -10785,10 +10617,8 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB130_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB130_5 ; GFX90A-NEXT: .LBB130_3: ; %atomicrmw.global @@ -10803,10 +10633,9 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB130_4 ; GFX90A-NEXT: .LBB130_5: ; %atomicrmw.phi @@ -10832,7 +10661,7 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB130_2 ; GFX950-NEXT: .LBB130_1: ; %atomicrmw.private @@ -10844,10 +10673,8 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB130_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB130_5 ; GFX950-NEXT: .LBB130_3: ; %atomicrmw.global @@ -10864,10 +10691,9 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB130_4 ; GFX950-NEXT: .LBB130_5: ; %atomicrmw.phi @@ -10899,7 +10725,8 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB131_2 ; GFX90A-NEXT: .LBB131_1: ; %atomicrmw.private @@ -10916,9 +10743,8 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB131_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB131_4 ; GFX90A-NEXT: .LBB131_3: ; %atomicrmw.global @@ -10949,7 +10775,8 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB131_2 ; GFX950-NEXT: .LBB131_1: ; %atomicrmw.private @@ -10965,9 +10792,8 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB131_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB131_4 ; GFX950-NEXT: .LBB131_3: ; %atomicrmw.global @@ -10999,10 +10825,11 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB132_2 ; GFX90A-NEXT: .LBB132_1: ; %atomicrmw.private @@ -11017,9 +10844,8 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB132_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB132_4 ; GFX90A-NEXT: .LBB132_3: ; %atomicrmw.global @@ -11043,10 +10869,11 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB132_2 ; GFX950-NEXT: .LBB132_1: ; %atomicrmw.private @@ -11060,9 +10887,8 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB132_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB132_4 ; GFX950-NEXT: .LBB132_3: ; %atomicrmw.global @@ -11097,7 +10923,8 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB133_2 ; GFX90A-NEXT: .LBB133_1: ; %atomicrmw.private @@ -11114,9 +10941,8 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB133_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB133_4 ; GFX90A-NEXT: .LBB133_3: ; %atomicrmw.global @@ -11147,7 +10973,8 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB133_2 ; GFX950-NEXT: .LBB133_1: ; %atomicrmw.private @@ -11163,9 +10990,8 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB133_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB133_4 ; GFX950-NEXT: .LBB133_3: ; %atomicrmw.global @@ -11197,10 +11023,11 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_xor_b64 exec, vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB134_2 ; GFX90A-NEXT: .LBB134_1: ; %atomicrmw.private @@ -11215,9 +11042,8 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB134_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB134_4 ; GFX90A-NEXT: .LBB134_3: ; %atomicrmw.global @@ -11241,10 +11067,11 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB134_2 ; GFX950-NEXT: .LBB134_1: ; %atomicrmw.private @@ -11258,9 +11085,8 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB134_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB134_4 ; GFX950-NEXT: .LBB134_3: ; %atomicrmw.global @@ -11297,7 +11123,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB135_2 ; GFX90A-NEXT: .LBB135_1: ; %atomicrmw.private @@ -11316,10 +11142,8 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB135_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB135_5 ; GFX90A-NEXT: .LBB135_3: ; %atomicrmw.global @@ -11337,13 +11161,12 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB135_4 ; GFX90A-NEXT: .LBB135_5: ; %atomicrmw.phi @@ -11370,7 +11193,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB135_2 ; GFX950-NEXT: .LBB135_1: ; %atomicrmw.private @@ -11388,10 +11211,8 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB135_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB135_5 ; GFX950-NEXT: .LBB135_3: ; %atomicrmw.global @@ -11414,10 +11235,9 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB135_4 ; GFX950-NEXT: .LBB135_5: ; %atomicrmw.phi @@ -11449,7 +11269,7 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB136_2 ; GFX90A-NEXT: .LBB136_1: ; %atomicrmw.private @@ -11466,10 +11286,8 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB136_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB136_5 ; GFX90A-NEXT: .LBB136_3: ; %atomicrmw.global @@ -11488,10 +11306,9 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB136_4 ; GFX90A-NEXT: .LBB136_5: ; %atomicrmw.phi @@ -11517,7 +11334,7 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB136_2 ; GFX950-NEXT: .LBB136_1: ; %atomicrmw.private @@ -11534,10 +11351,8 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB136_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB136_5 ; GFX950-NEXT: .LBB136_3: ; %atomicrmw.global @@ -11559,10 +11374,9 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB136_4 ; GFX950-NEXT: .LBB136_5: ; %atomicrmw.phi @@ -11596,7 +11410,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec ; GFX90A-NEXT: s_mov_b64 s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB137_2 ; GFX90A-NEXT: .LBB137_1: ; %atomicrmw.private @@ -11615,10 +11429,8 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB137_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB137_5 ; GFX90A-NEXT: .LBB137_3: ; %atomicrmw.global @@ -11636,13 +11448,12 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB137_4 ; GFX90A-NEXT: .LBB137_5: ; %atomicrmw.phi @@ -11669,7 +11480,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB137_2 ; GFX950-NEXT: .LBB137_1: ; %atomicrmw.private @@ -11687,10 +11498,8 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB137_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB137_5 ; GFX950-NEXT: .LBB137_3: ; %atomicrmw.global @@ -11713,10 +11522,9 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB137_4 ; GFX950-NEXT: .LBB137_5: ; %atomicrmw.phi @@ -11748,7 +11556,7 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB138_2 ; GFX90A-NEXT: .LBB138_1: ; %atomicrmw.private @@ -11765,10 +11573,8 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB138_2: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc -; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB138_5 ; GFX90A-NEXT: .LBB138_3: ; %atomicrmw.global @@ -11787,10 +11593,9 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB138_4 ; GFX90A-NEXT: .LBB138_5: ; %atomicrmw.phi @@ -11816,7 +11621,7 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec -; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB138_2 ; GFX950-NEXT: .LBB138_1: ; %atomicrmw.private @@ -11833,10 +11638,8 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[0:1] ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB138_2: -; GFX950-NEXT: s_or_b64 exec, exec, vcc -; GFX950-NEXT: s_xor_b64 s[0:1], exec, vcc -; GFX950-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX950-NEXT: s_mov_b64 exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB138_5 ; GFX950-NEXT: .LBB138_3: ; %atomicrmw.global @@ -11858,10 +11661,9 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB138_4 ; GFX950-NEXT: .LBB138_5: ; %atomicrmw.phi @@ -11901,10 +11703,9 @@ define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB139_1 ; GFX90A-NEXT: .LBB139_2: ; %atomicrmw.end @@ -11955,10 +11756,9 @@ define void @flat_atomic_fadd_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB140_1 ; GFX90A-NEXT: .LBB140_2: ; %atomicrmw.end @@ -12006,10 +11806,9 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: .LBB141_2: ; %atomicrmw.end @@ -12040,10 +11839,9 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: .LBB141_2: ; %atomicrmw.end @@ -12078,10 +11876,9 @@ define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB142_1 ; GFX90A-NEXT: .LBB142_2: ; %atomicrmw.end @@ -12110,10 +11907,9 @@ define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB142_1 ; GFX950-NEXT: .LBB142_2: ; %atomicrmw.end @@ -12150,10 +11946,9 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: .LBB143_2: ; %atomicrmw.end @@ -12187,10 +11982,9 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: .LBB143_2: ; %atomicrmw.end @@ -12227,10 +12021,9 @@ define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB144_1 ; GFX90A-NEXT: .LBB144_2: ; %atomicrmw.end @@ -12262,10 +12055,9 @@ define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB144_1 ; GFX950-NEXT: .LBB144_2: ; %atomicrmw.end @@ -12302,10 +12094,9 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: .LBB145_2: ; %atomicrmw.end @@ -12339,10 +12130,9 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: .LBB145_2: ; %atomicrmw.end @@ -12379,10 +12169,9 @@ define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB146_1 ; GFX90A-NEXT: .LBB146_2: ; %atomicrmw.end @@ -12414,10 +12203,9 @@ define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB146_1 ; GFX950-NEXT: .LBB146_2: ; %atomicrmw.end @@ -12459,10 +12247,9 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: .LBB147_2: ; %atomicrmw.end @@ -12493,10 +12280,9 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: .LBB147_2: ; %atomicrmw.end @@ -12538,10 +12324,9 @@ define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB148_1 ; GFX90A-NEXT: .LBB148_2: ; %atomicrmw.end @@ -12570,10 +12355,9 @@ define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB148_1 ; GFX950-NEXT: .LBB148_2: ; %atomicrmw.end @@ -12615,10 +12399,9 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: .LBB149_2: ; %atomicrmw.end @@ -12649,10 +12432,9 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: .LBB149_2: ; %atomicrmw.end @@ -12694,10 +12476,9 @@ define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB150_1 ; GFX90A-NEXT: .LBB150_2: ; %atomicrmw.end @@ -12726,10 +12507,9 @@ define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB150_1 ; GFX950-NEXT: .LBB150_2: ; %atomicrmw.end @@ -12786,10 +12566,9 @@ define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: .LBB151_2: ; %atomicrmw.end @@ -12858,10 +12637,9 @@ define void @flat_atomic_fadd_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB152_1 ; GFX90A-NEXT: .LBB152_2: ; %atomicrmw.end @@ -12927,10 +12705,9 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: .LBB153_2: ; %atomicrmw.end @@ -12967,10 +12744,9 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: .LBB153_2: ; %atomicrmw.end @@ -13023,10 +12799,9 @@ define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB154_1 ; GFX90A-NEXT: .LBB154_2: ; %atomicrmw.end @@ -13061,10 +12836,9 @@ define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB154_1 ; GFX950-NEXT: .LBB154_2: ; %atomicrmw.end @@ -13117,10 +12891,9 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: .LBB155_2: ; %atomicrmw.end @@ -13157,10 +12930,9 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: .LBB155_2: ; %atomicrmw.end @@ -13213,10 +12985,9 @@ define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB156_1 ; GFX90A-NEXT: .LBB156_2: ; %atomicrmw.end @@ -13251,10 +13022,9 @@ define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB156_1 ; GFX950-NEXT: .LBB156_2: ; %atomicrmw.end @@ -13307,10 +13077,9 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: .LBB157_2: ; %atomicrmw.end @@ -13347,10 +13116,9 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: .LBB157_2: ; %atomicrmw.end @@ -13403,10 +13171,9 @@ define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB158_1 ; GFX90A-NEXT: .LBB158_2: ; %atomicrmw.end @@ -13441,10 +13208,9 @@ define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB158_1 ; GFX950-NEXT: .LBB158_2: ; %atomicrmw.end @@ -13502,10 +13268,9 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: .LBB159_2: ; %atomicrmw.end @@ -13542,10 +13307,9 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: .LBB159_2: ; %atomicrmw.end @@ -13603,10 +13367,9 @@ define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB160_1 ; GFX90A-NEXT: .LBB160_2: ; %atomicrmw.end @@ -13641,10 +13404,9 @@ define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB160_1 ; GFX950-NEXT: .LBB160_2: ; %atomicrmw.end @@ -13702,10 +13464,9 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: .LBB161_2: ; %atomicrmw.end @@ -13742,10 +13503,9 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: .LBB161_2: ; %atomicrmw.end @@ -13803,10 +13563,9 @@ define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB162_1 ; GFX90A-NEXT: .LBB162_2: ; %atomicrmw.end @@ -13841,10 +13600,9 @@ define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB162_1 ; GFX950-NEXT: .LBB162_2: ; %atomicrmw.end @@ -14202,10 +13960,9 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: .LBB171_2: ; %atomicrmw.end @@ -14238,10 +13995,9 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: .LBB171_2: ; %atomicrmw.end @@ -14278,10 +14034,9 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB172_1 ; GFX90A-NEXT: .LBB172_2: ; %atomicrmw.end @@ -14311,10 +14066,9 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB172_1 ; GFX950-NEXT: .LBB172_2: ; %atomicrmw.end @@ -14985,10 +14739,9 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: .LBB189_2: ; %atomicrmw.end @@ -15023,10 +14776,9 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: .LBB189_2: ; %atomicrmw.end @@ -15064,10 +14816,9 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB190_1 ; GFX90A-NEXT: .LBB190_2: ; %atomicrmw.end @@ -15099,10 +14850,9 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB190_1 ; GFX950-NEXT: .LBB190_2: ; %atomicrmw.end @@ -15139,10 +14889,9 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: .LBB191_2: ; %atomicrmw.end @@ -15175,10 +14924,9 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: .LBB191_2: ; %atomicrmw.end @@ -15214,10 +14962,9 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB192_1 ; GFX90A-NEXT: .LBB192_2: ; %atomicrmw.end @@ -15247,10 +14994,9 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB192_1 ; GFX950-NEXT: .LBB192_2: ; %atomicrmw.end @@ -16001,13 +15747,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB201_3 ; GFX90A-NEXT: .LBB201_4: ; %atomicrmw.phi @@ -16066,10 +15811,9 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB201_3 ; GFX950-NEXT: .LBB201_4: ; %atomicrmw.phi @@ -16133,10 +15877,9 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB202_3 ; GFX90A-NEXT: .LBB202_4: ; %atomicrmw.phi @@ -16190,10 +15933,9 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB202_3 ; GFX950-NEXT: .LBB202_4: ; %atomicrmw.phi @@ -17695,13 +17437,12 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB219_3 ; GFX90A-NEXT: .LBB219_4: ; %atomicrmw.phi @@ -17766,10 +17507,9 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB219_3 ; GFX950-NEXT: .LBB219_4: ; %atomicrmw.phi @@ -17835,10 +17575,9 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB220_3 ; GFX90A-NEXT: .LBB220_4: ; %atomicrmw.phi @@ -17898,10 +17637,9 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB220_3 ; GFX950-NEXT: .LBB220_4: ; %atomicrmw.phi @@ -17968,13 +17706,12 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB221_3 ; GFX90A-NEXT: .LBB221_4: ; %atomicrmw.phi @@ -18037,10 +17774,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB221_3 ; GFX950-NEXT: .LBB221_4: ; %atomicrmw.phi @@ -18104,10 +17840,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB222_3 ; GFX90A-NEXT: .LBB222_4: ; %atomicrmw.phi @@ -18165,10 +17900,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB222_3 ; GFX950-NEXT: .LBB222_4: ; %atomicrmw.phi @@ -18360,10 +18094,9 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: .LBB225_2: ; %atomicrmw.end @@ -18395,10 +18128,9 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: .LBB225_2: ; %atomicrmw.end @@ -18434,10 +18166,9 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: .LBB226_2: ; %atomicrmw.end @@ -18466,10 +18197,9 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: .LBB226_2: ; %atomicrmw.end @@ -18508,10 +18238,9 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: .LBB227_2: ; %atomicrmw.end @@ -18545,10 +18274,9 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: .LBB227_2: ; %atomicrmw.end @@ -18587,10 +18315,9 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: .LBB228_2: ; %atomicrmw.end @@ -18622,10 +18349,9 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: .LBB228_2: ; %atomicrmw.end @@ -18664,10 +18390,9 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: .LBB229_2: ; %atomicrmw.end @@ -18701,10 +18426,9 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: .LBB229_2: ; %atomicrmw.end @@ -18743,10 +18467,9 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: .LBB230_2: ; %atomicrmw.end @@ -18778,10 +18501,9 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: .LBB230_2: ; %atomicrmw.end @@ -18821,10 +18543,9 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: .LBB231_2: ; %atomicrmw.end @@ -18857,10 +18578,9 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: .LBB231_2: ; %atomicrmw.end @@ -18900,10 +18620,9 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: .LBB232_2: ; %atomicrmw.end @@ -18933,10 +18652,9 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: .LBB232_2: ; %atomicrmw.end @@ -18976,10 +18694,9 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: .LBB233_2: ; %atomicrmw.end @@ -19012,10 +18729,9 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: .LBB233_2: ; %atomicrmw.end @@ -19055,10 +18771,9 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: .LBB234_2: ; %atomicrmw.end @@ -19088,10 +18803,9 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: .LBB234_2: ; %atomicrmw.end @@ -19382,13 +19096,12 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB237_3 ; GFX90A-NEXT: .LBB237_4: ; %atomicrmw.phi @@ -19442,10 +19155,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB237_3 ; GFX950-NEXT: .LBB237_4: ; %atomicrmw.phi @@ -19502,10 +19214,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB238_3 ; GFX90A-NEXT: .LBB238_4: ; %atomicrmw.phi @@ -19554,10 +19265,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB238_3 ; GFX950-NEXT: .LBB238_4: ; %atomicrmw.phi @@ -19977,13 +19687,12 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB243_3 ; GFX90A-NEXT: .LBB243_4: ; %atomicrmw.phi @@ -20046,10 +19755,9 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB243_3 ; GFX950-NEXT: .LBB243_4: ; %atomicrmw.phi @@ -20114,10 +19822,9 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB244_3 ; GFX90A-NEXT: .LBB244_4: ; %atomicrmw.phi @@ -20176,10 +19883,9 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB244_3 ; GFX950-NEXT: .LBB244_4: ; %atomicrmw.phi @@ -20247,13 +19953,12 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB245_3 ; GFX90A-NEXT: .LBB245_4: ; %atomicrmw.phi @@ -20316,10 +20021,9 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB245_3 ; GFX950-NEXT: .LBB245_4: ; %atomicrmw.phi @@ -20384,10 +20088,9 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB246_3 ; GFX90A-NEXT: .LBB246_4: ; %atomicrmw.phi @@ -20446,10 +20149,9 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB246_3 ; GFX950-NEXT: .LBB246_4: ; %atomicrmw.phi @@ -20491,10 +20193,9 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB247_1 ; GFX90A-NEXT: .LBB247_2: ; %atomicrmw.end @@ -20548,10 +20249,9 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB248_1 ; GFX90A-NEXT: .LBB248_2: ; %atomicrmw.end @@ -20603,10 +20303,9 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: .LBB249_2: ; %atomicrmw.end @@ -20639,10 +20338,9 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: .LBB249_2: ; %atomicrmw.end @@ -20678,10 +20376,9 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB250_1 ; GFX90A-NEXT: .LBB250_2: ; %atomicrmw.end @@ -20711,10 +20408,9 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB250_1 ; GFX950-NEXT: .LBB250_2: ; %atomicrmw.end @@ -20753,10 +20449,9 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: .LBB251_2: ; %atomicrmw.end @@ -20792,10 +20487,9 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: .LBB251_2: ; %atomicrmw.end @@ -20834,10 +20528,9 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB252_1 ; GFX90A-NEXT: .LBB252_2: ; %atomicrmw.end @@ -20871,10 +20564,9 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB252_1 ; GFX950-NEXT: .LBB252_2: ; %atomicrmw.end @@ -20913,10 +20605,9 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: .LBB253_2: ; %atomicrmw.end @@ -20952,10 +20643,9 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: .LBB253_2: ; %atomicrmw.end @@ -20994,10 +20684,9 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB254_1 ; GFX90A-NEXT: .LBB254_2: ; %atomicrmw.end @@ -21031,10 +20720,9 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB254_1 ; GFX950-NEXT: .LBB254_2: ; %atomicrmw.end @@ -21078,10 +20766,9 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: .LBB255_2: ; %atomicrmw.end @@ -21114,10 +20801,9 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: .LBB255_2: ; %atomicrmw.end @@ -21161,10 +20847,9 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB256_1 ; GFX90A-NEXT: .LBB256_2: ; %atomicrmw.end @@ -21194,10 +20879,9 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB256_1 ; GFX950-NEXT: .LBB256_2: ; %atomicrmw.end @@ -21241,10 +20925,9 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: .LBB257_2: ; %atomicrmw.end @@ -21277,10 +20960,9 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: .LBB257_2: ; %atomicrmw.end @@ -21324,10 +21006,9 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB258_1 ; GFX90A-NEXT: .LBB258_2: ; %atomicrmw.end @@ -21357,10 +21038,9 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB258_1 ; GFX950-NEXT: .LBB258_2: ; %atomicrmw.end @@ -21419,10 +21099,9 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: .LBB259_2: ; %atomicrmw.end @@ -21495,10 +21174,9 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB260_1 ; GFX90A-NEXT: .LBB260_2: ; %atomicrmw.end @@ -21568,10 +21246,9 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: .LBB261_2: ; %atomicrmw.end @@ -21610,10 +21287,9 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: .LBB261_2: ; %atomicrmw.end @@ -21668,10 +21344,9 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB262_1 ; GFX90A-NEXT: .LBB262_2: ; %atomicrmw.end @@ -21708,10 +21383,9 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB262_1 ; GFX950-NEXT: .LBB262_2: ; %atomicrmw.end @@ -21766,10 +21440,9 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: .LBB263_2: ; %atomicrmw.end @@ -21808,10 +21481,9 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: .LBB263_2: ; %atomicrmw.end @@ -21866,10 +21538,9 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB264_1 ; GFX90A-NEXT: .LBB264_2: ; %atomicrmw.end @@ -21906,10 +21577,9 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB264_1 ; GFX950-NEXT: .LBB264_2: ; %atomicrmw.end @@ -21964,10 +21634,9 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: .LBB265_2: ; %atomicrmw.end @@ -22006,10 +21675,9 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: .LBB265_2: ; %atomicrmw.end @@ -22064,10 +21732,9 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB266_1 ; GFX90A-NEXT: .LBB266_2: ; %atomicrmw.end @@ -22104,10 +21771,9 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB266_1 ; GFX950-NEXT: .LBB266_2: ; %atomicrmw.end @@ -22167,10 +21833,9 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: .LBB267_2: ; %atomicrmw.end @@ -22209,10 +21874,9 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: .LBB267_2: ; %atomicrmw.end @@ -22272,10 +21936,9 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB268_1 ; GFX90A-NEXT: .LBB268_2: ; %atomicrmw.end @@ -22312,10 +21975,9 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB268_1 ; GFX950-NEXT: .LBB268_2: ; %atomicrmw.end @@ -22375,10 +22037,9 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: .LBB269_2: ; %atomicrmw.end @@ -22417,10 +22078,9 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: .LBB269_2: ; %atomicrmw.end @@ -22480,10 +22140,9 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB270_1 ; GFX90A-NEXT: .LBB270_2: ; %atomicrmw.end @@ -22520,10 +22179,9 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB270_1 ; GFX950-NEXT: .LBB270_2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index 46592de1d05a3..6e9401596d7e6 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -988,10 +988,9 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.end @@ -1024,10 +1023,9 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: .LBB21_2: ; %atomicrmw.end @@ -1068,10 +1066,9 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.end @@ -1103,10 +1100,9 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB22_1 ; GFX950-NEXT: .LBB22_2: ; %atomicrmw.end @@ -1145,10 +1141,9 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.end @@ -1180,10 +1175,9 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: .LBB23_2: ; %atomicrmw.end @@ -1223,10 +1217,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.end @@ -1257,10 +1250,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB24_1 ; GFX950-NEXT: .LBB24_2: ; %atomicrmw.end @@ -1299,10 +1291,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.end @@ -1333,10 +1324,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB25_1 ; GFX950-NEXT: .LBB25_2: ; %atomicrmw.end @@ -1375,10 +1365,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: .LBB26_2: ; %atomicrmw.end @@ -1410,10 +1399,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: .LBB26_2: ; %atomicrmw.end @@ -1454,10 +1442,9 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: .LBB27_2: ; %atomicrmw.end @@ -1489,10 +1476,9 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB27_1 ; GFX950-NEXT: .LBB27_2: ; %atomicrmw.end @@ -1531,10 +1517,9 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: .LBB28_2: ; %atomicrmw.end @@ -1565,10 +1550,9 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB28_1 ; GFX950-NEXT: .LBB28_2: ; %atomicrmw.end @@ -1657,10 +1641,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: .LBB29_2: ; %atomicrmw.end @@ -1794,10 +1777,9 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB29_1 ; GFX950-NEXT: .LBB29_2: ; %atomicrmw.end @@ -1892,10 +1874,9 @@ define void @global_atomic_xor_expansion_i32_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: .LBB30_2: ; %atomicrmw.end @@ -1924,10 +1905,9 @@ define void @global_atomic_xor_expansion_i32_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB30_1 ; GFX950-NEXT: .LBB30_2: ; %atomicrmw.end @@ -1961,10 +1941,9 @@ define void @global_atomic_xor_expansion_i32_noret_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: .LBB31_2: ; %atomicrmw.end @@ -1992,10 +1971,9 @@ define void @global_atomic_xor_expansion_i32_noret_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB31_1 ; GFX950-NEXT: .LBB31_2: ; %atomicrmw.end @@ -2037,10 +2015,9 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.end @@ -2076,10 +2053,9 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB32_1 ; GFX950-NEXT: .LBB32_2: ; %atomicrmw.end @@ -2123,10 +2099,9 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.end @@ -2160,10 +2135,9 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB33_1 ; GFX950-NEXT: .LBB33_2: ; %atomicrmw.end @@ -2203,10 +2177,9 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.end @@ -2240,10 +2213,9 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB34_1 ; GFX950-NEXT: .LBB34_2: ; %atomicrmw.end @@ -2285,10 +2257,9 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.end @@ -2320,10 +2291,9 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB35_1 ; GFX950-NEXT: .LBB35_2: ; %atomicrmw.end @@ -2363,10 +2333,9 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: .LBB36_2: ; %atomicrmw.end @@ -2398,10 +2367,9 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB36_1 ; GFX950-NEXT: .LBB36_2: ; %atomicrmw.end @@ -2441,10 +2409,9 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: .LBB37_2: ; %atomicrmw.end @@ -2478,10 +2445,9 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB37_1 ; GFX950-NEXT: .LBB37_2: ; %atomicrmw.end @@ -2525,10 +2491,9 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: .LBB38_2: ; %atomicrmw.end @@ -2562,10 +2527,9 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB38_1 ; GFX950-NEXT: .LBB38_2: ; %atomicrmw.end @@ -2605,10 +2569,9 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.end @@ -2640,10 +2603,9 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB39_1 ; GFX950-NEXT: .LBB39_2: ; %atomicrmw.end @@ -2684,10 +2646,9 @@ define void @global_atomic_xor_expansion_i64_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: .LBB40_2: ; %atomicrmw.end @@ -2718,10 +2679,9 @@ define void @global_atomic_xor_expansion_i64_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB40_1 ; GFX950-NEXT: .LBB40_2: ; %atomicrmw.end @@ -2756,10 +2716,9 @@ define void @global_atomic_xor_expansion_i64_noret_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.end @@ -2788,10 +2747,9 @@ define void @global_atomic_xor_expansion_i64_noret_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB41_1 ; GFX950-NEXT: .LBB41_2: ; %atomicrmw.end @@ -3935,10 +3893,9 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: .LBB69_2: ; %atomicrmw.end @@ -3969,10 +3926,9 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: .LBB69_2: ; %atomicrmw.end @@ -4008,10 +3964,9 @@ define void @global_atomic_nand_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 ; GFX90A-NEXT: .LBB70_2: ; %atomicrmw.end @@ -4040,10 +3995,9 @@ define void @global_atomic_nand_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB70_1 ; GFX950-NEXT: .LBB70_2: ; %atomicrmw.end @@ -4577,10 +4531,9 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: .LBB85_2: ; %atomicrmw.end @@ -4613,10 +4566,9 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: .LBB85_2: ; %atomicrmw.end @@ -4653,10 +4605,9 @@ define void @global_atomic_usub_cond_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 ; GFX90A-NEXT: .LBB86_2: ; %atomicrmw.end @@ -4687,10 +4638,9 @@ define void @global_atomic_usub_cond_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB86_1 ; GFX950-NEXT: .LBB86_2: ; %atomicrmw.end @@ -4725,10 +4675,9 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: .LBB87_2: ; %atomicrmw.end @@ -4759,10 +4708,9 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: .LBB87_2: ; %atomicrmw.end @@ -4797,10 +4745,9 @@ define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 ; GFX90A-NEXT: .LBB88_2: ; %atomicrmw.end @@ -4829,10 +4776,9 @@ define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB88_1 ; GFX950-NEXT: .LBB88_2: ; %atomicrmw.end @@ -5100,10 +5046,9 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB95_1 ; GFX90A-NEXT: .LBB95_2: ; %atomicrmw.end @@ -5138,10 +5083,9 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB95_1 ; GFX950-NEXT: .LBB95_2: ; %atomicrmw.end @@ -5180,10 +5124,9 @@ define void @global_atomic_nand_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB96_1 ; GFX90A-NEXT: .LBB96_2: ; %atomicrmw.end @@ -5214,10 +5157,9 @@ define void @global_atomic_nand_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB96_1 ; GFX950-NEXT: .LBB96_2: ; %atomicrmw.end @@ -5782,10 +5724,9 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB111_1 ; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.end @@ -5823,10 +5764,9 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB111_1 ; GFX950-NEXT: .LBB111_2: ; %atomicrmw.end @@ -5866,10 +5806,9 @@ define void @global_atomic_usub_cond_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB112_1 ; GFX90A-NEXT: .LBB112_2: ; %atomicrmw.end @@ -5903,10 +5842,9 @@ define void @global_atomic_usub_cond_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB112_1 ; GFX950-NEXT: .LBB112_2: ; %atomicrmw.end @@ -5945,10 +5883,9 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB113_1 ; GFX90A-NEXT: .LBB113_2: ; %atomicrmw.end @@ -5985,10 +5922,9 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB113_1 ; GFX950-NEXT: .LBB113_2: ; %atomicrmw.end @@ -6027,10 +5963,9 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB114_1 ; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.end @@ -6063,10 +5998,9 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB114_1 ; GFX950-NEXT: .LBB114_2: ; %atomicrmw.end @@ -6176,10 +6110,9 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: .LBB117_2: ; %atomicrmw.end @@ -6209,10 +6142,9 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: .LBB117_2: ; %atomicrmw.end @@ -6247,10 +6179,9 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: .LBB118_2: ; %atomicrmw.end @@ -6278,10 +6209,9 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: .LBB118_2: ; %atomicrmw.end @@ -6318,10 +6248,9 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: .LBB119_2: ; %atomicrmw.end @@ -6353,10 +6282,9 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: .LBB119_2: ; %atomicrmw.end @@ -6393,10 +6321,9 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: .LBB120_2: ; %atomicrmw.end @@ -6426,10 +6353,9 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: .LBB120_2: ; %atomicrmw.end @@ -6466,10 +6392,9 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: .LBB121_2: ; %atomicrmw.end @@ -6501,10 +6426,9 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: .LBB121_2: ; %atomicrmw.end @@ -6541,10 +6465,9 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: .LBB122_2: ; %atomicrmw.end @@ -6574,10 +6497,9 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: .LBB122_2: ; %atomicrmw.end @@ -6615,10 +6537,9 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: .LBB123_2: ; %atomicrmw.end @@ -6649,10 +6570,9 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: .LBB123_2: ; %atomicrmw.end @@ -6690,10 +6610,9 @@ define void @global_atomic_fmaximum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: .LBB124_2: ; %atomicrmw.end @@ -6722,10 +6641,9 @@ define void @global_atomic_fmaximum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: .LBB124_2: ; %atomicrmw.end @@ -6763,10 +6681,9 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: .LBB125_2: ; %atomicrmw.end @@ -6797,10 +6714,9 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: .LBB125_2: ; %atomicrmw.end @@ -6838,10 +6754,9 @@ define void @global_atomic_fminimum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: .LBB126_2: ; %atomicrmw.end @@ -6870,10 +6785,9 @@ define void @global_atomic_fminimum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: .LBB126_2: ; %atomicrmw.end @@ -6988,10 +6902,9 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB129_1 ; GFX90A-NEXT: .LBB129_2: ; %atomicrmw.end @@ -7024,10 +6937,9 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB129_1 ; GFX950-NEXT: .LBB129_2: ; %atomicrmw.end @@ -7063,10 +6975,9 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB130_1 ; GFX90A-NEXT: .LBB130_2: ; %atomicrmw.end @@ -7095,10 +7006,9 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB130_1 ; GFX950-NEXT: .LBB130_2: ; %atomicrmw.end @@ -7288,10 +7198,9 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB135_1 ; GFX90A-NEXT: .LBB135_2: ; %atomicrmw.end @@ -7329,10 +7238,9 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB135_1 ; GFX950-NEXT: .LBB135_2: ; %atomicrmw.end @@ -7372,10 +7280,9 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB136_1 ; GFX90A-NEXT: .LBB136_2: ; %atomicrmw.end @@ -7409,10 +7316,9 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB136_1 ; GFX950-NEXT: .LBB136_2: ; %atomicrmw.end @@ -7452,10 +7358,9 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB137_1 ; GFX90A-NEXT: .LBB137_2: ; %atomicrmw.end @@ -7493,10 +7398,9 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB137_1 ; GFX950-NEXT: .LBB137_2: ; %atomicrmw.end @@ -7536,10 +7440,9 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB138_1 ; GFX90A-NEXT: .LBB138_2: ; %atomicrmw.end @@ -7573,10 +7476,9 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB138_1 ; GFX950-NEXT: .LBB138_2: ; %atomicrmw.end @@ -7686,10 +7588,9 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: .LBB141_2: ; %atomicrmw.end @@ -7720,10 +7621,9 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: .LBB141_2: ; %atomicrmw.end @@ -7758,10 +7658,9 @@ define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB142_1 ; GFX90A-NEXT: .LBB142_2: ; %atomicrmw.end @@ -7790,10 +7689,9 @@ define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB142_1 ; GFX950-NEXT: .LBB142_2: ; %atomicrmw.end @@ -7830,10 +7728,9 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: .LBB143_2: ; %atomicrmw.end @@ -7867,10 +7764,9 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: .LBB143_2: ; %atomicrmw.end @@ -7907,10 +7803,9 @@ define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB144_1 ; GFX90A-NEXT: .LBB144_2: ; %atomicrmw.end @@ -7942,10 +7837,9 @@ define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB144_1 ; GFX950-NEXT: .LBB144_2: ; %atomicrmw.end @@ -7982,10 +7876,9 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: .LBB145_2: ; %atomicrmw.end @@ -8019,10 +7912,9 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: .LBB145_2: ; %atomicrmw.end @@ -8059,10 +7951,9 @@ define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB146_1 ; GFX90A-NEXT: .LBB146_2: ; %atomicrmw.end @@ -8094,10 +7985,9 @@ define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB146_1 ; GFX950-NEXT: .LBB146_2: ; %atomicrmw.end @@ -8139,10 +8029,9 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: .LBB147_2: ; %atomicrmw.end @@ -8173,10 +8062,9 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: .LBB147_2: ; %atomicrmw.end @@ -8218,10 +8106,9 @@ define void @global_atomic_fmaximum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB148_1 ; GFX90A-NEXT: .LBB148_2: ; %atomicrmw.end @@ -8250,10 +8137,9 @@ define void @global_atomic_fmaximum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB148_1 ; GFX950-NEXT: .LBB148_2: ; %atomicrmw.end @@ -8295,10 +8181,9 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: .LBB149_2: ; %atomicrmw.end @@ -8329,10 +8214,9 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: .LBB149_2: ; %atomicrmw.end @@ -8374,10 +8258,9 @@ define void @global_atomic_fminimum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB150_1 ; GFX90A-NEXT: .LBB150_2: ; %atomicrmw.end @@ -8406,10 +8289,9 @@ define void @global_atomic_fminimum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB150_1 ; GFX950-NEXT: .LBB150_2: ; %atomicrmw.end @@ -8466,10 +8348,9 @@ define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: .LBB151_2: ; %atomicrmw.end @@ -8538,10 +8419,9 @@ define void @global_atomic_fadd_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB152_1 ; GFX90A-NEXT: .LBB152_2: ; %atomicrmw.end @@ -8607,10 +8487,9 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: .LBB153_2: ; %atomicrmw.end @@ -8647,10 +8526,9 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: .LBB153_2: ; %atomicrmw.end @@ -8703,10 +8581,9 @@ define void @global_atomic_fsub_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB154_1 ; GFX90A-NEXT: .LBB154_2: ; %atomicrmw.end @@ -8741,10 +8618,9 @@ define void @global_atomic_fsub_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB154_1 ; GFX950-NEXT: .LBB154_2: ; %atomicrmw.end @@ -8797,10 +8673,9 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: .LBB155_2: ; %atomicrmw.end @@ -8837,10 +8712,9 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: .LBB155_2: ; %atomicrmw.end @@ -8893,10 +8767,9 @@ define void @global_atomic_fmax_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB156_1 ; GFX90A-NEXT: .LBB156_2: ; %atomicrmw.end @@ -8931,10 +8804,9 @@ define void @global_atomic_fmax_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB156_1 ; GFX950-NEXT: .LBB156_2: ; %atomicrmw.end @@ -8987,10 +8859,9 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: .LBB157_2: ; %atomicrmw.end @@ -9027,10 +8898,9 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: .LBB157_2: ; %atomicrmw.end @@ -9083,10 +8953,9 @@ define void @global_atomic_fmin_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB158_1 ; GFX90A-NEXT: .LBB158_2: ; %atomicrmw.end @@ -9121,10 +8990,9 @@ define void @global_atomic_fmin_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB158_1 ; GFX950-NEXT: .LBB158_2: ; %atomicrmw.end @@ -9182,10 +9050,9 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: .LBB159_2: ; %atomicrmw.end @@ -9222,10 +9089,9 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: .LBB159_2: ; %atomicrmw.end @@ -9283,10 +9149,9 @@ define void @global_atomic_fmaximum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB160_1 ; GFX90A-NEXT: .LBB160_2: ; %atomicrmw.end @@ -9321,10 +9186,9 @@ define void @global_atomic_fmaximum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB160_1 ; GFX950-NEXT: .LBB160_2: ; %atomicrmw.end @@ -9382,10 +9246,9 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: .LBB161_2: ; %atomicrmw.end @@ -9422,10 +9285,9 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: .LBB161_2: ; %atomicrmw.end @@ -9483,10 +9345,9 @@ define void @global_atomic_fminimum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB162_1 ; GFX90A-NEXT: .LBB162_2: ; %atomicrmw.end @@ -9521,10 +9382,9 @@ define void @global_atomic_fminimum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX950-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB162_1 ; GFX950-NEXT: .LBB162_2: ; %atomicrmw.end @@ -9865,10 +9725,9 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: .LBB171_2: ; %atomicrmw.end @@ -9900,10 +9759,9 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: .LBB171_2: ; %atomicrmw.end @@ -9940,10 +9798,9 @@ define void @global_atomic_nand_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB172_1 ; GFX90A-NEXT: .LBB172_2: ; %atomicrmw.end @@ -9973,10 +9830,9 @@ define void @global_atomic_nand_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB172_1 ; GFX950-NEXT: .LBB172_2: ; %atomicrmw.end @@ -10614,10 +10470,9 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: .LBB189_2: ; %atomicrmw.end @@ -10651,10 +10506,9 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: .LBB189_2: ; %atomicrmw.end @@ -10692,10 +10546,9 @@ define void @global_atomic_usub_cond_i32_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB190_1 ; GFX90A-NEXT: .LBB190_2: ; %atomicrmw.end @@ -10727,10 +10580,9 @@ define void @global_atomic_usub_cond_i32_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB190_1 ; GFX950-NEXT: .LBB190_2: ; %atomicrmw.end @@ -10766,10 +10618,9 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: .LBB191_2: ; %atomicrmw.end @@ -10801,10 +10652,9 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: .LBB191_2: ; %atomicrmw.end @@ -10840,10 +10690,9 @@ define void @global_atomic_usub_sat_i32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB192_1 ; GFX90A-NEXT: .LBB192_2: ; %atomicrmw.end @@ -10873,10 +10722,9 @@ define void @global_atomic_usub_sat_i32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB192_1 ; GFX950-NEXT: .LBB192_2: ; %atomicrmw.end @@ -11232,10 +11080,9 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB201_1 ; GFX90A-NEXT: .LBB201_2: ; %atomicrmw.end @@ -11271,10 +11118,9 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB201_1 ; GFX950-NEXT: .LBB201_2: ; %atomicrmw.end @@ -11314,10 +11160,9 @@ define void @global_atomic_nand_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB202_1 ; GFX90A-NEXT: .LBB202_2: ; %atomicrmw.end @@ -11349,10 +11194,9 @@ define void @global_atomic_nand_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB202_1 ; GFX950-NEXT: .LBB202_2: ; %atomicrmw.end @@ -12017,10 +11861,9 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB219_1 ; GFX90A-NEXT: .LBB219_2: ; %atomicrmw.end @@ -12059,10 +11902,9 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB219_1 ; GFX950-NEXT: .LBB219_2: ; %atomicrmw.end @@ -12103,10 +11945,9 @@ define void @global_atomic_usub_cond_i64_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB220_1 ; GFX90A-NEXT: .LBB220_2: ; %atomicrmw.end @@ -12141,10 +11982,9 @@ define void @global_atomic_usub_cond_i64_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB220_1 ; GFX950-NEXT: .LBB220_2: ; %atomicrmw.end @@ -12184,10 +12024,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB221_1 ; GFX90A-NEXT: .LBB221_2: ; %atomicrmw.end @@ -12225,10 +12064,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB221_1 ; GFX950-NEXT: .LBB221_2: ; %atomicrmw.end @@ -12268,10 +12106,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB222_1 ; GFX90A-NEXT: .LBB222_2: ; %atomicrmw.end @@ -12305,10 +12142,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB222_1 ; GFX950-NEXT: .LBB222_2: ; %atomicrmw.end @@ -12423,10 +12259,9 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: .LBB225_2: ; %atomicrmw.end @@ -12457,10 +12292,9 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: .LBB225_2: ; %atomicrmw.end @@ -12496,10 +12330,9 @@ define void @global_atomic_fsub_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: .LBB226_2: ; %atomicrmw.end @@ -12528,10 +12361,9 @@ define void @global_atomic_fsub_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: .LBB226_2: ; %atomicrmw.end @@ -12569,10 +12401,9 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: .LBB227_2: ; %atomicrmw.end @@ -12605,10 +12436,9 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: .LBB227_2: ; %atomicrmw.end @@ -12646,10 +12476,9 @@ define void @global_atomic_fmax_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: .LBB228_2: ; %atomicrmw.end @@ -12680,10 +12509,9 @@ define void @global_atomic_fmax_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: .LBB228_2: ; %atomicrmw.end @@ -12721,10 +12549,9 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: .LBB229_2: ; %atomicrmw.end @@ -12757,10 +12584,9 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: .LBB229_2: ; %atomicrmw.end @@ -12798,10 +12624,9 @@ define void @global_atomic_fmin_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: .LBB230_2: ; %atomicrmw.end @@ -12832,10 +12657,9 @@ define void @global_atomic_fmin_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: .LBB230_2: ; %atomicrmw.end @@ -12874,10 +12698,9 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: .LBB231_2: ; %atomicrmw.end @@ -12909,10 +12732,9 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: .LBB231_2: ; %atomicrmw.end @@ -12951,10 +12773,9 @@ define void @global_atomic_fmaximum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: .LBB232_2: ; %atomicrmw.end @@ -12984,10 +12805,9 @@ define void @global_atomic_fmaximum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: .LBB232_2: ; %atomicrmw.end @@ -13026,10 +12846,9 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: .LBB233_2: ; %atomicrmw.end @@ -13061,10 +12880,9 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: .LBB233_2: ; %atomicrmw.end @@ -13103,10 +12921,9 @@ define void @global_atomic_fminimum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: .LBB234_2: ; %atomicrmw.end @@ -13136,10 +12953,9 @@ define void @global_atomic_fminimum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: .LBB234_2: ; %atomicrmw.end @@ -13258,10 +13074,9 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB237_1 ; GFX90A-NEXT: .LBB237_2: ; %atomicrmw.end @@ -13295,10 +13110,9 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB237_1 ; GFX950-NEXT: .LBB237_2: ; %atomicrmw.end @@ -13335,10 +13149,9 @@ define void @global_atomic_fsub_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB238_1 ; GFX90A-NEXT: .LBB238_2: ; %atomicrmw.end @@ -13368,10 +13181,9 @@ define void @global_atomic_fsub_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB238_1 ; GFX950-NEXT: .LBB238_2: ; %atomicrmw.end @@ -13568,10 +13380,9 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB243_1 ; GFX90A-NEXT: .LBB243_2: ; %atomicrmw.end @@ -13610,10 +13421,9 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB243_1 ; GFX950-NEXT: .LBB243_2: ; %atomicrmw.end @@ -13654,10 +13464,9 @@ define void @global_atomic_fmaximum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB244_1 ; GFX90A-NEXT: .LBB244_2: ; %atomicrmw.end @@ -13692,10 +13501,9 @@ define void @global_atomic_fmaximum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB244_1 ; GFX950-NEXT: .LBB244_2: ; %atomicrmw.end @@ -13736,10 +13544,9 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB245_1 ; GFX90A-NEXT: .LBB245_2: ; %atomicrmw.end @@ -13778,10 +13585,9 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB245_1 ; GFX950-NEXT: .LBB245_2: ; %atomicrmw.end @@ -13822,10 +13628,9 @@ define void @global_atomic_fminimum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB246_1 ; GFX90A-NEXT: .LBB246_2: ; %atomicrmw.end @@ -13860,10 +13665,9 @@ define void @global_atomic_fminimum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB246_1 ; GFX950-NEXT: .LBB246_2: ; %atomicrmw.end @@ -13978,10 +13782,9 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: .LBB249_2: ; %atomicrmw.end @@ -14013,10 +13816,9 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: .LBB249_2: ; %atomicrmw.end @@ -14052,10 +13854,9 @@ define void @global_atomic_fsub_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB250_1 ; GFX90A-NEXT: .LBB250_2: ; %atomicrmw.end @@ -14085,10 +13886,9 @@ define void @global_atomic_fsub_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB250_1 ; GFX950-NEXT: .LBB250_2: ; %atomicrmw.end @@ -14126,10 +13926,9 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: .LBB251_2: ; %atomicrmw.end @@ -14164,10 +13963,9 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: .LBB251_2: ; %atomicrmw.end @@ -14205,10 +14003,9 @@ define void @global_atomic_fmax_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB252_1 ; GFX90A-NEXT: .LBB252_2: ; %atomicrmw.end @@ -14241,10 +14038,9 @@ define void @global_atomic_fmax_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB252_1 ; GFX950-NEXT: .LBB252_2: ; %atomicrmw.end @@ -14282,10 +14078,9 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: .LBB253_2: ; %atomicrmw.end @@ -14320,10 +14115,9 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: .LBB253_2: ; %atomicrmw.end @@ -14361,10 +14155,9 @@ define void @global_atomic_fmin_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB254_1 ; GFX90A-NEXT: .LBB254_2: ; %atomicrmw.end @@ -14397,10 +14190,9 @@ define void @global_atomic_fmin_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB254_1 ; GFX950-NEXT: .LBB254_2: ; %atomicrmw.end @@ -14443,10 +14235,9 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: .LBB255_2: ; %atomicrmw.end @@ -14478,10 +14269,9 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: .LBB255_2: ; %atomicrmw.end @@ -14524,10 +14314,9 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB256_1 ; GFX90A-NEXT: .LBB256_2: ; %atomicrmw.end @@ -14557,10 +14346,9 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB256_1 ; GFX950-NEXT: .LBB256_2: ; %atomicrmw.end @@ -14603,10 +14391,9 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: .LBB257_2: ; %atomicrmw.end @@ -14638,10 +14425,9 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: .LBB257_2: ; %atomicrmw.end @@ -14684,10 +14470,9 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB258_1 ; GFX90A-NEXT: .LBB258_2: ; %atomicrmw.end @@ -14717,10 +14502,9 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB258_1 ; GFX950-NEXT: .LBB258_2: ; %atomicrmw.end @@ -14778,10 +14562,9 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: .LBB259_2: ; %atomicrmw.end @@ -14852,10 +14635,9 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB260_1 ; GFX90A-NEXT: .LBB260_2: ; %atomicrmw.end @@ -14923,10 +14705,9 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: .LBB261_2: ; %atomicrmw.end @@ -14964,10 +14745,9 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: .LBB261_2: ; %atomicrmw.end @@ -15021,10 +14801,9 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB262_1 ; GFX90A-NEXT: .LBB262_2: ; %atomicrmw.end @@ -15060,10 +14839,9 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB262_1 ; GFX950-NEXT: .LBB262_2: ; %atomicrmw.end @@ -15117,10 +14895,9 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: .LBB263_2: ; %atomicrmw.end @@ -15158,10 +14935,9 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: .LBB263_2: ; %atomicrmw.end @@ -15215,10 +14991,9 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB264_1 ; GFX90A-NEXT: .LBB264_2: ; %atomicrmw.end @@ -15254,10 +15029,9 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB264_1 ; GFX950-NEXT: .LBB264_2: ; %atomicrmw.end @@ -15311,10 +15085,9 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: .LBB265_2: ; %atomicrmw.end @@ -15352,10 +15125,9 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: .LBB265_2: ; %atomicrmw.end @@ -15409,10 +15181,9 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB266_1 ; GFX90A-NEXT: .LBB266_2: ; %atomicrmw.end @@ -15448,10 +15219,9 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB266_1 ; GFX950-NEXT: .LBB266_2: ; %atomicrmw.end @@ -15510,10 +15280,9 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: .LBB267_2: ; %atomicrmw.end @@ -15551,10 +15320,9 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: .LBB267_2: ; %atomicrmw.end @@ -15613,10 +15381,9 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB268_1 ; GFX90A-NEXT: .LBB268_2: ; %atomicrmw.end @@ -15652,10 +15419,9 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB268_1 ; GFX950-NEXT: .LBB268_2: ; %atomicrmw.end @@ -15714,10 +15480,9 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: .LBB269_2: ; %atomicrmw.end @@ -15755,10 +15520,9 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: .LBB269_2: ; %atomicrmw.end @@ -15817,10 +15581,9 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB270_1 ; GFX90A-NEXT: .LBB270_2: ; %atomicrmw.end @@ -15856,10 +15619,9 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX950-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX950-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX950-NEXT: s_mov_b64 exec, vcc ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execnz .LBB270_1 ; GFX950-NEXT: .LBB270_2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll index 10e78e09f175b..83f3bcd47cd84 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -149,7 +149,7 @@ attributes #0 = { nounwind } ; GCN-NEXT: dynamic_stack: ; GCN-NEXT: .backend_stack_size: 0x10{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; SDAG-NEXT: .sgpr_count: 0x26{{$}} +; SDAG-NEXT: .sgpr_count: 0x28{{$}} ; GISEL-NEXT: .sgpr_count: 0x28{{$}} ; GCN: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x2{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index 68e3d6bd55ba9..8b5d2ac550076 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -8,7 +8,7 @@ ; GCN-NOT: v_lshrrev_b32 ; GCN-NOT: s_sub_u32 -; GCN: s_xor_b64 exec +; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, vcc, exec ; GCN: s_cbranch_execz [[BB1:.LBB[0-9]+_[0-9]+]] ; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 glc{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll index f1d13eb763120..400c6a0609529 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll @@ -12,7 +12,7 @@ ; GCN-DAG: v_cmp_lt_i32 ; GCN: s_and_b64 -; GCN: s_mov_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: s_or_b64 exec, exec ; GCN: {{[s|v]}}_cmp_eq_u32 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll index 4862845d616b2..e6355c91f9e21 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -14,7 +14,7 @@ define i32 @divergent_lshr_and_cmp(i32 %x) { ; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 killed [[V_CMP_NE_U32_e64_]], killed [[S_MOV_B64_]], implicit-def dead $scc - ; GCN-NEXT: SI_BRCOND %bb.2, killed [[S_XOR_B64_]] + ; GCN-NEXT: SI_BRCOND %bb.2, killed [[S_XOR_B64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1.out.true: diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index f75bb8c701d4b..08f1edf3736dc 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -36,10 +36,8 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_or_b64 exec, exec, s[4:5] ; ISA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; ISA-NEXT: s_xor_b64 s[4:5], vcc, exec -; ISA-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; ISA-NEXT: s_and_b64 s[8:9], s[8:9], exec -; ISA-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; ISA-NEXT: s_mov_b64 exec, s[4:5] +; ISA-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; ISA-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; ISA-NEXT: s_mov_b64 s[4:5], 0 ; ISA-NEXT: ; divergent control-flow edge ; ISA-NEXT: s_cbranch_execz .LBB0_5 @@ -52,12 +50,10 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: ; %bb.3: ; %endif1 ; ISA-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; ISA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; ISA-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; ISA-NEXT: s_xor_b64 s[4:5], exec, s[2:3] ; ISA-NEXT: v_mov_b32_e32 v5, v2 ; ISA-NEXT: v_mov_b32_e32 v4, v3 -; ISA-NEXT: s_and_b64 s[4:5], s[4:5], exec -; ISA-NEXT: s_mov_b64 exec, s[2:3] +; ISA-NEXT: s_or_b64 s[2:3], s[2:3], vcc +; ISA-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; ISA-NEXT: s_mov_b64 s[2:3], 0 ; ISA-NEXT: ; divergent control-flow edge ; ISA-NEXT: s_cbranch_execz .LBB0_1 @@ -73,13 +69,14 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; ISA-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; ISA-NEXT: v_mov_b32_e32 v1, 0 -; ISA-NEXT: s_xor_b64 exec, vcc, exec +; ISA-NEXT: s_xor_b64 s[0:1], vcc, exec +; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; ISA-NEXT: ; divergent control-flow edge ; ISA-NEXT: s_cbranch_execz .LBB0_7 ; ISA-NEXT: .LBB0_6: ; %if1 ; ISA-NEXT: v_sqrt_f32_e32 v1, v0 ; ISA-NEXT: .LBB0_7: ; %endloop -; ISA-NEXT: s_or_b64 exec, exec, vcc +; ISA-NEXT: s_or_b64 exec, exec, s[0:1] ; ISA-NEXT: exp mrt0, v1, v1, v1, v1 done vm ; ISA-NEXT: s_endpgm start: @@ -151,13 +148,11 @@ define amdgpu_ps void @i1_copy_assert(i1 %v4) { ; ISA-NEXT: .LBB1_1: ; %Flow ; ISA-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; ISA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; ISA-NEXT: s_mov_b32 s4, 1 ; ISA-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; ISA-NEXT: s_xor_b64 s[2:3], vcc, exec -; ISA-NEXT: s_xor_b64 s[6:7], exec, s[2:3] -; ISA-NEXT: s_and_b64 s[6:7], s[6:7], exec -; ISA-NEXT: s_mov_b32 s4, 1 -; ISA-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; ISA-NEXT: s_mov_b64 exec, s[2:3] +; ISA-NEXT: s_and_saveexec_b64 s[2:3], s[2:3] +; ISA-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; ISA-NEXT: ; divergent control-flow edge ; ISA-NEXT: s_cbranch_execz .LBB1_4 ; ISA-NEXT: .LBB1_2: ; %loop diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 33b80f161ddbc..dadfd626c3b7e 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -1258,14 +1258,15 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_and_b32_e32 v4, 1, v4 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; SI-NEXT: s_mov_b32 s38, 0 -; SI-NEXT: s_xor_b64 exec, vcc, exec +; SI-NEXT: s_xor_b64 s[34:35], vcc, exec +; SI-NEXT: s_and_saveexec_b64 s[34:35], s[34:35] ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB7_2 ; SI-NEXT: .LBB7_1: ; %T -; SI-NEXT: s_mov_b32 s34, 0xf000 +; SI-NEXT: s_mov_b32 s40, 0xf000 ; SI-NEXT: s_mov_b32 s39, s38 ; SI-NEXT: s_mov_b64 s[36:37], s[38:39] -; SI-NEXT: s_mov_b32 s39, s34 +; SI-NEXT: s_mov_b32 s39, s40 ; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:2 glc @@ -1307,9 +1308,8 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_or_b32_e32 v2, v2, v10 ; SI-NEXT: v_or_b32_e32 v1, v8, v11 ; SI-NEXT: .LBB7_2: -; SI-NEXT: s_or_b64 exec, exec, vcc -; SI-NEXT: s_xor_b64 s[34:35], exec, vcc -; SI-NEXT: s_mov_b64 exec, vcc +; SI-NEXT: s_or_b64 exec, exec, s[34:35] +; SI-NEXT: s_and_saveexec_b64 s[34:35], vcc ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: .LBB7_3: ; %F @@ -1401,7 +1401,8 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_xor_b64 exec, vcc, exec +; GFX9-NEXT: s_xor_b64 s[34:35], vcc, exec +; GFX9-NEXT: s_and_saveexec_b64 s[34:35], s[34:35] ; GFX9-NEXT: ; divergent control-flow edge ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: .LBB7_1: ; %T @@ -1411,9 +1412,8 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, vcc -; GFX9-NEXT: s_xor_b64 s[34:35], exec, vcc -; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-NEXT: ; divergent control-flow edge ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: .LBB7_3: ; %F @@ -1457,9 +1457,10 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX11-TRUE16-NEXT: scratch_load_u8 v4, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-TRUE16-NEXT: .LBB7_1: ; %T @@ -1468,10 +1469,8 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v[0:1], off glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: .LBB7_2: -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_4 ; GFX11-TRUE16-NEXT: .LBB7_3: ; %F @@ -1506,9 +1505,10 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX11-FAKE16-NEXT: scratch_load_u8 v4, off, s32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-FAKE16-NEXT: .LBB7_1: ; %T @@ -1517,10 +1517,8 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v[0:1], off glc dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: .LBB7_2: -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_4 ; GFX11-FAKE16-NEXT: .LBB7_3: ; %F @@ -1587,14 +1585,15 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_and_b32_e32 v4, 1, v4 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; SI-NEXT: s_mov_b32 s38, 0 -; SI-NEXT: s_xor_b64 exec, vcc, exec +; SI-NEXT: s_xor_b64 s[34:35], vcc, exec +; SI-NEXT: s_and_saveexec_b64 s[34:35], s[34:35] ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: .LBB8_1: ; %T -; SI-NEXT: s_mov_b32 s34, 0xf000 +; SI-NEXT: s_mov_b32 s40, 0xf000 ; SI-NEXT: s_mov_b32 s39, s38 ; SI-NEXT: s_mov_b64 s[36:37], s[38:39] -; SI-NEXT: s_mov_b32 s39, s34 +; SI-NEXT: s_mov_b32 s39, s40 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:2 glc @@ -1636,9 +1635,8 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_or_b32_e32 v2, v2, v10 ; SI-NEXT: v_or_b32_e32 v3, v3, v11 ; SI-NEXT: .LBB8_2: -; SI-NEXT: s_or_b64 exec, exec, vcc -; SI-NEXT: s_xor_b64 s[34:35], exec, vcc -; SI-NEXT: s_mov_b64 exec, vcc +; SI-NEXT: s_or_b64 exec, exec, s[34:35] +; SI-NEXT: s_and_saveexec_b64 s[34:35], vcc ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: .LBB8_3: ; %F @@ -1729,7 +1727,8 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_xor_b64 exec, vcc, exec +; GFX9-NEXT: s_xor_b64 s[34:35], vcc, exec +; GFX9-NEXT: s_and_saveexec_b64 s[34:35], s[34:35] ; GFX9-NEXT: ; divergent control-flow edge ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: .LBB8_1: ; %T @@ -1739,9 +1738,8 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, vcc -; GFX9-NEXT: s_xor_b64 s[34:35], exec, vcc -; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-NEXT: ; divergent control-flow edge ; GFX9-NEXT: s_cbranch_execz .LBB8_4 ; GFX9-NEXT: .LBB8_3: ; %F @@ -1783,9 +1781,10 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX11-TRUE16-NEXT: scratch_load_u8 v4, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-TRUE16-NEXT: .LBB8_1: ; %T @@ -1794,10 +1793,8 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v[0:1], off glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: .LBB8_2: -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_4 ; GFX11-TRUE16-NEXT: .LBB8_3: ; %F @@ -1832,9 +1829,10 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX11-FAKE16-NEXT: scratch_load_u8 v4, off, s32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-FAKE16-NEXT: .LBB8_1: ; %T @@ -1843,10 +1841,8 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v[0:1], off glc dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: .LBB8_2: -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 ; GFX11-FAKE16-NEXT: .LBB8_3: ; %F diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-sgpr32-to-vgpr16.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-sgpr32-to-vgpr16.ll index c272492f6fe10..c20181a0b66f4 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-sgpr32-to-vgpr16.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-sgpr32-to-vgpr16.ll @@ -33,7 +33,7 @@ define i32 @test_cvt_pk_fp8_f32(float %x, float %y) { ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 7 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[REG_SEQUENCE]], killed [[S_MOV_B32_2]], implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; CHECK-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.a2: @@ -90,7 +90,7 @@ define i32 @test_cvt_pk_fp8_f32_e5m3(float %x, float %y) { ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 7 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[REG_SEQUENCE]], killed [[S_MOV_B32_2]], implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; CHECK-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.a2: diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll index b5dabe50c21ad..fdec49ffa9179 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -20,7 +20,8 @@ define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { ; GCN-NEXT: s_mov_b32 s0, 1 ; GCN-NEXT: .LBB0_2: ; %bb602 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s0, v0 -; GCN-NEXT: s_xor_b64 exec, vcc, exec +; GCN-NEXT: s_xor_b64 s[0:1], vcc, exec +; GCN-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: .LBB0_3: ; %bb49 @@ -74,7 +75,8 @@ define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer ; GCN-NEXT: s_mov_b32 s0, 1 ; GCN-NEXT: .LBB1_2: ; %bb602 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s0, v0 -; GCN-NEXT: s_xor_b64 exec, vcc, exec +; GCN-NEXT: s_xor_b64 s[0:1], vcc, exec +; GCN-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execz .LBB1_4 ; GCN-NEXT: .LBB1_3: ; %bb49 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 6b8ff89ede27a..c4fe1944fdc8d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1800,10 +1800,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: .LBB43_2: ; %atomicrmw.end @@ -2128,10 +2127,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: .LBB50_2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index db1482e8a56a4..63d5aa0d4e3b0 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -157,7 +157,8 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_xor_b64 exec +; GCN: s_xor_b64 +; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_exec{{n?z}} ; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}} @@ -244,7 +245,8 @@ declare void @func(ptr addrspace(5) nocapture) #0 ; stores in the middle block. ; GCN-LABEL: {{^}}undefined_stack_store_reg: -; GCN: s_xor_b64 exec +; GCN: s_xor_b64 +; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_exec{{n?z}} ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: @@ -273,7 +275,8 @@ bb5: } ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: -; GCN: s_xor_b64 exec +; GCN: s_xor_b64 +; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_exec{{n?z}} ; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index 433c1a8f2b2a6..3ce67eec004b7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -48,7 +48,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX908-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX908-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_]] + ; GFX908-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX908-NEXT: S_BRANCH %bb.1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1 (%ir-block.2): @@ -80,7 +80,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]] ; GFX908-NEXT: early-clobber %0:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX908-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec - ; GFX908-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX908-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX908-NEXT: S_BRANCH %bb.2 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2 (%ir-block.25): @@ -107,7 +107,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX90A_GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A_GFX942-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX90A_GFX942-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_]] + ; GFX90A_GFX942-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A_GFX942-NEXT: S_BRANCH %bb.1 ; GFX90A_GFX942-NEXT: {{ $}} ; GFX90A_GFX942-NEXT: bb.1 (%ir-block.2): @@ -139,7 +139,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]] ; GFX90A_GFX942-NEXT: early-clobber %0:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A_GFX942-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec - ; GFX90A_GFX942-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX90A_GFX942-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A_GFX942-NEXT: S_BRANCH %bb.2 ; GFX90A_GFX942-NEXT: {{ $}} ; GFX90A_GFX942-NEXT: bb.2 (%ir-block.25): @@ -166,7 +166,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX11_GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX11_GFX12-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX11_GFX12-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_]] + ; GFX11_GFX12-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX11_GFX12-NEXT: S_BRANCH %bb.1 ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: bb.1 (%ir-block.2): @@ -192,7 +192,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: early-clobber %0:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec ; GFX11_GFX12-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec - ; GFX11_GFX12-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX11_GFX12-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX11_GFX12-NEXT: S_BRANCH %bb.2 ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: bb.2 (%ir-block.18): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 55728c5d753fa..2e424af30360b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -50,7 +50,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1 (%ir-block.2): @@ -87,7 +87,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX90A-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2 (%ir-block.26): @@ -130,7 +130,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX942-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX942-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]] + ; GFX942-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.1 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.1 (%ir-block.2): @@ -167,7 +167,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec ; GFX942-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX942-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX942-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.2 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.2 (%ir-block.26): @@ -210,7 +210,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[SI_PS_LIVE]], implicit $exec ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX11-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX11-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]] + ; GFX11-NEXT: SI_BRCOND %bb.4, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.1 (%ir-block.2): @@ -248,7 +248,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_1]], implicit $exec ; GFX11-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX11-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]] + ; GFX11-NEXT: SI_BRCOND %bb.3, killed [[V_CMP_NE_U32_e64_1]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.2 (%ir-block.23): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index d6e21ce9e4eaf..ad1fd1a97b9a5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -54,7 +54,7 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_EQ_U64_e64_]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX90A-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; GFX90A-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_NE_U32_e64_]] + ; GFX90A-NEXT: SI_BRCOND %bb.1, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.atomicrmw.end: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll index a7e99a876ac98..01e09886f0bb5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_xor_b64 s[2:3], vcc, exec -; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[2:3] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB0_2 ; GCN-NEXT: .LBB0_1: diff --git a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll index 2684f5102190c..ae5d5960bb41f 100644 --- a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll @@ -9,7 +9,8 @@ ; CHECK: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1, vcc ; CHECK: v_cmp_ne_u32_e32 vcc, 1, {{v[0-9]+}} -; CHECK: s_xor_b64 s[8:9], vcc, exec +; CHECK: s_xor_b64 s[6:7], vcc, exec +; CHECK: s_and_saveexec_b64 s[6:7], s[6:7] ; CHECK: BB0_2: define amdgpu_kernel void @hoist_cond(ptr addrspace(1) nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) { diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 1bc70eac7a837..da9156ff1d32e 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -84,7 +84,8 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; SI-NEXT: s_xor_b64 exec, vcc, exec +; SI-NEXT: s_xor_b64 s[0:1], vcc, exec +; SI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: .LBB2_1: ; %loop.preheader @@ -316,7 +317,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: s_xor_b64 s[0:1], vcc, exec ; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_mov_b64 exec, s[0:1] +; SI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB6_7 ; SI-NEXT: .LBB6_1: ; %outer_loop.preheader @@ -353,10 +354,8 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: ; in Loop: Header=BB6_5 Depth=2 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_xor_b64 s[10:11], exec, vcc -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_and_saveexec_b64 s[10:11], vcc ; SI-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; SI-NEXT: s_mov_b64 exec, vcc ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execnz .LBB6_5 ; SI-NEXT: s_branch .LBB6_3 @@ -450,10 +449,8 @@ define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_xor_b64 s[8:9], vcc, exec ; SI-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; SI-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec -; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execnz .LBB7_2 ; SI-NEXT: .LBB7_8: ; Inline asm indirect target diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll index 2154759e29f9c..4eee771b3a56c 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll @@ -73,7 +73,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec ; DAGISEL-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY1]], [[COPY]], implicit $exec - ; DAGISEL-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_NE_U32_e64_]] + ; DAGISEL-NEXT: SI_BRCOND %bb.2, killed [[V_CMP_NE_U32_e64_]], implicit-def dead $exec, implicit-def dead $vcc_lo, implicit $exec ; DAGISEL-NEXT: S_BRANCH %bb.1 ; DAGISEL-NEXT: {{ $}} ; DAGISEL-NEXT: bb.1.if.then: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index fc63854845120..3f4102e9dbb72 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -235,38 +235,39 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind { ; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[10:11] -; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 -; GCN-NEXT: buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v3, v[3:4], s[4:7], 0 addr64 offset:8 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b64 s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; GCN-NEXT: s_xor_b64 exec, vcc, exec +; GCN-NEXT: s_mov_b64 s[2:3], 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; GCN-NEXT: s_xor_b64 s[2:3], vcc, exec +; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[2:3] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execz .LBB9_2 ; GCN-NEXT: .LBB9_1: ; %bb -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: .LBB9_2: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, vcc +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_div_fmas_f32 v0, v1, v2, v3 -; GCN-NEXT: s_mov_b32 s11, s3 -; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 +; GCN-NEXT: s_mov_b32 s3, s7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GCN-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll index 3daa1337d0abe..da4710d2c9778 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -18,7 +18,8 @@ define amdgpu_cs float @ds_ordered_swap(ptr addrspace(2) inreg %gds, i32 %value) ; FUNC-LABEL: {{^}}ds_ordered_swap_conditional: ; GCN-SDAG: v_cmp_eq_u32_e32 vcc, 0, v[[VALUE:[0-9]+]] -; GCN-SDAG: s_xor_b64 exec, vcc, exec +; GCN-SDAG: s_xor_b64 [[SDAG_SAVED:s\[[0-9]+:[0-9]+\]]], vcc, exec +; GCN-SDAG: s_and_saveexec_b64 [[SDAG_SAVED]], [[SDAG_SAVED]] ; GCN-GISEL: v_cmp_ne_u32_e32 vcc, 0, v[[VALUE:[0-9]+]] ; GCN-GISEL: s_and_saveexec_b64 s[[SAVED:\[[0-9]+:[0-9]+\]]], vcc ; // We have to use s_cbranch, because ds_ordered_count has side effects with EXEC=0 @@ -30,7 +31,7 @@ define amdgpu_cs float @ds_ordered_swap(ptr addrspace(2) inreg %gds, i32 %value) ; GCN-NEXT: [[BB]]: ; // Wait for expcnt(0) before modifying EXEC ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-SDAG-NEXT: s_or_b64 exec, exec, vcc +; GCN-SDAG-NEXT: s_or_b64 exec, exec, [[SDAG_SAVED]] ; GCN-GISEL-NEXT: s_or_b64 exec, exec, s[[SAVED]] define amdgpu_cs float @ds_ordered_swap_conditional(ptr addrspace(2) inreg %gds, i32 %value) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll index 39d686db65052..ea1eeef012903 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -148,7 +148,8 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_mov_b32_e32 v2, s0 ; SDAG-NEXT: s_and_b32 s1, exec_lo, s1 -; SDAG-NEXT: s_xor_b32 exec_lo, s1, exec_lo +; SDAG-NEXT: s_xor_b32 s1, s1, exec_lo +; SDAG-NEXT: s_and_saveexec_b32 s1, s1 ; SDAG-NEXT: ; divergent control-flow edge ; SDAG-NEXT: s_cbranch_execz .LBB6_2 ; SDAG-NEXT: .LBB6_1: ; %if diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll index a3dd995ba9e9d..ad2832ed2690f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -35,7 +35,8 @@ define amdgpu_ps float @test2() #0 { ; CHECK-LABEL: {{^}}test3: ; CHECK-SDAG: s_wqm_b64 exec, exec -; CHECK-SDAG: s_xor_saveexec_b64 +; CHECK-SDAG: s_xor_b64 [[EXEC_COPY0:s\[[0-9]+:[0-9]+\]]], s[0:1], exec +; CHECK-SDAG: s_and_saveexec_b64 [[EXEC_COPY0]], [[EXEC_COPY0]] ; CHECK-GISEL: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK-GISEL: s_wqm_b64 exec, exec ; CHECK-GISEL-DAG: s_mov_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll b/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll index f0993eec290eb..00fb175e694ce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll @@ -66,23 +66,25 @@ define amdgpu_cs ptr addrspace(5) @sponentry_cs_no_dvgpr(i32 %val) #2 { define amdgpu_cs ptr addrspace(5) @sponentry_cs_dvgpr_control_flow(i32 %val, ptr addrspace(5) %ptr) #0 { ; DAGISEL-LABEL: sponentry_cs_dvgpr_control_flow: ; DAGISEL: ; %bb.0: ; %entry -; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; DAGISEL-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0x42, v0 +; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; DAGISEL-NEXT: s_cmp_lg_u32 0, s33 ; DAGISEL-NEXT: s_cmovk_i32 s33, 0x1c0 +; DAGISEL-NEXT: s_xor_b32 s0, vcc_lo, exec_lo ; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS ; DAGISEL-NEXT: s_wait_storecnt 0x0 -; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; DAGISEL-NEXT: s_and_saveexec_b32 s0, s0 ; DAGISEL-NEXT: ; divergent control-flow edge ; DAGISEL-NEXT: s_cbranch_execz .LBB3_2 ; DAGISEL-NEXT: .LBB3_1: ; %if.then -; DAGISEL-NEXT: s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; DAGISEL-NEXT: s_getreg_b32 s1, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; DAGISEL-NEXT: s_cmp_lg_u32 0, s0 -; DAGISEL-NEXT: s_cmovk_i32 s0, 0x1c0 -; DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; DAGISEL-NEXT: s_cmp_lg_u32 0, s1 +; DAGISEL-NEXT: s_cmovk_i32 s1, 0x1c0 +; DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; DAGISEL-NEXT: .LBB3_2: ; %if.end -; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-NEXT: v_readfirstlane_b32 s0, v1 ; DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0) @@ -189,13 +191,16 @@ define amdgpu_gfx ptr addrspace(5) @sponentry_gfx(i32 %val, ptr addrspace(5) %pt ; DAGISEL-NEXT: s_wait_storecnt 0x0 ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS ; DAGISEL-NEXT: s_wait_storecnt 0x0 -; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; DAGISEL-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; DAGISEL-NEXT: s_and_saveexec_b32 s0, s0 ; DAGISEL-NEXT: ; divergent control-flow edge ; DAGISEL-NEXT: s_cbranch_execz .LBB6_2 ; DAGISEL-NEXT: .LBB6_1: ; %if.then ; DAGISEL-NEXT: v_mov_b32_e32 v1, s32 ; DAGISEL-NEXT: .LBB6_2: ; %if.end -; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-NEXT: v_mov_b32_e32 v0, v1 ; DAGISEL-NEXT: s_setpc_b64 s[30:31] @@ -447,13 +452,16 @@ define amdgpu_cs_chain void @sponentry_cs_chain(i32 %val, ptr addrspace(5) %ptr) ; DAGISEL-NEXT: s_wait_storecnt 0x0 ; DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:4 scope:SCOPE_SYS ; DAGISEL-NEXT: s_wait_storecnt 0x0 -; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; DAGISEL-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; DAGISEL-NEXT: s_and_saveexec_b32 s0, s0 ; DAGISEL-NEXT: ; divergent control-flow edge ; DAGISEL-NEXT: s_cbranch_execz .LBB10_2 ; DAGISEL-NEXT: .LBB10_1: ; %if.then ; DAGISEL-NEXT: v_mov_b32_e32 v0, s32 ; DAGISEL-NEXT: .LBB10_2: ; %if.end -; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; DAGISEL-NEXT: scratch_store_b32 v9, v0, off scope:SCOPE_SYS ; DAGISEL-NEXT: s_wait_storecnt 0x0 ; DAGISEL-NEXT: s_alloc_vgpr 0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index abf43caa613dd..a48b457763247 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -99,9 +99,8 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB0_1 ; GFX7-NEXT: .LBB0_2: ; %atomicrmw.end @@ -125,9 +124,8 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB0_1 ; GFX6-NEXT: .LBB0_2: ; %atomicrmw.end @@ -222,9 +220,8 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: .LBB1_2: ; %atomicrmw.end @@ -249,9 +246,8 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: .LBB1_2: ; %atomicrmw.end @@ -345,10 +341,9 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB2_1 ; GFX7-NEXT: .LBB2_2: ; %atomicrmw.end @@ -370,10 +365,9 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB2_1 ; GFX6-NEXT: .LBB2_2: ; %atomicrmw.end @@ -466,10 +460,9 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: .LBB3_2: ; %atomicrmw.end @@ -492,10 +485,9 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: .LBB3_2: ; %atomicrmw.end @@ -525,6 +517,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -535,11 +528,9 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX12-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-NEXT: v_cmpx_ne_u32_e32 1, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, s0, s1 -; GFX12-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: .LBB4_2: ; %atomicrmw.end @@ -565,6 +556,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -573,11 +565,9 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v3 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB4_1 ; GFX11-NEXT: .LBB4_2: ; %atomicrmw.end @@ -603,9 +593,8 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: .LBB4_2: ; %atomicrmw.end @@ -638,9 +627,8 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: .LBB4_2: ; %atomicrmw.end @@ -665,9 +653,8 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: .LBB4_2: ; %atomicrmw.end @@ -692,9 +679,8 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: .LBB4_2: ; %atomicrmw.end @@ -719,9 +705,8 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: .LBB4_2: ; %atomicrmw.end @@ -746,6 +731,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -756,11 +742,9 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX12-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-NEXT: v_cmpx_ne_u32_e32 1, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, s0, s1 -; GFX12-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: .LBB5_2: ; %atomicrmw.end @@ -786,6 +770,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -794,11 +779,9 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v3 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB5_1 ; GFX11-NEXT: .LBB5_2: ; %atomicrmw.end @@ -824,9 +807,8 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 ; GFX10-NEXT: .LBB5_2: ; %atomicrmw.end @@ -859,9 +841,8 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: .LBB5_2: ; %atomicrmw.end @@ -886,9 +867,8 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 ; GFX8-NEXT: .LBB5_2: ; %atomicrmw.end @@ -913,9 +893,8 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7-NEXT: .LBB5_2: ; %atomicrmw.end @@ -940,9 +919,8 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB5_1 ; GFX6-NEXT: .LBB5_2: ; %atomicrmw.end @@ -977,10 +955,9 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, s0, s1 -; GFX12-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: .LBB6_2: ; %atomicrmw.end @@ -1014,9 +991,8 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: .LBB6_2: ; %atomicrmw.end @@ -1041,9 +1017,8 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 ; GFX10-NEXT: .LBB6_2: ; %atomicrmw.end @@ -1074,10 +1049,9 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: .LBB6_2: ; %atomicrmw.end @@ -1100,10 +1074,9 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: .LBB6_2: ; %atomicrmw.end @@ -1126,10 +1099,9 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: .LBB6_2: ; %atomicrmw.end @@ -1152,10 +1124,9 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: .LBB6_2: ; %atomicrmw.end @@ -1189,10 +1160,9 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, s0, s1 -; GFX12-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: .LBB7_2: ; %atomicrmw.end @@ -1226,9 +1196,8 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: .LBB7_2: ; %atomicrmw.end @@ -1253,9 +1222,8 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 ; GFX10-NEXT: .LBB7_2: ; %atomicrmw.end @@ -1286,10 +1254,9 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: .LBB7_2: ; %atomicrmw.end @@ -1312,10 +1279,9 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: .LBB7_2: ; %atomicrmw.end @@ -1338,10 +1304,9 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: .LBB7_2: ; %atomicrmw.end @@ -1365,10 +1330,9 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: .LBB7_2: ; %atomicrmw.end @@ -1403,6 +1367,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 @@ -1418,11 +1383,9 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-TRUE16-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1450,6 +1413,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 @@ -1466,11 +1430,9 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-FAKE16-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1503,10 +1465,9 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1525,11 +1486,11 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 @@ -1543,11 +1504,9 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-TRUE16-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1571,6 +1530,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 @@ -1585,11 +1545,9 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-FAKE16-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1622,9 +1580,8 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1655,10 +1612,9 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1690,9 +1646,8 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 ; GFX908-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1726,9 +1681,8 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1763,9 +1717,8 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1800,9 +1753,8 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: .LBB8_2: ; %atomicrmw.end @@ -1835,6 +1787,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 @@ -1850,11 +1803,9 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-TRUE16-NEXT: .LBB9_2: ; %atomicrmw.end @@ -1884,6 +1835,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 @@ -1900,11 +1852,9 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-FAKE16-NEXT: .LBB9_2: ; %atomicrmw.end @@ -1938,10 +1888,9 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: .LBB9_2: ; %atomicrmw.end @@ -1962,11 +1911,11 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 @@ -1980,11 +1929,9 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-TRUE16-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2010,6 +1957,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 @@ -2024,11 +1972,9 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-FAKE16-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2062,9 +2008,8 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2096,10 +2041,9 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2132,9 +2076,8 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2169,9 +2112,8 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2207,9 +2149,8 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2245,9 +2186,8 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: .LBB9_2: ; %atomicrmw.end @@ -2295,10 +2235,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2342,10 +2281,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2377,10 +2315,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2398,7 +2335,6 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -2418,9 +2354,8 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-TRUE16-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2459,9 +2394,8 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-FAKE16-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2493,9 +2427,8 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2525,10 +2458,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2558,10 +2490,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2593,10 +2524,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2629,10 +2559,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2665,10 +2594,9 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: .LBB10_2: ; %atomicrmw.end @@ -2716,10 +2644,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: .LBB11_2: ; %atomicrmw.end @@ -2764,10 +2691,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: .LBB11_2: ; %atomicrmw.end @@ -2800,10 +2726,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[2:3], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: .LBB11_2: ; %atomicrmw.end @@ -2823,7 +2748,6 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -2843,9 +2767,8 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-TRUE16-NEXT: .LBB11_2: ; %atomicrmw.end @@ -2885,9 +2808,8 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-FAKE16-NEXT: .LBB11_2: ; %atomicrmw.end @@ -2920,9 +2842,8 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: .LBB11_2: ; %atomicrmw.end @@ -2953,10 +2874,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: .LBB11_2: ; %atomicrmw.end @@ -2987,10 +2907,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: .LBB11_2: ; %atomicrmw.end @@ -3023,10 +2942,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: .LBB11_2: ; %atomicrmw.end @@ -3060,10 +2978,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: .LBB11_2: ; %atomicrmw.end @@ -3097,10 +3014,9 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: .LBB11_2: ; %atomicrmw.end @@ -3126,6 +3042,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 @@ -3137,11 +3054,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3163,6 +3078,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -3176,11 +3092,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-FAKE16-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3206,10 +3120,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3227,6 +3140,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 @@ -3236,11 +3150,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3253,11 +3165,11 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -3269,11 +3181,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-FAKE16-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3300,9 +3210,8 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3326,10 +3235,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3354,9 +3262,8 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3382,9 +3289,8 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3412,9 +3318,8 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3443,9 +3348,8 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: .LBB12_2: ; %atomicrmw.end @@ -3483,10 +3387,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3520,10 +3423,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3548,10 +3450,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3579,9 +3480,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3593,7 +3493,6 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3610,9 +3509,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3638,9 +3536,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3663,10 +3560,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3689,10 +3585,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3716,10 +3611,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3745,10 +3639,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3775,10 +3668,9 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: .LBB13_2: ; %atomicrmw.end @@ -3813,6 +3705,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3839,11 +3732,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-TRUE16-NEXT: .LBB14_2: ; %atomicrmw.end @@ -3871,6 +3762,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3896,11 +3788,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-FAKE16-NEXT: .LBB14_2: ; %atomicrmw.end @@ -3941,10 +3831,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: .LBB14_2: ; %atomicrmw.end @@ -3968,6 +3857,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3991,11 +3881,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-TRUE16-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4019,6 +3907,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -4041,11 +3930,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-FAKE16-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4083,9 +3970,8 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4122,10 +4008,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4163,9 +4048,8 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4205,9 +4089,8 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4242,9 +4125,8 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4279,9 +4161,8 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: .LBB14_2: ; %atomicrmw.end @@ -4314,6 +4195,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -4340,11 +4222,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4374,6 +4254,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -4399,11 +4280,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-FAKE16-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4445,10 +4324,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4474,6 +4352,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -4497,11 +4376,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4527,6 +4404,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -4549,11 +4427,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-FAKE16-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4592,9 +4468,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4632,10 +4507,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4674,9 +4548,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4717,9 +4590,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4755,9 +4627,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4793,9 +4664,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: .LBB15_2: ; %atomicrmw.end @@ -4854,10 +4724,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.end @@ -4910,10 +4779,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.end @@ -4953,10 +4821,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5004,9 +4871,8 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5053,9 +4919,8 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5092,9 +4957,8 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5130,10 +4994,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5169,10 +5032,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5210,10 +5072,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5246,10 +5107,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5282,10 +5142,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: .LBB16_2: ; %atomicrmw.end @@ -5343,10 +5202,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-TRUE16-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5400,10 +5258,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-FAKE16-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5444,10 +5301,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5496,9 +5352,8 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-TRUE16-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5546,9 +5401,8 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-FAKE16-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5586,9 +5440,8 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5625,10 +5478,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5665,10 +5517,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5707,10 +5558,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5744,10 +5594,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5781,10 +5630,9 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: .LBB17_2: ; %atomicrmw.end @@ -5809,6 +5657,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 @@ -5832,11 +5681,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-TRUE16-NEXT: .LBB18_2: ; %atomicrmw.end @@ -5858,6 +5705,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 @@ -5880,11 +5728,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-FAKE16-NEXT: .LBB18_2: ; %atomicrmw.end @@ -5919,10 +5765,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: .LBB18_2: ; %atomicrmw.end @@ -5940,6 +5785,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 @@ -5960,11 +5806,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-TRUE16-NEXT: .LBB18_2: ; %atomicrmw.end @@ -5982,6 +5826,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 @@ -6001,11 +5846,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-FAKE16-NEXT: .LBB18_2: ; %atomicrmw.end @@ -6038,9 +5881,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: .LBB18_2: ; %atomicrmw.end @@ -6072,10 +5914,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.end @@ -6108,9 +5949,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: .LBB18_2: ; %atomicrmw.end @@ -6143,9 +5983,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.end @@ -6173,9 +6012,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: .LBB18_2: ; %atomicrmw.end @@ -6204,9 +6042,8 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: .LBB18_2: ; %atomicrmw.end @@ -6254,10 +6091,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-TRUE16-NEXT: ; divergent control-flow edge ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-TRUE16-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6300,10 +6136,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX12-FAKE16-NEXT: ; divergent control-flow edge ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-FAKE16-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6337,10 +6172,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX942-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: s_mov_b64 exec, vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6378,9 +6212,8 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6417,9 +6250,8 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-FAKE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-FAKE16-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6451,9 +6283,8 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6484,10 +6315,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX90A-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6518,10 +6348,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX908-NEXT: s_xor_b64 s[8:9], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6552,10 +6381,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6581,10 +6409,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6611,10 +6438,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: .LBB19_2: ; %atomicrmw.end @@ -6659,6 +6485,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6667,11 +6494,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v3 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: .LBB20_2: ; %atomicrmw.end @@ -6696,9 +6521,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: .LBB20_2: ; %atomicrmw.end @@ -6720,10 +6544,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.end @@ -6746,9 +6569,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: .LBB20_2: ; %atomicrmw.end @@ -6774,9 +6596,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: .LBB20_2: ; %atomicrmw.end @@ -6813,10 +6634,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: .LBB20_2: ; %atomicrmw.end @@ -6855,10 +6675,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: .LBB20_2: ; %atomicrmw.end @@ -6902,6 +6721,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6910,11 +6730,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v3 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: .LBB21_2: ; %atomicrmw.end @@ -6939,9 +6757,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: .LBB21_2: ; %atomicrmw.end @@ -6963,10 +6780,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.end @@ -6989,9 +6805,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: .LBB21_2: ; %atomicrmw.end @@ -7017,9 +6832,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: .LBB21_2: ; %atomicrmw.end @@ -7056,10 +6870,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: .LBB21_2: ; %atomicrmw.end @@ -7099,10 +6912,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: .LBB21_2: ; %atomicrmw.end @@ -7155,9 +6967,8 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: .LBB22_2: ; %atomicrmw.end @@ -7181,9 +6992,8 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: .LBB22_2: ; %atomicrmw.end @@ -7204,10 +7014,9 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.end @@ -7228,10 +7037,9 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: .LBB22_2: ; %atomicrmw.end @@ -7255,10 +7063,9 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: .LBB22_2: ; %atomicrmw.end @@ -7294,10 +7101,9 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: .LBB22_2: ; %atomicrmw.end @@ -7333,10 +7139,9 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: .LBB22_2: ; %atomicrmw.end @@ -7385,9 +7190,8 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: .LBB23_2: ; %atomicrmw.end @@ -7411,9 +7215,8 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_xor_b32 s5, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 ; GFX10-NEXT: .LBB23_2: ; %atomicrmw.end @@ -7434,10 +7237,9 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.end @@ -7458,10 +7260,9 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX908-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: .LBB23_2: ; %atomicrmw.end @@ -7485,10 +7286,9 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: .LBB23_2: ; %atomicrmw.end @@ -7524,10 +7324,9 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: .LBB23_2: ; %atomicrmw.end @@ -7564,10 +7363,9 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: .LBB23_2: ; %atomicrmw.end @@ -7616,6 +7414,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 @@ -7642,11 +7441,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-TRUE16-NEXT: .LBB24_2: ; %atomicrmw.end @@ -7684,8 +7481,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 @@ -7693,11 +7491,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s0 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-FAKE16-NEXT: .LBB24_2: ; %atomicrmw.end @@ -7739,9 +7535,8 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: s_xor_b32 s4, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_or_b32 s5, s5, s4 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 ; GFX10-NEXT: .LBB24_2: ; %atomicrmw.end @@ -7781,10 +7576,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.end @@ -7825,9 +7619,8 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX908-NEXT: s_xor_b64 s[4:5], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: .LBB24_2: ; %atomicrmw.end @@ -7870,9 +7663,8 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX8-NEXT: s_xor_b64 s[4:5], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.end @@ -7911,12 +7703,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.end @@ -7958,12 +7749,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: .LBB24_2: ; %atomicrmw.end @@ -8011,6 +7801,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 @@ -8037,11 +7828,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-TRUE16-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8079,8 +7868,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 @@ -8088,11 +7878,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 1, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s0 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-FAKE16-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8134,9 +7922,8 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: s_xor_b32 s4, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_or_b32 s5, s5, s4 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 ; GFX10-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8176,10 +7963,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8220,9 +8006,8 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX908-NEXT: s_xor_b64 s[4:5], exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8265,9 +8050,8 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX8-NEXT: s_xor_b64 s[4:5], exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8306,12 +8090,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8354,12 +8137,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: .LBB25_2: ; %atomicrmw.end @@ -8434,9 +8216,8 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-TRUE16-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8483,9 +8264,8 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s0 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-FAKE16-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8526,9 +8306,8 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 -; GFX10-NEXT: s_xor_b32 s4, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_or_b32 s5, s5, s4 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8567,10 +8346,9 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8609,10 +8387,9 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8653,10 +8430,9 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX8-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8694,12 +8470,11 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8737,12 +8512,11 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: .LBB26_2: ; %atomicrmw.end @@ -8812,9 +8586,8 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-TRUE16-NEXT: s_xor_b32 s1, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-TRUE16-NEXT: ; divergent control-flow edge ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: .LBB27_2: ; %atomicrmw.end @@ -8861,9 +8634,8 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s0 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX11-FAKE16-NEXT: ; divergent control-flow edge ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-FAKE16-NEXT: .LBB27_2: ; %atomicrmw.end @@ -8904,9 +8676,8 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 -; GFX10-NEXT: s_xor_b32 s4, exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_or_b32 s5, s5, s4 -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: .LBB27_2: ; %atomicrmw.end @@ -8945,10 +8716,9 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX90A-NEXT: s_mov_b64 exec, vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: .LBB27_2: ; %atomicrmw.end @@ -8987,10 +8757,9 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX908-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX908-NEXT: s_mov_b64 exec, vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: .LBB27_2: ; %atomicrmw.end @@ -9031,10 +8800,9 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 1, v3 -; GFX8-NEXT: s_xor_b64 s[4:5], exec, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] -; GFX8-NEXT: s_mov_b64 exec, vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: .LBB27_2: ; %atomicrmw.end @@ -9072,12 +8840,11 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: .LBB27_2: ; %atomicrmw.end @@ -9116,12 +8883,11 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: .LBB27_2: ; %atomicrmw.end @@ -9148,9 +8914,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s1, s3, 4 -; GFX12-NEXT: s_xor_b32 s3, s0, exec_lo +; GFX12-NEXT: s_xor_b32 s0, s0, exec_lo ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: .LBB28_1: @@ -9166,22 +8932,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: .LBB28_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX12-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX12-NEXT: s_xor_b32 s3, vcc_lo, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_mov_b32 exec_lo, s6 +; GFX12-NEXT: s_and_saveexec_b32 s3, s3 ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execz .LBB28_4 ; GFX12-NEXT: .LBB28_3: -; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX12-NEXT: s_lshl_b32 s1, s1, 4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mul_f32 v2, 0x42280000, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -9189,7 +8955,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_4: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: s_mov_b32 s1, exec_lo @@ -9220,8 +8987,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, vcc_lo -; GFX12-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: .LBB28_7: @@ -9257,9 +9023,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_add_i32 s3, s3, 4 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX942-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX942-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX942-NEXT: ; implicit-def: $vgpr2 -; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execz .LBB28_2 ; GFX942-NEXT: .LBB28_1: @@ -9272,30 +9038,30 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: .LBB28_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_mov_b64 s[0:1], exec -; GFX942-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX942-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX942-NEXT: s_mov_b64 s[6:7], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX942-NEXT: v_readfirstlane_b32 s6, v2 -; GFX942-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: v_readfirstlane_b32 s8, v2 +; GFX942-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execz .LBB28_4 ; GFX942-NEXT: .LBB28_3: -; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX942-NEXT: s_lshl_b32 s0, s3, 4 +; GFX942-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX942-NEXT: s_lshl_b32 s3, s3, 4 ; GFX942-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, s0 +; GFX942-NEXT: v_mov_b32_e32 v3, s3 ; GFX942-NEXT: ds_add_f32 v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: .LBB28_4: -; GFX942-NEXT: s_or_b64 exec, exec, vcc +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX942-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX942-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, s6 +; GFX942-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX942-NEXT: s_mov_b64 s[0:1], exec ; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9316,11 +9082,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX942-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX942-NEXT: ; implicit-def: $vgpr3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX942-NEXT: s_mov_b64 exec, vcc +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execz .LBB28_8 ; GFX942-NEXT: .LBB28_7: @@ -9351,9 +9116,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s3, 4 -; GFX11-NEXT: s_xor_b32 s3, s0, exec_lo +; GFX11-NEXT: s_xor_b32 s0, s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: .LBB28_1: @@ -9369,27 +9134,27 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: .LBB28_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX11-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_xor_b32 s3, vcc_lo, exec_lo ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 exec_lo, s6 +; GFX11-NEXT: s_and_saveexec_b32 s3, s3 ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-NEXT: .LBB28_3: -; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mul_f32 v2, 0x42280000, v2 ; GFX11-NEXT: ds_add_f32 v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_4: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9418,8 +9183,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execz .LBB28_8 ; GFX11-NEXT: .LBB28_7: @@ -9449,8 +9213,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s3, 4 -; GFX10-NEXT: s_xor_b32 s3, s0, exec_lo -; GFX10-NEXT: s_mov_b32 exec_lo, s3 +; GFX10-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execz .LBB28_2 ; GFX10-NEXT: .LBB28_1: @@ -9465,19 +9229,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: .LBB28_2: ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10-NEXT: s_and_saveexec_b32 s3, s3 ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execz .LBB28_4 ; GFX10-NEXT: .LBB28_3: -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9486,7 +9250,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB28_4: ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9510,8 +9274,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: ; implicit-def: $vgpr3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execz .LBB28_8 ; GFX10-NEXT: .LBB28_7: @@ -9543,9 +9306,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s3, s3, 4 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX90A-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_mov_b64 exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB28_2 ; GFX90A-NEXT: .LBB28_1: @@ -9558,30 +9321,30 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX90A-NEXT: v_readfirstlane_b32 s6, v2 -; GFX90A-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX90A-NEXT: s_mov_b64 exec, s[8:9] +; GFX90A-NEXT: v_readfirstlane_b32 s8, v2 +; GFX90A-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB28_4 ; GFX90A-NEXT: .LBB28_3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s3, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX90A-NEXT: s_lshl_b32 s3, s3, 4 ; GFX90A-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NEXT: ds_add_f32 v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_4: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX90A-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc @@ -9603,9 +9366,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB28_8 ; GFX90A-NEXT: .LBB28_7: @@ -9635,9 +9397,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s3, s3, 4 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX908-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_mov_b64 exec, s[8:9] +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execz .LBB28_2 ; GFX908-NEXT: .LBB28_1: @@ -9650,30 +9412,30 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_2: ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX908-NEXT: v_readfirstlane_b32 s6, v2 -; GFX908-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX908-NEXT: s_mov_b64 exec, s[8:9] +; GFX908-NEXT: v_readfirstlane_b32 s8, v2 +; GFX908-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execz .LBB28_4 ; GFX908-NEXT: .LBB28_3: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX908-NEXT: s_lshl_b32 s0, s3, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX908-NEXT: s_lshl_b32 s3, s3, 4 ; GFX908-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, s0 +; GFX908-NEXT: v_mov_b32_e32 v3, s3 ; GFX908-NEXT: ds_add_f32 v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_4: -; GFX908-NEXT: s_or_b64 exec, exec, vcc +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX908-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, s6 +; GFX908-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc @@ -9695,9 +9457,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX908-NEXT: ; implicit-def: $vgpr3 -; GFX908-NEXT: s_mov_b64 exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execz .LBB28_8 ; GFX908-NEXT: .LBB28_7: @@ -9727,10 +9488,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s3, s3, 4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX8-NEXT: ; implicit-def: $vgpr2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_mov_b64 exec, s[8:9] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: .LBB28_1: @@ -9743,30 +9504,30 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_readfirstlane_b32 s6, v2 -; GFX8-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX8-NEXT: s_mov_b64 exec, s[8:9] +; GFX8-NEXT: v_readfirstlane_b32 s8, v2 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execz .LBB28_4 ; GFX8-NEXT: .LBB28_3: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s3, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX8-NEXT: s_lshl_b32 s3, s3, 4 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: ds_add_f32 v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_4: -; GFX8-NEXT: s_or_b64 exec, exec, vcc +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX8-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc @@ -9788,9 +9549,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX8-NEXT: ; implicit-def: $vgpr3 -; GFX8-NEXT: s_mov_b64 exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execz .LBB28_8 ; GFX8-NEXT: .LBB28_7: @@ -9815,29 +9575,29 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-LABEL: local_ds_fadd: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX7-NEXT: s_mov_b64 s[0:1], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 +; GFX7-NEXT: s_mov_b64 s[14:15], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s14, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s15, v0 +; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s7, s7, 4 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_xor_b64 s[14:15], s[12:13], exec +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX7-NEXT: s_mov_b64 s[8:9], -1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: ; implicit-def: $vgpr2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_mov_b64 exec, s[14:15] +; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], s[0:1] +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execz .LBB28_3 ; GFX7-NEXT: .LBB28_1: -; GFX7-NEXT: s_lshl_b32 s14, s7, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: s_lshl_b32 s0, s7, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v2, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[14:15] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_and_b64 s[0:1], s[8:9], exec @@ -9851,9 +9611,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] ; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v4 -; GFX7-NEXT: s_xor_b64 s[14:15], exec, s[0:1] -; GFX7-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GFX7-NEXT: s_mov_b64 exec, s[0:1] +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX7-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 ; GFX7-NEXT: .LBB28_3: @@ -9863,10 +9622,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX7-NEXT: s_xor_b64 s[14:15], vcc, exec ; GFX7-NEXT: v_readfirstlane_b32 s12, v2 -; GFX7-NEXT: s_or_b64 s[10:11], s[10:11], vcc -; GFX7-NEXT: s_mov_b64 exec, s[14:15] +; GFX7-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX7-NEXT: s_and_saveexec_b64 s[14:15], s[14:15] +; GFX7-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execz .LBB28_6 ; GFX7-NEXT: .LBB28_4: @@ -9886,10 +9645,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7-NEXT: s_or_b64 s[10:11], s[10:11], s[0:1] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB28_5 ; GFX7-NEXT: .LBB28_6: @@ -9920,11 +9678,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX7-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX7-NEXT: ; implicit-def: $vgpr3 -; GFX7-NEXT: s_mov_b64 exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execz .LBB28_11 ; GFX7-NEXT: .LBB28_9: @@ -9942,9 +9699,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v5 -; GFX7-NEXT: s_xor_b64 s[0:1], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB28_10 ; GFX7-NEXT: .LBB28_11: @@ -9963,30 +9719,30 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX6-LABEL: local_ds_fadd: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s12, s[4:5], 0x3 -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[10:11], 0, v0 +; GFX6-NEXT: s_load_dword s14, s[4:5], 0x3 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s12, s12, 4 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: s_add_i32 s14, s14, 4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_xor_b64 s[14:15], s[10:11], exec +; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX6-NEXT: s_mov_b64 s[6:7], -1 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: ; implicit-def: $vgpr2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_mov_b64 exec, s[14:15] +; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: s_and_saveexec_b64 s[10:11], s[0:1] +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB28_3 ; GFX6-NEXT: .LBB28_1: -; GFX6-NEXT: s_lshl_b32 s13, s12, 3 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: s_lshl_b32 s0, s14, 3 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v2, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[12:13] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], exec @@ -10000,9 +9756,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v4 -; GFX6-NEXT: s_xor_b64 s[14:15], exec, s[0:1] -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[0:1] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 ; GFX6-NEXT: .LBB28_3: @@ -10013,14 +9768,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: s_xor_b64 s[14:15], vcc, exec ; GFX6-NEXT: v_readfirstlane_b32 s11, v2 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], vcc -; GFX6-NEXT: s_mov_b64 exec, s[14:15] +; GFX6-NEXT: s_xor_b64 s[12:13], vcc, exec +; GFX6-NEXT: s_and_saveexec_b64 s[12:13], s[12:13] +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB28_6 ; GFX6-NEXT: .LBB28_4: -; GFX6-NEXT: s_lshl_b32 s12, s12, 4 +; GFX6-NEXT: s_lshl_b32 s12, s14, 4 ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: ds_read_b32 v4, v2 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -10036,10 +9791,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[0:1] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB28_5 ; GFX6-NEXT: .LBB28_6: @@ -10072,11 +9826,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX6-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: ; implicit-def: $vgpr3 -; GFX6-NEXT: s_mov_b64 exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB28_11 ; GFX6-NEXT: .LBB28_9: @@ -10094,9 +9847,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v5 -; GFX6-NEXT: s_xor_b64 s[0:1], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB28_10 ; GFX6-NEXT: .LBB28_11: @@ -10136,9 +9888,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s1, s3, 4 -; GFX12-NEXT: s_xor_b32 s3, s0, exec_lo +; GFX12-NEXT: s_xor_b32 s0, s0, exec_lo ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: .LBB29_1: @@ -10152,29 +9904,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: .LBB29_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX12-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX12-NEXT: s_xor_b32 s3, vcc_lo, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_mov_b32 exec_lo, s6 +; GFX12-NEXT: s_and_saveexec_b32 s3, s3 ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execz .LBB29_4 ; GFX12-NEXT: .LBB29_3: -; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX12-NEXT: s_lshl_b32 s1, s1, 4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mul_f32 v2, 0x42280000, v2 ; GFX12-NEXT: ds_add_f32 v3, v2 ; GFX12-NEXT: .LBB29_4: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: s_mov_b32 s1, exec_lo @@ -10205,8 +9958,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, vcc_lo -; GFX12-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-NEXT: ; divergent control-flow edge ; GFX12-NEXT: s_cbranch_execz .LBB29_8 ; GFX12-NEXT: .LBB29_7: @@ -10240,9 +9992,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_add_i32 s3, s3, 4 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX942-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX942-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX942-NEXT: ; implicit-def: $vgpr2 -; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execz .LBB29_2 ; GFX942-NEXT: .LBB29_1: @@ -10254,30 +10006,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX942-NEXT: .LBB29_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_mov_b64 s[0:1], exec -; GFX942-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX942-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX942-NEXT: s_mov_b64 s[6:7], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_readfirstlane_b32 s6, v2 -; GFX942-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: v_readfirstlane_b32 s8, v2 +; GFX942-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execz .LBB29_4 ; GFX942-NEXT: .LBB29_3: -; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX942-NEXT: s_lshl_b32 s0, s3, 4 +; GFX942-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX942-NEXT: s_lshl_b32 s3, s3, 4 ; GFX942-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, s0 +; GFX942-NEXT: v_mov_b32_e32 v3, s3 ; GFX942-NEXT: ds_add_f32 v3, v2 ; GFX942-NEXT: .LBB29_4: -; GFX942-NEXT: s_or_b64 exec, exec, vcc +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX942-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX942-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, s6 +; GFX942-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX942-NEXT: s_mov_b64 s[0:1], exec ; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 @@ -10298,11 +10050,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX942-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX942-NEXT: ; implicit-def: $vgpr3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX942-NEXT: s_mov_b64 exec, vcc +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: ; divergent control-flow edge ; GFX942-NEXT: s_cbranch_execz .LBB29_8 ; GFX942-NEXT: .LBB29_7: @@ -10332,9 +10083,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s3, 4 -; GFX11-NEXT: s_xor_b32 s3, s0, exec_lo +; GFX11-NEXT: s_xor_b32 s0, s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execz .LBB29_2 ; GFX11-NEXT: .LBB29_1: @@ -10348,26 +10099,26 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: .LBB29_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX11-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_xor_b32 s3, vcc_lo, exec_lo ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 exec_lo, s6 +; GFX11-NEXT: s_and_saveexec_b32 s3, s3 ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execz .LBB29_4 ; GFX11-NEXT: .LBB29_3: -; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mul_f32 v2, 0x42280000, v2 ; GFX11-NEXT: ds_add_f32 v3, v2 ; GFX11-NEXT: .LBB29_4: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 @@ -10396,8 +10147,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX11-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: ; divergent control-flow edge ; GFX11-NEXT: s_cbranch_execz .LBB29_8 ; GFX11-NEXT: .LBB29_7: @@ -10425,8 +10175,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s3, 4 -; GFX10-NEXT: s_xor_b32 s3, s0, exec_lo -; GFX10-NEXT: s_mov_b32 exec_lo, s3 +; GFX10-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execz .LBB29_2 ; GFX10-NEXT: .LBB29_1: @@ -10439,26 +10189,26 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: .LBB29_2: ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10-NEXT: s_and_saveexec_b32 s3, s3 ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execz .LBB29_4 ; GFX10-NEXT: .LBB29_3: -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX10-NEXT: ds_add_f32 v3, v2 ; GFX10-NEXT: .LBB29_4: ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 @@ -10482,8 +10232,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: ; implicit-def: $vgpr3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; GFX10-NEXT: s_mov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: ; divergent control-flow edge ; GFX10-NEXT: s_cbranch_execz .LBB29_8 ; GFX10-NEXT: .LBB29_7: @@ -10513,9 +10262,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s3, s3, 4 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX90A-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_mov_b64 exec, s[8:9] +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB29_2 ; GFX90A-NEXT: .LBB29_1: @@ -10527,30 +10276,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: .LBB29_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s6, v2 -; GFX90A-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX90A-NEXT: s_mov_b64 exec, s[8:9] +; GFX90A-NEXT: v_readfirstlane_b32 s8, v2 +; GFX90A-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB29_4 ; GFX90A-NEXT: .LBB29_3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s3, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX90A-NEXT: s_lshl_b32 s3, s3, 4 ; GFX90A-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NEXT: ds_add_f32 v3, v2 ; GFX90A-NEXT: .LBB29_4: -; GFX90A-NEXT: s_or_b64 exec, exec, vcc +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX90A-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc @@ -10572,9 +10321,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_mov_b64 exec, vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: ; divergent control-flow edge ; GFX90A-NEXT: s_cbranch_execz .LBB29_8 ; GFX90A-NEXT: .LBB29_7: @@ -10603,9 +10351,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s3, s3, 4 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX908-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_mov_b64 exec, s[8:9] +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execz .LBB29_2 ; GFX908-NEXT: .LBB29_1: @@ -10617,30 +10365,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: .LBB29_2: ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX908-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s6, v2 -; GFX908-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX908-NEXT: s_mov_b64 exec, s[8:9] +; GFX908-NEXT: v_readfirstlane_b32 s8, v2 +; GFX908-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execz .LBB29_4 ; GFX908-NEXT: .LBB29_3: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX908-NEXT: s_lshl_b32 s0, s3, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX908-NEXT: s_lshl_b32 s3, s3, 4 ; GFX908-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, s0 +; GFX908-NEXT: v_mov_b32_e32 v3, s3 ; GFX908-NEXT: ds_add_f32 v3, v2 ; GFX908-NEXT: .LBB29_4: -; GFX908-NEXT: s_or_b64 exec, exec, vcc +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX908-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, s6 +; GFX908-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc @@ -10662,9 +10410,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX908-NEXT: ; implicit-def: $vgpr3 -; GFX908-NEXT: s_mov_b64 exec, vcc +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX908-NEXT: ; divergent control-flow edge ; GFX908-NEXT: s_cbranch_execz .LBB29_8 ; GFX908-NEXT: .LBB29_7: @@ -10693,10 +10440,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s3, s3, 4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], exec +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX8-NEXT: ; implicit-def: $vgpr2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_mov_b64 exec, s[8:9] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execz .LBB29_2 ; GFX8-NEXT: .LBB29_1: @@ -10708,30 +10455,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: .LBB29_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s6, v2 -; GFX8-NEXT: s_xor_b64 s[8:9], vcc, exec -; GFX8-NEXT: s_mov_b64 exec, s[8:9] +; GFX8-NEXT: v_readfirstlane_b32 s8, v2 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execz .LBB29_4 ; GFX8-NEXT: .LBB29_3: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s3, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX8-NEXT: s_lshl_b32 s3, s3, 4 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: ds_add_f32 v3, v2 ; GFX8-NEXT: .LBB29_4: -; GFX8-NEXT: s_or_b64 exec, exec, vcc +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX8-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc @@ -10753,9 +10500,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX8-NEXT: ; implicit-def: $vgpr3 -; GFX8-NEXT: s_mov_b64 exec, vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execz .LBB29_8 ; GFX8-NEXT: .LBB29_7: @@ -10779,29 +10525,29 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-LABEL: local_ds_fadd_one_as: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX7-NEXT: s_mov_b64 s[0:1], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 +; GFX7-NEXT: s_mov_b64 s[14:15], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s14, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s15, v0 +; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s7, s7, 4 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_xor_b64 s[14:15], s[12:13], exec +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX7-NEXT: s_mov_b64 s[8:9], -1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: ; implicit-def: $vgpr2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_mov_b64 exec, s[14:15] +; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], s[0:1] +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execz .LBB29_3 ; GFX7-NEXT: .LBB29_1: -; GFX7-NEXT: s_lshl_b32 s14, s7, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: s_lshl_b32 s0, s7, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v2, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[14:15] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_and_b64 s[0:1], s[8:9], exec @@ -10815,9 +10561,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] ; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v4 -; GFX7-NEXT: s_xor_b64 s[14:15], exec, s[0:1] -; GFX7-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GFX7-NEXT: s_mov_b64 exec, s[0:1] +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX7-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 ; GFX7-NEXT: .LBB29_3: @@ -10827,10 +10572,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX7-NEXT: s_xor_b64 s[14:15], vcc, exec ; GFX7-NEXT: v_readfirstlane_b32 s12, v2 -; GFX7-NEXT: s_or_b64 s[10:11], s[10:11], vcc -; GFX7-NEXT: s_mov_b64 exec, s[14:15] +; GFX7-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX7-NEXT: s_and_saveexec_b64 s[14:15], s[14:15] +; GFX7-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execz .LBB29_6 ; GFX7-NEXT: .LBB29_4: @@ -10850,10 +10595,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX7-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7-NEXT: s_or_b64 s[10:11], s[10:11], s[0:1] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB29_5 ; GFX7-NEXT: .LBB29_6: @@ -10884,11 +10628,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX7-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX7-NEXT: ; implicit-def: $vgpr3 -; GFX7-NEXT: s_mov_b64 exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execz .LBB29_11 ; GFX7-NEXT: .LBB29_9: @@ -10906,9 +10649,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v5 -; GFX7-NEXT: s_xor_b64 s[0:1], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB29_10 ; GFX7-NEXT: .LBB29_11: @@ -10927,30 +10669,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX6-LABEL: local_ds_fadd_one_as: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s12, s[4:5], 0x3 -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[10:11], 0, v0 +; GFX6-NEXT: s_load_dword s14, s[4:5], 0x3 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s12, s12, 4 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: s_add_i32 s14, s14, 4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_xor_b64 s[14:15], s[10:11], exec +; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], exec ; GFX6-NEXT: s_mov_b64 s[6:7], -1 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: ; implicit-def: $vgpr2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_mov_b64 exec, s[14:15] +; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: s_and_saveexec_b64 s[10:11], s[0:1] +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB29_3 ; GFX6-NEXT: .LBB29_1: -; GFX6-NEXT: s_lshl_b32 s13, s12, 3 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: s_lshl_b32 s0, s14, 3 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v2, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[12:13] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], exec @@ -10964,9 +10706,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v4 -; GFX6-NEXT: s_xor_b64 s[14:15], exec, s[0:1] -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[0:1] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 ; GFX6-NEXT: .LBB29_3: @@ -10977,14 +10718,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: s_xor_b64 s[14:15], vcc, exec ; GFX6-NEXT: v_readfirstlane_b32 s11, v2 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], vcc -; GFX6-NEXT: s_mov_b64 exec, s[14:15] +; GFX6-NEXT: s_xor_b64 s[12:13], vcc, exec +; GFX6-NEXT: s_and_saveexec_b64 s[12:13], s[12:13] +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB29_6 ; GFX6-NEXT: .LBB29_4: -; GFX6-NEXT: s_lshl_b32 s12, s12, 4 +; GFX6-NEXT: s_lshl_b32 s12, s14, 4 ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: ds_read_b32 v4, v2 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -11000,10 +10741,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 -; GFX6-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[0:1] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB29_5 ; GFX6-NEXT: .LBB29_6: @@ -11036,11 +10776,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX6-NEXT: s_xor_b64 s[0:1], exec, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: ; implicit-def: $vgpr3 -; GFX6-NEXT: s_mov_b64 exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB29_11 ; GFX6-NEXT: .LBB29_9: @@ -11058,9 +10797,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v5 -; GFX6-NEXT: s_xor_b64 s[0:1], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB29_10 ; GFX6-NEXT: .LBB29_11: @@ -11172,9 +10910,8 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: .LBB30_2: ; %atomicrmw.end @@ -11198,9 +10935,8 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: .LBB30_2: ; %atomicrmw.end @@ -11294,10 +11030,9 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX7-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 exec, vcc ; GFX7-NEXT: ; divergent control-flow edge ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: .LBB31_2: ; %atomicrmw.end @@ -11319,10 +11054,9 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GFX6-NEXT: s_xor_b64 s[6:7], exec, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, vcc ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: .LBB31_2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 0d5f28171f2ef..ac8a2c2b39926 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -49,10 +49,8 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 -; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB0_1 ; GCN-NEXT: .LBB0_3: ; %bb9 @@ -131,10 +129,8 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 -; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB1_1 ; GCN-NEXT: .LBB1_4: ; %bb9 @@ -229,10 +225,8 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB2_1 ; GCN-NEXT: .LBB2_4: ; %bb9 @@ -324,10 +318,8 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB3_1 ; GCN-NEXT: .LBB3_4: ; %bb9 @@ -417,10 +409,8 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 -; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB4_1 ; GCN-NEXT: .LBB4_4: ; %bb9 @@ -513,10 +503,8 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-NEXT: s_add_i32 s4, s4, 1 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-NEXT: .LBB5_4: ; %bb9 diff --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll index 0dcd85fe6ec92..f9ff3e757d18b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll @@ -176,9 +176,8 @@ define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, ptr add ; GCN-NEXT: v_mad_i32_i24 v0, v0, v5, v0 ; GCN-NEXT: v_mov_b32_e32 v5, v2 ; GCN-NEXT: s_xor_b64 s[6:7], vcc, exec -; GCN-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB3_1 ; GCN-NEXT: .LBB3_2: ; %bb5 @@ -198,17 +197,16 @@ define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, ptr add ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: .LBB3_1: ; %bb6 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v1 ; VI-NEXT: v_mad_i32_i24 v0, v0, v5, v5 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; VI-NEXT: v_mad_i32_i24 v5, v0, v5, v0 -; VI-NEXT: s_xor_b64 s[6:7], vcc, exec +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v1 ; VI-NEXT: v_mad_i32_i24 v0, v5, v0, v5 -; VI-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; VI-NEXT: v_mad_i32_i24 v0, v0, v5, v0 ; VI-NEXT: v_mov_b32_e32 v5, v2 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_xor_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; VI-NEXT: ; divergent control-flow edge ; VI-NEXT: s_cbranch_execnz .LBB3_1 ; VI-NEXT: .LBB3_2: ; %bb5 diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll index 27678fd4ed172..229b079f223ba 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll @@ -1541,9 +1541,8 @@ define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %ar ; GCN-NEXT: buffer_store_dword v5, v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: buffer_store_dwordx4 v[5:8], v[18:19], s[4:7], 0 addr64 ; GCN-NEXT: s_xor_b64 s[10:11], vcc, exec -; GCN-NEXT: s_xor_b64 s[12:13], exec, s[10:11] -; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GCN-NEXT: s_mov_b64 exec, s[10:11] +; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[10:11] +; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB9_1 ; GCN-NEXT: .LBB9_2: ; %bb18 @@ -1563,21 +1562,20 @@ define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %ar ; GFX8-NEXT: .LBB9_1: ; %bb19 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GFX8-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX8-NEXT: v_mad_u32_u24 v4, v5, v0, v14 ; GFX8-NEXT: v_mad_u32_u24 v6, v6, v1, v10 ; GFX8-NEXT: v_mad_u32_u24 v7, v7, v2, v11 ; GFX8-NEXT: v_mad_u32_u24 v8, v8, v3, v12 -; GFX8-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 ; GFX8-NEXT: v_mad_u32_u24 v5, v4, v0, v14 ; GFX8-NEXT: v_mad_u32_u24 v6, v6, v1, v10 ; GFX8-NEXT: v_mad_u32_u24 v7, v7, v2, v11 ; GFX8-NEXT: v_mad_u32_u24 v8, v8, v3, v12 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX8-NEXT: flat_store_dword v[16:17], v5 ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[5:8] -; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: .LBB9_2: ; %bb18 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index d937f85da3a8e..4e047ea54bafd 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -5416,7 +5416,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; CHECK-NEXT: s_mov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB5_3 ; CHECK-NEXT: .LBB5_1: ; %memmove_bwd_loop.preheader @@ -5490,9 +5490,8 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] ; CHECK-NEXT: s_cbranch_scc0 .LBB5_2 ; CHECK-NEXT: .LBB5_3: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 -; CHECK-NEXT: s_mov_b32 exec_lo, s6 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s6, s6 ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB5_6 ; CHECK-NEXT: .LBB5_4: ; %memmove_fwd_loop.preheader @@ -5561,7 +5560,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] ; CHECK-NEXT: s_cbranch_scc1 .LBB5_5 ; CHECK-NEXT: .LBB5_6: ; %memmove_done -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -5581,7 +5580,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; ALIGNED-NEXT: s_mov_b32 exec_lo, vcc_lo +; ALIGNED-NEXT: s_and_saveexec_b32 s7, vcc_lo ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB5_3 ; ALIGNED-NEXT: .LBB5_1: ; %memmove_bwd_loop.preheader @@ -6221,9 +6220,8 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: flat_store_byte v[20:21], v0 offset:46 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_2 ; ALIGNED-NEXT: .LBB5_3: -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_xor_b32 s7, exec_lo, s6 -; ALIGNED-NEXT: s_mov_b32 exec_lo, s6 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_and_saveexec_b32 s6, s6 ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB5_6 ; ALIGNED-NEXT: .LBB5_4: ; %memmove_fwd_loop.preheader @@ -6858,7 +6856,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %memmove_done -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; ALIGNED-NEXT: s_clause 0xa ; 44-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 @@ -6879,7 +6877,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; UNROLL3-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; UNROLL3-NEXT: s_mov_b32 exec_lo, vcc_lo +; UNROLL3-NEXT: s_and_saveexec_b32 s7, vcc_lo ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB5_3 ; UNROLL3-NEXT: .LBB5_1: ; %memmove_bwd_residual @@ -6917,9 +6915,8 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB5_2 ; UNROLL3-NEXT: .LBB5_3: -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; UNROLL3-NEXT: s_xor_b32 s7, exec_lo, s6 -; UNROLL3-NEXT: s_mov_b32 exec_lo, s6 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_and_saveexec_b32 s6, s6 ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB5_7 ; UNROLL3-NEXT: .LBB5_4: ; %memmove_fwd_loop.preheader @@ -6957,7 +6954,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032 ; UNROLL3-NEXT: .LBB5_7: ; %memmove_done -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: @@ -6971,7 +6968,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; CHECK-NEXT: s_mov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB6_3 ; CHECK-NEXT: .LBB6_1: ; %memmove_bwd_loop.preheader @@ -7041,9 +7038,8 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB6_2 ; CHECK-NEXT: .LBB6_3: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 -; CHECK-NEXT: s_mov_b32 exec_lo, s6 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s6, s6 ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB6_6 ; CHECK-NEXT: .LBB6_4: ; %memmove_fwd_loop.preheader @@ -7108,7 +7104,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB6_5 ; CHECK-NEXT: .LBB6_6: ; %memmove_done -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p1_p1_sz2048: @@ -7126,7 +7122,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; ALIGNED-NEXT: s_mov_b32 exec_lo, vcc_lo +; ALIGNED-NEXT: s_and_saveexec_b32 s7, vcc_lo ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB6_3 ; ALIGNED-NEXT: .LBB6_1: ; %memmove_bwd_loop.preheader @@ -7757,9 +7753,8 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_2 ; ALIGNED-NEXT: .LBB6_3: -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_xor_b32 s7, exec_lo, s6 -; ALIGNED-NEXT: s_mov_b32 exec_lo, s6 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_and_saveexec_b32 s6, s6 ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB6_6 ; ALIGNED-NEXT: .LBB6_4: ; %memmove_fwd_loop.preheader @@ -8385,7 +8380,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc1 .LBB6_5 ; ALIGNED-NEXT: .LBB6_6: ; %memmove_done -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; ALIGNED-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 @@ -8405,7 +8400,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; UNROLL3-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; UNROLL3-NEXT: s_mov_b32 exec_lo, vcc_lo +; UNROLL3-NEXT: s_and_saveexec_b32 s7, vcc_lo ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB6_3 ; UNROLL3-NEXT: .LBB6_1: ; %memmove_bwd_residual @@ -8443,9 +8438,8 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB6_2 ; UNROLL3-NEXT: .LBB6_3: -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; UNROLL3-NEXT: s_xor_b32 s7, exec_lo, s6 -; UNROLL3-NEXT: s_mov_b32 exec_lo, s6 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_and_saveexec_b32 s6, s6 ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB6_7 ; UNROLL3-NEXT: .LBB6_4: ; %memmove_fwd_loop.preheader @@ -8483,7 +8477,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032 ; UNROLL3-NEXT: .LBB6_7: ; %memmove_done -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false) @@ -8496,7 +8490,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; CHECK-NEXT: s_mov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB7_3 ; CHECK-NEXT: .LBB7_1: ; %memmove_bwd_loop.preheader @@ -8566,9 +8560,8 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] ; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 ; CHECK-NEXT: .LBB7_3: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 -; CHECK-NEXT: s_mov_b32 exec_lo, s6 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s6, s6 ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB7_6 ; CHECK-NEXT: .LBB7_4: ; %memmove_fwd_loop.preheader @@ -8635,7 +8628,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] ; CHECK-NEXT: s_cbranch_scc1 .LBB7_5 ; CHECK-NEXT: .LBB7_6: ; %memmove_done -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -8644,7 +8637,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; ALIGNED-NEXT: s_mov_b32 exec_lo, vcc_lo +; ALIGNED-NEXT: s_and_saveexec_b32 s7, vcc_lo ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB7_3 ; ALIGNED-NEXT: .LBB7_1: ; %memmove_bwd_loop.preheader @@ -9146,9 +9139,8 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: flat_store_byte v[98:99], v26 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_2 ; ALIGNED-NEXT: .LBB7_3: -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_xor_b32 s7, exec_lo, s6 -; ALIGNED-NEXT: s_mov_b32 exec_lo, s6 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_and_saveexec_b32 s6, s6 ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB7_6 ; ALIGNED-NEXT: .LBB7_4: ; %memmove_fwd_loop.preheader @@ -9648,7 +9640,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: flat_store_byte v[98:99], v82 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_5 ; ALIGNED-NEXT: .LBB7_6: ; %memmove_done -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -9657,7 +9649,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] ; UNROLL3-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; UNROLL3-NEXT: s_mov_b32 exec_lo, vcc_lo +; UNROLL3-NEXT: s_and_saveexec_b32 s7, vcc_lo ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB7_3 ; UNROLL3-NEXT: .LBB7_1: ; %memmove_bwd_residual @@ -9694,9 +9686,8 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB7_2 ; UNROLL3-NEXT: .LBB7_3: -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; UNROLL3-NEXT: s_xor_b32 s7, exec_lo, s6 -; UNROLL3-NEXT: s_mov_b32 exec_lo, s6 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_and_saveexec_b32 s6, s6 ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB7_7 ; UNROLL3-NEXT: .LBB7_4: ; %memmove_fwd_loop.preheader @@ -9735,7 +9726,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2032 ; UNROLL3-NEXT: .LBB7_7: ; %memmove_done -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: @@ -9749,7 +9740,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; CHECK-NEXT: s_mov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB8_3 ; CHECK-NEXT: .LBB8_1: ; %memmove_bwd_loop.preheader @@ -9956,9 +9947,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB8_2 ; CHECK-NEXT: .LBB8_3: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 -; CHECK-NEXT: s_mov_b32 exec_lo, s6 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s6, s6 ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB8_6 ; CHECK-NEXT: .LBB8_4: ; %memmove_fwd_loop.preheader @@ -10164,7 +10154,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB8_5 ; CHECK-NEXT: .LBB8_6: ; %memmove_done -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p5_p5_sz2048: @@ -10218,7 +10208,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v0 ; ALIGNED-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; ALIGNED-NEXT: s_mov_b32 exec_lo, vcc_lo +; ALIGNED-NEXT: s_and_saveexec_b32 s7, vcc_lo ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB8_3 ; ALIGNED-NEXT: .LBB8_1: ; %memmove_bwd_loop.preheader @@ -11280,9 +11270,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_2 ; ALIGNED-NEXT: .LBB8_3: -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_xor_b32 s7, exec_lo, s6 -; ALIGNED-NEXT: s_mov_b32 exec_lo, s6 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_and_saveexec_b32 s6, s6 ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB8_6 ; ALIGNED-NEXT: .LBB8_4: ; %memmove_fwd_loop.preheader @@ -12343,7 +12332,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB8_5 ; ALIGNED-NEXT: .LBB8_6: ; %memmove_done -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; ALIGNED-NEXT: s_clause 0x2d ; 184-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v124, off, s[0:3], s32 offset:4 @@ -12399,7 +12388,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v0 ; UNROLL3-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; UNROLL3-NEXT: s_mov_b32 exec_lo, vcc_lo +; UNROLL3-NEXT: s_and_saveexec_b32 s7, vcc_lo ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB8_3 ; UNROLL3-NEXT: .LBB8_1: ; %memmove_bwd_residual @@ -12477,9 +12466,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB8_2 ; UNROLL3-NEXT: .LBB8_3: -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; UNROLL3-NEXT: s_xor_b32 s7, exec_lo, s6 -; UNROLL3-NEXT: s_mov_b32 exec_lo, s6 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_and_saveexec_b32 s6, s6 ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB8_7 ; UNROLL3-NEXT: .LBB8_4: ; %memmove_fwd_loop.preheader @@ -12559,7 +12547,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2032 ; UNROLL3-NEXT: .LBB8_7: ; %memmove_done -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false) @@ -12574,7 +12562,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, v2, v3 ; CHECK-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; CHECK-NEXT: s_mov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB9_3 ; CHECK-NEXT: .LBB9_1: ; %memmove_bwd_loop.preheader @@ -12686,9 +12674,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: .LBB9_3: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 -; CHECK-NEXT: s_mov_b32 exec_lo, s6 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s6, s6 ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB9_6 ; CHECK-NEXT: .LBB9_4: ; %memmove_fwd_loop.preheader @@ -12793,7 +12780,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] ; CHECK-NEXT: s_cbranch_scc1 .LBB9_5 ; CHECK-NEXT: .LBB9_6: ; %memmove_done -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -12852,7 +12839,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_cndmask_b32_e32 v3, -1, v55, vcc_lo ; ALIGNED-NEXT: v_cmp_lt_u32_e32 vcc_lo, v2, v3 ; ALIGNED-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; ALIGNED-NEXT: s_mov_b32 exec_lo, vcc_lo +; ALIGNED-NEXT: s_and_saveexec_b32 s7, vcc_lo ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB9_3 ; ALIGNED-NEXT: .LBB9_1: ; %memmove_bwd_loop.preheader @@ -14365,9 +14352,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_2 ; ALIGNED-NEXT: .LBB9_3: -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_xor_b32 s7, exec_lo, s6 -; ALIGNED-NEXT: s_mov_b32 exec_lo, s6 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_and_saveexec_b32 s6, s6 ; ALIGNED-NEXT: ; divergent control-flow edge ; ALIGNED-NEXT: s_cbranch_execz .LBB9_6 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_fwd_loop.preheader @@ -15874,7 +15860,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_5 ; ALIGNED-NEXT: .LBB9_6: ; %memmove_done -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; ALIGNED-NEXT: s_clause 0x2d ; 184-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v124, off, s[0:3], s32 offset:4 @@ -15932,7 +15918,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo ; UNROLL3-NEXT: v_cmp_lt_u32_e32 vcc_lo, v2, v3 ; UNROLL3-NEXT: s_xor_b32 s6, vcc_lo, exec_lo -; UNROLL3-NEXT: s_mov_b32 exec_lo, vcc_lo +; UNROLL3-NEXT: s_and_saveexec_b32 s7, vcc_lo ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB9_3 ; UNROLL3-NEXT: .LBB9_1: ; %memmove_bwd_residual @@ -15985,9 +15971,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB9_2 ; UNROLL3-NEXT: .LBB9_3: ; UNROLL3-NEXT: s_inst_prefetch 0x2 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; UNROLL3-NEXT: s_xor_b32 s7, exec_lo, s6 -; UNROLL3-NEXT: s_mov_b32 exec_lo, s6 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_and_saveexec_b32 s6, s6 ; UNROLL3-NEXT: ; divergent control-flow edge ; UNROLL3-NEXT: s_cbranch_execz .LBB9_7 ; UNROLL3-NEXT: .LBB9_4: ; %memmove_fwd_loop.preheader @@ -16041,7 +16026,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 ; UNROLL3-NEXT: .LBB9_7: ; %memmove_done -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll b/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll index ea4dcfc72cb37..49c4011257871 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll @@ -9,7 +9,8 @@ define amdgpu_hs void @_amdgpu_hs_main() #0 { ; GCN-LABEL: _amdgpu_ps_main: ; GCN: v_cmp_ngt_f32_e32 vcc, 0.5, v0 -; GCN: s_xor_b64 exec +; GCN: s_xor_b64 +; GCN: s_and_saveexec_b64 define amdgpu_ps void @_amdgpu_ps_main(i32 %arg) local_unnamed_addr #1 { .entry: diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index 30be4f93431d9..90050fcadddf0 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -26,7 +26,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-NEXT: s_mov_b64 s[6:7], -1 ; GCN-NEXT: s_mov_b64 s[2:3], 0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: .LBB0_1: ; %atomic @@ -47,10 +47,9 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc ; GCN-NEXT: v_mov_b32_e32 v5, v3 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB0_2 ; GCN-NEXT: .LBB0_3: ; %atomicrmw.end @@ -88,10 +87,10 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GCN-NEXT: s_xor_b64 s[0:1], s[6:7], exec +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_xor_b64 s[0:1], vcc, exec ; GCN-NEXT: s_mov_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execz .LBB1_3 ; GCN-NEXT: .LBB1_1: ; %atomic @@ -113,10 +112,9 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_xor_b64 s[8:9], exec, vcc ; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execnz .LBB1_2 ; GCN-NEXT: .LBB1_3: ; %exit diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index d63e35c912338..7fc6a1a078b00 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -23,9 +23,8 @@ define void @lsr_order_mul24_0(i32 %arg, i32 %arg2, i32 %arg6, i32 %arg13, i32 % ; GFX9-NEXT: v_sub_u32_e32 v5, v4, v5 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v5, v3 -; GFX9-NEXT: s_xor_b64 s[6:7], exec, vcc +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: s_mov_b64 exec, vcc ; GFX9-NEXT: ; divergent control-flow edge ; GFX9-NEXT: s_cbranch_execnz .LBB0_1 ; GFX9-NEXT: .LBB0_2: ; %.loopexit @@ -57,11 +56,11 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 1, v18 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[8:9], v0, v1 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: s_xor_b64 s[6:7], s[8:9], exec +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], exec ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] ; GFX9-NEXT: ; divergent control-flow edge ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: .LBB1_1: ; %bb19 @@ -100,13 +99,12 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[4:5], v11, v19, s[4:5] ; GFX9-NEXT: global_load_dword v3, v[18:19], off ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GFX9-NEXT: ds_write_b32 v8, v3 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] ; GFX9-NEXT: ; divergent control-flow edge ; GFX9-NEXT: s_cbranch_execnz .LBB1_2 ; GFX9-NEXT: .LBB1_3: ; %.loopexit diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 92e0c070e78c3..da5ef8a913f0c 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -64,7 +64,7 @@ ; First divergent branch ; GCN: v_cmp_lt_i32_e{{32|64}} ; GCN: s_xor_b64 -; GCN: s_mov_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: ; divergent control-flow edge ; GCN: s_cbranch_execz @@ -74,8 +74,7 @@ ; Second reconverge ; GCN: s_or_b64 exec, exec -; GCN: s_xor_b64 -; GCN: s_mov_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: ; divergent control-flow edge ; GCN: s_cbranch_execz @@ -84,8 +83,7 @@ ; Third reconverge ; GCN: s_or_b64 exec, exec -; GCN: s_xor_b64 -; GCN: s_mov_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: ; divergent control-flow edge ; GCN: s_cbranch_execz @@ -94,8 +92,7 @@ ; Fourth reconverge ; GCN: s_or_b64 exec, exec -; GCN: s_xor_b64 -; GCN: s_mov_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: ; divergent control-flow edge ; GCN: s_cbranch_execz @@ -370,7 +367,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; GCN: v_cmp_{{eq|ne}}_u32_e{{32|64}} {{vcc|s\[[0-9]+:[0-9]+\]}}, {{3|7}}, v0 ; GCN: s_xor_b64 -; GCN: s_mov_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: ; divergent control-flow edge ; GCN: s_cbranch_execz @@ -379,8 +376,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; GCN: v_mov_b32_e32 v0, 1.0 ; GCN: s_or_b64 exec, exec -; GCN: s_xor_b64 -; GCN: s_mov_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: ; divergent control-flow edge ; GCN: s_cbranch_execz diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll index af35caf7065d1..2187cde5e243d 100644 --- a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll @@ -10,26 +10,27 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @reg_coalescer_breaks_dead(ptr addrspace(1) nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, i1 %c0) #1 { ; GFX6-LABEL: reg_coalescer_breaks_dead: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 -; GFX6-NEXT: s_xor_b64 exec, vcc, exec +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: .LBB0_1: ; %bb3 -; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s1, s0, 31 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX6-NEXT: s_add_u32 s0, s2, s0 -; GFX6-NEXT: s_addc_u32 s1, s3, s1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; GFX6-NEXT: s_add_u32 s2, s6, s2 +; GFX6-NEXT: s_addc_u32 s3, s7, s3 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NEXT: .LBB0_2: ; %bb4 -; GFX6-NEXT: s_or_b64 exec, exec, vcc +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX6-NEXT: s_load_dword s0, s[4:5], 0xe ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bitcmp0_b32 s0, 0 @@ -41,26 +42,27 @@ define amdgpu_kernel void @reg_coalescer_breaks_dead(ptr addrspace(1) nocapture ; ; GFX8-LABEL: reg_coalescer_breaks_dead: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_xor_b64 exec, vcc, exec +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: ; divergent control-flow edge ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: .LBB0_1: ; %bb3 -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s2, s0 -; GFX8-NEXT: s_addc_u32 s1, s3, s1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_ashr_i32 s3, s2, 31 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; GFX8-NEXT: s_add_u32 s2, s6, s2 +; GFX8-NEXT: s_addc_u32 s3, s7, s3 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: .LBB0_2: ; %bb4 -; GFX8-NEXT: s_or_b64 exec, exec, vcc +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x38 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bitcmp0_b32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll index cd0347f2f1567..4a9b7c24590b3 100644 --- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll +++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll @@ -10,8 +10,8 @@ ; GCN: ; %else -; GCN: v_cmp_gt_f32_e64 [[SAVE:s\[[0-9]+:[0-9]+\]]] -; GCN: s_xor_b64 exec, [[SAVE]], exec +; GCN: v_cmp_gt_f32_e32 vcc +; GCN: s_xor_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc, exec ; GCN: s_cbranch_execz [[RET_BB]] ; GCN: .LBB{{[0-9]+}}_{{[0-9]+}}: ; %unreachable.bb @@ -59,8 +59,7 @@ ret.bb: ; preds = %else, %main_body ; GCN: v_cmp_gt_f32_e32 vcc ; GCN: s_xor_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc, exec -; GCN: s_mov_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], [[SAVE1]] -; GCN: s_mov_b64 exec, vcc +; GCN: s_and_saveexec_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_cbranch_execz ; GCN: [[RET_BB2]]: ; %ret.bb @@ -68,8 +67,7 @@ ret.bb: ; preds = %else, %main_body ; GCN: {{buffer|flat}}_store_dword ; GCN: s_or_b64 exec, exec, [[SAVE2]] -; GCN: s_xor_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] -; GCN: s_mov_b64 exec, [[SAVE1]] +; GCN: s_and_saveexec_b64 [[SAVE1]], [[SAVE1]] ; GCN: s_cbranch_execz [[UNIFIED_RET:.LBB[0-9]+_[0-9]+]] ; GCN: .LBB{{[0-9]+}}_{{[0-9]+}}: ; %unreachable.bb @@ -77,7 +75,7 @@ ret.bb: ; preds = %else, %main_body ; GCN: ; divergent unreachable ; GCN: [[UNIFIED_RET]]: ; %UnifiedReturnBlock -; GCN: s_or_b64 exec, exec, [[SAVE3]] +; GCN: s_or_b64 exec, exec, [[SAVE1]] ; GCN: s_waitcnt define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index ebf940c7b682b..2b9b1fdc3a70e 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -407,15 +407,15 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], vcc -; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GCN-IR-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v11 -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[8:9] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[8:9] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: .LBB1_1: ; %udiv-bb1 @@ -423,10 +423,11 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB1_4 ; GCN-IR-NEXT: .LBB1_2: ; %udiv-preheader @@ -458,13 +459,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: .LBB1_4: ; %udiv-loop-exit @@ -1475,13 +1475,13 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], 63, v[2:3] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 ; GCN-IR-NEXT: .LBB11_1: ; %udiv-bb1 @@ -1489,10 +1489,11 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB11_4 ; GCN-IR-NEXT: .LBB11_2: ; %udiv-preheader @@ -1523,13 +1524,12 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 ; GCN-IR-NEXT: .LBB11_4: ; %udiv-loop-exit @@ -1669,15 +1669,15 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 ; GCN-IR-NEXT: .LBB12_1: ; %udiv-bb1 @@ -1686,10 +1686,11 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[10:11], v2 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB12_4 ; GCN-IR-NEXT: .LBB12_2: ; %udiv-preheader @@ -1720,13 +1721,12 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: .LBB12_4: ; %udiv-loop-exit @@ -1780,7 +1780,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 ; GCN-IR-NEXT: .LBB13_1: ; %udiv-bb1 @@ -1788,10 +1788,11 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v0 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], v0 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB13_4 ; GCN-IR-NEXT: .LBB13_2: ; %udiv-preheader @@ -1819,14 +1820,13 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 -; GCN-IR-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 ; GCN-IR-NEXT: .LBB13_4: ; %udiv-loop-exit diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll index 316c27dfad00e..6f50c0756d1a5 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}annotate_unreachable: -; GCN: s_xor_b64 exec +; GCN: s_and_saveexec_b64 ; GCN-NOT: s_endpgm ; GCN: .Lfunc_end0 define amdgpu_kernel void @annotate_unreachable(ptr addrspace(1) noalias nocapture readonly %arg, i1 %c0) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll index 23a33ab95b1f1..1f10f15ddcfab 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator: ; GCN: v_cmp_ne_u32 -; GCN: s_xor_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_execz .LBB0_{{[0-9]+}} ; GCN: .LBB0_{{[0-9]+}}: ; %unreachable @@ -28,7 +28,7 @@ ret: ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order: ; GCN: v_cmp_eq_u32 -; GCN: s_xor_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_execz .LBB1_{{[0-9]+}} ; GCN: .LBB1_{{[0-9]+}}: ; %unreachable diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index a413de265a11c..0c5feab2fa0bf 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -18,7 +18,8 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.else ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 9, v0 -; CHECK-NEXT: s_xor_b64 exec, vcc, exec +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, exec +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[2:3] ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB0_5 ; CHECK-NEXT: .LBB0_2: ; %if.then diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll index 94c3628ec1b43..70c5200ab241c 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll @@ -5,7 +5,7 @@ ; An s_cbranch_execnz is required to avoid trapping if all lanes are 0 ; GCN-LABEL: {{^}}trap_divergent_branch: ; GCN: v_cmp_ne_u32 -; GCN: s_xor_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_execnz [[TRAP:.LBB[0-9]+_[0-9]+]] ; GCN: .LBB{{[0-9]+}}_{{[0-9]+}}: ; %end ; GCN-NEXT: s_endpgm @@ -29,7 +29,7 @@ end: ; GCN-LABEL: {{^}}debugtrap_divergent_branch: ; GCN: v_cmp_ne_u32 -; GCN: s_xor_b64 exec +; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_execnz [[TRAP:.LBB[0-9]+_[0-9]+]] ; GCN: .LBB{{[0-9]+}}_{{[0-9]+}}: ; %end ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b66b50e18a42b..d8d052fb4d9d3 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -9744,17 +9744,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s42, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 ; GFX6-NEXT: s_add_u32 s40, s40, s11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) @@ -9894,11 +9894,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, 1 -; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[4:11] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9917,6 +9917,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[2:3], vcc, exec ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] ; GFX6-NEXT: ;;#ASMEND @@ -9927,19 +9928,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ; def s[24:31] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[0:3] +; GFX6-NEXT: ; def s[4:7] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[4:5] +; GFX6-NEXT: ; def s[34:35] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_xor_b64 exec, vcc, exec +; GFX6-NEXT: s_and_saveexec_b64 vcc, s[2:3] ; GFX6-NEXT: ; divergent control-flow edge ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: .LBB1_1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9951,18 +9952,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s13, 5 ; GFX6-NEXT: v_writelane_b32 v4, s14, 6 ; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x85000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x85000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x84800 +; GFX6-NEXT: s_mov_b32 s2, 0x84800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s8, v4, 0 ; GFX6-NEXT: v_readlane_b32 s9, v4, 1 @@ -9974,8 +9975,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s15, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9987,18 +9988,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s21, 5 ; GFX6-NEXT: v_writelane_b32 v4, s22, 6 ; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x85800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x85800 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x85000 +; GFX6-NEXT: s_mov_b32 s2, 0x85000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s16, v4, 0 ; GFX6-NEXT: v_readlane_b32 s17, v4, 1 @@ -10010,8 +10011,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s23, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10023,18 +10024,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s29, 5 ; GFX6-NEXT: v_writelane_b32 v4, s30, 6 ; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x86000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x86000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x85800 +; GFX6-NEXT: s_mov_b32 s2, 0x85800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s24, v4, 0 ; GFX6-NEXT: v_readlane_b32 s25, v4, 1 @@ -10046,22 +10047,21 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s31, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s0, 0 -; GFX6-NEXT: v_writelane_b32 v4, s1, 1 -; GFX6-NEXT: v_writelane_b32 v4, s2, 2 -; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s34, 0x86800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v4, s4, 0 +; GFX6-NEXT: v_writelane_b32 v4, s5, 1 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: s_mov_b32 s2, 0x86800 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10350,8 +10350,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, v4 ; GFX9-FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, v4 +; GFX9-FLATSCR-NEXT: s_xor_b64 s[34:35], vcc, exec ; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s[0:7] @@ -10369,17 +10370,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ; def s[40:43] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART -; GFX9-FLATSCR-NEXT: ; def s[34:35] +; GFX9-FLATSCR-NEXT: ; def s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s33 ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: s_xor_b64 exec, vcc, exec +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], s[34:35] ; GFX9-FLATSCR-NEXT: ; divergent control-flow edge ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-FLATSCR-NEXT: .LBB1_1: ; %bb0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART -; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] +; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 ; 16-byte Folded Spill @@ -10404,7 +10405,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, vcc +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 @@ -10492,8 +10493,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[39:42], v5, s[38:39] offset:16 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 ; GFX10-FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 +; GFX10-FLATSCR-NEXT: s_xor_b32 s33, vcc_lo, exec_lo ; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; def s[0:7] @@ -10514,9 +10516,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ; def s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; def s33 +; GFX10-FLATSCR-NEXT: ; def s38 ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; GFX10-FLATSCR-NEXT: s_and_saveexec_b32 s33, s33 ; GFX10-FLATSCR-NEXT: ; divergent control-flow edge ; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX10-FLATSCR-NEXT: .LBB1_1: ; %bb0 @@ -10653,7 +10655,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: .LBB1_2: ; %ret -; GFX10-FLATSCR-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo +; GFX10-FLATSCR-NEXT: s_or_b32 exec_lo, exec_lo, s33 ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[63:66], s[36:37] offset:112 ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[59:62], s[36:37] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 06f0e4e6ea80b..47731e2a7e8c0 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -388,14 +388,14 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], vcc -; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GCN-IR-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 -; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[8:9] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[8:9] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: .LBB1_1: ; %udiv-bb1 @@ -403,10 +403,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB1_4 ; GCN-IR-NEXT: .LBB1_2: ; %udiv-preheader @@ -438,13 +439,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: .LBB1_4: ; %udiv-loop-exit @@ -1627,12 +1627,12 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], 63, v[2:3] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 ; GCN-IR-NEXT: .LBB11_1: ; %udiv-bb1 @@ -1640,10 +1640,11 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB11_4 ; GCN-IR-NEXT: .LBB11_2: ; %udiv-preheader @@ -1674,13 +1675,12 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 ; GCN-IR-NEXT: .LBB11_4: ; %udiv-loop-exit @@ -1819,14 +1819,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 ; GCN-IR-NEXT: .LBB12_1: ; %udiv-bb1 @@ -1835,10 +1835,11 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[10:11], v2 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB12_4 ; GCN-IR-NEXT: .LBB12_2: ; %udiv-preheader @@ -1869,13 +1870,12 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: .LBB12_4: ; %udiv-loop-exit @@ -1935,7 +1935,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], exec ; GCN-IR-NEXT: s_mov_b64 s[8:9], -1 -; GCN-IR-NEXT: s_mov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 ; GCN-IR-NEXT: .LBB13_1: ; %udiv-bb1 @@ -1943,10 +1943,11 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 +; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], exec, vcc -; GCN-IR-NEXT: s_xor_b64 exec, s[4:5], exec +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execz .LBB13_4 ; GCN-IR-NEXT: .LBB13_2: ; %udiv-preheader @@ -1974,14 +1975,13 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec ; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[8:9], exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] -; GCN-IR-NEXT: s_mov_b64 exec, s[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-IR-NEXT: ; divergent control-flow edge ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 ; GCN-IR-NEXT: .LBB13_4: ; %udiv-loop-exit diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index a0c7bbf245b7a..4cb22be919a37 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -356,11 +356,9 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 -; GCN-NEXT: s_xor_b64 s[8:9], exec, vcc -; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-NEXT: s_mov_b64 exec, vcc ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execz .LBB10_3 ; GCN-NEXT: .LBB10_2: ; %loop_end diff --git a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll index 6e8d4e2af4e88..80630c2043a00 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll @@ -15,8 +15,8 @@ define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x, ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_xor_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s1, exec_lo, s1 -; GCN-NEXT: s_xor_b32 s2, s1, exec_lo -; GCN-NEXT: s_mov_b32 exec_lo, s2 +; GCN-NEXT: s_xor_b32 s1, s1, exec_lo +; GCN-NEXT: s_and_saveexec_b32 s1, s1 ; GCN-NEXT: ; divergent control-flow edge ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %if diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index 1b42e69665ba7..3004407b905c2 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -6,15 +6,15 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 { ; SI-LABEL: else1: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 -; SI-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; SI-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; SI-NEXT: s_and_saveexec_b32 s0, s0 ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB0_2 ; SI-NEXT: .LBB0_1: ; %if ; SI-NEXT: v_add_f32_e32 v0, v1, v1 ; SI-NEXT: .LBB0_2: -; SI-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo -; SI-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; SI-NEXT: s_mov_b32 exec_lo, vcc_lo +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB0_4 ; SI-NEXT: .LBB0_3: ; %else @@ -45,16 +45,16 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI-LABEL: else2: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 -; SI-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; SI-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; SI-NEXT: s_and_saveexec_b32 s0, s0 ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB1_2 ; SI-NEXT: .LBB1_1: ; %if ; SI-NEXT: v_add_f32_e32 v1, v1, v1 ; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: .LBB1_2: -; SI-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo -; SI-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; SI-NEXT: s_mov_b32 exec_lo, vcc_lo +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB1_4 ; SI-NEXT: .LBB1_3: ; %else @@ -102,10 +102,8 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; SI-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v3 -; SI-NEXT: s_xor_b32 s3, vcc_lo, exec_lo -; SI-NEXT: s_xor_b32 s2, exec_lo, s3 -; SI-NEXT: s_and_b32 s2, s2, exec_lo -; SI-NEXT: s_mov_b32 exec_lo, s3 +; SI-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; SI-NEXT: s_and_saveexec_b32 s2, s2 ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB2_4 ; SI-NEXT: .LBB2_3: ; %if @@ -114,9 +112,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: v_add_nc_u32_e32 v4, 1, v2 ; SI-NEXT: .LBB2_4: ; in Loop: Header=BB2_2 Depth=1 ; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; SI-NEXT: s_xor_b32 s2, exec_lo, vcc_lo -; SI-NEXT: s_and_b32 s2, s2, exec_lo -; SI-NEXT: s_mov_b32 exec_lo, vcc_lo +; SI-NEXT: s_and_saveexec_b32 s2, vcc_lo ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB2_1 ; SI-NEXT: .LBB2_5: ; %else @@ -173,22 +169,23 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v0, v1 -; SI-NEXT: v_cmp_gt_i32_e64 s4, 6, v6 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 +; SI-NEXT: v_cmp_gt_i32_e64 s4, 6, v6 +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_add_u32 s12, s12, s1 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_xor_b32 exec_lo, s4, exec_lo +; SI-NEXT: s_xor_b32 s0, s4, exec_lo +; SI-NEXT: s_and_saveexec_b32 s5, s0 ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB3_4 ; SI-NEXT: .LBB3_1: ; %if -; SI-NEXT: s_mov_b32 s5, exec_lo +; SI-NEXT: s_mov_b32 s8, exec_lo ; SI-NEXT: .LBB3_2: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: v_readfirstlane_b32 s7, v3 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3] -; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: s_and_saveexec_b32 s9, vcc_lo ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] @@ -196,14 +193,13 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s9 ; SI-NEXT: s_cbranch_execnz .LBB3_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s5 +; SI-NEXT: s_mov_b32 exec_lo, s8 ; SI-NEXT: .LBB3_4: -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; SI-NEXT: s_xor_b32 s6, exec_lo, s4 -; SI-NEXT: s_mov_b32 exec_lo, s4 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; SI-NEXT: s_and_saveexec_b32 s6, s4 ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB3_8 ; SI-NEXT: .LBB3_5: ; %else @@ -250,37 +246,37 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_cmp_gt_i32_e64 s4, 6, v0 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: s_add_u32 s12, s12, s1 ; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_xor_b32 s0, s4, exec_lo ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_xor_b32 exec_lo, s4, exec_lo +; SI-NEXT: s_and_saveexec_b32 s5, s0 ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB4_4 ; SI-NEXT: .LBB4_1: ; %if -; SI-NEXT: s_mov_b32 s5, exec_lo +; SI-NEXT: s_mov_b32 s8, exec_lo ; SI-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: v_readfirstlane_b32 s7, v3 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3] -; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: s_and_saveexec_b32 s9, vcc_lo ; SI-NEXT: v_mov_b32_e32 v0, v40 ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s9 ; SI-NEXT: s_cbranch_execnz .LBB4_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s5 +; SI-NEXT: s_mov_b32 exec_lo, s8 ; SI-NEXT: .LBB4_4: -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; SI-NEXT: s_xor_b32 s6, exec_lo, s4 -; SI-NEXT: s_mov_b32 exec_lo, s4 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; SI-NEXT: s_and_saveexec_b32 s6, s4 ; SI-NEXT: ; divergent control-flow edge ; SI-NEXT: s_cbranch_execz .LBB4_8 ; SI-NEXT: .LBB4_5: ; %else diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll index 0572383bfcd1b..7cd5926f8df22 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll @@ -82,6 +82,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_xor_b32 s0, vcc_lo, exec_lo ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 @@ -92,7 +93,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_xor_b32 exec_lo, vcc_lo, exec_lo +; CHECK-NEXT: s_and_saveexec_b32 s0, s0 ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB1_2 ; CHECK-NEXT: .LBB1_1: ; %.true @@ -147,10 +148,8 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: .LBB1_2: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, vcc_lo -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_xor_b32 s0, exec_lo, vcc_lo -; CHECK-NEXT: s_mov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo ; CHECK-NEXT: ; divergent control-flow edge ; CHECK-NEXT: s_cbranch_execz .LBB1_4 ; CHECK-NEXT: .LBB1_3: ; %.false diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll index 0987a17587162..97f82c70fd86a 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll @@ -12,8 +12,9 @@ define void @global_store_different_block(ptr addrspace(1) %data_ptr, ptr addrsp ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: v_and_b32_e32 v0, 1, v4 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_xor_b64 s[0:1], vcc, exec ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_xor_b64 exec, vcc, exec +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: ; divergent control-flow edge ; GFX950-NEXT: s_cbranch_execz .LBB0_2 ; GFX950-NEXT: .LBB0_1: ; %do_atomic @@ -22,7 +23,7 @@ define void @global_store_different_block(ptr addrspace(1) %data_ptr, ptr addrsp ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[2:3], v[0:1], off ; GFX950-NEXT: .LBB0_2: ; %exit -; GFX950-NEXT: s_or_b64 exec, exec, vcc +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] entry: