From 1e0899794dd432c6c3c7bbba6940e8640e6a201c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 19 Jun 2026 07:24:21 +0100 Subject: [PATCH 001/149] [DAG] visitEXTRACT_SUBVECTOR - Fold EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(X,C1),C0) with nonzero indices (#204533) Removed equivalent fold from x86 and added generic DAG fold to replace it - net zero test changes Refactored version of #200935 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++++++-------- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 -------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5a4ae64cb98af..1409c7b683069 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27656,18 +27656,18 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { return NarrowLoad; // Combine an extract of an extract into a single extract_subvector. - // ext (ext X, C), 0 --> ext X, C - if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) { + // ext (ext X, C1), C2 --> ext X, C1 + C2 + if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) { // Both indices must have the same scaling factor and C has to be a // multiple of the new result type's known minimum vector length. + uint64_t InnerExtIdx = V.getConstantOperandVal(1); + uint64_t NewExtIdx = InnerExtIdx + ExtIdx; if (V.getValueType().isScalableVector() == NVT.isScalableVector() && - V.getConstantOperandVal(1) % NVT.getVectorMinNumElements() == 0 && + NewExtIdx % NVT.getVectorMinNumElements() == 0 && TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(), - V.getConstantOperandVal(1)) && - TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) { - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, V.getOperand(0), - V.getOperand(1)); - } + NewExtIdx) && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) + return DAG.getExtractSubvector(DL, NVT, V.getOperand(0), NewExtIdx); } // ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b9a65e2671aa9..a5470d9735dba 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -62023,14 +62023,6 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts)); - // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2) - if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && - InVec.hasOneUse() && TLI.isTypeLegal(VT) && - TLI.isTypeLegal(InVec.getOperand(0).getValueType())) { - unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1); - return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits); - } - // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2) // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2) // iff SUB is entirely contained in the extraction. From 4b2a02d47e31da4764aa6e204e6c502ccc69e201 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 19 Jun 2026 07:27:27 +0100 Subject: [PATCH 002/149] [X86] Replace X86 specific PDEP/PEXT handling with generic intrinsics (#204144) * Remove X86ISD::PDEP/PEXT and use ISD::PDEP/PEXT instead * AutoUpgrade x86 pdep/pext intrinsics to llvm.pdep/pext generics * Move X86 DAG knownbits/demandedbits handling to generic (unchanged) * Move X86 InstCombine folds to generic (unchanged) * Add memory sanitizer handling for generic pdep/pext intrinsics * Updated clang builtins to emit generics Fixes #204537 --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 10 +++ clang/test/CodeGen/X86/bmi2-builtins.c | 8 +- llvm/include/llvm/IR/IntrinsicsX86.td | 12 --- llvm/lib/Analysis/ConstantFolding.cpp | 10 +++ llvm/lib/Analysis/InstructionSimplify.cpp | 14 +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++ .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 18 ++++ .../CodeGen/SelectionDAG/TargetLowering.cpp | 28 ++++++ llvm/lib/IR/AutoUpgrade.cpp | 8 ++ llvm/lib/Target/X86/X86ISelLowering.cpp | 50 +---------- .../Target/X86/X86InstCombineIntrinsic.cpp | 88 ------------------- llvm/lib/Target/X86/X86InstrFragments.td | 4 - llvm/lib/Target/X86/X86InstrMisc.td | 54 ++---------- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 4 - .../InstCombine/InstCombineCalls.cpp | 34 +++++++ .../Instrumentation/MemorySanitizer.cpp | 29 +++++- llvm/test/CodeGen/X86/bmi2.ll | 23 +++-- .../Instrumentation/MemorySanitizer/bmi.ll | 16 ++-- .../Instrumentation/MemorySanitizer/pdep.ll | 35 +++++--- .../Instrumentation/MemorySanitizer/pext.ll | 35 +++++--- llvm/test/Transforms/InstCombine/pdep.ll | 30 +++---- llvm/test/Transforms/InstCombine/pext.ll | 30 +++---- 22 files changed, 259 insertions(+), 287 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index acfeb9967cd2f..50125a71fcd5f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -976,6 +976,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType()); return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); } + case X86::BI__builtin_ia32_pdep_si: + case X86::BI__builtin_ia32_pdep_di: { + Function *F = CGM.getIntrinsic(Intrinsic::pdep, Ops[0]->getType()); + return Builder.CreateCall(F, Ops); + } + case X86::BI__builtin_ia32_pext_si: + case X86::BI__builtin_ia32_pext_di: { + Function *F = CGM.getIntrinsic(Intrinsic::pext, Ops[0]->getType()); + return Builder.CreateCall(F, Ops); + } case X86::BI__builtin_ia32_undef128: case X86::BI__builtin_ia32_undef256: case X86::BI__builtin_ia32_undef512: diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c b/clang/test/CodeGen/X86/bmi2-builtins.c index 1b2cb9048adb2..c83cc43d9fc3f 100644 --- a/clang/test/CodeGen/X86/bmi2-builtins.c +++ b/clang/test/CodeGen/X86/bmi2-builtins.c @@ -17,12 +17,12 @@ unsigned int test_bzhi_u32(unsigned int __X, unsigned int __Y) { } unsigned int test_pdep_u32(unsigned int __X, unsigned int __Y) { - // CHECK: @llvm.x86.bmi.pdep.32 + // CHECK: @llvm.pdep.i32 return _pdep_u32(__X, __Y); } unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) { - // CHECK: @llvm.x86.bmi.pext.32 + // CHECK: @llvm.pext.i32 return _pext_u32(__X, __Y); } @@ -41,12 +41,12 @@ unsigned long long test_bzhi_u64(unsigned long long __X, unsigned long long __Y) } unsigned long long test_pdep_u64(unsigned long long __X, unsigned long long __Y) { - // CHECK: @llvm.x86.bmi.pdep.64 + // CHECK: @llvm.pdep.i64 return _pdep_u64(__X, __Y); } unsigned long long test_pext_u64(unsigned long long __X, unsigned long long __Y) { - // CHECK: @llvm.x86.bmi.pext.64 + // CHECK: @llvm.pext.i64 return _pext_u64(__X, __Y); } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index b75a0485d6263..5c7785731111c 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2575,18 +2575,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_bmi_bzhi_64 : ClangBuiltin<"__builtin_ia32_bzhi_di">, DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_bmi_pdep_32 : ClangBuiltin<"__builtin_ia32_pdep_si">, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_x86_bmi_pdep_64 : ClangBuiltin<"__builtin_ia32_pdep_di">, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem]>; - def int_x86_bmi_pext_32 : ClangBuiltin<"__builtin_ia32_pext_si">, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_x86_bmi_pext_64 : ClangBuiltin<"__builtin_ia32_pext_di">, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 3fe78d6c4322d..f18b7a0b66a21 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1756,6 +1756,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::fshl: case Intrinsic::fshr: case Intrinsic::clmul: + case Intrinsic::pdep: + case Intrinsic::pext: case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: case Intrinsic::masked_load: @@ -3904,6 +3906,14 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty, if (!C0 || !C1) return Constant::getNullValue(Ty); return ConstantInt::get(Ty, APIntOps::clmul(*C0, *C1)); + case Intrinsic::pdep: + if (!C0 || !C1) + return Constant::getNullValue(Ty); + return ConstantInt::get(Ty, APIntOps::expandBits(*C0, *C1)); + case Intrinsic::pext: + if (!C0 || !C1) + return Constant::getNullValue(Ty); + return ConstantInt::get(Ty, APIntOps::compressBits(*C0, *C1)); case Intrinsic::amdgcn_wave_reduce_umin: case Intrinsic::amdgcn_wave_reduce_umax: case Intrinsic::amdgcn_wave_reduce_max: diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 7698d0d772a94..3b20592bcaed2 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -6930,6 +6930,20 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, return Constant::getNullValue(ReturnType); break; } + case Intrinsic::pdep: { + if (match(Op1, m_Zero())) + return Constant::getNullValue(ReturnType); + if (match(Op1, m_AllOnes())) + return Op0; + break; + } + case Intrinsic::pext: { + if (match(Op1, m_Zero())) + return Constant::getNullValue(ReturnType); + if (match(Op1, m_AllOnes())) + return Op0; + break; + } case Intrinsic::ptrmask: { // NOTE: We can't apply this simplifications based on the value of Op1 // because we need to preserve provenance. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1409c7b683069..4fdef7d4afb5d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12245,12 +12245,18 @@ SDValue DAGCombiner::visitPDEP(SDNode *N) { // pdep(x, 0) -> 0 if (isNullOrNullSplat(N1)) return DAG.getConstant(0, DL, VT); + // pdep(x, -1) -> x (all positions selected, bits deposited at identity) if (isAllOnesOrAllOnesSplat(N1)) return N0; + // fold pdep(c1, c2) -> expandBits(c1, c2) if (SDValue C = DAG.FoldConstantArithmetic(ISD::PDEP, DL, VT, {N0, N1})) return C; + + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b32c16fe4300f..44120cceed2a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3952,6 +3952,24 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.Zero.setBitsFrom(1); break; } + case ISD::PDEP: { + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + // Zeros are retained from the mask operand. But not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + break; + } + case ISD::PEXT: { + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + // The result has as many leading zeros as the number of zeroes in the mask. + unsigned Count = Known.Zero.popcount(); + Known.Zero = APInt::getHighBitsSet(BitWidth, Count); + Known.One.clearAllBits(); + break; + } case ISD::CLMUL: { Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 5ba36495ba4f6..5772ef37ec762 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2463,6 +2463,34 @@ bool TargetLowering::SimplifyDemandedBits( Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); break; } + case ISD::PDEP: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero(); + APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); + + // If the demanded bits has leading zeroes, we don't demand those from the + // mask. + if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) + return true; + + // The number of possible 1s in the mask determines the number of LSBs of + // operand 0 used. Undemanded bits from the mask don't matter so filter + // them before counting. + KnownBits Known2; + uint64_t Count = (~Known.Zero & LoMask).popcount(); + APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); + if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) + return true; + + // Zeroes are retained from the mask, but not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + break; + } case ISD::SIGN_EXTEND_INREG: { SDValue Op0 = Op.getOperand(0); EVT ExVT = cast(Op.getOperand(1))->getVT(); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 4d353c95b8930..3a823f906b012 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -533,6 +533,10 @@ static bool shouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.starts_with("vpcom") || // Added in 3.2, Updated in 9.0 Name.starts_with("vprot")); // Added in 8.0 + if (Name.consume_front("bmi.")) + return (Name.starts_with("pdep.") || // Added in 23.0 + Name.starts_with("pext.")); // Added in 23.0 + return (Name == "addcarry.u32" || // Added in 8.0 Name == "addcarry.u64" || // Added in 8.0 Name == "addcarryx.u32" || // Added in 8.0 @@ -4618,6 +4622,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, } else if (Name.starts_with("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. + } else if (Name.starts_with("bmi.pdep.")) { + Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pdep); + } else if (Name.starts_with("bmi.pext.")) { + Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pext); } else reportFatalUsageErrorWithCI("Unexpected intrinsic", CI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a5470d9735dba..e9ba1c05df361 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39748,25 +39748,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.One.clearAllBits(); break; } - case X86ISD::PDEP: { - KnownBits Known2; - Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); - Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - // Zeros are retained from the mask operand. But not ones. - Known.One.clearAllBits(); - // The result will have at least as many trailing zeros as the non-mask - // operand since bits can only map to the same or higher bit position. - Known.Zero.setLowBits(Known2.countMinTrailingZeros()); - break; - } - case X86ISD::PEXT: { - Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); - // The result has as many leading zeros as the number of zeroes in the mask. - unsigned Count = Known.Zero.popcount(); - Known.Zero = APInt::getHighBitsSet(BitWidth, Count); - Known.One.clearAllBits(); - break; - } case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: @@ -46015,34 +45996,6 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( break; } - case X86ISD::PDEP: { - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - - unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero(); - APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); - - // If the demanded bits has leading zeroes, we don't demand those from the - // mask. - if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) - return true; - - // The number of possible 1s in the mask determines the number of LSBs of - // operand 0 used. Undemanded bits from the mask don't matter so filter - // them before counting. - KnownBits Known2; - uint64_t Count = (~Known.Zero & LoMask).popcount(); - APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); - if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) - return true; - - // Zeroes are retained from the mask, but not ones. - Known.One.clearAllBits(); - // The result will have at least as many trailing zeros as the non-mask - // operand since bits can only map to the same or higher bit position. - Known.Zero.setLowBits(Known2.countMinTrailingZeros()); - return false; - } case X86ISD::VPMADD52L: case X86ISD::VPMADD52H: { KnownBits KnownOp0, KnownOp1, KnownOp2; @@ -63415,8 +63368,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); case X86ISD::BEXTR: case X86ISD::BEXTRI: - case X86ISD::BZHI: - case X86ISD::PDEP: return combineBMI(N, DAG, DCI); + case X86ISD::BZHI: return combineBMI(N, DAG, DCI); case X86ISD::PCLMULQDQ: return combinePCLMULQDQ(N, DAG, DCI); case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI); case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI); diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 4999581489e82..ad1c171428671 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -2259,94 +2259,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // TODO should we convert this to an AND if the RHS is constant? } break; - case Intrinsic::x86_bmi_pext_32: - case Intrinsic::x86_bmi_pext_64: - if (auto *MaskC = dyn_cast(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - - unsigned MaskIdx, MaskLen; - if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - Value *Input = II.getArgOperand(0); - Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); - Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); - Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); - return IC.replaceInstUsesWith(II, Shifted); - } - - if (auto *SrcC = dyn_cast(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToSet = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToTest = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToSet <<= 1; - // Clear lowest set bit. - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; - case Intrinsic::x86_bmi_pdep_32: - case Intrinsic::x86_bmi_pdep_64: - if (auto *MaskC = dyn_cast(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - - unsigned MaskIdx, MaskLen; - if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - Value *Input = II.getArgOperand(0); - Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); - Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); - Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); - return IC.replaceInstUsesWith(II, Masked); - } - - if (auto *SrcC = dyn_cast(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToTest = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToSet = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToTest <<= 1; - // Clear lowest set bit; - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 9316360c5e02a..923b968382866 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -424,10 +424,6 @@ def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>; // Zero High Bits Starting with Specified Bit Position. def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>; -// Parallel extract and deposit. -def X86pdep : SDNode<"X86ISD::PDEP", SDTIntBinOp>; -def X86pext : SDNode<"X86ISD::PEXT", SDTIntBinOp>; - // X86-specific multiply by immediate. def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index 613a431fe365a..c6acaa697fdc7 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -1391,55 +1391,17 @@ multiclass PdepPext, XD, VEX; - defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep>, XD, REX_W, VEX; - defm PEXT32 : PdepPext<"pext", Xi32, X86pext>, XS, VEX; - defm PEXT64 : PdepPext<"pext", Xi64, X86pext>, XS, REX_W, VEX; + defm PDEP32 : PdepPext<"pdep", Xi32, pdep>, XD, VEX; + defm PDEP64 : PdepPext<"pdep", Xi64, pdep>, XD, REX_W, VEX; + defm PEXT32 : PdepPext<"pext", Xi32, pext>, XS, VEX; + defm PEXT64 : PdepPext<"pext", Xi64, pext>, XS, REX_W, VEX; } let Predicates = [HasBMI2, HasEGPR] in { - defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep, "_EVEX">, XD, EVEX; - defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep, "_EVEX">, XD, REX_W, EVEX; - defm PEXT32 : PdepPext<"pext", Xi32, X86pext, "_EVEX">, XS, EVEX; - defm PEXT64 : PdepPext<"pext", Xi64, X86pext, "_EVEX">, XS, REX_W, EVEX; -} - -let Predicates = [HasBMI2, NoEGPR] in { - def : Pat<(i32 (pext GR32:$src, GR32:$mask)), - (PEXT32rr GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))), - (PEXT32rm GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pext GR64:$src, GR64:$mask)), - (PEXT64rr GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))), - (PEXT64rm GR64:$src, i64mem:$mask)>; - def : Pat<(i32 (pdep GR32:$src, GR32:$mask)), - (PDEP32rr GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))), - (PDEP32rm GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pdep GR64:$src, GR64:$mask)), - (PDEP64rr GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))), - (PDEP64rm GR64:$src, i64mem:$mask)>; -} - -let Predicates = [HasBMI2, HasEGPR] in { - def : Pat<(i32 (pext GR32:$src, GR32:$mask)), - (PEXT32rr_EVEX GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))), - (PEXT32rm_EVEX GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pext GR64:$src, GR64:$mask)), - (PEXT64rr_EVEX GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))), - (PEXT64rm_EVEX GR64:$src, i64mem:$mask)>; - def : Pat<(i32 (pdep GR32:$src, GR32:$mask)), - (PDEP32rr_EVEX GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))), - (PDEP32rm_EVEX GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pdep GR64:$src, GR64:$mask)), - (PDEP64rr_EVEX GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))), - (PDEP64rm_EVEX GR64:$src, i64mem:$mask)>; + defm PDEP32 : PdepPext<"pdep", Xi32, pdep, "_EVEX">, XD, EVEX; + defm PDEP64 : PdepPext<"pdep", Xi64, pdep, "_EVEX">, XD, REX_W, EVEX; + defm PEXT32 : PdepPext<"pext", Xi32, pext, "_EVEX">, XS, EVEX; + defm PEXT64 : PdepPext<"pext", Xi64, pext, "_EVEX">, XS, REX_W, EVEX; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 9e32ca23dafe2..a6b0db0230cf3 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1837,10 +1837,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0), - X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0), - X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0), - X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0), - X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 3cd7515eb7670..1df156053e302 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2660,6 +2660,40 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { return &CI; break; } + case Intrinsic::pdep: { + const APInt *MaskC; + if (match(II->getArgOperand(1), m_APInt(MaskC))) { + unsigned MaskIdx, MaskLen; + if (MaskC->isShiftedMask(MaskIdx, MaskLen)) { + // any single contiguous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + Value *Input = II->getArgOperand(0); + Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx); + Value *Shifted = Builder.CreateShl(Input, ShiftAmt); + Value *Masked = Builder.CreateAnd(Shifted, II->getArgOperand(1)); + return replaceInstUsesWith(*II, Masked); + } + } + break; + } + case Intrinsic::pext: { + const APInt *MaskC; + if (match(II->getArgOperand(1), m_APInt(MaskC))) { + unsigned MaskIdx, MaskLen; + if (MaskC->isShiftedMask(MaskIdx, MaskLen)) { + // any single contiguous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + Value *Input = II->getArgOperand(0); + Value *Masked = Builder.CreateAnd(Input, II->getArgOperand(1)); + Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx); + Value *Shifted = Builder.CreateLShr(Masked, ShiftAmt); + return replaceInstUsesWith(*II, Shifted); + } + } + break; + } case Intrinsic::ptrmask: { unsigned BitWidth = DL.getPointerTypeSizeInBits(II->getType()); KnownBits Known(BitWidth); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index bbc9f5d1b7506..f37e21f2c6dbb 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3333,6 +3333,26 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } + // Instrument packed bits deposit/expand intrinsics. + // All of these intrinsics are Z = I(X, Y) + // where the types of all operands and the result match. + // The following instrumentation happens to work for all of them: + // Sz = I(Sx, Y) | (sext (Sy != 0)) + void handlePackedBits(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + Type *ShadowTy = getShadowTy(&I); + + // If any bit of the mask operand is poisoned, then the whole thing is. + Value *SMask = getShadow(&I, 1); + SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)), + ShadowTy); + // Apply the same intrinsic to the shadow of the first operand. + Value *S = IRB.CreateIntrinsic(I.getIntrinsicID(), ShadowTy, + {getShadow(&I, 0), I.getOperand(1)}); + setShadow(&I, IRB.CreateOr(SMask, S)); + setOriginForNaryOp(I); + } + /// Instrument llvm.memmove /// /// At this point we don't know if llvm.memmove will be inlined or not. @@ -5873,6 +5893,11 @@ struct MemorySanitizerVisitor : public InstVisitor { handleFunnelShift(I); break; + case Intrinsic::pdep: + case Intrinsic::pext: + handlePackedBits(I); + break; + case Intrinsic::is_constant: // The result of llvm.is.constant() is always defined. setShadow(&I, getCleanShadow(&I)); @@ -6503,10 +6528,6 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::x86_bmi_bextr_64: case Intrinsic::x86_bmi_bzhi_32: case Intrinsic::x86_bmi_bzhi_64: - case Intrinsic::x86_bmi_pdep_32: - case Intrinsic::x86_bmi_pdep_64: - case Intrinsic::x86_bmi_pext_32: - case Intrinsic::x86_bmi_pext_64: handleBmiIntrinsic(I); break; diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll index cabeebb0c3f36..41585bde9a696 100644 --- a/llvm/test/CodeGen/X86/bmi2.ll +++ b/llvm/test/CodeGen/X86/bmi2.ll @@ -128,7 +128,7 @@ define i32 @pdep32_load(i32 %x, ptr %y) { define i32 @pdep32_anyext(i16 %x) { ; X86-LABEL: pdep32_anyext: ; X86: # %bb.0: -; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA ; X86-NEXT: pdepl %ecx, %eax, %eax ; X86-NEXT: retl @@ -178,7 +178,7 @@ define i32 @pdep32_demandedbits(i32 %x) { define i32 @pdep32_demandedbits2(i32 %x, i32 %y) { ; X86-LABEL: pdep32_demandedbits2: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: andl $128, %eax ; X86-NEXT: retl @@ -203,9 +203,8 @@ define i32 @pdep32_demandedbits2(i32 %x, i32 %y) { define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) { ; X86-LABEL: pdep32_demandedbits_mask: ; X86: # %bb.0: -; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: pdepl %eax, %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: andl $32768, %eax # imm = 0x8000 ; X86-NEXT: retl ; @@ -230,9 +229,8 @@ define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) { define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) { ; X86-LABEL: pdep32_demandedbits_mask2: ; X86: # %bb.0: -; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: pdepl %eax, %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; @@ -285,22 +283,23 @@ define i32 @pdep32_knownbits(i32 %x) { define i32 @pdep32_knownbits2(i32 %x, i32 %y) { ; X86-LABEL: pdep32_knownbits2: ; X86: # %bb.0: -; X86-NEXT: movl $-256, %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $8, %eax ; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: imull %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: pdep32_knownbits2: ; X64: # %bb.0: -; X64-NEXT: andl $-256, %edi +; X64-NEXT: andl $16776960, %edi # imm = 0xFFFF00 ; X64-NEXT: pdepl %esi, %edi, %eax ; X64-NEXT: imull %eax, %eax ; X64-NEXT: retq ; ; EGPR-LABEL: pdep32_knownbits2: ; EGPR: # %bb.0: -; EGPR-NEXT: andl $-256, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0xff] +; EGPR-NEXT: andl $16776960, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0x00] +; EGPR-NEXT: # imm = 0xFFFF00 ; EGPR-NEXT: pdepl %esi, %edi, %eax # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x43,0xf5,0xc6] ; EGPR-NEXT: imull %eax, %eax # encoding: [0x0f,0xaf,0xc0] ; EGPR-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll index 46bec2956c73c..208546ec56246 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll @@ -110,9 +110,9 @@ define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pdep.32(i32 [[TMP1]], i32 [[B]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.pdep.i32(i32 [[TMP1]], i32 [[B]]) ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.x86.bmi.pdep.32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.pdep.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[C]] ; @@ -131,9 +131,9 @@ define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pdep.64(i64 [[TMP1]], i64 [[B]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.pdep.i64(i64 [[TMP1]], i64 [[B]]) ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.x86.bmi.pdep.64(i64 [[A]], i64 [[B]]) +; CHECK-NEXT: [[C:%.*]] = call i64 @llvm.pdep.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[C]] ; @@ -152,9 +152,9 @@ define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pext.32(i32 [[TMP1]], i32 [[B]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.pext.i32(i32 [[TMP1]], i32 [[B]]) ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.x86.bmi.pext.32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.pext.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[C]] ; @@ -173,9 +173,9 @@ define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pext.64(i64 [[TMP1]], i64 [[B]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.pext.i64(i64 [[TMP1]], i64 [[B]]) ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.x86.bmi.pext.64(i64 [[A]], i64 [[B]]) +; CHECK-NEXT: [[C:%.*]] = call i64 @llvm.pext.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[C]] ; diff --git a/llvm/test/Instrumentation/MemorySanitizer/pdep.ll b/llvm/test/Instrumentation/MemorySanitizer/pdep.ll index 5a94f6abfa773..f323f386d0f50 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/pdep.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/pdep.ll @@ -7,10 +7,13 @@ target triple = "x86_64-unknown-linux-gnu" define i8 @Test_pdep_8(i8 %a, i8 %b) sanitize_memory { ; CHECK-LABEL: define i8 @Test_pdep_8( ; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.pdep.i8(i8 [[TMP2]], i8 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i8 @llvm.pdep.i8(i8 [[A]], i8 [[B]]) ; CHECK-NEXT: store i8 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[C]] @@ -23,10 +26,13 @@ define i8 @Test_pdep_8(i8 %a, i8 %b) sanitize_memory { define i16 @Test_pdep_16(i16 %a, i16 %b) sanitize_memory { ; CHECK-LABEL: define i16 @Test_pdep_16( ; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i16 +; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.pdep.i16(i16 [[TMP2]], i16 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i16 @llvm.pdep.i16(i16 [[A]], i16 [[B]]) ; CHECK-NEXT: store i16 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i16 [[C]] @@ -39,10 +45,13 @@ define i16 @Test_pdep_16(i16 %a, i16 %b) sanitize_memory { define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory { ; CHECK-LABEL: define i32 @Test_pdep_32( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.pdep.i32(i32 [[TMP2]], i32 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: store i32 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[C]] @@ -55,10 +64,13 @@ define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory { define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory { ; CHECK-LABEL: define i64 @Test_pdep_64( ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.pdep.i64(i64 [[TMP2]], i64 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[C]] @@ -70,10 +82,13 @@ define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory { define i128 @Test_pdep_128(i128 %a, i128 %b) sanitize_memory { ; CHECK-LABEL: define i128 @Test_pdep_128( ; CHECK-SAME: i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i128, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = call i128 @llvm.pdep.i128(i128 [[TMP2]], i128 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i128 @llvm.pdep.i128(i128 [[A]], i128 [[B]]) ; CHECK-NEXT: store i128 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i128 [[C]] diff --git a/llvm/test/Instrumentation/MemorySanitizer/pext.ll b/llvm/test/Instrumentation/MemorySanitizer/pext.ll index 72c4834998446..2caf6a47ac93b 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/pext.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/pext.ll @@ -7,10 +7,13 @@ target triple = "x86_64-unknown-linux-gnu" define i8 @Test_pext_8(i8 %a, i8 %b) sanitize_memory { ; CHECK-LABEL: define i8 @Test_pext_8( ; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.pext.i8(i8 [[TMP2]], i8 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i8 @llvm.pext.i8(i8 [[A]], i8 [[B]]) ; CHECK-NEXT: store i8 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[C]] @@ -23,10 +26,13 @@ define i8 @Test_pext_8(i8 %a, i8 %b) sanitize_memory { define i16 @Test_pext_16(i16 %a, i16 %b) sanitize_memory { ; CHECK-LABEL: define i16 @Test_pext_16( ; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i16 +; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.pext.i16(i16 [[TMP2]], i16 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i16 @llvm.pext.i16(i16 [[A]], i16 [[B]]) ; CHECK-NEXT: store i16 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i16 [[C]] @@ -39,10 +45,13 @@ define i16 @Test_pext_16(i16 %a, i16 %b) sanitize_memory { define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory { ; CHECK-LABEL: define i32 @Test_pext_32( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.pext.i32(i32 [[TMP2]], i32 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.pext.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: store i32 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[C]] @@ -55,10 +64,13 @@ define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory { define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory { ; CHECK-LABEL: define i64 @Test_pext_64( ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.pext.i64(i64 [[TMP2]], i64 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.pext.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[C]] @@ -70,10 +82,13 @@ define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory { define i128 @Test_pext_128(i128 %a, i128 %b) sanitize_memory { ; CHECK-LABEL: define i128 @Test_pext_128( ; CHECK-SAME: i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i128, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = call i128 @llvm.pext.i128(i128 [[TMP2]], i128 [[B]]) +; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[C:%.*]] = tail call i128 @llvm.pext.i128(i128 [[A]], i128 [[B]]) ; CHECK-NEXT: store i128 [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i128 [[C]] diff --git a/llvm/test/Transforms/InstCombine/pdep.ll b/llvm/test/Transforms/InstCombine/pdep.ll index ceb4d1f97b6b0..b726e87a6168c 100644 --- a/llvm/test/Transforms/InstCombine/pdep.ll +++ b/llvm/test/Transforms/InstCombine/pdep.ll @@ -3,8 +3,7 @@ define i32 @test_pdep_32_zero_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pdep_32_zero_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], i32 0) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 0 ; %1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 0) ret i32 %1 @@ -12,8 +11,7 @@ define i32 @test_pdep_32_zero_mask(i32 %x) nounwind readnone { define i64 @test_pdep_64_zero_mask(i64 %x) nounwind readnone { ; CHECK-LABEL: @test_pdep_64_zero_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], i64 0) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 0 ; %1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 0) ret i64 %1 @@ -21,8 +19,7 @@ define i64 @test_pdep_64_zero_mask(i64 %x) nounwind readnone { define i32 @test_pdep_32_allones_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pdep_32_allones_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], i32 -1) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 [[TMP1:%.*]] ; %1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 -1) ret i32 %1 @@ -30,8 +27,7 @@ define i32 @test_pdep_32_allones_mask(i32 %x) nounwind readnone { define i64 @test_pdep_64_allones_mask(i64 %x) nounwind readnone { ; CHECK-LABEL: @test_pdep_64_allones_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], i64 -1) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 [[TMP1:%.*]] ; %1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 -1) ret i64 %1 @@ -39,7 +35,8 @@ define i64 @test_pdep_64_allones_mask(i64 %x) nounwind readnone { define i32 @test_pdep_32_shifted_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pdep_32_shifted_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], i32 12) +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[X:%.*]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP2]], 12 ; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 12) @@ -48,7 +45,8 @@ define i32 @test_pdep_32_shifted_mask(i32 %x) nounwind readnone { define i64 @test_pdep_64_shifted_mask(i64 %x) nounwind readnone { ; CHECK-LABEL: @test_pdep_64_shifted_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], i64 12) +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[X:%.*]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP2]], 12 ; CHECK-NEXT: ret i64 [[TMP1]] ; %1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 12) @@ -57,8 +55,7 @@ define i64 @test_pdep_64_shifted_mask(i64 %x) nounwind readnone { define i32 @test_pdep_32_constant_fold() nounwind readnone { ; CHECK-LABEL: @test_pdep_32_constant_fold( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 -252645136) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 807407616 ; %1 = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 4042322160) ret i32 %1 @@ -66,8 +63,7 @@ define i32 @test_pdep_32_constant_fold() nounwind readnone { define i64 @test_pdep_64_constant_fold() nounwind readnone { ; CHECK-LABEL: @test_pdep_64_constant_fold( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -1085102592571150096) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 -1089641583808049024 ; %1 = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -1085102592571150096) ret i64 %1 @@ -75,8 +71,7 @@ define i64 @test_pdep_64_constant_fold() nounwind readnone { define i32 @test_pdep_32_constant_fold_2() nounwind readnone { ; CHECK-LABEL: @test_pdep_32_constant_fold_2( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 -16776961) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 838860816 ; %1 = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 4278190335) ret i32 %1 @@ -84,8 +79,7 @@ define i32 @test_pdep_32_constant_fold_2() nounwind readnone { define i64 @test_pdep_64_constant_fold_2() nounwind readnone { ; CHECK-LABEL: @test_pdep_64_constant_fold_2( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -72056498804490496) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 -144114243170822144 ; %1 = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -72056498804490496) ret i64 %1 diff --git a/llvm/test/Transforms/InstCombine/pext.ll b/llvm/test/Transforms/InstCombine/pext.ll index 52baa9a171c62..0f13f3f542023 100644 --- a/llvm/test/Transforms/InstCombine/pext.ll +++ b/llvm/test/Transforms/InstCombine/pext.ll @@ -3,8 +3,7 @@ define i32 @test_pext_32_zero_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pext_32_zero_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], i32 0) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 0 ; %1 = tail call i32 @llvm.pext.i32(i32 %x, i32 0) ret i32 %1 @@ -12,8 +11,7 @@ define i32 @test_pext_32_zero_mask(i32 %x) nounwind readnone { define i64 @test_pext_64_zero_mask(i64 %x) nounwind readnone { ; CHECK-LABEL: @test_pext_64_zero_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], i64 0) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 0 ; %1 = tail call i64 @llvm.pext.i64(i64 %x, i64 0) ret i64 %1 @@ -21,8 +19,7 @@ define i64 @test_pext_64_zero_mask(i64 %x) nounwind readnone { define i32 @test_pext_32_allones_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pext_32_allones_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], i32 -1) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 [[TMP1:%.*]] ; %1 = tail call i32 @llvm.pext.i32(i32 %x, i32 -1) ret i32 %1 @@ -30,8 +27,7 @@ define i32 @test_pext_32_allones_mask(i32 %x) nounwind readnone { define i64 @test_pext_64_allones_mask(i64 %x) nounwind readnone { ; CHECK-LABEL: @test_pext_64_allones_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], i64 -1) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 [[TMP1:%.*]] ; %1 = tail call i64 @llvm.pext.i64(i64 %x, i64 -1) ret i64 %1 @@ -39,7 +35,8 @@ define i64 @test_pext_64_allones_mask(i64 %x) nounwind readnone { define i32 @test_pext_32_shifted_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pext_32_shifted_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], i32 6) +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP2]], 3 ; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = tail call i32 @llvm.pext.i32(i32 %x, i32 6) @@ -48,7 +45,8 @@ define i32 @test_pext_32_shifted_mask(i32 %x) nounwind readnone { define i64 @test_pext_64_shifted_mask(i64 %x) nounwind readnone { ; CHECK-LABEL: @test_pext_64_shifted_mask( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], i64 6) +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP2]], 3 ; CHECK-NEXT: ret i64 [[TMP1]] ; %1 = tail call i64 @llvm.pext.i64(i64 %x, i64 6) @@ -58,8 +56,7 @@ define i64 @test_pext_64_shifted_mask(i64 %x) nounwind readnone { define i32 @test_pext_32_constant_fold() nounwind readnone { ; CHECK-LABEL: @test_pext_32_constant_fold( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 1985229328, i32 -252645136) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 30001 ; %1 = tail call i32 @llvm.pext.i32(i32 1985229328, i32 4042322160) ret i32 %1 @@ -67,8 +64,7 @@ define i32 @test_pext_32_constant_fold() nounwind readnone { define i64 @test_pext_64_constant_fold() nounwind readnone { ; CHECK-LABEL: @test_pext_64_constant_fold( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -1085102592571150096) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 1966210489 ; %1 = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -1085102592571150096) ret i64 %1 @@ -76,8 +72,7 @@ define i64 @test_pext_64_constant_fold() nounwind readnone { define i32 @test_pext_32_constant_fold_2() nounwind readnone { ; CHECK-LABEL: @test_pext_32_constant_fold_2( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 1985229328, i32 -16776961) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: ret i32 30224 ; %1 = tail call i32 @llvm.pext.i32(i32 1985229328, i32 4278190335) ret i32 %1 @@ -85,8 +80,7 @@ define i32 @test_pext_32_constant_fold_2() nounwind readnone { define i64 @test_pext_64_constant_fold_2() nounwind readnone { ; CHECK-LABEL: @test_pext_64_constant_fold_2( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -72056498804490496) -; CHECK-NEXT: ret i64 [[TMP1]] +; CHECK-NEXT: ret i64 1980816570 ; %1 = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -72056498804490496) ret i64 %1 From f0134cc7a5a56b53dfbe2887cd759806845c8797 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 19 Jun 2026 08:29:03 +0200 Subject: [PATCH 003/149] AMDGPU: Add subtarget feature for controllable xnack modes (#204523) This replaces the previously removed xnack-any-only feature, with the inversion xnack-on-off-modes. All pre-gfx12.5 xnack targets support the controllable mode. Ignore explicitly set xnack settings the same way as is done for xnack requests on other unsupported targets. --- clang/lib/Basic/TargetID.cpp | 3 +- clang/lib/Driver/ToolChains/AMDGPU.cpp | 7 +-- clang/test/Driver/invalid-target-id.cl | 21 +++++++++ .../llvm/TargetParser/AMDGPUTargetParser.def | 44 +++++++++---------- .../llvm/TargetParser/AMDGPUTargetParser.h | 4 +- llvm/lib/Target/AMDGPU/AMDGPU.td | 20 ++++++--- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 + llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 +- .../MCTargetDesc/AMDGPUTargetStreamer.h | 10 +---- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 24 +++++----- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 +- .../AMDGPU/target-id-xnack-always-on.ll | 22 ++++++++++ 12 files changed, 105 insertions(+), 58 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll diff --git a/clang/lib/Basic/TargetID.cpp b/clang/lib/Basic/TargetID.cpp index 6d9ba55ccd9d7..67f429607ef27 100644 --- a/clang/lib/Basic/TargetID.cpp +++ b/clang/lib/Basic/TargetID.cpp @@ -32,7 +32,8 @@ getAllPossibleAMDGPUTargetIDFeatures(const llvm::Triple &T, : llvm::AMDGPU::getArchAttrR600(ProcKind); if (Features & llvm::AMDGPU::FEATURE_SRAMECC) Ret.push_back("sramecc"); - if (Features & llvm::AMDGPU::FEATURE_XNACK) + // Only allow xnack in target ID if the processor supports on/off modes. + if (Features & llvm::AMDGPU::FEATURE_XNACK_ON_OFF_MODES) Ret.push_back("xnack"); return Ret; } diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index ddc26604a8006..b57579f135b36 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -1130,9 +1130,10 @@ static bool isXnackAvailable(const llvm::Triple &TT, llvm::StringRef TargetID) { auto Features = TT.isAMDGCN() ? llvm::AMDGPU::getArchAttrAMDGCN(ProcKind) : llvm::AMDGPU::getArchAttrR600(ProcKind); - // If processor has xnack always on, Address sanitizer is supported - bool XnackAvailable = (Features & llvm::AMDGPU::FEATURE_XNACK_ALWAYS); - if (XnackAvailable) + // If processor has xnack but doesn't support on/off modes, xnack is always on + bool XnackAlwaysOn = (Features & llvm::AMDGPU::FEATURE_XNACK) && + !(Features & llvm::AMDGPU::FEATURE_XNACK_ON_OFF_MODES); + if (XnackAlwaysOn) return true; // Otherwise, check if xnack+ is explicitly enabled in the target ID diff --git a/clang/test/Driver/invalid-target-id.cl b/clang/test/Driver/invalid-target-id.cl index 4f6f140437885..f93e618e460be 100644 --- a/clang/test/Driver/invalid-target-id.cl +++ b/clang/test/Driver/invalid-target-id.cl @@ -39,3 +39,24 @@ // RUN: %s 2>&1 | FileCheck -check-prefix=NOCOLON %s // NOCOLON: error: invalid target ID 'gfx900+xnack' + +// gfx1250 and gfx12-5-generic do not support xnack on/off modes +// RUN: not %clang -target amdgcn-amd-amdhsa \ +// RUN: -mcpu=gfx1250:xnack+ -nostdlib \ +// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX1250 %s + +// RUN: not %clang -target amdgcn-amd-amdhsa \ +// RUN: -mcpu=gfx1250:xnack- -nostdlib \ +// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX1250 %s + +// XNACK-MODE-GFX1250: error: invalid target ID 'gfx1250:xnack{{[+-]}}' + +// RUN: not %clang -target amdgcn-amd-amdhsa \ +// RUN: -mcpu=gfx12-5-generic:xnack+ -nostdlib \ +// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX125 %s + +// RUN: not %clang -target amdgcn-amd-amdhsa \ +// RUN: -mcpu=gfx12-5-generic:xnack- -nostdlib \ +// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX125 %s + +// XNACK-MODE-GFX125: error: invalid target ID 'gfx12-5-generic:xnack{{[+-]}}' diff --git a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def index d15fc01f30019..dcc0c28b1ee74 100644 --- a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def +++ b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def @@ -76,7 +76,7 @@ AMDGCN_GPU_ALIAS("mullins", GK_GFX703) AMDGCN_GPU ("gfx704", GK_GFX704, ( 7, 0, 4), FEATURE_NONE) AMDGCN_GPU_ALIAS("bonaire", GK_GFX704) AMDGCN_GPU ("gfx705", GK_GFX705, ( 7, 0, 5), FEATURE_NONE) -AMDGCN_GPU ("gfx801", GK_GFX801, ( 8, 0, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) +AMDGCN_GPU ("gfx801", GK_GFX801, ( 8, 0, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) AMDGCN_GPU_ALIAS("carrizo", GK_GFX801) AMDGCN_GPU ("gfx802", GK_GFX802, ( 8, 0, 2), FEATURE_FAST_DENORMAL_F32) AMDGCN_GPU_ALIAS("iceland", GK_GFX802) @@ -87,22 +87,22 @@ AMDGCN_GPU_ALIAS("polaris10", GK_GFX803) AMDGCN_GPU_ALIAS("polaris11", GK_GFX803) AMDGCN_GPU ("gfx805", GK_GFX805, ( 8, 0, 5), FEATURE_FAST_DENORMAL_F32) AMDGCN_GPU_ALIAS("tongapro", GK_GFX805) -AMDGCN_GPU ("gfx810", GK_GFX810, ( 8, 1, 0), FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) +AMDGCN_GPU ("gfx810", GK_GFX810, ( 8, 1, 0), FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) AMDGCN_GPU_ALIAS("stoney", GK_GFX810) -AMDGCN_GPU ("gfx900", GK_GFX900, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) -AMDGCN_GPU ("gfx902", GK_GFX902, ( 9, 0, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) -AMDGCN_GPU ("gfx904", GK_GFX904, ( 9, 0, 4), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) -AMDGCN_GPU ("gfx906", GK_GFX906, ( 9, 0, 6), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC) -AMDGCN_GPU ("gfx908", GK_GFX908, ( 9, 0, 8), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC) -AMDGCN_GPU ("gfx909", GK_GFX909, ( 9, 0, 9), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) -AMDGCN_GPU ("gfx90a", GK_GFX90A, ( 9, 0, 10), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC) -AMDGCN_GPU ("gfx90c", GK_GFX90C, ( 9, 0, 12), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) -AMDGCN_GPU ("gfx942", GK_GFX942, ( 9, 4, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC) -AMDGCN_GPU ("gfx950", GK_GFX950, ( 9, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC) -AMDGCN_GPU ("gfx1010", GK_GFX1010, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP) -AMDGCN_GPU ("gfx1011", GK_GFX1011, (10, 1, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP) -AMDGCN_GPU ("gfx1012", GK_GFX1012, (10, 1, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP) -AMDGCN_GPU ("gfx1013", GK_GFX1013, (10, 1, 3), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP) +AMDGCN_GPU ("gfx900", GK_GFX900, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) +AMDGCN_GPU ("gfx902", GK_GFX902, ( 9, 0, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) +AMDGCN_GPU ("gfx904", GK_GFX904, ( 9, 0, 4), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) +AMDGCN_GPU ("gfx906", GK_GFX906, ( 9, 0, 6), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx908", GK_GFX908, ( 9, 0, 8), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx909", GK_GFX909, ( 9, 0, 9), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) +AMDGCN_GPU ("gfx90a", GK_GFX90A, ( 9, 0, 10), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx90c", GK_GFX90C, ( 9, 0, 12), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) +AMDGCN_GPU ("gfx942", GK_GFX942, ( 9, 4, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx950", GK_GFX950, ( 9, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx1010", GK_GFX1010, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP) +AMDGCN_GPU ("gfx1011", GK_GFX1011, (10, 1, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP) +AMDGCN_GPU ("gfx1012", GK_GFX1012, (10, 1, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP) +AMDGCN_GPU ("gfx1013", GK_GFX1013, (10, 1, 3), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP) AMDGCN_GPU ("gfx1030", GK_GFX1030, (10, 3, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) AMDGCN_GPU ("gfx1031", GK_GFX1031, (10, 3, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) AMDGCN_GPU ("gfx1032", GK_GFX1032, (10, 3, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) @@ -123,8 +123,8 @@ AMDGCN_GPU ("gfx1171", GK_GFX1171, (11, 7, 1), FEATURE_FAST_FMA_F32|FEAT AMDGCN_GPU ("gfx1172", GK_GFX1172, (11, 7, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) AMDGCN_GPU ("gfx1200", GK_GFX1200, (12, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) AMDGCN_GPU ("gfx1201", GK_GFX1201, (12, 0, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) -AMDGCN_GPU ("gfx1250", GK_GFX1250, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS|FEATURE_SRAMECC) -AMDGCN_GPU ("gfx1251", GK_GFX1251, (12, 5, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx1250", GK_GFX1250, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx1251", GK_GFX1251, (12, 5, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_SRAMECC) AMDGCN_GPU ("gfx1310", GK_GFX1310, (13, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) // Generic targets return the lowest common denominator @@ -140,13 +140,13 @@ AMDGCN_GPU ("gfx1310", GK_GFX1310, (13, 1, 0), FEATURE_FAST_FMA_F32|FEAT // // TODO: Split up this API depending on its caller so // generic target handling is more obvious and less risky. -AMDGCN_GPU ("gfx9-generic", GK_GFX9_GENERIC, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK) -AMDGCN_GPU ("gfx10-1-generic", GK_GFX10_1_GENERIC, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP) +AMDGCN_GPU ("gfx9-generic", GK_GFX9_GENERIC, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES) +AMDGCN_GPU ("gfx10-1-generic", GK_GFX10_1_GENERIC, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP) AMDGCN_GPU ("gfx10-3-generic", GK_GFX10_3_GENERIC, (10, 3, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) AMDGCN_GPU ("gfx11-generic", GK_GFX11_GENERIC, (11, 0, 3), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) AMDGCN_GPU ("gfx12-generic", GK_GFX12_GENERIC, (12, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP) -AMDGCN_GPU ("gfx9-4-generic", GK_GFX9_4_GENERIC, ( 9, 4, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC) -AMDGCN_GPU ("gfx12-5-generic", GK_GFX12_5_GENERIC, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS) +AMDGCN_GPU ("gfx9-4-generic", GK_GFX9_4_GENERIC, ( 9, 4, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC) +AMDGCN_GPU ("gfx12-5-generic", GK_GFX12_5_GENERIC, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK) #undef AMDGCN_GPU #undef AMDGCN_GPU_ALIAS diff --git a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h index 7c192b36b6ec8..1288f4cd69ff0 100644 --- a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h +++ b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h @@ -72,8 +72,8 @@ enum ArchFeatureKind : uint32_t { // WGP mode is supported. FEATURE_WGP = 1 << 9, - // Xnack is available by default - FEATURE_XNACK_ALWAYS = 1 << 10 + // Xnack on/off modes are supported. + FEATURE_XNACK_ON_OFF_MODES = 1 << 10 }; enum FeatureError : uint32_t { diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 588f63aeffcb9..2abb9c0154947 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -207,6 +207,13 @@ def FeatureSupportsXNACK : SubtargetFeature<"xnack-support", "Hardware supports XNACK" >; +defm XNACKOnOffModes : AMDGPUSubtargetFeature<"xnack-on-off-modes", + "Target supports XNACK on/off modes", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0, + [FeatureSupportsXNACK] +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -217,7 +224,8 @@ def FeatureSupportsXNACK : SubtargetFeature<"xnack-support", def FeatureXNACK : SubtargetFeature<"xnack", "EnableXNACK", "true", - "Enable XNACK support" + "Enable XNACK support", + [FeatureSupportsXNACK] >; def FeatureTgSplit : SubtargetFeature<"tgsplit", @@ -1503,7 +1511,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, - FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, + FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureXNACKOnOffModes, FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad, @@ -1683,7 +1691,7 @@ def FeatureISAVersion8_0_1 : FeatureSet< !listconcat(FeatureISAVersion8_0_Common.Features, [FeatureFastFMAF32, FeatureHalfRate64Ops, - FeatureSupportsXNACK])>; + FeatureXNACKOnOffModes])>; def FeatureISAVersion8_0_2 : FeatureSet< !listconcat(FeatureISAVersion8_0_Common.Features, @@ -1700,7 +1708,7 @@ def FeatureISAVersion8_0_5 : FeatureSet< def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureSupportsXNACK, + FeatureXNACKOnOffModes, FeatureImageStoreD16Bug, FeatureImageGather4D16Bug]>; @@ -1895,7 +1903,7 @@ def FeatureISAVersion10_1_Common : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSMisalignedBug, - FeatureSupportsXNACK, + FeatureXNACKOnOffModes, // gfx101x bugs FeatureVcmpxPermlaneHazard, FeatureVMEMtoScalarWriteHazard, @@ -2201,7 +2209,6 @@ def FeatureISAVersion12_50_Common : FeatureSet< FeatureSetPrioIncWgInst, FeatureSWakeupBarrier, Feature45BitNumRecordsBufferResource, - FeatureSupportsXNACK, FeatureXNACK, FeatureClusters, FeatureD16Writes32BitVgpr, @@ -2268,6 +2275,7 @@ def FeatureISAVersion12_5_Generic: FeatureSet< [FeatureAddressableLocalMemorySize327680, FeatureSetregVGPRMSBFixup, FeatureRequiresCOV6, + FeatureSupportsXNACK, FeatureGFX125xLowestRateWMMA, FeatureTransCoexecutionHazard, FeatureWMMACoexecutionHazards])>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index ce7e22436f33f..03a046bcb9142 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -293,6 +293,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { // Property of the kernel/environment which can't actually differ. AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, + AMDGPU::FeatureXNACKOnOffModes, AMDGPU::FeatureSupportsXNACK, AMDGPU::FeatureTrapHandler, // The default assumption needs to be ecc is enabled, but no directly diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 14de6753d42e4..55edfc2ea52d2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -157,8 +157,6 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, assert(llvm::isPowerOf2_32(InstCacheLineSize) && "InstCacheLineSize must be a power of 2"); - TargetID.setTargetIDFromFeaturesString(FS); - LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " << TargetID.getXnackSetting() << '\n'); LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " @@ -182,7 +180,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, : // clang-format off AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), - TargetID(*this), + TargetID(*this, FS), InstrItins(getInstrItineraryForCPU(GPU)), BufferOOBRelaxed(BufferOOBRelaxed), TBufferOOBRelaxed(TBufferOOBRelaxed), diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index ca1fe3ccf3da1..dc9636c6c2105 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -139,15 +139,9 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { std::optional &getTargetID() { return TargetID; } - void initializeTargetID(const MCSubtargetInfo &STI) { - assert(TargetID == std::nullopt && "TargetID can only be initialized once"); - TargetID.emplace(STI); - } void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) { - initializeTargetID(STI); - - assert(getTargetID() != std::nullopt && "TargetID is None"); - getTargetID()->setTargetIDFromFeaturesString(FeatureString); + assert(TargetID == std::nullopt && "TargetID can only be initialized once"); + TargetID.emplace(STI, FeatureString); } }; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index cfa9a59d3ded2..e1e83ece32ad0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1099,20 +1099,19 @@ VOPD::InstInfo getVOPDInstInfo(unsigned VOPDOpcode, namespace IsaInfo { -AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) - : STI(STI), XnackSetting(TargetIDSetting::Any), - SramEccSetting(TargetIDSetting::Any) { - if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) - XnackSetting = TargetIDSetting::Unsupported; - if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) - SramEccSetting = TargetIDSetting::Unsupported; -} +AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI, + StringRef FeatureString) + : STI(STI), XnackSetting(STI.getFeatureBits().test(FeatureSupportsXNACK) + ? TargetIDSetting::Any + : TargetIDSetting::Unsupported), + SramEccSetting(STI.getFeatureBits().test(FeatureSupportsSRAMECC) + ? TargetIDSetting::Any + : TargetIDSetting::Unsupported) { -void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) { // Check if xnack or sramecc is explicitly enabled or disabled. In the // absence of the target features we assume we must generate code that can run // in any environment. - SubtargetFeatures Features(FS); + SubtargetFeatures Features(FeatureString); std::optional XnackRequested; std::optional SramEccRequested; @@ -1127,7 +1126,10 @@ void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) { SramEccRequested = false; } - bool XnackSupported = isXnackSupported(); + // Only allow changing xnack setting if the target supports on/off modes. + // Targets without on/off mode support keep their initial setting (Any). + + bool XnackSupported = STI.getFeatureBits().test(FeatureXNACKOnOffModes); bool SramEccSupported = isSramEccSupported(); if (XnackRequested) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 11c393a623d20..6c771b3460662 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -160,7 +160,7 @@ class AMDGPUTargetID { TargetIDSetting SramEccSetting; public: - explicit AMDGPUTargetID(const MCSubtargetInfo &STI); + explicit AMDGPUTargetID(const MCSubtargetInfo &STI, StringRef FeatureString); ~AMDGPUTargetID() = default; /// \return True if the current xnack setting is not "Unsupported". @@ -217,7 +217,6 @@ class AMDGPUTargetID { SramEccSetting = NewSramEccSetting; } - void setTargetIDFromFeaturesString(StringRef FS); void setTargetIDFromTargetIDStream(StringRef TargetID); /// Write string representation to \p OS diff --git a/llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll b/llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll new file mode 100644 index 0000000000000..13d13c875b8aa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll @@ -0,0 +1,22 @@ +; gfx1250, gfx1251, and gfx12-5-generic have xnack always on because they don't +; support on/off modes (no FeatureXNACKOnOffModes). The target ID should not +; include xnack modifiers regardless of -mattr settings. + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1251 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-5-generic < %s | FileCheck --check-prefix=CHECK %s + +; Even with -mattr=+xnack or -mattr=-xnack, the target ID doesn't change +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+xnack < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-xnack < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1251 -mattr=+xnack < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1251 -mattr=-xnack < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-5-generic -mattr=+xnack < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-5-generic -mattr=-xnack < %s | FileCheck %s + +; CHECK: .amdgcn_target "amdgcn-amd-amdhsa--gfx{{1250|1251|12-5-generic}}" + +define void @func0() { +entry: + ret void +} From b65d5100d712e49abe85b337b03663ec1dcbd266 Mon Sep 17 00:00:00 2001 From: Kevin Sala Penades Date: Thu, 18 Jun 2026 23:43:38 -0700 Subject: [PATCH 004/149] [offload][OpenMP] Fix record replay when no memory is used (#201771) Progams that do not use any memory (e.g., no mappings) were failing because we were trying to execute zero size transfers. This commit adds handling for this case. --- offload/libomptarget/omptarget.cpp | 30 +++++++++------ .../common/src/RecordReplay.cpp | 37 +++++++++---------- .../record-replay-empty-memory.cpp | 26 +++++++++++++ .../kernelreplay/llvm-omp-kernel-replay.cpp | 3 +- 4 files changed, 64 insertions(+), 32 deletions(-) create mode 100644 offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index d18b8e38b7808..84b7554253d20 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -2440,6 +2440,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, // Initialize the device memory of each global. for (int32_t I = 0; I < NumGlobals; ++I) { assert(Globals[I].AuxAddr && "Global has no AuxAddr."); + assert(Globals[I].Size && "Global has Size zero."); // Initialize the value of the global in the device. int Ret = Device.submitData(Symbols[I + 1].DevPtr, Globals[I].AuxAddr, @@ -2450,25 +2451,30 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, } } - // Reuse a previous device allocation or allocate a new device buffer. + // Reuse a previous device allocation or allocate a new device buffer. Do not + // allocate anything if the size is zero. void *&TgtPtr = ReuseDeviceAlloc; - if (!TgtPtr) + if (!TgtPtr && DeviceMemorySize) { TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); - if (!TgtPtr) { - REPORT() << "Failed to allocate device memory."; - return OFFLOAD_FAIL; + if (!TgtPtr) { + REPORT() << "Failed to allocate device memory."; + return OFFLOAD_FAIL; + } } // Save the device allocation for future replays of the same kernel. if (ReplayOutcome) ReplayOutcome->ReplayDeviceAlloc = TgtPtr; - int Ret = - Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) { - REPORT() << "Failed to submit data to a global."; - return OFFLOAD_FAIL; + // Initialize the device memory. + if (DeviceMemorySize) { + int Ret = + Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT() << "Failed to submit data to the device memory."; + return OFFLOAD_FAIL; + } } KernelArgsTy KernelArgs{}; @@ -2487,8 +2493,8 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, KernelExtraArgsTy KernelExtraArgs{}; KernelExtraArgs.ReplayOutcome = ReplayOutcome; - Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets, KernelArgs, - &KernelExtraArgs, AsyncInfo); + int Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets, + KernelArgs, &KernelExtraArgs, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT() << "Failed to launch kernel replay."; return OFFLOAD_FAIL; diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp index 7cfd39288307b..bd93f79bb8bad 100644 --- a/offload/plugins-nextgen/common/src/RecordReplay.cpp +++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp @@ -338,23 +338,24 @@ Error NativeRecordReplayTy::recordSnapshot(StringRef Filename) { uint64_t RecordSize = CurrentSize; AllocationLock.unlock(); - ErrorOr> DeviceMemoryMB = - WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize); - if (!DeviceMemoryMB) - return Plugin::error(ErrorCode::OUT_OF_RESOURCES, - "creating MemoryBuffer for device memory"); - - if (auto Err = Device.dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), - StartAddr, RecordSize, nullptr)) - return Err; - - StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), RecordSize); + std::unique_ptr DeviceMB; + if (RecordSize) { + DeviceMB = WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize); + if (!DeviceMB) + return Plugin::error(ErrorCode::OUT_OF_RESOURCES, + "creating MemoryBuffer for device memory"); + + if (auto Err = Device.dataRetrieve(DeviceMB->getBufferStart(), StartAddr, + RecordSize, nullptr)) + return Err; + } std::error_code EC; raw_fd_ostream OS(Filename, EC); if (EC) return Plugin::error(ErrorCode::HOST_IO, "saving memory snapshot file"); - OS << DeviceMemory; + if (DeviceMB) + OS.write(DeviceMB->getBufferStart(), RecordSize); OS.close(); return Plugin::success(); } @@ -389,13 +390,12 @@ Error NativeRecordReplayTy::recordGlobals(StringRef Filename) { NumGlobals++; } - ErrorOr> GlobalsMB = - WritableMemoryBuffer::getNewUninitMemBuffer(TotalSize); + auto GlobalsMB = WritableMemoryBuffer::getNewUninitMemBuffer(TotalSize); if (!GlobalsMB) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "creating MemoryBuffer for globals memory"); - void *BufferPtr = GlobalsMB.get()->getBufferStart(); + void *BufferPtr = GlobalsMB->getBufferStart(); *((uint32_t *)(BufferPtr)) = NumGlobals; BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t)); @@ -418,16 +418,15 @@ Error NativeRecordReplayTy::recordGlobals(StringRef Filename) { return Err; BufferPtr = utils::advancePtr(BufferPtr, Global.Size); } - assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && + assert(BufferPtr == GlobalsMB->getBufferEnd() && "Buffer over or under-filled."); assert(TotalSize == (uint64_t)utils::getPtrDiff( - BufferPtr, GlobalsMB->get()->getBufferStart()) && + BufferPtr, GlobalsMB->getBufferStart()) && "Buffer size mismatch."); - StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), TotalSize); std::error_code EC; raw_fd_ostream OS(Filename, EC); - OS << GlobalsMemory; + OS.write(GlobalsMB->getBufferStart(), TotalSize); OS.close(); return Plugin::success(); } diff --git a/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp new file mode 100644 index 0000000000000..0705c6d66ac8e --- /dev/null +++ b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp @@ -0,0 +1,26 @@ +// clang-format off +// RUN: %libomptarget-compilexx-generic +// RUN: rm -rf %t.testdir +// RUN: mkdir -p %t.testdir +// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic +// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {} +// clang-format on + +// REQUIRES: gpu + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: x86_64-unknown-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: intelgpu + +#include +#include + +int main() { +#pragma omp target teams num_teams(256) + { + } + + // CHECK: PASS + printf("PASS\n"); +} diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp index a5bda7a0f0444..4335002fd8c77 100644 --- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp +++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -130,7 +130,8 @@ Error verifyReplayOutput(StringRef RecordOutputFilename, if (!ReplayOutputBufferOrErr) return createErr("failed to read the kernel replay output file"); - // Compare record and replay outputs to verify they match. + // Compare record and replay outputs to verify they match. If they are both + // empty, the verification is successful. StringRef RecordOutput = RecordOutputBufferOrErr.get()->getBuffer(); StringRef ReplayOutput = ReplayOutputBufferOrErr.get()->getBuffer(); if (RecordOutput != ReplayOutput) From 9c50867e78707c7ad9b46b6c2c71ef45ac124bbb Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 19 Jun 2026 16:45:03 +1000 Subject: [PATCH 005/149] [ORC][examples] Add a new example showing basic symbolAliases usage. (#204733) LLJITWithSymbolAliases shows how the symbolAliases function can be used to introduce aliases for both JIT'd and precompiled symbols. --- llvm/examples/OrcV2Examples/CMakeLists.txt | 1 + .../LLJITWithSymbolAliases/CMakeLists.txt | 12 +++ .../LLJITWithSymbolAliases.cpp | 85 +++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt create mode 100644 llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp diff --git a/llvm/examples/OrcV2Examples/CMakeLists.txt b/llvm/examples/OrcV2Examples/CMakeLists.txt index f1189e4ef96ca..e365565a6f9c2 100644 --- a/llvm/examples/OrcV2Examples/CMakeLists.txt +++ b/llvm/examples/OrcV2Examples/CMakeLists.txt @@ -8,6 +8,7 @@ add_subdirectory(LLJITWithLazyReexports) add_subdirectory(LLJITWithObjectCache) add_subdirectory(LLJITWithObjectLinkingLayerPlugin) add_subdirectory(LLJITWithOptimizingIRTransform) +add_subdirectory(LLJITWithSymbolAliases) add_subdirectory(LLJITWithThinLTOSummaries) add_subdirectory(OrcV2CBindingsAddObjectFile) add_subdirectory(OrcV2CBindingsBasicUsage) diff --git a/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt new file mode 100644 index 0000000000000..d821eddf6560e --- /dev/null +++ b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt @@ -0,0 +1,12 @@ +set(LLVM_LINK_COMPONENTS + Core + ExecutionEngine + IRReader + OrcJIT + Support + nativecodegen + ) + +add_llvm_example(LLJITWithSymbolAliases + LLJITWithSymbolAliases.cpp + ) diff --git a/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp new file mode 100644 index 0000000000000..50d9e58554b83 --- /dev/null +++ b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp @@ -0,0 +1,85 @@ +//===-- LLJITWithSymbolAliases.cpp - Symbol aliases with LLJIT ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This example demonstrates how to use the symbolAliases utility to define +// alternate names for symbols already present in a JITDylib. We define two +// aliases: +// +// - "aliased_foo" as an alias for "foo", a function defined in a JIT'd IR +// module. +// - "aliased_bar" as an alias for "bar", a precompiled function added to +// the JITDylib via absoluteSymbols. +// +// We then look up both aliases and call them to confirm that they resolve to +// the original definitions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/LLJIT.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" + +#include "../ExampleModules.h" + +using namespace llvm; +using namespace llvm::orc; + +ExitOnError ExitOnErr; + +// IR module containing the simplest possible function: foo returns 42. +const llvm::StringRef FooMod = + R"( + define i32 @foo() { + entry: + ret i32 42 + } +)"; + +// Precompiled function that we will expose to the JIT via absoluteSymbols. +static int bar() { return 7; } + +int main(int argc, char *argv[]) { + // Initialize LLVM. + InitLLVM X(argc, argv); + + InitializeNativeTarget(); + InitializeNativeTargetAsmPrinter(); + + cl::ParseCommandLineOptions(argc, argv, "LLJITWithSymbolAliases"); + ExitOnErr.setBanner(std::string(argv[0]) + ": "); + + // Create an LLJIT instance and add the IR module containing 'foo'. + auto J = ExitOnErr(LLJITBuilder().create()); + ExitOnErr(J->addIRModule(ExitOnErr(parseExampleModule(FooMod, "foo-mod")))); + + // Add the precompiled 'bar' function as an absolute symbol. + auto &JD = J->getMainJITDylib(); + ExitOnErr(JD.define(absoluteSymbols( + {{J->mangleAndIntern("bar"), + {ExecutorAddr::fromPtr(&bar), + JITSymbolFlags::Exported | JITSymbolFlags::Callable}}}))); + + // Define aliases: 'aliased_foo' -> 'foo' and 'aliased_bar' -> 'bar'. + ExitOnErr(JD.define(symbolAliases( + {{J->mangleAndIntern("aliased_foo"), + {J->mangleAndIntern("foo"), + JITSymbolFlags::Exported | JITSymbolFlags::Callable}}, + {J->mangleAndIntern("aliased_bar"), + {J->mangleAndIntern("bar"), + JITSymbolFlags::Exported | JITSymbolFlags::Callable}}}))); + + // Look up the aliases and call them. + auto AliasedFoo = ExitOnErr(J->lookup("aliased_foo")).toPtr(); + auto AliasedBar = ExitOnErr(J->lookup("aliased_bar")).toPtr(); + + outs() << "aliased_foo() = " << AliasedFoo() << "\n" + << "aliased_bar() = " << AliasedBar() << "\n"; + + return 0; +} From 0ad5d54be53f34f71e25575964785670720a5f4c Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Fri, 19 Jun 2026 08:00:45 +0100 Subject: [PATCH 006/149] [GlobalISel] TableGen memcpy-like prelegalizer combines (#203235) This removes the corresponding handwritten C++ combine handling from the AArch64 prelegalizer combiners. Assisted-by: codex --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 12 +++--- .../include/llvm/Target/GlobalISel/Combine.td | 40 +++++++++++++++++++ .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 19 +++++++-- llvm/lib/Target/AArch64/AArch64Combine.td | 16 +++++++- .../AArch64/GISel/AArch64GlobalISelUtils.cpp | 30 ++++++++------ .../AArch64/GISel/AArch64GlobalISelUtils.h | 9 +++-- .../GISel/AArch64O0PreLegalizerCombiner.cpp | 20 ---------- .../GISel/AArch64PreLegalizerCombiner.cpp | 21 ---------- .../Target/Mips/MipsPreLegalizerCombiner.cpp | 2 +- 9 files changed, 100 insertions(+), 69 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index a04ff991b2cf8..878cf28ae239e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -358,6 +358,12 @@ class CombinerHelper { /// $whatever = COPY $addr LLVM_ABI bool tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0) const; + LLVM_ABI bool matchCombineMemCpyFamily(MachineInstr &MI, + MemCpyFamilyLoweringInfo &MatchInfo, + unsigned MaxLen = 0) const; + LLVM_ABI void + applyCombineMemCpyFamily(MachineInstr &MI, + MemCpyFamilyLoweringInfo &MatchInfo) const; LLVM_ABI bool matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo) const; @@ -860,12 +866,6 @@ class CombinerHelper { /// combine functions. Returns true if changed. LLVM_ABI bool tryCombine(MachineInstr &MI) const; - /// Emit loads and stores that perform the given memcpy. - /// Assumes \p MI is a G_MEMCPY_INLINE or a G_MEMSET_INLINE - /// TODO: implement dynamically sized inline memcpy, - /// and rename: s/bool tryEmit/void emit/ - LLVM_ABI bool tryEmitMemcpyInlineFamily(MachineInstr &MI) const; - /// Match: /// (G_UMULO x, 2) -> (G_UADDO x, x) /// (G_SMULO x, 2) -> (G_SADDO x, x) diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index d565bfb6696e6..1b0602af68367 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -296,6 +296,46 @@ def combine_indexed_load_store : GICombineRule< [{ return Helper.matchCombineIndexedLoadStore(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyCombineIndexedLoadStore(*${root}, ${matchinfo}); }])>; +def memcpy_family_matchinfo : GIDefMatchData<"MemCpyFamilyLoweringInfo">; +def combine_memcpy_inline : GICombineRule< + (defs root:$root, memcpy_family_matchinfo:$matchinfo), + (match (G_MEMCPY_INLINE $dst_addr, $src_addr, $size):$root, + [{ return Helper.matchCombineMemCpyFamily(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyCombineMemCpyFamily(*${root}, ${matchinfo}); }])>; + +def combine_memset_inline : GICombineRule< + (defs root:$root, memcpy_family_matchinfo:$matchinfo), + (match (G_MEMSET_INLINE $dst_addr, $value, $size):$root, + [{ return Helper.matchCombineMemCpyFamily(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyCombineMemCpyFamily(*${root}, ${matchinfo}); }])>; + +def combine_memcpy : GICombineRule< + (defs root:$root, memcpy_family_matchinfo:$matchinfo), + (match (G_MEMCPY $dst_addr, $src_addr, $size, $tailcall):$root, + [{ return Helper.matchCombineMemCpyFamily(*${root}, ${matchinfo}, + CInfo.EnableOpt ? 0 : 32); }]), + (apply [{ Helper.applyCombineMemCpyFamily(*${root}, ${matchinfo}); }])>; + +def combine_memmove : GICombineRule< + (defs root:$root, memcpy_family_matchinfo:$matchinfo), + (match (G_MEMMOVE $dst_addr, $src_addr, $size, $tailcall):$root, + [{ return Helper.matchCombineMemCpyFamily(*${root}, ${matchinfo}, + CInfo.EnableOpt ? 0 : 32); }]), + (apply [{ Helper.applyCombineMemCpyFamily(*${root}, ${matchinfo}); }])>; + +def combine_memset : GICombineRule< + (defs root:$root, memcpy_family_matchinfo:$matchinfo), + (match (G_MEMSET $dst_addr, $value, $size, $tailcall):$root, + [{ return Helper.matchCombineMemCpyFamily(*${root}, ${matchinfo}, + CInfo.EnableOpt ? 0 : 32); }]), + (apply [{ Helper.applyCombineMemCpyFamily(*${root}, ${matchinfo}); }])>; + +def memcpy_family_combines : GICombineGroup<[combine_memcpy_inline, + combine_memset_inline, + combine_memcpy, + combine_memmove, + combine_memset]>; + def opt_brcond_by_inverting_cond_matchdata : GIDefMatchData<"MachineInstr *">; def opt_brcond_by_inverting_cond : GICombineRule< (defs root:$root, opt_brcond_by_inverting_cond_matchdata:$matchinfo), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 8ea8efcb3758a..8853a941e137f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1721,12 +1721,25 @@ void CombinerHelper::applyOptBrCondByInvertingCond( Observer.changedInstr(*BrCond); } -bool CombinerHelper::tryEmitMemcpyInlineFamily(MachineInstr &MI) const { +bool CombinerHelper::matchCombineMemCpyFamily( + MachineInstr &MI, MemCpyFamilyLoweringInfo &MatchInfo, + unsigned MaxLen) const { + auto &[Dst, Src, KnownLen, Alignment, DstAlignCanChange, MemOps] = MatchInfo; + return canLowerMemCpyFamily(MI, MRI, MaxLen, Dst, Src, KnownLen, Alignment, + DstAlignCanChange, MemOps); +} + +void CombinerHelper::applyCombineMemCpyFamily( + MachineInstr &MI, MemCpyFamilyLoweringInfo &MatchInfo) const { + auto &[Dst, Src, KnownLen, Alignment, DstAlignCanChange, MemOps] = MatchInfo; MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder); - return Helper.lowerMemCpyFamily(MI) == - LegalizerHelper::LegalizeResult::Legalized; + bool Changed = Helper.lowerMemCpyFamily(MI, Dst, Src, KnownLen, Alignment, + DstAlignCanChange, MemOps) == + LegalizerHelper::LegalizeResult::Legalized; + assert(Changed && "expected memcpy-family instruction to lower"); + (void)Changed; } bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index a9c447336cd5e..82b9cec960d89 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -68,6 +68,14 @@ def simplify_uaddo : GICombineRule< (apply [{ applySimplifyUADDO(*${root}, MRI, B, Observer, Helper, ${matchinfo}); }])>; +def emit_bzero : GICombineRule< + (defs root:$root), + (match (G_MEMSET $dst_addr, $value, $size, $tailcall):$root, + [{ return llvm::AArch64GISelUtils::matchEmitBZero(*${root}, MRI, + Libcalls, + CInfo.EnableMinSize); }]), + (apply [{ llvm::AArch64GISelUtils::applyEmitBZero(*${root}, B); }])>; + def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, select_zero_true, @@ -82,13 +90,17 @@ def AArch64PreLegalizerCombiner: GICombiner< push_sub_through_sext, push_add_through_sext, push_mul_through_sext, - simplify_uaddo]> { + simplify_uaddo, + memcpy_family_combines, + emit_bzero]> { let CombineAllMethodName = "tryCombineAllImpl"; } def AArch64O0PreLegalizerCombiner: GICombiner< "AArch64O0PreLegalizerCombinerImpl", [optnone_combines, - combine_shuffle_vector]> { + combine_shuffle_vector, + memcpy_family_combines, + emit_bzero]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp index 74cb5e9bb0729..d0ef3ae72b02d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp @@ -60,15 +60,27 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub, return MaybeZero && MaybeZero->Value.getZExtValue() == 0; } -bool AArch64GISelUtils::tryEmitBZero(MachineInstr &MI, - MachineIRBuilder &MIRBuilder, - const LibcallLoweringInfo &Libcalls, - bool MinSize) { +void AArch64GISelUtils::applyEmitBZero(MachineInstr &MI, + MachineIRBuilder &MIRBuilder) { + assert(MI.getOpcode() == TargetOpcode::G_MEMSET); + + MIRBuilder.setInstrAndDebugLoc(MI); + MIRBuilder + .buildInstr(TargetOpcode::G_BZERO, {}, + {MI.getOperand(0), MI.getOperand(2)}) + .addImm(MI.getOperand(3).getImm()) + .addMemOperand(*MI.memoperands_begin()); + MI.eraseFromParent(); +} + +bool AArch64GISelUtils::matchEmitBZero(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const LibcallLoweringInfo &Libcalls, + bool MinSize) { assert(MI.getOpcode() == TargetOpcode::G_MEMSET); if (Libcalls.getLibcallImpl(RTLIB::BZERO) == RTLIB::Unsupported) return false; - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); auto Zero = getIConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI); if (!Zero || Zero->Value.getSExtValue() != 0) @@ -86,14 +98,6 @@ bool AArch64GISelUtils::tryEmitBZero(MachineInstr &MI, return false; } } - - MIRBuilder.setInstrAndDebugLoc(MI); - MIRBuilder - .buildInstr(TargetOpcode::G_BZERO, {}, - {MI.getOperand(0), MI.getOperand(2)}) - .addImm(MI.getOperand(3).getImm()) - .addMemOperand(*MI.memoperands_begin()); - MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h index fdb7524fb0ed6..0fd955a643fb5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h @@ -44,9 +44,12 @@ bool isCMN(const MachineInstr *MaybeSub, const CmpInst::Predicate &Pred, /// /// \note This only applies on Darwin. /// -/// \returns true if \p MI was replaced with a G_BZERO. -bool tryEmitBZero(MachineInstr &MI, MachineIRBuilder &MIRBuilder, - const LibcallLoweringInfo &Libcalls, bool MinSize); +/// \returns true if \p MI can be replaced with a G_BZERO. +bool matchEmitBZero(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const LibcallLoweringInfo &Libcalls, bool MinSize); +/// +/// Replace \p MI with a G_BZERO. +void applyEmitBZero(MachineInstr &MI, MachineIRBuilder &MIRBuilder); /// Analyze a ptrauth discriminator value to try to find the constant integer /// and address parts, cracking a ptrauth_blend intrinsic if there is one. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp index 0846bd7994a31..74e8c682df482 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp @@ -92,26 +92,6 @@ bool AArch64O0PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { if (tryCombineAllImpl(MI)) return true; - unsigned Opc = MI.getOpcode(); - switch (Opc) { - case TargetOpcode::G_MEMCPY_INLINE: - case TargetOpcode::G_MEMSET_INLINE: - return Helper.tryEmitMemcpyInlineFamily(MI); - case TargetOpcode::G_MEMCPY: - case TargetOpcode::G_MEMMOVE: - case TargetOpcode::G_MEMSET: { - // At -O0 set a maxlen of 32 to inline; - unsigned MaxLen = 32; - // Try to inline memcpy type calls if optimizations are enabled. - if (Helper.tryCombineMemCpyFamily(MI, MaxLen)) - return true; - if (Opc == TargetOpcode::G_MEMSET) - return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, Libcalls, - CInfo.EnableMinSize); - return false; - } - } - return false; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 9834ea8ce5df9..91c75a06c84cd 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -789,27 +789,6 @@ bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { if (tryCombineAllImpl(MI)) return true; - unsigned Opc = MI.getOpcode(); - switch (Opc) { - case TargetOpcode::G_MEMCPY_INLINE: - case TargetOpcode::G_MEMSET_INLINE: - return Helper.tryEmitMemcpyInlineFamily(MI); - case TargetOpcode::G_MEMCPY: - case TargetOpcode::G_MEMMOVE: - case TargetOpcode::G_MEMSET: { - // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other - // heuristics decide. - unsigned MaxLen = CInfo.EnableOpt ? 0 : 32; - // Try to inline memcpy type calls if optimizations are enabled. - if (Helper.tryCombineMemCpyFamily(MI, MaxLen)) - return true; - if (Opc == TargetOpcode::G_MEMSET) - return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, Libcalls, - CInfo.EnableMinSize); - return false; - } - } - return false; } diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp index dd9b76d9c9ecc..070557e8d9b03 100644 --- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp @@ -59,7 +59,7 @@ class MipsPreLegalizerCombinerImpl : public Combiner { return false; case TargetOpcode::G_MEMCPY_INLINE: case TargetOpcode::G_MEMSET_INLINE: - return Helper.tryEmitMemcpyInlineFamily(MI); + return Helper.tryCombineMemCpyFamily(MI); case TargetOpcode::G_LOAD: case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: { From d87e513d454714d08b6deeb055bf6f6cc959b450 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 19 Jun 2026 09:11:55 +0200 Subject: [PATCH 007/149] [clang][test] Use #marker in enable_if tests (#204624) I just wasted way too long trying to figure out why my newly added RUN lines were randomly broken or not. Stop using absolute line numbers. --- clang/test/Sema/enable_if.c | 49 +++++++++++++----- clang/test/SemaCXX/enable_if.cpp | 88 ++++++++++++++++++++++---------- 2 files changed, 96 insertions(+), 41 deletions(-) diff --git a/clang/test/Sema/enable_if.c b/clang/test/Sema/enable_if.c index 3ef8310a2fef7..80f8cce5918ed 100644 --- a/clang/test/Sema/enable_if.c +++ b/clang/test/Sema/enable_if.c @@ -112,29 +112,50 @@ void f(int n) __attribute__((enable_if())); // expected-error{{'enable_if' attr void f(int n) __attribute__((enable_if(unresolvedid, "chosen when 'unresolvedid' is non-zero"))); // expected-error{{use of undeclared identifier 'unresolvedid'}} int global; -void f(int n) __attribute__((enable_if(global == 0, "chosen when 'global' is zero"))); // expected-error{{'enable_if' attribute expression never produces a constant expression}} // expected-note{{subexpression not valid in a constant expression}} +void f(int n) __attribute__((enable_if(global == 0, "chosen when 'global' is zero"))); // expected-error{{'enable_if' attribute expression never produces a constant expression}} \ + // expected-note{{subexpression not valid in a constant expression}} enum { cst = 7 }; void return_cst(void) __attribute__((overloadable)) __attribute__((enable_if(cst == 7, "chosen when 'cst' is 7"))); void test_return_cst(void) { return_cst(); } -void f2(void) __attribute__((overloadable)) __attribute__((enable_if(1, "always chosen"))); -void f2(void) __attribute__((overloadable)) __attribute__((enable_if(0, "never chosen"))); -void f2(void) __attribute__((overloadable)) __attribute__((enable_if(TRUE, "always chosen #2"))); +void f2(void) __attribute__((overloadable)) __attribute__((enable_if(1, "always chosen"))); // #f2_1 +void f2(void) __attribute__((overloadable)) __attribute__((enable_if(0, "never chosen"))); // #f2_2 +void f2(void) __attribute__((overloadable)) __attribute__((enable_if(TRUE, "always chosen #2"))); // #f2_3 void test6(void) { - void (*p1)(void) = &f2; // expected-error{{initializing 'void (*)(void)' with an expression of incompatible type ''}} expected-note@121{{candidate function}} expected-note@122{{candidate function made ineligible by enable_if}} expected-note@123{{candidate function}} - void (*p2)(void) = f2; // expected-error{{initializing 'void (*)(void)' with an expression of incompatible type ''}} expected-note@121{{candidate function}} expected-note@122{{candidate function made ineligible by enable_if}} expected-note@123{{candidate function}} - void *p3 = (void*)&f2; // expected-error{{address of overloaded function 'f2' is ambiguous}} expected-note@121{{candidate function}} expected-note@122{{candidate function made ineligible by enable_if}} expected-note@123{{candidate function}} - void *p4 = (void*)f2; // expected-error{{address of overloaded function 'f2' is ambiguous}} expected-note@121{{candidate function}} expected-note@122{{candidate function made ineligible by enable_if}} expected-note@123{{candidate function}} + void (*p1)(void) = &f2; // expected-error {{initializing 'void (*)(void)' with an expression of incompatible type ''}} \ + // expected-note@#f2_1 {{candidate function}} \ + // expected-note@#f2_2 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f2_3 {{candidate function}} + void (*p2)(void) = f2; // expected-error {{initializing 'void (*)(void)' with an expression of incompatible type ''}} \ + // expected-note@#f2_1 {{candidate function}} \ + // expected-note@#f2_2 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f2_3 {{candidate function}} + void *p3 = (void*)&f2; // expected-error {{address of overloaded function 'f2' is ambiguous}} \ + // expected-note@#f2_1 {{candidate function}} \ + // expected-note@#f2_2 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f2_3 {{candidate function}} + void *p4 = (void*)f2; // expected-error {{address of overloaded function 'f2' is ambiguous}} \ + // expected-note@#f2_1 {{candidate function}} \ + // expected-note@#f2_2 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f2_3 {{candidate function}} } -void f3(int m) __attribute__((overloadable)) __attribute__((enable_if(m >= 0, "positive"))); -void f3(int m) __attribute__((overloadable)) __attribute__((enable_if(m < 0, "negative"))); +void f3(int m) __attribute__((overloadable)) __attribute__((enable_if(m >= 0, "positive"))); // #f3_1 +void f3(int m) __attribute__((overloadable)) __attribute__((enable_if(m < 0, "negative"))); // #f3_2 void test7(void) { - void (*p1)(int) = &f3; // expected-error{{initializing 'void (*)(int)' with an expression of incompatible type ''}} expected-note@131{{candidate function made ineligible by enable_if}} expected-note@132{{candidate function made ineligible by enable_if}} - void (*p2)(int) = f3; // expected-error{{initializing 'void (*)(int)' with an expression of incompatible type ''}} expected-note@131{{candidate function made ineligible by enable_if}} expected-note@132{{candidate function made ineligible by enable_if}} - void *p3 = (void*)&f3; // expected-error{{address of overloaded function 'f3' does not match required type 'void'}} expected-note@131{{candidate function made ineligible by enable_if}} expected-note@132{{candidate function made ineligible by enable_if}} - void *p4 = (void*)f3; // expected-error{{address of overloaded function 'f3' does not match required type 'void'}} expected-note@131{{candidate function made ineligible by enable_if}} expected-note@132{{candidate function made ineligible by enable_if}} + void (*p1)(int) = &f3; // expected-error {{initializing 'void (*)(int)' with an expression of incompatible type ''}} \ + // expected-note@#f3_1 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f3_2 {{candidate function made ineligible by enable_if}} + void (*p2)(int) = f3; // expected-error {{initializing 'void (*)(int)' with an expression of incompatible type ''}} \ + // expected-note@#f3_1 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f3_2 {{candidate function made ineligible by enable_if}} + void *p3 = (void*)&f3; // expected-error {{address of overloaded function 'f3' does not match required type 'void'}} \ + // expected-note@#f3_1 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f3_2 {{candidate function made ineligible by enable_if}} + void *p4 = (void*)f3; // expected-error {{address of overloaded function 'f3' does not match required type 'void'}} \ + // expected-note@#f3_1 {{candidate function made ineligible by enable_if}} \ + // expected-note@#f3_2 {{candidate function made ineligible by enable_if}} } void f4(int m) __attribute__((enable_if(0, ""))); diff --git a/clang/test/SemaCXX/enable_if.cpp b/clang/test/SemaCXX/enable_if.cpp index 4b0a253d89fed..9b35bf2ac0c8d 100644 --- a/clang/test/SemaCXX/enable_if.cpp +++ b/clang/test/SemaCXX/enable_if.cpp @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -std=c++11 -verify %s // RUN: %clang_cc1 -std=c++2a -verify %s + typedef int (*fp)(int); int surrogate(int); struct Incomplete; // expected-note{{forward declaration of 'Incomplete'}} \ @@ -118,7 +119,8 @@ template class C { int fn3(bool b) __attribute__((enable_if(b, ""))); // FIXME: This test should net 0 error messages. template void test3() { - fn3(sizeof(T) == 1); // expected-error{{no matching function for call to 'fn3'}} expected-note@-2{{candidate disabled}} + fn3(sizeof(T) == 1); // expected-error{{no matching function for call to 'fn3'}} \ + // expected-note@-2{{candidate disabled}} } template @@ -138,7 +140,8 @@ void test4() { void h(int); template void outer() { void local_function() __attribute__((enable_if(::h(T()), ""))); - local_function(); // expected-error{{no matching function for call to 'local_function'}} expected-note@-1{{candidate disabled}} + local_function(); // expected-error{{no matching function for call to 'local_function'}} \ + // expected-note@-1{{candidate disabled}} }; namespace PR20988 { @@ -160,7 +163,8 @@ namespace PR20988 { int fn3(bool b) __attribute__((enable_if(b, ""))); // FIXME: This test should net 0 error messages. template void test3() { - fn3(sizeof(T) == 1); // expected-error{{no matching function for call to 'fn3'}} expected-note@-2{{candidate disabled}} + fn3(sizeof(T) == 1); // expected-error {{no matching function for call to 'fn3'}} \ + // expected-note@-2 {{candidate disabled}} } } @@ -188,14 +192,22 @@ namespace FnPtrs { a = &ovlBar; } - int ovlConflict(int m) __attribute__((enable_if(true, ""))); - int ovlConflict(int m) __attribute__((enable_if(1, ""))); + int ovlConflict(int m) __attribute__((enable_if(true, ""))); // #ovl_1 + int ovlConflict(int m) __attribute__((enable_if(1, ""))); // #ovl_2 void test3() { - int (*p)(int) = ovlConflict; // expected-error{{address of overloaded function 'ovlConflict' is ambiguous}} expected-note@191{{candidate function}} expected-note@192{{candidate function}} - int (*p2)(int) = &ovlConflict; // expected-error{{address of overloaded function 'ovlConflict' is ambiguous}} expected-note@191{{candidate function}} expected-note@192{{candidate function}} + int (*p)(int) = ovlConflict; // expected-error {{address of overloaded function 'ovlConflict' is ambiguous}} \ + // expected-note@#ovl_1 {{candidate function}} \ + // expected-note@#ovl_2 {{candidate function}} + int (*p2)(int) = &ovlConflict; // expected-error {{address of overloaded function 'ovlConflict' is ambiguous}} \ + // expected-note@#ovl_1 {{candidate function}} \ + // expected-note@#ovl_2 {{candidate function}} int (*a)(int); - a = ovlConflict; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@191{{candidate function}} expected-note@192{{candidate function}} - a = &ovlConflict; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@191{{candidate function}} expected-note@192{{candidate function}} + a = ovlConflict; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#ovl_1 {{candidate function}} \ + // expected-note@#ovl_2 {{candidate function}} + a = &ovlConflict; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#ovl_1 {{candidate function}} \ + // expected-note@#ovl_2 {{candidate function}} } template @@ -211,37 +223,59 @@ namespace FnPtrs { } template - T templatedBar(T m) __attribute__((enable_if(m > 0, ""))) { return T(); } + T templatedBar(T m) __attribute__((enable_if(m > 0, ""))) { return T(); } // #tbar void test5() { - int (*p)(int) = templatedBar; // expected-error{{address of overloaded function 'templatedBar' does not match required type 'int (int)'}} expected-note@214{{candidate function made ineligible by enable_if}} - int (*p2)(int) = &templatedBar; // expected-error{{address of overloaded function 'templatedBar' does not match required type 'int (int)'}} expected-note@214{{candidate function made ineligible by enable_if}} + int (*p)(int) = templatedBar; // expected-error {{address of overloaded function 'templatedBar' does not match required type 'int (int)'}} \ + // expected-note@#tbar {{candidate function made ineligible by enable_if}} + int (*p2)(int) = &templatedBar; // expected-error {{address of overloaded function 'templatedBar' does not match required type 'int (int)'}} \ + // expected-note@#tbar {{candidate function made ineligible by enable_if}} int (*a)(int); - a = templatedBar; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@214{{candidate function made ineligible by enable_if}} - a = &templatedBar; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@214{{candidate function made ineligible by enable_if}} + a = templatedBar; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#tbar {{candidate function made ineligible by enable_if}} + a = &templatedBar; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#tbar {{candidate function made ineligible by enable_if}} } template - T templatedConflict(T m) __attribute__((enable_if(false, ""))) { return T(); } + T templatedConflict(T m) __attribute__((enable_if(false, ""))) { return T(); } // #conflict1 template - T templatedConflict(T m) __attribute__((enable_if(true, ""))) { return T(); } + T templatedConflict(T m) __attribute__((enable_if(true, ""))) { return T(); } // #conflict2 template - T templatedConflict(T m) __attribute__((enable_if(1, ""))) { return T(); } + T templatedConflict(T m) __attribute__((enable_if(1, ""))) { return T(); } // #conflict3 void test6() { - int (*p)(int) = templatedConflict; // expected-error{{address of overloaded function 'templatedConflict' is ambiguous}} expected-note@224{{candidate function made ineligible by enable_if}} expected-note@226{{candidate function}} expected-note@228{{candidate function}} - int (*p0)(int) = &templatedConflict; // expected-error{{address of overloaded function 'templatedConflict' is ambiguous}} expected-note@224{{candidate function made ineligible by enable_if}} expected-note@226{{candidate function}} expected-note@228{{candidate function}} + int (*p)(int) = templatedConflict; // expected-error {{address of overloaded function 'templatedConflict' is ambiguous}} \ + // expected-note@#conflict1 {{candidate function made ineligible by enable_if}} \ + // expected-note@#conflict2 {{candidate function}} \ + // expected-note@#conflict3 {{candidate function}} + int (*p0)(int) = &templatedConflict; // expected-error {{address of overloaded function 'templatedConflict' is ambiguous}} \ + // expected-note@#conflict1 {{candidate function made ineligible by enable_if}} \ + // expected-note@#conflict2 {{candidate function}} \ + // expected-note@#conflict3 {{candidate function}} int (*a)(int); - a = templatedConflict; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@226{{candidate function}} expected-note@228{{candidate function}} - a = &templatedConflict; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@226{{candidate function}} expected-note@228{{candidate function}} + a = templatedConflict; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#conflict2 {{candidate function}} \ + // expected-note@#conflict3 {{candidate function}} + a = &templatedConflict; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#conflict2 {{candidate function}} \ + // expected-note@#conflict3 {{candidate function}} } - int ovlNoCandidate(int m) __attribute__((enable_if(false, ""))); - int ovlNoCandidate(int m) __attribute__((enable_if(0, ""))); + int ovlNoCandidate(int m) __attribute__((enable_if(false, ""))); // #ovlno1 + int ovlNoCandidate(int m) __attribute__((enable_if(0, ""))); // #ovlno2 void test7() { - int (*p)(int) = ovlNoCandidate; // expected-error{{address of overloaded function 'ovlNoCandidate' does not match required type}} expected-note@237{{made ineligible by enable_if}} expected-note@238{{made ineligible by enable_if}} - int (*p2)(int) = &ovlNoCandidate; // expected-error{{address of overloaded function 'ovlNoCandidate' does not match required type}} expected-note@237{{made ineligible by enable_if}} expected-note@238{{made ineligible by enable_if}} + int (*p)(int) = ovlNoCandidate; // expected-error {{address of overloaded function 'ovlNoCandidate' does not match required type}} \ + // expected-note@#ovlno1 {{made ineligible by enable_if}} \ + // expected-note@#ovlno2 {{made ineligible by enable_if}} + int (*p2)(int) = &ovlNoCandidate; // expected-error {{address of overloaded function 'ovlNoCandidate' does not match required type}} \ + // expected-note@#ovlno1 {{made ineligible by enable_if}} \ + // expected-note@#ovlno2 {{made ineligible by enable_if}} int (*a)(int); - a = ovlNoCandidate; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@237{{made ineligible by enable_if}} expected-note@238{{made ineligible by enable_if}} - a = &ovlNoCandidate; // expected-error{{assigning to 'int (*)(int)' from incompatible type ''}} expected-note@237{{made ineligible by enable_if}} expected-note@238{{made ineligible by enable_if}} + a = ovlNoCandidate; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#ovlno1 {{made ineligible by enable_if}} \ + // expected-note@#ovlno2 {{made ineligible by enable_if}} + a = &ovlNoCandidate; // expected-error {{assigning to 'int (*)(int)' from incompatible type ''}} \ + // expected-note@#ovlno1 {{made ineligible by enable_if}} \ + // expected-note@#ovlno2 {{made ineligible by enable_if}} } int noOvlNoCandidate(int m) __attribute__((enable_if(false, ""))); From 7d122c329944a6be4ed93929c99f5e8fc75b6892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Horv=C3=A1th?= Date: Fri, 19 Jun 2026 08:23:23 +0100 Subject: [PATCH 008/149] [LifetimeSafety] Propagate loans through pointer inc/dec and compound assignment (#204477) --- .../LifetimeSafety/FactsGenerator.cpp | 26 +++++++++- .../Sema/LifetimeSafety/dangling-global.cpp | 49 ++++++++++++++++++- clang/test/Sema/LifetimeSafety/safety-c.c | 22 +++++++++ 3 files changed, 95 insertions(+), 2 deletions(-) diff --git a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp index 545836cd76fb9..4b5a776b2bae7 100644 --- a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp +++ b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp @@ -372,6 +372,21 @@ void FactsGenerator::VisitUnaryOperator(const UnaryOperator *UO) { killAndFlowOrigin(*UO, *SubExpr); return; } + case UO_PreInc: + case UO_PostInc: + case UO_PreDec: + case UO_PostDec: { + // Inc/dec keeps a pointer in the same allocation, so the result carries the + // operand's loans. Peel the operand's storage origin when the *result* is a + // prvalue (post-inc/dec, or any form in C) -- the inverse of + // getRValueOrigins, which peels when its own argument is a glvalue. + if (!UO->getType()->isPointerType()) + return; + OriginList *SubList = getOriginsList(*UO->getSubExpr()); + flow(getOriginsList(*UO), + UO->isGLValue() ? SubList : SubList->peelOuterOrigin(), /*Kill=*/true); + return; + } default: return; } @@ -472,8 +487,17 @@ void FactsGenerator::VisitBinaryOperator(const BinaryOperator *BO) { killAndFlowOrigin(*BO, *BO->getRHS()); return; } - if (BO->isCompoundAssignmentOp()) + if (BO->isCompoundAssignmentOp()) { + // A pointer compound additive assignment (`p += n`) carries the LHS's loans + // like inc/dec above; in C the result is a prvalue, so peel its outer + // (storage) origin. + if (BO->getType()->isPointerType()) { + OriginList *LHSList = getOriginsList(*BO->getLHS()); + flow(getOriginsList(*BO), IsCMode ? LHSList->peelOuterOrigin() : LHSList, + /*Kill=*/true); + } return; + } if (BO->getType()->isPointerType() && BO->isAdditiveOp()) handlePointerArithmetic(BO); handleUse(BO->getRHS()); diff --git a/clang/test/Sema/LifetimeSafety/dangling-global.cpp b/clang/test/Sema/LifetimeSafety/dangling-global.cpp index 8a96cbced43b4..8d464b0dbe554 100644 --- a/clang/test/Sema/LifetimeSafety/dangling-global.cpp +++ b/clang/test/Sema/LifetimeSafety/dangling-global.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -Wlifetime-safety -Wno-dangling -verify %s -int *global; // expected-note 4 {{this global dangles}} +int *global; // expected-note 10 {{this global dangles}} int *global_backup; // expected-note {{this global dangles}} struct ObjWithStaticField { @@ -70,3 +70,50 @@ void conditional_no_escape(int c) { global = nullptr; // no-warning (void)local; } + +// Pointer compound assignment and increment/decrement keep the pointer in the +// same allocation, so the result carries the borrow. +void via_compound_add() { + int local[10]; + int *p = local; // expected-warning {{stack memory associated with local variable 'local' escapes to the global variable 'global' which will dangle}} + global = (p += 1); +} + +void via_compound_sub() { + int local[10]; + int *p = local + 5; // expected-warning {{stack memory associated with local variable 'local' escapes to the global variable 'global' which will dangle}} + global = (p -= 1); +} + +void via_preinc() { + int local[10]; + int *p = local; // expected-warning {{stack memory associated with local variable 'local' escapes to the global variable 'global' which will dangle}} + global = ++p; +} + +void via_postinc() { + int local[10]; + int *p = local; // expected-warning {{stack memory associated with local variable 'local' escapes to the global variable 'global' which will dangle}} + global = p++; +} + +void via_predec() { + int local[10]; + int *p = local + 5; // expected-warning {{stack memory associated with local variable 'local' escapes to the global variable 'global' which will dangle}} + global = --p; +} + +void via_postdec() { + int local[10]; + int *p = local + 5; // expected-warning {{stack memory associated with local variable 'local' escapes to the global variable 'global' which will dangle}} + global = p--; +} + +// Negative: arithmetic on a pointer into long-lived storage stays silent. +void ok_global_storage() { + static int s[10]; + int *p = s; + p += 1; + ++p; + global = (p -= 1); // no-warning +} diff --git a/clang/test/Sema/LifetimeSafety/safety-c.c b/clang/test/Sema/LifetimeSafety/safety-c.c index 95c8cf7bb00c7..13b92a8d81db4 100644 --- a/clang/test/Sema/LifetimeSafety/safety-c.c +++ b/clang/test/Sema/LifetimeSafety/safety-c.c @@ -179,3 +179,25 @@ int *atomic_pointer_declref(void) { _Atomic(int *) p = &value; return p; } + +// In C, a pointer compound assignment is a prvalue; its result still carries +// the LHS pointer's loans. +void compound_assign_prvalue(void) { + int *p; + { + int local[10]; + int *q = local; // expected-warning {{local variable 'local' does not live long enough}} + p = (q += 1); + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} +} + +void preincrement_prvalue(void) { + int *p; + { + int local[10]; + int *q = local; // expected-warning {{local variable 'local' does not live long enough}} + p = ++q; + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} +} From 6352a584c409a4ad44804f06e06260be587866d3 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 19 Jun 2026 09:47:40 +0200 Subject: [PATCH 009/149] [mlir][IR] Fix typo in code example of DenseTypedElementsAttr (#204739) There was a typo in the type-first syntax code example. --- mlir/include/mlir/IR/BuiltinAttributes.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/IR/BuiltinAttributes.td b/mlir/include/mlir/IR/BuiltinAttributes.td index 6165a24c0d34f..f238137734d56 100644 --- a/mlir/include/mlir/IR/BuiltinAttributes.td +++ b/mlir/include/mlir/IR/BuiltinAttributes.td @@ -271,7 +271,7 @@ def Builtin_DenseTypedElementsAttr : Builtin_Attr< dense : 10 : i32> // Type-first syntax: A tensor of 2 float32 elements. - dense : [10.0, 11.0]> + dense : [10.0 : f32, 11.0 : f32]> ``` Note: The literal-first syntax is supported only for complex, float, index, From 3b1a922a6d02705ef30b1527e34af7b6208dc02f Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 19 Jun 2026 08:53:50 +0100 Subject: [PATCH 010/149] [VPlan] Extend licm to sink replicate stores (#191026) Follow up on hoisting replicate loads in VPlan-licm to also sink replicate stores. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 69 +++++++++++++------ llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 - .../LoopVectorize/AArch64/reg-usage.ll | 7 +- .../VPlan/interleave-and-scalarize-only.ll | 7 +- .../LoopVectorize/X86/cost-model.ll | 10 +-- .../X86/invariant-store-vectorization.ll | 6 +- .../Transforms/LoopVectorize/X86/pr36524.ll | 12 ++-- .../LoopVectorize/X86/uniform_mem_op.ll | 4 +- ...first-order-recurrence-with-uniform-ops.ll | 12 ++-- .../LoopVectorize/hoist-predicated-loads.ll | 8 +-- .../LoopVectorize/if-pred-stores.ll | 18 ++--- .../invariant-store-vectorization-2.ll | 18 ++--- .../invariant-store-vectorization.ll | 10 +-- .../test/Transforms/LoopVectorize/metadata.ll | 40 +++++------ ...pr47343-expander-lcssa-after-cfg-update.ll | 6 +- .../pr59319-loop-access-info-invalidation.ll | 8 +-- .../reduction-with-invariant-store.ll | 4 +- .../Transforms/LoopVectorize/runtime-check.ll | 2 +- .../LoopVectorize/single-scalar-cast-minbw.ll | 4 +- .../LoopVectorize/skeleton-lcssa-crash.ll | 6 +- 20 files changed, 140 insertions(+), 113 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f064fb7a756fa..bfad5d02d1767 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -153,10 +153,10 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( /// Helper for extra no-alias checks via known-safe recipe and SCEV. class SinkStoreInfo { - const SmallPtrSetImpl &ExcludeRecipes; + SmallPtrSet ExcludeRecipes; VPReplicateRecipe &GroupLeader; - PredicatedScalarEvolution &PSE; - const Loop &L; + PredicatedScalarEvolution *PSE = nullptr; + const Loop *L = nullptr; // Return true if \p A and \p B are known to not alias for all VFs in the // plan, checked via the distance between the accesses @@ -165,15 +165,18 @@ class SinkStoreInfo { B->getOpcode() != Instruction::Store) return false; + if (!PSE || !L) + return A == B; + VPValue *AddrA = A->getOperand(1); - const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L); + const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, *PSE, L); VPValue *AddrB = B->getOperand(1); - const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L); + const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, *PSE, L); if (isa(SCEVA) || isa(SCEVB)) return false; const APInt *Distance; - ScalarEvolution &SE = *PSE.getSE(); + ScalarEvolution &SE = *PSE->getSE(); if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance))) return false; @@ -197,18 +200,20 @@ class SinkStoreInfo { } public: - SinkStoreInfo(const SmallPtrSetImpl &ExcludeRecipes, + SinkStoreInfo(ArrayRef ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L) - : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE), - L(L) {} + : ExcludeRecipes(ExcludeRecipes.begin(), ExcludeRecipes.end()), + GroupLeader(GroupLeader), PSE(&PSE), L(&L) {} + + SinkStoreInfo(VPReplicateRecipe &GroupLeader) : GroupLeader(GroupLeader) {} /// Return true if \p R should be skipped during alias checking, either /// because it's in the exclude set or because no-alias can be proven via /// SCEV. bool shouldSkip(VPRecipeBase &R) const { auto *Store = dyn_cast(&R); - return ExcludeRecipes.contains(&R) || + return ExcludeRecipes.contains(Store) || (Store && isNoAliasViaDistance(Store, &GroupLeader)); } }; @@ -2547,15 +2552,26 @@ void VPlanTransforms::cse(VPlan &Plan) { } /// Return true if we do not know how to (mechanically) hoist or sink a -/// non-memory or memory recipe \p R out of a loop region. +/// non-memory or memory recipe \p R out of a loop region. When sinking, passing +/// \p Sinking = true ensures that assumes aren't sunk. static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, - VPBasicBlock *LastBB) { - if (!isa(R) || !R.mayReadFromMemory()) - return vputils::cannotHoistOrSinkRecipe(R); + VPBasicBlock *LastBB, + bool Sinking = false) { + if (!isa(R) || !R.mayReadOrWriteMemory() || + match(&R, m_Intrinsic())) + return vputils::cannotHoistOrSinkRecipe(R, Sinking); - // Check that the load doesn't alias with stores between FirstBB and LastBB. + // Check that the memory operation doesn't alias between FirstBB and LastBB. auto MemLoc = vputils::getMemoryLocation(R); - return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB); + + // TODO: Could make use of SinkStoreInfo::isNoAliasViaDistance by collecting + // stores upfront, and constructing a full SinkStoreInfo. + auto SinkInfo = + Sinking ? std::make_optional(SinkStoreInfo(cast(R))) + : std::nullopt; + + return !MemLoc || + !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB, SinkInfo); } /// Move loop-invariant recipes out of the vector loop region in \p Plan. @@ -2594,7 +2610,9 @@ static void licm(VPlan &Plan) { LoopRegion->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(POT)) { for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true)) + if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(), + LoopRegion->getExitingBasicBlock(), + /*Sinking=*/true)) continue; if (auto *RepR = dyn_cast(&R)) { @@ -2607,8 +2625,20 @@ static void licm(VPlan &Plan) { // non-single-scalar replicates correctly. if (!RepR->isSingleScalar()) continue; + + // The pointer operand of stores must be loop-invariant. + if (RepR->getOpcode() == Instruction::Store && + !RepR->getOperand(1)->isDefinedOutsideLoopRegions()) + continue; } + [[maybe_unused]] auto *RepR = dyn_cast(&R); + assert((!R.mayWriteToMemory() || + (RepR && RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1)->isDefinedOutsideLoopRegions())) && + "The only recipes that may write to memory are expected to be " + "stores with invariant pointer-operand"); + // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to // support recipes with multiple defined values (e.g., interleaved loads). auto *Def = cast(&R); @@ -5229,12 +5259,9 @@ canSinkStoreWithNoAliasCheck(ArrayRef StoresToSink, // When sinking a group of stores, all members of the group alias each other. // Skip them during the alias checks. - SmallPtrSet StoresToSinkSet(StoresToSink.begin(), - StoresToSink.end()); - VPBasicBlock *FirstBB = StoresToSink.front()->getParent(); VPBasicBlock *LastBB = StoresToSink.back()->getParent(); - SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L); + SinkStoreInfo SinkInfo(StoresToSink, *StoresToSink[0], PSE, L); return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 32da50aa92aa0..78cf539642b41 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -527,8 +527,6 @@ bool vputils::cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking) { // would destroy information. if (match(&R, m_Intrinsic())) return Sinking; - // TODO: Relax checks in the future, e.g. we could also hoist reads, if their - // memory location is not modified in the vector loop. if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) return true; // Allocas cannot be hoisted. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index 955178d57af37..94bb704d729aa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -14,12 +14,11 @@ define void @get_invariant_reg_usage(ptr %z) { ; CHECK-LABEL: LV: Checking a loop in 'get_invariant_reg_usage' -; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK: LV(REG): VF = vscale x 16 +; CHECK-NEXT: LV(REG): Found max usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers L.entry: %0 = load i128, ptr %z, align 16 diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll index 9ae22eff46f37..c602433a89016 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll @@ -308,13 +308,14 @@ define void @scalarize_ptrtoint(ptr %src, ptr %dst) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr %src, i64 [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP12]], label %middle.block, label %vector.body +; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP5]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], 10 ; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: store ptr [[TMP11]], ptr %dst, align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP12]], label %middle.block, label %vector.body entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 938c7fa7dcc09..8313a8a726b16 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -507,15 +507,15 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i32 [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 2 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP26]], align 8, !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = and <2 x i1> [[BROADCAST_SPLAT]], [[TMP27]] ; CHECK-NEXT: [[TMP29:%.*]] = zext <2 x i1> [[TMP28]] to <2 x i8> ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i8> [[TMP29]], i64 1 -; CHECK-NEXT: store i8 [[TMP30]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: store i8 [[TMP30]], ptr [[DST]], align 1, !alias.scope [[META12:![0-9]+]], !noalias [[META14:![0-9]+]] ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index c36ed48920446..eb4856758e193 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -49,11 +49,11 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]] ; CHECK-NEXT: [[TMP7]] = add <16 x i32> [[VEC_PHI5]], [[WIDE_LOAD8]] ; CHECK-NEXT: [[TMP8]] = add <16 x i32> [[VEC_PHI6]], [[WIDE_LOAD9]] -; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <16 x i32> [[TMP7]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP8]], [[BIN_RDX10]] @@ -76,11 +76,11 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX14]] ; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope [[META0]] ; CHECK-NEXT: [[TMP13]] = add <8 x i32> [[VEC_PHI15]], [[WIDE_LOAD16]] -; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX14]], 8 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META6]], !noalias [[META0]] ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]]) ; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[CMP_N18]], label %[[FOR_END]], label %[[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll index 1396029b20c9f..81e3fda7f44b2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll @@ -19,17 +19,17 @@ define void @foo(ptr %ptr, ptr %ptr.2) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: store i32 [[TMP4]], ptr [[PTR_2]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store <4 x i64> [[VEC_IND]], ptr [[TMP6]], align 8, !alias.scope [[META3]] +; CHECK-NEXT: store <4 x i64> [[VEC_IND]], ptr [[TMP6]], align 8, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP2:%.*]] = add i64 2, [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 3 +; CHECK-NEXT: store i32 [[TMP4]], ptr [[PTR_2]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index 7d2efed7c2896..9ac27113d3eb9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -279,11 +279,11 @@ define void @uniform_copy(ptr %A, ptr %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META12]] ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll index 347264121fd8d..251b2f32b407a 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll @@ -261,12 +261,12 @@ define i32 @uniform_widened_recurrence_resume(ptr %src, ptr %dst, i64 %n) { ; UNROLL-NO-IC-NEXT: br label %[[VECTOR_BODY:.*]] ; UNROLL-NO-IC: [[VECTOR_BODY]]: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: store i8 0, ptr [[DST]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-IC: [[MIDDLE_BLOCK]]: ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> +; UNROLL-NO-IC-NEXT: store i8 0, ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META6]] ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i8> ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = or <4 x i32> splat (i32 1), [[TMP4]] @@ -320,11 +320,11 @@ define i32 @uniform_widened_recurrence_resume(ptr %src, ptr %dst, i64 %n) { ; UNROLL-NO-VF-NEXT: br label %[[VECTOR_BODY:.*]] ; UNROLL-NO-VF: [[VECTOR_BODY]]: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: store i8 0, ptr [[DST]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-VF: [[MIDDLE_BLOCK]]: +; UNROLL-NO-VF-NEXT: store i8 0, ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META6]] ; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP2]] to i8 ; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = or i32 1, [[TMP1]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -378,12 +378,12 @@ define i32 @uniform_widened_recurrence_resume(ptr %src, ptr %dst, i64 %n) { ; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[TMP2]], %[[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[BROADCAST_SPLAT]], %[[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: store i8 0, ptr [[DST]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; SINK-AFTER: [[MIDDLE_BLOCK]]: ; SINK-AFTER-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR2]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> +; SINK-AFTER-NEXT: store i8 0, ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META6]] ; SINK-AFTER-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP2]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i8> ; SINK-AFTER-NEXT: [[TMP7:%.*]] = or <4 x i32> splat (i32 1), [[TMP4]] diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll index 00dd8f5d7e2b2..0fa01a4904079 100644 --- a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll @@ -916,11 +916,11 @@ define void @hoist_predicated_load_with_chained_geps1(ptr %dst, ptr %src, i1 %co ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i64 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META68:![0-9]+]] -; CHECK-NEXT: store i16 [[TMP3]], ptr [[DST]], align 2, !alias.scope [[META71:![0-9]+]], !noalias [[META68]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP73:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP71:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: store i16 [[TMP3]], ptr [[DST]], align 2, !alias.scope [[META72:![0-9]+]], !noalias [[META68]] ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; @@ -975,11 +975,11 @@ define void @hoist_predicated_load_with_chained_geps2(ptr %dst, ptr %src, i1 %co ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META75:![0-9]+]] -; CHECK-NEXT: store i16 [[TMP3]], ptr [[DST]], align 2, !alias.scope [[META78:![0-9]+]], !noalias [[META75]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP80:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP78:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: store i16 [[TMP3]], ptr [[DST]], align 2, !alias.scope [[META79:![0-9]+]], !noalias [[META75]] ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index 7b80a460a8bea..36837042a420a 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -1038,12 +1038,12 @@ define void @hoistable_predicated_store(ptr %A, ptr %B, ptr %C, ptr %D) { ; UNROLL-NEXT: br label %[[VECTOR_BODY:.*]] ; UNROLL: [[VECTOR_BODY]]: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; UNROLL-NEXT: store i32 0, ptr [[C]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META21:![0-9]+]] -; UNROLL-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META24:![0-9]+]], !noalias [[META25:![0-9]+]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; UNROLL-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; UNROLL-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; UNROLL: [[MIDDLE_BLOCK]]: +; UNROLL-NEXT: store i32 0, ptr [[C]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]] +; UNROLL-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META25:![0-9]+]], !noalias [[META26:![0-9]+]] ; UNROLL-NEXT: br label %[[SCALAR_PH]] ; UNROLL: [[SCALAR_PH]]: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] @@ -1104,12 +1104,12 @@ define void @hoistable_predicated_store(ptr %A, ptr %B, ptr %C, ptr %D) { ; UNROLL-NOSIMPLIFY-NEXT: br label %[[VECTOR_BODY:.*]] ; UNROLL-NOSIMPLIFY: [[VECTOR_BODY]]: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; UNROLL-NOSIMPLIFY-NEXT: store i32 0, ptr [[C]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META21:![0-9]+]] -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META24:![0-9]+]], !noalias [[META25:![0-9]+]] ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; UNROLL-NOSIMPLIFY: [[MIDDLE_BLOCK]]: +; UNROLL-NOSIMPLIFY-NEXT: store i32 0, ptr [[C]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META25:![0-9]+]], !noalias [[META26:![0-9]+]] ; UNROLL-NOSIMPLIFY-NEXT: br label %[[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: [[SCALAR_PH]]: ; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] @@ -1170,12 +1170,12 @@ define void @hoistable_predicated_store(ptr %A, ptr %B, ptr %C, ptr %D) { ; VEC-NEXT: br label %[[VECTOR_BODY:.*]] ; VEC: [[VECTOR_BODY]]: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VEC-NEXT: store i32 0, ptr [[C]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META21:![0-9]+]] -; VEC-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META24:![0-9]+]], !noalias [[META25:![0-9]+]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; VEC-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; VEC-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; VEC: [[MIDDLE_BLOCK]]: +; VEC-NEXT: store i32 0, ptr [[C]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]] +; VEC-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META25:![0-9]+]], !noalias [[META26:![0-9]+]] ; VEC-NEXT: br label %[[SCALAR_PH]] ; VEC: [[SCALAR_PH]]: ; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll index 392328137f089..ec6ff61d055a8 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -43,15 +43,16 @@ define void @inv_val_store_to_inv_address_conditional_diff_values_ic(ptr %a, i64 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META0]], !noalias [[META3]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT4]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope [[META3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[WIDE_LOAD_LCSSA:%.*]] = phi <4 x i32> [ [[WIDE_LOAD]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD_LCSSA]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3 +; CHECK-NEXT: store i32 [[TMP4]], ptr [[A]], align 4, !alias.scope [[META3]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -141,11 +142,11 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]] -; CHECK-NEXT: store i32 [[K]], ptr [[A]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: store i32 [[K]], ptr [[A]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -225,14 +226,15 @@ define i32 @variant_val_store_to_inv_address(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META16:![0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META16]] ; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[WIDE_LOAD_LCSSA:%.*]] = phi <4 x i32> [ [[WIDE_LOAD]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD_LCSSA]], i64 3 +; CHECK-NEXT: store i32 [[TMP6]], ptr [[A]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META16]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index 601323ae35345..69fc31a3ee23a 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -40,12 +40,12 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_BODY]] ] +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -113,12 +113,12 @@ define void @inv_val_store_to_inv_address(ptr %a, i64 %n, ptr %b) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]] -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META12]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META9:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META9]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll index be30c7629e774..2040dbe247608 100644 --- a/llvm/test/Transforms/LoopVectorize/metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/metadata.ll @@ -530,12 +530,12 @@ define void @noalias_metadata(ptr align 8 %dst, ptr align 8 %src) { ; CHECK-NEXT: [[TMP26:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[NEXT_GEP]], align 8, !alias.scope [[META14:![0-9]+]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1 -; CHECK-NEXT: store ptr [[TMP7]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: store ptr [[TMP9]], ptr [[DST]], align 8, !alias.scope [[META18:![0-9]+]], !noalias [[META20:![0-9]+]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -584,12 +584,12 @@ define void @noalias_metadata(ptr align 8 %dst, ptr align 8 %src) { ; INTERLEAVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i64 2 ; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8, !alias.scope [[META14:![0-9]+]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1 -; INTERLEAVE-NEXT: store ptr [[TMP8]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; INTERLEAVE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; INTERLEAVE-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; INTERLEAVE: [[MIDDLE_BLOCK]]: +; INTERLEAVE-NEXT: [[TMP10:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1 +; INTERLEAVE-NEXT: store ptr [[TMP10]], ptr [[DST]], align 8, !alias.scope [[META18:![0-9]+]], !noalias [[META20:![0-9]+]] ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; INTERLEAVE: [[SCALAR_PH]]: @@ -655,13 +655,13 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_ ; CHECK: [[META14]] = !{[[META15:![0-9]+]]} ; CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]} ; CHECK: [[META16]] = distinct !{[[META16]], !"LVerDomain"} -; CHECK: [[META17]] = !{[[META18:![0-9]+]]} -; CHECK: [[META18]] = distinct !{[[META18]], [[META16]]} -; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META15]]} -; CHECK: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"} -; CHECK: [[META21]] = distinct !{[[META21]], !"t2"} -; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]} -; CHECK: [[META23]] = !{[[META20]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META5]], [[META6]]} +; CHECK: [[META18]] = !{[[META19:![0-9]+]]} +; CHECK: [[META19]] = distinct !{[[META19]], [[META16]]} +; CHECK: [[META20]] = !{[[META21:![0-9]+]], [[META15]]} +; CHECK: [[META21]] = distinct !{[[META21]], [[META22:![0-9]+]], !"g1"} +; CHECK: [[META22]] = distinct !{[[META22]], !"t2"} +; CHECK: [[META23]] = !{[[META21]]} ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]} ;. ; INTERLEAVE: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0} @@ -681,12 +681,12 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_ ; INTERLEAVE: [[META14]] = !{[[META15:![0-9]+]]} ; INTERLEAVE: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]} ; INTERLEAVE: [[META16]] = distinct !{[[META16]], !"LVerDomain"} -; INTERLEAVE: [[META17]] = !{[[META18:![0-9]+]]} -; INTERLEAVE: [[META18]] = distinct !{[[META18]], [[META16]]} -; INTERLEAVE: [[META19]] = !{[[META20:![0-9]+]], [[META15]]} -; INTERLEAVE: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"} -; INTERLEAVE: [[META21]] = distinct !{[[META21]], !"t2"} -; INTERLEAVE: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]} -; INTERLEAVE: [[META23]] = !{[[META20]]} +; INTERLEAVE: [[LOOP17]] = distinct !{[[LOOP17]], [[META5]], [[META6]]} +; INTERLEAVE: [[META18]] = !{[[META19:![0-9]+]]} +; INTERLEAVE: [[META19]] = distinct !{[[META19]], [[META16]]} +; INTERLEAVE: [[META20]] = !{[[META21:![0-9]+]], [[META15]]} +; INTERLEAVE: [[META21]] = distinct !{[[META21]], [[META22:![0-9]+]], !"g1"} +; INTERLEAVE: [[META22]] = distinct !{[[META22]], !"t2"} +; INTERLEAVE: [[META23]] = !{[[META21]]} ; INTERLEAVE: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll index 38d84f28a2851..af07b18a742f0 100644 --- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll +++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll @@ -43,12 +43,12 @@ define void @f() { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: store i32 0, ptr @f.e, align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] -; CHECK-NEXT: store i8 10, ptr [[TMP0]], align 1, !alias.scope [[META3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500 -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: store i32 0, ptr @f.e, align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]] +; CHECK-NEXT: store i8 10, ptr [[TMP0]], align 1, !alias.scope [[META6]] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll index 8d9de2b8f51b0..9ffee47c8a31d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll @@ -54,11 +54,11 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) { ; CHECK-NEXT: br label [[VECTOR_BODY10:%.*]] ; CHECK: vector.body8: ; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH6]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY10]] ] -; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK12:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK12:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block11: +; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] ; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N10]], label [[LOOP_3_LR_PH:%.*]], label [[SCALAR_PH5]] ; CHECK: scalar.ph3: @@ -84,11 +84,11 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) { ; CHECK-NEXT: br label [[VECTOR_BODY27:%.*]] ; CHECK: vector.body25: ; CHECK-NEXT: [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH24]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY27]] ] -; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4, !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT29]] = add nuw i64 [[INDEX29]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC25]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK29:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK29:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block28: +; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] ; CHECK-NEXT: [[CMP_N27:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC25]] ; CHECK-NEXT: br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH21]] ; CHECK: scalar.ph20: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll index 1f3a7c974b497..4602d4305376f 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -885,11 +885,11 @@ define i32 @non_reduc_store_invariant_addr_not_hoisted(ptr %dst, ptr readonly %s ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META39:![0-9]+]] ; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: store i32 0, ptr [[TMP0]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META39]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: store i32 0, ptr [[TMP0]], align 4, !alias.scope [[META43:![0-9]+]], !noalias [[META39]] ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: br [[EXIT:label %.*]] ; CHECK: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll index e0f355375e46c..415ea29cd73d2 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -506,11 +506,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2 x i8], ptr [[OUT]], i64 [[TMP6]] ; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]] -; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll index 4288580c070d2..ea2d0bcac63b6 100644 --- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll +++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll @@ -87,11 +87,11 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META8:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll index c9c9d4ecb2026..1b067efe09dc1 100644 --- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll @@ -45,12 +45,12 @@ define i16 @test(ptr %arg, i64 %N) { ; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[L_1]], i64 [[TMP4]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 2, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i64 1 -; CHECK-NEXT: store i16 [[TMP6]], ptr [[L_2]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i64 1 +; CHECK-NEXT: store i16 [[TMP6]], ptr [[L_2]], align 2, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: From f8bd135b6faef0675f1b8cb088373337d0439bb8 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Fri, 19 Jun 2026 09:06:45 +0100 Subject: [PATCH 011/149] [lit] Make RecursionError less likely in internal shell (#204573) The lit internal shell chains together the contents of multiple RUN: lines by connecting them with implicit && nodes, forming a binary tree structure which is then executed recursively by `_executeShCommand`. However the tree structure is constructed in a very simple way which makes it effectively just a linked list, so `_executeShCommand` must recurse to a depth equal to the number of commands. If a test file contains more than 1000 RUN: lines (e.g. running the clang driver only, with lots of different options), then this causes a RecursionError exception, which did not happen using the external shell. Failures of this kind can be avoided by instead connecting the commands together in a _balanced_ binary tree, which has equivalent behaviour, since the && shell operator is associative. --- llvm/utils/lit/lit/TestRunner.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index f59c88599f422..1b0f4ad4eebef 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -691,9 +691,20 @@ def executeScriptInternal( f"shell parser error on {dbg}: {command.lstrip()}\n" ) from None - cmd = cmds[0] - for c in cmds[1:]: - cmd = ShUtil.Seq(cmd, "&&", c) + # Link all of `cmds` into a single command, consisting of the original + # commands chained together with &&. To avoid RecursionError in large tests + # (e.g. with 1000 RUN: lines), we do this by subdividing the list in half + # each time, so that we make a balanced tree structure with depth + # proportional to only the log of the list length. + def make_tree(cmds): + if len(cmds) == 1: + return cmds[0] + else: + assert len(cmds) > 1, "didn't expect an empty sequence" + split = len(cmds) // 2 + return ShUtil.Seq(make_tree(cmds[:split]), "&&", make_tree(cmds[split:])) + + cmd = make_tree(cmds) results = [] timeoutInfo = None From 9361b3d9570596231ffa2b9a1a2ae4ee5fe7eb9d Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 19 Jun 2026 09:08:11 +0100 Subject: [PATCH 012/149] [LV] Add test for WidenCall with mixed scalar-vector operands (#203092) --- .../widen-call-op-scalar-vector.ll | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/widen-call-op-scalar-vector.ll diff --git a/llvm/test/Transforms/LoopVectorize/widen-call-op-scalar-vector.ll b/llvm/test/Transforms/LoopVectorize/widen-call-op-scalar-vector.ll new file mode 100644 index 0000000000000..1266a830417a8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/widen-call-op-scalar-vector.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +; Test with call where same value is used as call argument in +; vector and scalar position. +define void @test(ptr %p) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @vec_foo(<2 x i64> [[VEC_IND]], i64 [[TMP1]]) +; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[TMP0]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr i64, ptr %p, i64 %iv + %r = call i64 @foo(i64 %iv, i64 %iv) #0 + store i64 %r, ptr %gep, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +declare i64 @foo(i64, i64) +declare <2 x i64> @vec_foo(<2 x i64>, i64) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2vl1_foo(vec_foo)" } From b9587a78a090fa53bed1dc1b478502e3038a0028 Mon Sep 17 00:00:00 2001 From: Jiang Ning Date: Fri, 19 Jun 2026 16:10:34 +0800 Subject: [PATCH 013/149] [ELF][AArch64] Relax zero TLSLE add to nop (#204286) Optimize AArch64 local-exec TLS relocation handling by replacing a self-add R_AARCH64_TLSLE_ADD_TPREL_HI12 instruction with nop when the high 12 bits are zero. The optimization is disabled by --no-relax and avoids non-equivalent forms such as non-self-adds and 32-bit destination registers. --- lld/ELF/Arch/AArch64.cpp | 8 ++++++++ lld/test/ELF/aarch64-tls-le.s | 17 +++++++++++++++-- lld/test/ELF/aarch64-tlsld-ldst.s | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp index 99b3085852df7..6ec3961158fe0 100644 --- a/lld/ELF/Arch/AArch64.cpp +++ b/lld/ELF/Arch/AArch64.cpp @@ -787,6 +787,14 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel, break; case R_AARCH64_TLSLE_ADD_TPREL_HI12: checkUInt(ctx, loc, val, 24, rel); + if (ctx.arg.relax && (val >> 12) == 0) { + uint32_t inst = read32le(loc); + // The W-form zero-extends Xd, so only the X-form is a nop. + if ((inst & (1u << 31)) && (inst & 0x1f) == ((inst >> 5) & 0x1f)) { + write32le(loc, 0xd503201f); // nop + break; + } + } write32Imm12(loc, val >> 12); break; case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: diff --git a/lld/test/ELF/aarch64-tls-le.s b/lld/test/ELF/aarch64-tls-le.s index acbdb387dfc4a..feb269c30b2ab 100644 --- a/lld/test/ELF/aarch64-tls-le.s +++ b/lld/test/ELF/aarch64-tls-le.s @@ -1,7 +1,9 @@ # REQUIRES: aarch64 # RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o # RUN: ld.lld %t.o -o %t -# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t | FileCheck %s +# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t | FileCheck %s --check-prefixes=CHECK,RELAX +# RUN: ld.lld --no-relax %t.o -o %t.norelax +# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t.norelax | FileCheck %s --check-prefixes=CHECK,NORELAX # RUN: llvm-readobj -S -r %t | FileCheck -check-prefix=RELOC %s #Local-Dynamic to Local-Exec relax creates no @@ -15,6 +17,9 @@ # ERR: error: relocation R_AARCH64_TLSLE_ADD_TPREL_LO12_NC against v1 cannot be used with -shared # ERR: error: relocation R_AARCH64_TLSLE_ADD_TPREL_HI12 against v2 cannot be used with -shared # ERR: error: relocation R_AARCH64_TLSLE_ADD_TPREL_LO12_NC against v2 cannot be used with -shared +# ERR: error: relocation R_AARCH64_TLSLE_ADD_TPREL_HI12 against v1 cannot be used with -shared +# ERR: error: relocation R_AARCH64_TLSLE_ADD_TPREL_HI12 against v1 cannot be used with -shared +# ERR: error: relocation R_AARCH64_TLSLE_ADD_TPREL_HI12 against v1 cannot be used with -shared .globl _start _start: @@ -24,16 +29,24 @@ _start: mrs x0, TPIDR_EL0 add x0, x0, :tprel_hi12:v2 add x0, x0, :tprel_lo12_nc:v2 + add x2, x1, :tprel_hi12:v1 + add w3, w3, :tprel_hi12:v1 + add sp, sp, :tprel_hi12:v1 # TCB size = 0x16 and foo is first element from TLS register. #CHECK: Disassembly of section .text: #CHECK: <_start>: #CHECK-NEXT: mrs x0, TPIDR_EL0 -#CHECK-NEXT: add x0, x0, #0, lsl #12 +#RELAX-NEXT: nop +#NORELAX-NEXT: add x0, x0, #0, lsl #12 #CHECK-NEXT: add x0, x0, #16 #CHECK-NEXT: mrs x0, TPIDR_EL0 #CHECK-NEXT: add x0, x0, #4095, lsl #12 #CHECK-NEXT: add x0, x0, #4088 +#CHECK-NEXT: add x2, x1, #0, lsl #12 +#CHECK-NEXT: add w3, w3, #0, lsl #12 +#RELAX-NEXT: nop +#NORELAX-NEXT: add sp, sp, #0, lsl #12 .section .tbss,"awT",@nobits diff --git a/lld/test/ELF/aarch64-tlsld-ldst.s b/lld/test/ELF/aarch64-tlsld-ldst.s index bad35013105fc..72c8f2696545f 100644 --- a/lld/test/ELF/aarch64-tlsld-ldst.s +++ b/lld/test/ELF/aarch64-tlsld-ldst.s @@ -27,7 +27,7 @@ _start: mrs x8, TPIDR_EL0 // CHECK: <_start>: // CHECK-NEXT: 210158: mrs x8, TPIDR_EL0 // 0x0 + c10 = 0xc10 = tcb (16-bytes) + var0 -// CHECK-NEXT: 21015c: add x8, x8, #0, lsl #12 +// CHECK-NEXT: 21015c: nop // CHECK-NEXT: 210160: ldr q20, [x8, #3088] // 0x1000 + 0x820 = 0x1820 = tcb + var1 // CHECK-NEXT: 210164: add x8, x8, #1, lsl #12 From b496d0623824060af20726f89bbf8fe662dd49e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Horv=C3=A1th?= Date: Fri, 19 Jun 2026 09:12:42 +0100 Subject: [PATCH 014/149] [LifetimeSafety] Model bit_cast and atomic casts in the fact generator (#204591) VisitCastExpr dropped several borrow-carrying cast kinds into its default case. Propagate the borrow through `__builtin_bit_cast`/`std::bit_cast` of a pointer and through wrapping/unwrapping `_Atomic(T*)`, so a stack address laundered through either is caught (matching reinterpret_cast). hasOrigins and buildListForType now see through AtomicType, which is transparent for lifetimes. Assisted-by: Claude Opus 4.8 Co-authored-by: Gabor Horvath --- .../LifetimeSafety/FactsGenerator.cpp | 14 ++++++++++ clang/lib/Analysis/LifetimeSafety/Origins.cpp | 7 +++++ clang/test/Sema/LifetimeSafety/safety-c.c | 17 +++++++++-- clang/test/Sema/LifetimeSafety/safety.cpp | 28 +++++++++++++++++++ 4 files changed, 64 insertions(+), 2 deletions(-) diff --git a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp index 4b5a776b2bae7..3861117005752 100644 --- a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp +++ b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp @@ -342,6 +342,20 @@ void FactsGenerator::VisitCastExpr(const CastExpr *CE) { if (Src && Dest && Dest->getLength() == Src->getLength()) flow(Dest, Src, /*Kill=*/true); return; + case CK_LValueToRValueBitCast: + case CK_NonAtomicToAtomic: + case CK_AtomicToNonAtomic: { + // `__builtin_bit_cast`/`std::bit_cast` of a pointer, and + // wrapping/unwrapping `_Atomic(T*)`, preserve the pointer value, so + // propagate the borrow. The operand may be a glvalue, so strip its outer + // lvalue level first. A bit-cast that materializes a pointer from a + // non-pointer representation has no matching source origin and is + // untracked. + OriginList *RVSrc = getRValueOrigins(SubExpr, Src); + if (RVSrc && Dest->getLength() == RVSrc->getLength()) + flow(Dest, RVSrc, /*Kill=*/true); + return; + } default: return; } diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp index 3ff4823ca88a6..c837f246fa17b 100644 --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp @@ -106,6 +106,10 @@ bool OriginManager::hasOrigins(QualType QT, bool IntrinsicOnly) const { if (!IntrinsicOnly && LifetimeAnnotatedOriginTypes.contains(QT.getCanonicalType().getTypePtr())) return true; + // An `_Atomic(T)` wraps T transparently for lifetime purposes (the atomic + // holds the same value); see through it. + if (const auto *AT = QT->getAs()) + return hasOrigins(AT->getValueType(), IntrinsicOnly); const auto *RD = QT->getAsCXXRecordDecl(); if (!RD) return false; @@ -194,6 +198,9 @@ OriginList *OriginManager::createSingleOriginList(OriginID OID) { template OriginList *OriginManager::buildListForType(QualType QT, const T *Node) { assert(hasOrigins(QT) && "buildListForType called for non-pointer type"); + // `_Atomic(T)` is transparent for lifetime purposes: build the node for T. + if (const auto *AT = QT->getAs()) + return buildListForType(AT->getValueType(), Node); OriginList *Head = createNode(Node, QT); if (QT->isPointerOrReferenceType()) { diff --git a/clang/test/Sema/LifetimeSafety/safety-c.c b/clang/test/Sema/LifetimeSafety/safety-c.c index 13b92a8d81db4..9ab2a57cb08a9 100644 --- a/clang/test/Sema/LifetimeSafety/safety-c.c +++ b/clang/test/Sema/LifetimeSafety/safety-c.c @@ -173,11 +173,24 @@ void *void_pointer_dereference(void) { return &*bytes; } -// FIXME: Atomics are not modeled yet. +// `_Atomic(T)` is transparent for lifetime purposes; a stack address laundered +// through an atomic is caught. int *atomic_pointer_declref(void) { int value; + _Atomic(int *) p = &value; // expected-warning {{stack memory associated with local variable 'value' is returned}} + return p; // expected-note {{returned here}} +} + +int *atomic_pointer_static(void) { + static int value; _Atomic(int *) p = &value; - return p; + return p; // no-warning +} + +int **atomic_pointer_multilevel(void) { + int *inner; + _Atomic(int **) p = &inner; // expected-warning {{stack memory associated with local variable 'inner' is returned}} + return p; // expected-note {{returned here}} } // In C, a pointer compound assignment is a prvalue; its result still carries diff --git a/clang/test/Sema/LifetimeSafety/safety.cpp b/clang/test/Sema/LifetimeSafety/safety.cpp index 7a2644e46a6e1..65bfe69e854ac 100644 --- a/clang/test/Sema/LifetimeSafety/safety.cpp +++ b/clang/test/Sema/LifetimeSafety/safety.cpp @@ -1435,6 +1435,34 @@ void use_trivial_temporary_after_destruction() { use(a); // expected-note {{later used here}} } +namespace cast_modeling { +// A pointer bit-cast (`__builtin_bit_cast`/`std::bit_cast`) preserves the +// value, so a borrow flowed through it is tracked (matching reinterpret_cast). +int *bit_cast_stack() { + int x = 0; + return __builtin_bit_cast(int *, &x); // expected-warning {{stack memory associated with local variable 'x' is returned}} expected-note {{returned here}} +} + +int *bit_cast_static() { + static int s = 0; + return __builtin_bit_cast(int *, &s); // no-warning +} + +void bit_cast_use_after_scope() { + int *p; + { + int local = 0; + p = __builtin_bit_cast(int *, &local); // expected-warning {{local variable 'local' does not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} +} + +int **bit_cast_multilevel() { + int *p = nullptr; + return __builtin_bit_cast(int **, &p); // expected-warning {{stack memory associated with local variable 'p' is returned}} expected-note {{returned here}} +} +} // namespace cast_modeling + namespace FullExprCleanupLoc { void var_initializer() { View v = non_trivially_destructed_temporary() // expected-warning {{temporary object does not live long enough}} \ From c327ab359a959de2e4241a5fcda409958f2c0d11 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 19 Jun 2026 09:36:00 +0100 Subject: [PATCH 015/149] [AArch64] Fix Windows target detection in FrameLowering (#204347) In #156467, we switched to using `getMCAsmInfo()->usesWindowsCFI()` to recognize "Windows". This does not include Windows triples with ELF binary formats. So, for aarch64-pc-windows-msvc-elf we would use the Windows callee-save list in `AArch64RegisterInfo::getCalleeSavedRegs()`, but FrameLowering would handle this like Linux, and fail to invalidate the (x29, x28) pairing. This patch switches back to using AArch64Subtarget::isTargetWindows(), which aligns with getCalleeSavedRegs(). Note: We were using `usesWindowsCFI()` to include UEFI targets, however, there does not seem to be tests/support for UEFI triples on AArch64 (basic examples that compile for x86 fail: https://godbolt.org/z/dPWdTrEG7). So, this has been moved to a TODO. Fixes #204060 --- .../Target/AArch64/AArch64FrameLowering.cpp | 7 +++- .../windows-elf-frame-record-pairing.ll | 36 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/windows-elf-frame-record-pairing.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 2fd8872dfccd1..026f807124d2f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -384,7 +384,12 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, } static bool isTargetWindows(const MachineFunction &MF) { - return MF.getTarget().getMCAsmInfo().usesWindowsCFI(); + // TODO: Should this include targets like UEFI (which use Windows CFI)? + // Note: Currently, there is not AArch64 support for UEFI. The value returned + // here must align with the predicate used for returning the list of callee + // saved regs in AArch64RegisterInfo::getCalleeSavedRegs(), so that we use + // invalidateWindowsRegisterPairing() where appropriate. + return MF.getSubtarget().isTargetWindows(); } bool AArch64FrameLowering::hasSVECalleeSavesAboveFrameRecord( diff --git a/llvm/test/CodeGen/AArch64/windows-elf-frame-record-pairing.ll b/llvm/test/CodeGen/AArch64/windows-elf-frame-record-pairing.ll new file mode 100644 index 0000000000000..dd1c6116b72ea --- /dev/null +++ b/llvm/test/CodeGen/AArch64/windows-elf-frame-record-pairing.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-pc-windows-msvc-elf" + +; This test uses a Windows triple with ELF binaries. This triple does not use +; Windows CFI, although it still uses CSR_Win_AArch64_AAPCS_SaveList for +; callee-saves. + +; This test checks we do not attempt to pair x28 with the frame pointer (x29). +; Previously we did not recognize aarch64-pc-windows-msvc-elf as Windows +; (in FrameLowering), and failed to invalidate the pairing because the code +; assumed it was using the default CSR_AArch64_AAPCS_SaveList instead of +; CSR_Win_AArch64_AAPCS_SaveList. +define i32 @large_stack_requires_frame_record() "frame-pointer"="all" nounwind { +; CHECK-LABEL: large_stack_requires_frame_record: +; CHECK: // %bb.0: +; CHECK-NEXT: str x28, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #8 +; CHECK-NEXT: sub sp, sp, #512 +; CHECK-NEXT: adrp x8, :got:baz +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: ldr x8, [x8, :got_lo12:baz] +; CHECK-NEXT: blr x8 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: add sp, sp, #512 +; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret + %x = alloca [500 x i8], align 16 + call void @baz(ptr %x) + ret i32 0 +} + +declare void @baz(ptr) From 06137a53cc96643f17f34c6affa8838c5c86e2cb Mon Sep 17 00:00:00 2001 From: Fabrice de Gans Date: Fri, 19 Jun 2026 09:20:01 +0000 Subject: [PATCH 016/149] [llvm] Remove LLVM_ABI_FOR_TEST in public headers (#204627) These annotations were mistakenly set up as LLVM_ABI_FOR_TEST. Since these are public headers, they should be using LLVM_ABI. The effort to build LLVM as a dylib is tracked in #109483. --- llvm/include/llvm/BinaryFormat/DXContainer.h | 2 +- llvm/include/llvm/CAS/ActionCache.h | 2 +- llvm/include/llvm/CAS/MappedFileRegionArena.h | 6 +- llvm/include/llvm/CAS/OnDiskDataAllocator.h | 17 +- llvm/include/llvm/CAS/OnDiskGraphDB.h | 32 ++- llvm/include/llvm/CAS/OnDiskKeyValueDB.h | 11 +- llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h | 20 +- llvm/include/llvm/CAS/UnifiedOnDiskCache.h | 20 +- llvm/include/llvm/CGData/StableFunctionMap.h | 2 +- llvm/include/llvm/CodeGen/MIR2Vec.h | 12 +- llvm/include/llvm/IR/BasicBlock.h | 2 +- llvm/include/llvm/Option/ArgList.h | 2 +- llvm/include/llvm/SandboxIR/Argument.h | 2 +- llvm/include/llvm/SandboxIR/BasicBlock.h | 2 +- llvm/include/llvm/SandboxIR/Function.h | 4 +- llvm/include/llvm/SandboxIR/Instruction.h | 4 +- llvm/include/llvm/SandboxIR/Pass.h | 2 +- llvm/include/llvm/SandboxIR/Region.h | 4 +- llvm/include/llvm/SandboxIR/Tracker.h | 4 +- llvm/include/llvm/SandboxIR/Use.h | 2 +- llvm/include/llvm/Support/Compiler.h | 10 +- llvm/include/llvm/Support/GlobPattern.h | 2 +- llvm/include/llvm/Support/LSP/Logging.h | 6 +- llvm/include/llvm/Support/LSP/Protocol.h | 211 ++++++++---------- llvm/include/llvm/Support/LSP/Transport.h | 12 +- .../llvm/Support/VirtualOutputConfig.h | 7 +- .../llvm/Transforms/Utils/DebugSSAUpdater.h | 2 +- .../llvm/Transforms/Utils/SSAUpdaterBulk.h | 4 +- 28 files changed, 192 insertions(+), 214 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index a4e45c3a2fc9c..4d832368ae888 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -203,7 +203,7 @@ enum class RootParameterType : uint32_t { LLVM_ABI ArrayRef> getRootParameterTypes(); -LLVM_ABI_FOR_TEST bool isValidParameterType(uint32_t V); +LLVM_ABI bool isValidParameterType(uint32_t V); LLVM_ABI bool isValidRangeType(uint32_t V); diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h index 97b0e6de740c9..76ca24bb198a1 100644 --- a/llvm/include/llvm/CAS/ActionCache.h +++ b/llvm/include/llvm/CAS/ActionCache.h @@ -34,7 +34,7 @@ class CacheKey { StringRef getKey() const { return Key; } LLVM_ABI CacheKey(const CASID &ID); - LLVM_ABI_FOR_TEST CacheKey(const ObjectProxy &Proxy); + LLVM_ABI CacheKey(const ObjectProxy &Proxy); LLVM_ABI CacheKey(const ObjectStore &CAS, const ObjectRef &Ref); private: diff --git a/llvm/include/llvm/CAS/MappedFileRegionArena.h b/llvm/include/llvm/CAS/MappedFileRegionArena.h index a00bfa7306ef6..f571fe7e097a0 100644 --- a/llvm/include/llvm/CAS/MappedFileRegionArena.h +++ b/llvm/include/llvm/CAS/MappedFileRegionArena.h @@ -65,7 +65,7 @@ class MappedFileRegionArena { /// that information can be stored before the header, like a file magic. /// \param NewFileConstructor is for constructing new files. It has exclusive /// access to the file. Must call \c initializeBumpPtr. - LLVM_ABI_FOR_TEST static Expected + LLVM_ABI static Expected create(const Twine &Path, uint64_t Capacity, uint64_t HeaderOffset, std::shared_ptr Logger, function_ref NewFileConstructor); @@ -86,7 +86,7 @@ class MappedFileRegionArena { return data() + *Offset; } /// Allocate, returning the offset from \a data() instead of a pointer. - LLVM_ABI_FOR_TEST Expected allocateOffset(uint64_t AllocSize); + LLVM_ABI Expected allocateOffset(uint64_t AllocSize); char *data() const { return Region.data(); } uint64_t size() const { return H->BumpPtr; } @@ -111,7 +111,7 @@ class MappedFileRegionArena { // initialize header from offset. Error initializeHeader(uint64_t HeaderOffset); - LLVM_ABI_FOR_TEST void destroyImpl(); + LLVM_ABI void destroyImpl(); void moveImpl(MappedFileRegionArena &RHS) { std::swap(Region, RHS.Region); std::swap(H, RHS.H); diff --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h index 9af8a0aceaed6..18b8a39457280 100644 --- a/llvm/include/llvm/CAS/OnDiskDataAllocator.h +++ b/llvm/include/llvm/CAS/OnDiskDataAllocator.h @@ -58,34 +58,33 @@ class OnDiskDataAllocator { /// Get the data of \p Size stored at the given \p Offset. Note the allocator /// doesn't keep track of the allocation size, thus \p Size doesn't need to /// match the size of allocation but needs to be smaller. - LLVM_ABI_FOR_TEST Expected> get(FileOffset Offset, - size_t Size) const; + LLVM_ABI Expected> get(FileOffset Offset, size_t Size) const; /// Allocate at least \p Size with 8-byte alignment. - LLVM_ABI_FOR_TEST Expected allocate(size_t Size); + LLVM_ABI Expected allocate(size_t Size); /// \returns the buffer that was allocated at \p create time, with size /// \p UserHeaderSize. LLVM_ABI MutableArrayRef getUserHeader() const; - LLVM_ABI_FOR_TEST size_t size() const; - LLVM_ABI_FOR_TEST size_t capacity() const; + LLVM_ABI size_t size() const; + LLVM_ABI size_t capacity() const; - LLVM_ABI_FOR_TEST static Expected + LLVM_ABI static Expected create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize, std::optional NewFileInitialSize, uint32_t UserHeaderSize = 0, std::shared_ptr Logger = nullptr, function_ref UserHeaderInit = nullptr); - LLVM_ABI_FOR_TEST OnDiskDataAllocator(OnDiskDataAllocator &&RHS); - LLVM_ABI_FOR_TEST OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS); + LLVM_ABI OnDiskDataAllocator(OnDiskDataAllocator &&RHS); + LLVM_ABI OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS); // No copy. Just call \a create() again. OnDiskDataAllocator(const OnDiskDataAllocator &) = delete; OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete; - LLVM_ABI_FOR_TEST ~OnDiskDataAllocator(); + LLVM_ABI ~OnDiskDataAllocator(); private: struct ImplType; diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h index 8ce7cebf773a5..f994a7134a364 100644 --- a/llvm/include/llvm/CAS/OnDiskGraphDB.h +++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h @@ -261,8 +261,8 @@ class OnDiskGraphDB { /// already a record for this object the operation is a no-op. \param ID the /// object ID to associate the data & references with. \param Refs references /// \param Data data buffer. - LLVM_ABI_FOR_TEST Error store(ObjectID ID, ArrayRef Refs, - ArrayRef Data); + LLVM_ABI Error store(ObjectID ID, ArrayRef Refs, + ArrayRef Data); /// Associates the data of a file with a particular object ID. If there is /// already a record for this object the operation is a no-op. @@ -275,10 +275,10 @@ class OnDiskGraphDB { /// /// \param ID the object ID to associate the data with. /// \param FilePath the path of the file data. - LLVM_ABI_FOR_TEST Error storeFile(ObjectID ID, StringRef FilePath); + LLVM_ABI Error storeFile(ObjectID ID, StringRef FilePath); /// \returns \p nullopt if the object associated with \p Ref does not exist. - LLVM_ABI_FOR_TEST Expected> load(ObjectID Ref); + LLVM_ABI Expected> load(ObjectID Ref); /// \returns the hash bytes digest for the object reference. ArrayRef getDigest(ObjectID Ref) const { @@ -288,12 +288,12 @@ class OnDiskGraphDB { /// Form a reference for the provided hash. The reference can be used as part /// of a CAS object even if it's not associated with an object yet. - LLVM_ABI_FOR_TEST Expected getReference(ArrayRef Hash); + LLVM_ABI Expected getReference(ArrayRef Hash); /// Get an existing reference to the object \p Digest. /// /// Returns \p nullopt if the object is not stored in this CAS. - LLVM_ABI_FOR_TEST std::optional + LLVM_ABI std::optional getExistingReference(ArrayRef Digest, bool CheckUpstream = true); /// Check whether the object associated with \p Ref is stored in the CAS. @@ -320,7 +320,7 @@ class OnDiskGraphDB { } /// \returns the data part of the provided object handle. - LLVM_ABI_FOR_TEST ArrayRef getObjectData(ObjectHandle Node) const; + LLVM_ABI ArrayRef getObjectData(ObjectHandle Node) const; /// \returns the object referenced by the provided object handle. object_refs_range getObjectRefs(ObjectHandle Node) const { @@ -358,7 +358,7 @@ class OnDiskGraphDB { /// /// NOTE: There's a possibility that the returned size is not including a /// large object if the process crashed right at the point of inserting it. - LLVM_ABI_FOR_TEST size_t getStorageSize() const; + LLVM_ABI size_t getStorageSize() const; /// \returns The precentage of space utilization of hard space limits. /// @@ -381,7 +381,7 @@ class OnDiskGraphDB { /// Checks that \p ID exists in the index. It is allowed to not have data /// associated with it. - LLVM_ABI_FOR_TEST Error validateObjectID(ObjectID ID) const; + LLVM_ABI Error validateObjectID(ObjectID ID) const; /// How to fault-in nodes if an upstream database is used. enum class FaultInPolicy { @@ -407,13 +407,13 @@ class OnDiskGraphDB { /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied /// to primary store. This is recorded at creation time and subsequent opens /// need to pass the same policy otherwise the \p open will fail. - LLVM_ABI_FOR_TEST static Expected> + LLVM_ABI static Expected> open(StringRef Path, StringRef HashName, unsigned HashByteSize, OnDiskGraphDB *UpstreamDB = nullptr, std::shared_ptr Logger = nullptr, FaultInPolicy Policy = FaultInPolicy::FullTree); - LLVM_ABI_FOR_TEST ~OnDiskGraphDB(); + LLVM_ABI ~OnDiskGraphDB(); private: /// Forward declaration for a proxy for an ondisk index record. @@ -426,8 +426,8 @@ class OnDiskGraphDB { }; /// Check if object exists and if it is on upstream only. - LLVM_ABI_FOR_TEST Expected - getObjectPresence(ObjectID Ref, bool CheckUpstream) const; + LLVM_ABI Expected getObjectPresence(ObjectID Ref, + bool CheckUpstream) const; /// When \p load is called for a node that doesn't exist, this function tries /// to load it from the upstream store and copy it to the primary one. @@ -468,8 +468,7 @@ class OnDiskGraphDB { static InternalRef makeInternalRef(FileOffset IndexOffset); - LLVM_ABI_FOR_TEST Expected> - getDigest(InternalRef Ref) const; + LLVM_ABI Expected> getDigest(InternalRef Ref) const; ArrayRef getDigest(const IndexProxy &I) const; @@ -478,8 +477,7 @@ class OnDiskGraphDB { IndexProxy getIndexProxyFromPointer(OnDiskTrieRawHashMap::ConstOnDiskPtr P) const; - LLVM_ABI_FOR_TEST InternalRefArrayRef - getInternalRefs(ObjectHandle Node) const; + LLVM_ABI InternalRefArrayRef getInternalRefs(ObjectHandle Node) const; /// \} /// Get the atomic variable that keeps track of the standalone data storage diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h index 68cced665f28e..2de04289199fc 100644 --- a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h +++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h @@ -36,13 +36,12 @@ class OnDiskKeyValueDB { /// /// \returns the value associated with the \p Key. It may be different than /// \p Value if another value is already associated with this key. - LLVM_ABI_FOR_TEST Expected> put(ArrayRef Key, - ArrayRef Value); + LLVM_ABI Expected> put(ArrayRef Key, + ArrayRef Value); /// \returns the value associated with the \p Key, or \p std::nullopt if the /// key does not exist. - LLVM_ABI_FOR_TEST Expected>> - get(ArrayRef Key); + LLVM_ABI Expected>> get(ArrayRef Key); /// \returns Total size of stored data. size_t getStorageSize() const { return Cache.size(); } @@ -66,14 +65,14 @@ class OnDiskKeyValueDB { /// \param UnifiedCache An optional UnifiedOnDiskCache that manages the size /// and lifetime of the CAS instance and it must owns current initializing /// KeyValueDB after initialized. - LLVM_ABI_FOR_TEST static Expected> + LLVM_ABI static Expected> open(StringRef Path, StringRef HashName, unsigned KeySize, StringRef ValueName, size_t ValueSize, UnifiedOnDiskCache *UnifiedCache = nullptr, std::shared_ptr Logger = nullptr); /// Validate the storage. - LLVM_ABI_FOR_TEST Error validate() const; + LLVM_ABI Error validate() const; private: OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache, diff --git a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h index e4963aa5b4f6d..1745b3a2791be 100644 --- a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h +++ b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h @@ -86,7 +86,7 @@ class OnDiskTrieRawHashMap { /// Validate the trie data structure. /// /// Callback receives the file offset to the data entry and the data stored. - LLVM_ABI_FOR_TEST Error validate( + LLVM_ABI Error validate( function_ref RecordVerifier) const; /// Check the valid range of file offset for OnDiskTrieRawHashMap. @@ -164,10 +164,10 @@ class OnDiskTrieRawHashMap { /// /// \returns pointer to the value if exists, otherwise returns a non-value /// pointer that evaluates to `false` when convert to boolean. - LLVM_ABI_FOR_TEST ConstOnDiskPtr find(ArrayRef Hash) const; + LLVM_ABI ConstOnDiskPtr find(ArrayRef Hash) const; /// Helper function to recover a pointer into the trie from file offset. - LLVM_ABI_FOR_TEST Expected + LLVM_ABI Expected recoverFromFileOffset(FileOffset Offset) const; using LazyInsertOnConstructCB = @@ -190,7 +190,7 @@ class OnDiskTrieRawHashMap { /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize /// simultaneous writes, but that seems dangerous to use in a memory-mapped /// file in case a process crashes in the busy state. - LLVM_ABI_FOR_TEST Expected + LLVM_ABI Expected insertLazy(ArrayRef Hash, LazyInsertOnConstructCB OnConstruct = nullptr, LazyInsertOnLeakCB OnLeak = nullptr); @@ -203,8 +203,8 @@ class OnDiskTrieRawHashMap { }); } - LLVM_ABI_FOR_TEST size_t size() const; - LLVM_ABI_FOR_TEST size_t capacity() const; + LLVM_ABI size_t size() const; + LLVM_ABI size_t capacity() const; /// Gets or creates a file at \p Path with a hash-mapped trie named \p /// TrieName. The hash size is \p NumHashBits (in bits) and the records store @@ -218,7 +218,7 @@ class OnDiskTrieRawHashMap { /// configure the trie, if it doesn't already exist. /// /// \pre NumHashBits is a multiple of 8 (byte-aligned). - LLVM_ABI_FOR_TEST static Expected + LLVM_ABI static Expected create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional NewFileInitialSize, @@ -226,9 +226,9 @@ class OnDiskTrieRawHashMap { std::optional NewTableNumRootBits = std::nullopt, std::optional NewTableNumSubtrieBits = std::nullopt); - LLVM_ABI_FOR_TEST OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS); - LLVM_ABI_FOR_TEST OnDiskTrieRawHashMap &operator=(OnDiskTrieRawHashMap &&RHS); - LLVM_ABI_FOR_TEST ~OnDiskTrieRawHashMap(); + LLVM_ABI OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS); + LLVM_ABI OnDiskTrieRawHashMap &operator=(OnDiskTrieRawHashMap &&RHS); + LLVM_ABI ~OnDiskTrieRawHashMap(); private: struct ImplType; diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h index 45bee3902daef..24bc4a36883a2 100644 --- a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h +++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h @@ -64,7 +64,7 @@ class UnifiedOnDiskCache { /// \param FaultInPolicy Controls how nodes are copied to primary store. This /// is recorded at creation time and subsequent opens need to pass the same /// policy otherwise the \p open will fail. - LLVM_ABI_FOR_TEST static Expected> + LLVM_ABI static Expected> open(StringRef Path, std::optional SizeLimit, StringRef HashName, unsigned HashByteSize, OnDiskGraphDB::FaultInPolicy FaultInPolicy = @@ -100,7 +100,7 @@ class UnifiedOnDiskCache { std::optional LLVMCasBinary); /// Validate the action cache only. - LLVM_ABI_FOR_TEST Error validateActionCache() const; + LLVM_ABI Error validateActionCache() const; /// This is called implicitly at destruction time, so it is not required for a /// client to call this. After calling \p close the only method that is valid @@ -109,20 +109,20 @@ class UnifiedOnDiskCache { /// \param CheckSizeLimit if true it will check whether the primary store has /// exceeded its intended size limit. If false the check is skipped even if a /// \p SizeLimit was passed to the \p open call. - LLVM_ABI_FOR_TEST Error close(bool CheckSizeLimit = true); + LLVM_ABI Error close(bool CheckSizeLimit = true); /// Set the size for limiting growth. This has an effect for when the instance /// is closed. - LLVM_ABI_FOR_TEST void setSizeLimit(std::optional SizeLimit); + LLVM_ABI void setSizeLimit(std::optional SizeLimit); /// \returns the storage size of the cache data. - LLVM_ABI_FOR_TEST uint64_t getStorageSize() const; + LLVM_ABI uint64_t getStorageSize() const; /// \returns whether the primary store has exceeded the intended size limit. /// This can return false even if the overall size of the opened directory is /// over the \p SizeLimit passed to \p open. To know whether garbage /// collection needs to be triggered or not, call \p needsGarbaseCollection. - LLVM_ABI_FOR_TEST bool hasExceededSizeLimit() const; + LLVM_ABI bool hasExceededSizeLimit() const; /// \returns whether there are unused data that can be deleted using a /// \p collectGarbage call. @@ -137,19 +137,19 @@ class UnifiedOnDiskCache { /// /// It is recommended that garbage-collection is triggered concurrently in the /// background, so that it has minimal effect on the workload of the process. - LLVM_ABI_FOR_TEST static Error + LLVM_ABI static Error collectGarbage(StringRef Path, ondisk::OnDiskCASLogger *Logger = nullptr); /// Remove unused data from the current UnifiedOnDiskCache. LLVM_ABI Error collectGarbage(); /// Helper function to convert the value stored in KeyValueDB and ObjectID. - LLVM_ABI_FOR_TEST static ObjectID getObjectIDFromValue(ArrayRef Value); + LLVM_ABI static ObjectID getObjectIDFromValue(ArrayRef Value); using ValueBytes = std::array; - LLVM_ABI_FOR_TEST static ValueBytes getValueFromObjectID(ObjectID ID); + LLVM_ABI static ValueBytes getValueFromObjectID(ObjectID ID); - LLVM_ABI_FOR_TEST ~UnifiedOnDiskCache(); + LLVM_ABI ~UnifiedOnDiskCache(); private: friend class OnDiskGraphDB; diff --git a/llvm/include/llvm/CGData/StableFunctionMap.h b/llvm/include/llvm/CGData/StableFunctionMap.h index 94585e958485e..909d0e1debe7e 100644 --- a/llvm/include/llvm/CGData/StableFunctionMap.h +++ b/llvm/include/llvm/CGData/StableFunctionMap.h @@ -105,7 +105,7 @@ struct StableFunctionMap { using HashFuncsMapType = std::unordered_map; /// Get the HashToFuncs map for serialization. - LLVM_ABI_FOR_TEST const HashFuncsMapType &getFunctionMap() const; + LLVM_ABI const HashFuncsMapType &getFunctionMap() const; /// Get the NameToId vector for serialization. ArrayRef getNames() const { return IdToName; } diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h index 8737ea786ed3d..6f7b6dfce4378 100644 --- a/llvm/include/llvm/CodeGen/MIR2Vec.h +++ b/llvm/include/llvm/CodeGen/MIR2Vec.h @@ -211,16 +211,14 @@ class MIRVocabulary { public: /// Static method for extracting base opcode names (public for testing) - LLVM_ABI_FOR_TEST static std::string - extractBaseOpcodeName(StringRef InstrName); + LLVM_ABI static std::string extractBaseOpcodeName(StringRef InstrName); /// Get indices from opcode or operand names. These are public for testing. /// String based lookups are inefficient and should be avoided in general. - LLVM_ABI_FOR_TEST unsigned - getCanonicalIndexForBaseName(StringRef BaseName) const; - LLVM_ABI_FOR_TEST unsigned + LLVM_ABI unsigned getCanonicalIndexForBaseName(StringRef BaseName) const; + LLVM_ABI unsigned getCanonicalIndexForOperandName(StringRef OperandName) const; - LLVM_ABI_FOR_TEST unsigned + LLVM_ABI unsigned getCanonicalIndexForRegisterClass(StringRef RegName, bool IsPhysical = true) const; @@ -266,7 +264,7 @@ class MIRVocabulary { MIRVocabulary() = delete; /// Factory method to create MIRVocabulary from vocabulary map - LLVM_ABI_FOR_TEST static Expected + LLVM_ABI static Expected create(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap, VocabMap &&PhyRegMap, VocabMap &&VirtRegMap, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI); diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h index 6b91902571b24..1c5fea7183ad1 100644 --- a/llvm/include/llvm/IR/BasicBlock.h +++ b/llvm/include/llvm/IR/BasicBlock.h @@ -747,7 +747,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also /// instructions, so the order should be validated no more than once after /// each ordering to ensure that transforms have the same algorithmic /// complexity when asserts are enabled as when they are disabled. - LLVM_ABI_FOR_TEST void validateInstrOrdering() const; + LLVM_ABI void validateInstrOrdering() const; }; // Create wrappers for C Binding types (see CBindingWrapping.h). diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h index cedc6ffb35921..5142a97884444 100644 --- a/llvm/include/llvm/Option/ArgList.h +++ b/llvm/include/llvm/Option/ArgList.h @@ -292,7 +292,7 @@ class ArgList { /// \return The name of the subcommand found. If no subcommand is found, /// this returns an empty StringRef. If multiple subcommands are found, the /// first one is returned. - LLVM_ABI_FOR_TEST StringRef getSubCommand( + LLVM_ABI StringRef getSubCommand( ArrayRef AllSubCommands, std::function)> HandleMultipleSubcommands, std::function)> HandleOtherPositionals) const; diff --git a/llvm/include/llvm/SandboxIR/Argument.h b/llvm/include/llvm/SandboxIR/Argument.h index 9cc38e600fb7b..d18a7e38be0f8 100644 --- a/llvm/include/llvm/SandboxIR/Argument.h +++ b/llvm/include/llvm/SandboxIR/Argument.h @@ -29,7 +29,7 @@ class Argument : public sandboxir::Value { assert(isa(Val) && "Expected Argument!"); } void printAsOperand(raw_ostream &OS) const; - LLVM_ABI_FOR_TEST void dumpOS(raw_ostream &OS) const final; + LLVM_ABI void dumpOS(raw_ostream &OS) const final; #endif }; diff --git a/llvm/include/llvm/SandboxIR/BasicBlock.h b/llvm/include/llvm/SandboxIR/BasicBlock.h index 97f7f6184f8de..cb162da00315c 100644 --- a/llvm/include/llvm/SandboxIR/BasicBlock.h +++ b/llvm/include/llvm/SandboxIR/BasicBlock.h @@ -104,7 +104,7 @@ class BasicBlock : public Value { #ifndef NDEBUG void verify() const final; - LLVM_ABI_FOR_TEST void dumpOS(raw_ostream &OS) const final; + LLVM_ABI void dumpOS(raw_ostream &OS) const final; #endif }; diff --git a/llvm/include/llvm/SandboxIR/Function.h b/llvm/include/llvm/SandboxIR/Function.h index 16da59e2eb7e6..42fe119705bc3 100644 --- a/llvm/include/llvm/SandboxIR/Function.h +++ b/llvm/include/llvm/SandboxIR/Function.h @@ -73,8 +73,8 @@ class Function : public GlobalWithNodeAPI(Val) && "Expected Function!"); } - LLVM_ABI_FOR_TEST void dumpNameAndArgs(raw_ostream &OS) const; - LLVM_ABI_FOR_TEST void dumpOS(raw_ostream &OS) const final; + LLVM_ABI void dumpNameAndArgs(raw_ostream &OS) const; + LLVM_ABI void dumpOS(raw_ostream &OS) const final; #endif }; diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index ab4c43b4731ed..cf4fc4e867667 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -1967,8 +1967,8 @@ class SwitchInst : public SingleLLVMInstructionImpl { public: CaseHandleImpl(Context &Ctx, LLVMCaseItT LLVMCaseIt) : Ctx(Ctx), LLVMCaseIt(LLVMCaseIt) {} - LLVM_ABI_FOR_TEST ConstT *getCaseValue() const; - LLVM_ABI_FOR_TEST BlockT *getCaseSuccessor() const; + LLVM_ABI ConstT *getCaseValue() const; + LLVM_ABI BlockT *getCaseSuccessor() const; unsigned getCaseIndex() const { const auto &LLVMCaseHandle = *LLVMCaseIt; return LLVMCaseHandle.getCaseIndex(); diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h index d125517c1e29f..9699f8965f30e 100644 --- a/llvm/include/llvm/SandboxIR/Pass.h +++ b/llvm/include/llvm/SandboxIR/Pass.h @@ -65,7 +65,7 @@ class Pass { return OS; } virtual void print(raw_ostream &OS) const { OS << Name; } - LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD virtual void dump() const; + LLVM_ABI LLVM_DUMP_METHOD virtual void dump() const; #endif /// Similar to print() but adds a newline. Used for testing. virtual void printPipeline(raw_ostream &OS) const { OS << Name << "\n"; } diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h index afe486a7246d0..a8c7d80a5294c 100644 --- a/llvm/include/llvm/SandboxIR/Region.h +++ b/llvm/include/llvm/SandboxIR/Region.h @@ -189,10 +189,10 @@ class LLVM_ABI Region { #ifndef NDEBUG /// This is an expensive check, meant for testing. - LLVM_ABI_FOR_TEST bool operator==(const Region &Other) const; + LLVM_ABI bool operator==(const Region &Other) const; bool operator!=(const Region &other) const { return !(*this == other); } - LLVM_ABI_FOR_TEST void dump(raw_ostream &OS) const; + LLVM_ABI void dump(raw_ostream &OS) const; void dump() const; friend raw_ostream &operator<<(raw_ostream &OS, const Region &Rgn) { Rgn.dump(OS); diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index f74ff5d29d620..881d4cb9d8d64 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -115,10 +115,10 @@ class IRSnapshotChecker { /// Saves a snapshot of the current state. If there was any previous snapshot, /// it will be replaced with the new one. - LLVM_ABI_FOR_TEST void save(); + LLVM_ABI void save(); /// Checks current state against saved state, crashes if different. - LLVM_ABI_FOR_TEST void expectNoDiff(); + LLVM_ABI void expectNoDiff(); }; #endif // NDEBUG diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h index 5b024b81e327c..418389e16a681 100644 --- a/llvm/include/llvm/SandboxIR/Use.h +++ b/llvm/include/llvm/SandboxIR/Use.h @@ -76,7 +76,7 @@ class Use { } bool operator!=(const Use &Other) const { return !(*this == Other); } #ifndef NDEBUG - LLVM_ABI_FOR_TEST void dumpOS(raw_ostream &OS) const; + LLVM_ABI void dumpOS(raw_ostream &OS) const; void dump() const; #endif // NDEBUG }; diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h index f4bd894021097..35f92b2b51430 100644 --- a/llvm/include/llvm/Support/Compiler.h +++ b/llvm/include/llvm/Support/Compiler.h @@ -171,10 +171,12 @@ /// for both functions and classes. On windows its turned in to dllimport for /// library consumers, for other platforms its a default visibility attribute. /// -/// LLVM_ABI_FOR_TEST is for annotating symbols that are only exported because -/// they are imported from a test. These symbols are not technically part of the -/// LLVM public interface and could be conditionally excluded when not building -/// tests in the future. +/// LLVM_ABI_FOR_TEST is for annotating symbols that are exported from a +/// library-internal header solely so that unit tests can link against them. +/// Symbols in LLVM's public headers are part of the LLVM public interface and +/// should use LLVM_ABI. LLVM_ABI_FOR_TEST is reserved for internal headers, +/// whose symbols could be conditionally excluded when not building tests in the +/// future. /// #ifndef LLVM_ABI_GENERATING_ANNOTATIONS // Marker to add to classes or functions in public headers that should not have diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h index ff9aa7dcb9e74..8c84c93834c6b 100644 --- a/llvm/include/llvm/Support/GlobPattern.h +++ b/llvm/include/llvm/Support/GlobPattern.h @@ -82,7 +82,7 @@ class GlobPattern { StringRef suffix() const { return Pattern.take_back(SuffixSize); } // Returns the longest plain substring of the pattern between prefix and // suffix. - LLVM_ABI_FOR_TEST StringRef longest_substr() const; + LLVM_ABI StringRef longest_substr() const; private: StringRef Pattern; diff --git a/llvm/include/llvm/Support/LSP/Logging.h b/llvm/include/llvm/Support/LSP/Logging.h index 47f9b06486f7e..a1fc8fb63de4d 100644 --- a/llvm/include/llvm/Support/LSP/Logging.h +++ b/llvm/include/llvm/Support/LSP/Logging.h @@ -24,7 +24,7 @@ class Logger { enum class Level { Debug, Info, Error }; /// Set the severity level of the logger. - LLVM_ABI_FOR_TEST static void setLogLevel(Level LogLevel); + LLVM_ABI static void setLogLevel(Level LogLevel); /// Initiate a log message at various severity levels. These should be called /// after a call to `initialize`. @@ -45,8 +45,8 @@ class Logger { static Logger &get(); /// Start a log message with the given severity level. - LLVM_ABI_FOR_TEST static void log(Level LogLevel, const char *Fmt, - const llvm::formatv_object_base &Message); + LLVM_ABI static void log(Level LogLevel, const char *Fmt, + const llvm::formatv_object_base &Message); /// The minimum logging level. Messages with lower level are ignored. Level LogLevel = Level::Error; diff --git a/llvm/include/llvm/Support/LSP/Protocol.h b/llvm/include/llvm/Support/LSP/Protocol.h index a50f4ac089ac4..351ccbf925d2f 100644 --- a/llvm/include/llvm/Support/LSP/Protocol.h +++ b/llvm/include/llvm/Support/LSP/Protocol.h @@ -81,7 +81,7 @@ class LSPError : public llvm::ErrorInfo { public: std::string message; ErrorCode code; - LLVM_ABI_FOR_TEST static char ID; + LLVM_ABI static char ID; LSPError(std::string message, ErrorCode code) : message(std::move(message)), code(code) {} @@ -146,9 +146,9 @@ class URIForFile { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const URIForFile &value); -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - URIForFile &result, llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const URIForFile &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, URIForFile &result, + llvm::json::Path path); LLVM_ABI raw_ostream &operator<<(raw_ostream &os, const URIForFile &value); //===----------------------------------------------------------------------===// @@ -172,9 +172,8 @@ struct ClientCapabilities { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - ClientCapabilities &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + ClientCapabilities &result, llvm::json::Path path); //===----------------------------------------------------------------------===// // ClientInfo @@ -189,8 +188,8 @@ struct ClientInfo { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - ClientInfo &result, llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, ClientInfo &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // InitializeParams @@ -203,8 +202,8 @@ enum class TraceLevel { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - TraceLevel &result, llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, TraceLevel &result, + llvm::json::Path path); struct InitializeParams { /// The capabilities provided by the client (editor or tool). @@ -225,9 +224,8 @@ struct InitializeParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - InitializeParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, InitializeParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // InitializedParams @@ -258,9 +256,8 @@ struct TextDocumentItem { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - TextDocumentItem &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, TextDocumentItem &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // TextDocumentIdentifier @@ -272,10 +269,9 @@ struct TextDocumentIdentifier { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const TextDocumentIdentifier &value); -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - TextDocumentIdentifier &result, - llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const TextDocumentIdentifier &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + TextDocumentIdentifier &result, llvm::json::Path path); //===----------------------------------------------------------------------===// // VersionedTextDocumentIdentifier @@ -289,11 +285,10 @@ struct VersionedTextDocumentIdentifier { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value -toJSON(const VersionedTextDocumentIdentifier &value); -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - VersionedTextDocumentIdentifier &result, - llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const VersionedTextDocumentIdentifier &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + VersionedTextDocumentIdentifier &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // Position @@ -341,9 +336,9 @@ struct Position { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - Position &result, llvm::json::Path path); -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const Position &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, Position &result, + llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const Position &value); LLVM_ABI raw_ostream &operator<<(raw_ostream &os, const Position &value); //===----------------------------------------------------------------------===// @@ -394,9 +389,9 @@ struct Range { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, Range &result, - llvm::json::Path path); -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const Range &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, Range &result, + llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const Range &value); LLVM_ABI raw_ostream &operator<<(raw_ostream &os, const Range &value); //===----------------------------------------------------------------------===// @@ -429,9 +424,9 @@ struct Location { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - Location &result, llvm::json::Path path); -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const Location &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, Location &result, + llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const Location &value); LLVM_ABI raw_ostream &operator<<(raw_ostream &os, const Location &value); //===----------------------------------------------------------------------===// @@ -447,9 +442,9 @@ struct TextDocumentPositionParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - TextDocumentPositionParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + TextDocumentPositionParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // ReferenceParams @@ -461,17 +456,16 @@ struct ReferenceContext { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - ReferenceContext &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, ReferenceContext &result, + llvm::json::Path path); struct ReferenceParams : TextDocumentPositionParams { ReferenceContext context; }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - ReferenceParams &result, llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, ReferenceParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // DidOpenTextDocumentParams @@ -483,9 +477,9 @@ struct DidOpenTextDocumentParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - DidOpenTextDocumentParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + DidOpenTextDocumentParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // DidCloseTextDocumentParams @@ -497,9 +491,9 @@ struct DidCloseTextDocumentParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - DidCloseTextDocumentParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + DidCloseTextDocumentParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // DidSaveTextDocumentParams @@ -510,8 +504,8 @@ struct DidSaveTextDocumentParams { TextDocumentIdentifier textDocument; }; -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &, - DidSaveTextDocumentParams &, llvm::json::Path); +LLVM_ABI bool fromJSON(const llvm::json::Value &, DidSaveTextDocumentParams &, + llvm::json::Path); //===----------------------------------------------------------------------===// // DidChangeTextDocumentParams @@ -536,9 +530,9 @@ struct TextDocumentContentChangeEvent { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - TextDocumentContentChangeEvent &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + TextDocumentContentChangeEvent &result, + llvm::json::Path path); struct DidChangeTextDocumentParams { /// The document that changed. @@ -549,9 +543,9 @@ struct DidChangeTextDocumentParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - DidChangeTextDocumentParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + DidChangeTextDocumentParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // MarkupContent @@ -571,7 +565,7 @@ struct MarkupContent { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const MarkupContent &mc); +LLVM_ABI llvm::json::Value toJSON(const MarkupContent &mc); //===----------------------------------------------------------------------===// // Hover @@ -590,7 +584,7 @@ struct Hover { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const Hover &hover); +LLVM_ABI llvm::json::Value toJSON(const Hover &hover); //===----------------------------------------------------------------------===// // SymbolKind @@ -665,7 +659,7 @@ struct DocumentSymbol { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const DocumentSymbol &symbol); +LLVM_ABI llvm::json::Value toJSON(const DocumentSymbol &symbol); //===----------------------------------------------------------------------===// // DocumentSymbolParams @@ -677,9 +671,8 @@ struct DocumentSymbolParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - DocumentSymbolParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + DocumentSymbolParams &result, llvm::json::Path path); //===----------------------------------------------------------------------===// // DiagnosticRelatedInformation @@ -700,11 +693,10 @@ struct DiagnosticRelatedInformation { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - DiagnosticRelatedInformation &result, - llvm::json::Path path); -LLVM_ABI_FOR_TEST llvm::json::Value -toJSON(const DiagnosticRelatedInformation &info); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + DiagnosticRelatedInformation &result, + llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const DiagnosticRelatedInformation &info); //===----------------------------------------------------------------------===// // Diagnostic @@ -726,9 +718,9 @@ enum class DiagnosticTag { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(DiagnosticTag tag); -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - DiagnosticTag &result, llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(DiagnosticTag tag); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, DiagnosticTag &result, + llvm::json::Path path); struct Diagnostic { /// The source range where the message applies. @@ -760,9 +752,9 @@ struct Diagnostic { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const Diagnostic &diag); -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - Diagnostic &result, llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const Diagnostic &diag); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, Diagnostic &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // PublishDiagnosticsParams @@ -781,8 +773,7 @@ struct PublishDiagnosticsParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value -toJSON(const PublishDiagnosticsParams ¶ms); +LLVM_ABI llvm::json::Value toJSON(const PublishDiagnosticsParams ¶ms); //===----------------------------------------------------------------------===// // TextEdit @@ -802,9 +793,9 @@ inline bool operator==(const TextEdit &lhs, const TextEdit &rhs) { return std::tie(lhs.newText, lhs.range) == std::tie(rhs.newText, rhs.range); } -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - TextEdit &result, llvm::json::Path path); -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const TextEdit &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, TextEdit &result, + llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const TextEdit &value); LLVM_ABI raw_ostream &operator<<(raw_ostream &os, const TextEdit &value); //===----------------------------------------------------------------------===// @@ -840,18 +831,16 @@ enum class CompletionItemKind { Operator = 24, TypeParameter = 25, }; -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - CompletionItemKind &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + CompletionItemKind &result, llvm::json::Path path); constexpr auto kCompletionItemKindMin = static_cast(CompletionItemKind::Text); constexpr auto kCompletionItemKindMax = static_cast(CompletionItemKind::TypeParameter); using CompletionItemKindBitset = std::bitset; -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - CompletionItemKindBitset &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + CompletionItemKindBitset &result, llvm::json::Path path); LLVM_ABI CompletionItemKind adjustKindToCapability(CompletionItemKind kind, @@ -934,7 +923,7 @@ struct CompletionItem { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const CompletionItem &value); +LLVM_ABI llvm::json::Value toJSON(const CompletionItem &value); LLVM_ABI raw_ostream &operator<<(raw_ostream &os, const CompletionItem &value); LLVM_ABI bool operator<(const CompletionItem &lhs, const CompletionItem &rhs); @@ -953,7 +942,7 @@ struct CompletionList { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const CompletionList &value); +LLVM_ABI llvm::json::Value toJSON(const CompletionList &value); //===----------------------------------------------------------------------===// // CompletionContext @@ -982,9 +971,8 @@ struct CompletionContext { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - CompletionContext &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + CompletionContext &result, llvm::json::Path path); //===----------------------------------------------------------------------===// // CompletionParams @@ -995,9 +983,8 @@ struct CompletionParams : TextDocumentPositionParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - CompletionParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, CompletionParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // ParameterInformation @@ -1017,7 +1004,7 @@ struct ParameterInformation { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const ParameterInformation &value); +LLVM_ABI llvm::json::Value toJSON(const ParameterInformation &value); //===----------------------------------------------------------------------===// // SignatureInformation @@ -1036,7 +1023,7 @@ struct SignatureInformation { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const SignatureInformation &value); +LLVM_ABI llvm::json::Value toJSON(const SignatureInformation &value); LLVM_ABI raw_ostream &operator<<(raw_ostream &os, const SignatureInformation &value); @@ -1057,7 +1044,7 @@ struct SignatureHelp { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const SignatureHelp &value); +LLVM_ABI llvm::json::Value toJSON(const SignatureHelp &value); //===----------------------------------------------------------------------===// // DocumentLinkParams @@ -1070,9 +1057,8 @@ struct DocumentLinkParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - DocumentLinkParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + DocumentLinkParams &result, llvm::json::Path path); //===----------------------------------------------------------------------===// // DocumentLink @@ -1108,7 +1094,7 @@ struct DocumentLink { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const DocumentLink &value); +LLVM_ABI llvm::json::Value toJSON(const DocumentLink &value); //===----------------------------------------------------------------------===// // InlayHintsParams @@ -1124,9 +1110,8 @@ struct InlayHintsParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - InlayHintsParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, InlayHintsParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // InlayHintKind @@ -1186,7 +1171,7 @@ struct InlayHint { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const InlayHint &); +LLVM_ABI llvm::json::Value toJSON(const InlayHint &); LLVM_ABI bool operator==(const InlayHint &lhs, const InlayHint &rhs); LLVM_ABI bool operator<(const InlayHint &lhs, const InlayHint &rhs); LLVM_ABI llvm::raw_ostream &operator<<(llvm::raw_ostream &os, @@ -1213,9 +1198,8 @@ struct CodeActionContext { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - CodeActionContext &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, + CodeActionContext &result, llvm::json::Path path); //===----------------------------------------------------------------------===// // CodeActionParams @@ -1233,9 +1217,8 @@ struct CodeActionParams { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - CodeActionParams &result, - llvm::json::Path path); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, CodeActionParams &result, + llvm::json::Path path); //===----------------------------------------------------------------------===// // WorkspaceEdit @@ -1250,9 +1233,9 @@ struct WorkspaceEdit { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST bool fromJSON(const llvm::json::Value &value, - WorkspaceEdit &result, llvm::json::Path path); -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const WorkspaceEdit &value); +LLVM_ABI bool fromJSON(const llvm::json::Value &value, WorkspaceEdit &result, + llvm::json::Path path); +LLVM_ABI llvm::json::Value toJSON(const WorkspaceEdit &value); //===----------------------------------------------------------------------===// // CodeAction @@ -1289,7 +1272,7 @@ struct CodeAction { }; /// Add support for JSON serialization. -LLVM_ABI_FOR_TEST llvm::json::Value toJSON(const CodeAction &); +LLVM_ABI llvm::json::Value toJSON(const CodeAction &); //===----------------------------------------------------------------------===// // ShowMessageParams diff --git a/llvm/include/llvm/Support/LSP/Transport.h b/llvm/include/llvm/Support/LSP/Transport.h index 5f8ea237e9654..6a0dd51d946bd 100644 --- a/llvm/include/llvm/Support/LSP/Transport.h +++ b/llvm/include/llvm/Support/LSP/Transport.h @@ -104,14 +104,14 @@ class JSONTransport { PrettyOutput(PrettyOutput) {} /// The following methods are used to send a message to the LSP client. - LLVM_ABI_FOR_TEST void notify(StringRef Method, llvm::json::Value Params); - LLVM_ABI_FOR_TEST void call(StringRef Method, llvm::json::Value Params, - llvm::json::Value Id); - LLVM_ABI_FOR_TEST void reply(llvm::json::Value Id, - llvm::Expected Result); + LLVM_ABI void notify(StringRef Method, llvm::json::Value Params); + LLVM_ABI void call(StringRef Method, llvm::json::Value Params, + llvm::json::Value Id); + LLVM_ABI void reply(llvm::json::Value Id, + llvm::Expected Result); /// Start executing the JSON-RPC transport. - LLVM_ABI_FOR_TEST llvm::Error run(MessageHandler &Handler); + LLVM_ABI llvm::Error run(MessageHandler &Handler); private: /// Dispatches the given incoming json message to the message handler. diff --git a/llvm/include/llvm/Support/VirtualOutputConfig.h b/llvm/include/llvm/Support/VirtualOutputConfig.h index 5c5ddd6fb628f..cc5a610545a78 100644 --- a/llvm/include/llvm/Support/VirtualOutputConfig.h +++ b/llvm/include/llvm/Support/VirtualOutputConfig.h @@ -36,7 +36,7 @@ struct EmptyBaseClass {}; /// configuration flag is either \c true or \c false. struct OutputConfig : detail::EmptyBaseClass { public: - LLVM_ABI_FOR_TEST void print(raw_ostream &OS) const; + LLVM_ABI void print(raw_ostream &OS) const; LLVM_ABI void dump() const; #define HANDLE_OUTPUT_CONFIG_FLAG(NAME, DEFAULT) \ @@ -61,7 +61,7 @@ struct OutputConfig : detail::EmptyBaseClass { /// Updates Text and CRLF flags based on \a sys::fs::OF_Text and \a /// sys::fs::OF_CRLF in \p Flags. Rejects CRLF without Text (calling /// \a setBinary()). - LLVM_ABI_FOR_TEST OutputConfig &setOpenFlags(const sys::fs::OpenFlags &Flags); + LLVM_ABI OutputConfig &setOpenFlags(const sys::fs::OpenFlags &Flags); constexpr OutputConfig() : EmptyBaseClass() @@ -86,8 +86,7 @@ struct OutputConfig : detail::EmptyBaseClass { } // namespace vfs -LLVM_ABI_FOR_TEST raw_ostream &operator<<(raw_ostream &OS, - vfs::OutputConfig Config); +LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, vfs::OutputConfig Config); } // namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/DebugSSAUpdater.h b/llvm/include/llvm/Transforms/Utils/DebugSSAUpdater.h index a17432da29e5a..3046ea73bdd95 100644 --- a/llvm/include/llvm/Transforms/Utils/DebugSSAUpdater.h +++ b/llvm/include/llvm/Transforms/Utils/DebugSSAUpdater.h @@ -343,7 +343,7 @@ class DbgValueRangeTable { DenseMap OrigSingleLocVariableValueTable; public: - LLVM_ABI_FOR_TEST void addVariable(Function *F, DebugVariableAggregate DVA); + LLVM_ABI void addVariable(Function *F, DebugVariableAggregate DVA); bool hasVariableEntry(DebugVariableAggregate DVA) const { return OrigVariableValueRangeTable.contains(DVA) || OrigSingleLocVariableValueTable.contains(DVA); diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h index a489daa8035ee..382aa5709542d 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h @@ -81,10 +81,10 @@ class SSAUpdaterBulk { /// Rewrite all uses and simplify the inserted PHI nodes. /// Use this method to preserve behavior when replacing SSAUpdater. - LLVM_ABI_FOR_TEST void RewriteAndOptimizeAllUses(DominatorTree &DT); + LLVM_ABI void RewriteAndOptimizeAllUses(DominatorTree &DT); }; -LLVM_ABI_FOR_TEST bool +LLVM_ABI bool EliminateNewDuplicatePHINodes(BasicBlock *BB, BasicBlock::phi_iterator FirstExistingPN); From 47b29c2eadbd9d7ddfb26bd6104a0a1f9e4a13a1 Mon Sep 17 00:00:00 2001 From: Faijul Amin Date: Fri, 19 Jun 2026 02:20:37 -0700 Subject: [PATCH 017/149] [SPIR-V] Legalize G_PHI of oversized vectors via fewer-elements (#203993) `G_PHI` on vectors wider than the SPIR-V max vector size previously failed legalization. This PR adds a `fewerElementsIf` rule that splits them down to `MaxVectorSize`, matching how other vector ops are handled in `SPIRVLegalizerInfo.cpp`. Added the following test `llvm/test/CodeGen/SPIRV/instructions/phi-large-vector.ll` covering spirv32 and spirv64. --- llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 6 +- .../instructions/phi-large-vector-shader.ll | 73 +++++++++++++++++++ .../SPIRV/instructions/phi-large-vector.ll | 44 +++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/SPIRV/instructions/phi-large-vector-shader.ll create mode 100644 llvm/test/CodeGen/SPIRV/instructions/phi-large-vector.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 512d1494a7b36..21fa782c4cc9e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -355,7 +355,11 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder(G_PHI) .legalFor(allPtrsScalarsAndVectors) - .legalIf(extendedPtrsScalarsAndVectors); + .legalIf(extendedPtrsScalarsAndVectors) + .moreElementsToNextPow2(0) + .fewerElementsIf(vectorElementCountIsGreaterThan(0, MaxVectorSize), + LegalizeMutations::changeElementCountTo( + 0, ElementCount::getFixed(MaxVectorSize))); getActionDefinitionsBuilder(G_BITCAST).legalIf( all(typeInSet(0, allPtrsScalarsAndVectors), diff --git a/llvm/test/CodeGen/SPIRV/instructions/phi-large-vector-shader.ll b/llvm/test/CodeGen/SPIRV/instructions/phi-large-vector-shader.ll new file mode 100644 index 0000000000000..434cab6f4a88c --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/instructions/phi-large-vector-shader.ll @@ -0,0 +1,73 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; In Shader execution models the SPIR-V max vector size is 4, so a G_PHI on +; a wider vector must be split into multiple PHIs of width 4. + +; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#V4:]] = OpTypeVector %[[#I32]] 4 +; CHECK-COUNT-8: %[[#PHI:]] = OpPhi %[[#V4]] +; CHECK: OpCompositeExtract %[[#I32]] %[[#PHI]] + +@A = internal addrspace(10) global [8 x <4 x i32>] zeroinitializer +@Out = internal addrspace(10) global [8 x <4 x i32>] zeroinitializer +@Cond = internal addrspace(10) global i32 zeroinitializer + +define void @main() local_unnamed_addr #0 { +entry: + %c = load i32, ptr addrspace(10) @Cond + %cond = icmp ne i32 %c, 0 + %p0 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @A, i32 0, i32 0 + %a0 = load <4 x i32>, ptr addrspace(10) %p0 + %p1 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @A, i32 0, i32 1 + %a1 = load <4 x i32>, ptr addrspace(10) %p1 + %ab = shufflevector <4 x i32> %a0, <4 x i32> %a1, + <8 x i32> + %wide_a = shufflevector <8 x i32> %ab, <8 x i32> %ab, + <32 x i32> + %wide_b = shufflevector <8 x i32> %ab, <8 x i32> %ab, + <32 x i32> + br i1 %cond, label %then, label %else + +then: + br label %merge + +else: + br label %merge + +merge: + %p = phi <32 x i32> [ %wide_a, %then ], [ %wide_b, %else ] + %s0 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %s1 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %s2 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %s3 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %s4 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %s5 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %s6 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %s7 = shufflevector <32 x i32> %p, <32 x i32> poison, <4 x i32> + %o0 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 0 + store <4 x i32> %s0, ptr addrspace(10) %o0 + %o1 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 1 + store <4 x i32> %s1, ptr addrspace(10) %o1 + %o2 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 2 + store <4 x i32> %s2, ptr addrspace(10) %o2 + %o3 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 3 + store <4 x i32> %s3, ptr addrspace(10) %o3 + %o4 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 4 + store <4 x i32> %s4, ptr addrspace(10) %o4 + %o5 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 5 + store <4 x i32> %s5, ptr addrspace(10) %o5 + %o6 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 6 + store <4 x i32> %s6, ptr addrspace(10) %o6 + %o7 = getelementptr [8 x <4 x i32>], ptr addrspace(10) @Out, i32 0, i32 7 + store <4 x i32> %s7, ptr addrspace(10) %o7 + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } diff --git a/llvm/test/CodeGen/SPIRV/instructions/phi-large-vector.ll b/llvm/test/CodeGen/SPIRV/instructions/phi-large-vector.ll new file mode 100644 index 0000000000000..43ab5b127ad54 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/instructions/phi-large-vector.ll @@ -0,0 +1,44 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; A G_PHI on a vector wider than the SPIR-V max (16) must be split into +; multiple PHIs of the largest legal width. + +; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#V16:]] = OpTypeVector %[[#I32]] 16 +; CHECK: %[[#PHI_LO:]] = OpPhi %[[#V16]] +; CHECK: %[[#PHI_HI:]] = OpPhi %[[#V16]] +; CHECK: OpCompositeExtract %[[#I32]] %[[#PHI_HI]] +; CHECK: OpIAdd %[[#V16]] %[[#PHI_LO]] + +define spir_kernel void @phi_v32(ptr addrspace(1) %out, i1 %cond, + <16 x i32> %a, <16 x i32> %b) { +entry: + %wide_a = shufflevector <16 x i32> %a, <16 x i32> %b, + <32 x i32> + %wide_b = shufflevector <16 x i32> %b, <16 x i32> %a, + <32 x i32> + br i1 %cond, label %then, label %else + +then: + br label %merge + +else: + br label %merge + +merge: + %p = phi <32 x i32> [ %wide_a, %then ], [ %wide_b, %else ] + %sum = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %p) + store i32 %sum, ptr addrspace(1) %out, align 4 + ret void +} + +declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) From 40cbc98bb17718eef5e0b1f47e3b58900aac354a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Fri, 19 Jun 2026 11:50:30 +0200 Subject: [PATCH 018/149] [AArch64][SDAG] Legalise nxv1 gather/scatter nodes (#204620) This updates WidenVecRes_MGATHER and WidenVecOp_MSCATTER to support scalable vector types. --- .../SelectionDAG/LegalizeVectorTypes.cpp | 20 +++--- .../AArch64/sve-masked-gather-64b-scaled.ll | 14 ++++ .../AArch64/sve-masked-gather-64b-unscaled.ll | 65 +++++++++++++++++++ .../AArch64/sve-masked-gather-legalize.ll | 32 +++++---- .../test/CodeGen/AArch64/sve-masked-gather.ll | 61 +++++++++++++++++ .../AArch64/sve-masked-scatter-64b-scaled.ll | 14 ++++ .../sve-masked-scatter-64b-unscaled.ll | 62 ++++++++++++++++++ .../CodeGen/AArch64/sve-masked-scatter.ll | 58 +++++++++++++++++ 8 files changed, 301 insertions(+), 25 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 382b0e2395da3..181cf7bd48b02 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -7021,27 +7021,25 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) { EVT MaskVT = Mask.getValueType(); SDValue PassThru = GetWidenedVector(N->getPassThru()); SDValue Scale = N->getScale(); - unsigned NumElts = WideVT.getVectorNumElements(); + ElementCount WideEC = WideVT.getVectorElementCount(); SDLoc dl(N); // The mask should be widened as well EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), - MaskVT.getVectorElementType(), - WideVT.getVectorNumElements()); + MaskVT.getVectorElementType(), WideEC); Mask = ModifyToType(Mask, WideMaskVT, true); // Widen the Index operand SDValue Index = N->getIndex(); - EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), - Index.getValueType().getScalarType(), - NumElts); + EVT WideIndexVT = EVT::getVectorVT( + *DAG.getContext(), Index.getValueType().getScalarType(), WideEC); Index = ModifyToType(Index, WideIndexVT); SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, Scale }; // Widen the MemoryType EVT WideMemVT = EVT::getVectorVT(*DAG.getContext(), - N->getMemoryVT().getScalarType(), NumElts); + N->getMemoryVT().getScalarType(), WideEC); SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other), WideMemVT, dl, Ops, N->getMemOperand(), N->getIndexType(), N->getExtensionType()); @@ -8373,23 +8371,23 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { if (OpNo == 1) { DataOp = GetWidenedVector(DataOp); - unsigned NumElts = DataOp.getValueType().getVectorNumElements(); + ElementCount WideEC = DataOp.getValueType().getVectorElementCount(); // Widen index. EVT IndexVT = Index.getValueType(); EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), - IndexVT.getVectorElementType(), NumElts); + IndexVT.getVectorElementType(), WideEC); Index = ModifyToType(Index, WideIndexVT); // The mask should be widened as well. EVT MaskVT = Mask.getValueType(); EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), - MaskVT.getVectorElementType(), NumElts); + MaskVT.getVectorElementType(), WideEC); Mask = ModifyToType(Mask, WideMaskVT, true); // Widen the MemoryType WideMemVT = EVT::getVectorVT(*DAG.getContext(), - MSC->getMemoryVT().getScalarType(), NumElts); + MSC->getMemoryVT().getScalarType(), WideEC); } else if (OpNo == 4) { // Just widen the index. It's allowed to have extra elements. Index = GetWidenedVector(Index); diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll index 624541766c957..42838d4ee6d65 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll @@ -96,6 +96,20 @@ define @masked_sgather_nxv2i32(ptr %base, ret %vals.sext } +define @masked_gather_nxv1i64(ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i64, ptr %base, %offsets + %r = call @llvm.masked.gather.nxv1i64( align 8 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv2i64.nxv1i64( poison, %r, i64 0) + ret %r.legal +} + declare @llvm.masked.gather.nxv2i16(, i32, , ) declare @llvm.masked.gather.nxv2i32(, i32, , ) declare @llvm.masked.gather.nxv2i64(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll index 0ed4dd1e4136e..03c2194e91a2b 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll @@ -127,6 +127,71 @@ define @masked_sgather_nxv2i32(ptr %base, ret %vals.sext } +define @masked_gather_nxv1i8(ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %r = call @llvm.masked.gather.nxv1i8( align 1 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv16i8.nxv1i8( poison, %r, i64 0) + ret %r.legal +} + +define @masked_gather_nxv1i16(ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %r = call @llvm.masked.gather.nxv1i16( align 2 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv8i16.nxv1i16( poison, %r, i64 0) + ret %r.legal +} + +define @masked_gather_nxv1i32(ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %r = call @llvm.masked.gather.nxv1i32( align 4 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv4i32.nxv1i32( poison, %r, i64 0) + ret %r.legal +} + +define @masked_gather_nxv1i64(ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %r = call @llvm.masked.gather.nxv1i64( align 8 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv2i64.nxv1i64( poison, %r, i64 0) + ret %r.legal +} + declare @llvm.masked.gather.nxv2i8(, i32, , ) declare @llvm.masked.gather.nxv2i16(, i32, , ) declare @llvm.masked.gather.nxv2i32(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll index adc95e9515e2e..d40aa5bc7f31f 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -201,6 +201,24 @@ define @masked_gather_nxv16i8(ptr %base, % ret %data } +; Similar as above but only a fourth of the mask is defined and the other lanes are "false". +; Expect a single ld1b. +define @masked_gather_nxv16i8_undef_hi_mask(ptr %base, %indices, %mask) #0 { +; CHECK-LABEL: masked_gather_nxv16i8_undef_hi_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %ptrs = getelementptr i8, ptr %base, %indices + %mask.false.hi = call @llvm.vector.insert.nxv16i1.nxv4i1( splat (i1 false), %mask, i64 0) + %data = call @llvm.masked.gather.nxv16i8( align 1 %ptrs, %mask.false.hi, poison) + ret %data +} + ; Code generate the worst case scenario when all vector types are illegal. define @masked_gather_nxv32i32(ptr %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv32i32: @@ -252,17 +270,3 @@ define @masked_sgather_nxv4i8( %ptrs, @llvm.masked.gather.nxv2i8(, i32, , ) -declare @llvm.masked.gather.nxv2i16(, i32, , ) -declare @llvm.masked.gather.nxv2i32(, i32, , ) -declare @llvm.masked.gather.nxv4i8(, i32, , ) -declare @llvm.masked.gather.nxv16i8(, i32, , ) -declare @llvm.masked.gather.nxv32i32(, i32, , ) - -declare @llvm.masked.gather.nxv4f16(, i32, , ) -declare @llvm.masked.gather.nxv8f16(, i32, , ) -declare @llvm.masked.gather.nxv8bf16(, i32, , ) -declare @llvm.masked.gather.nxv2f32(, i32, , ) -declare @llvm.masked.gather.nxv8f32(, i32, , ) -declare @llvm.masked.gather.nxv4f64(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather.ll index 5a7865e92415f..8ae74188ec939 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather.ll @@ -96,6 +96,67 @@ define @masked_sgather_nxv2i16( %ptrs, %vals.sext } +define @masked_gather_nxv1i8( %wide.ptrs, %mask) { +; CHECK-LABEL: masked_gather_nxv1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %r = call @llvm.masked.gather.nxv1i8( align 1 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv16i8.nxv1i8( poison, %r, i64 0) + ret %r.legal +} + +define @masked_gather_nxv1i16( %wide.ptrs, %mask) { +; CHECK-LABEL: masked_gather_nxv1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %r = call @llvm.masked.gather.nxv1i16( align 2 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv8i16.nxv1i16( poison, %r, i64 0) + ret %r.legal +} + +define @masked_gather_nxv1i32( %wide.ptrs, %mask) { +; CHECK-LABEL: masked_gather_nxv1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %r = call @llvm.masked.gather.nxv1i32( align 4 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv4i32.nxv1i32( poison, %r, i64 0) + ret %r.legal +} + +define @masked_gather_nxv1i64( %wide.ptrs, %mask) { +; CHECK-LABEL: masked_gather_nxv1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %r = call @llvm.masked.gather.nxv1i64( align 8 %ptrs, %mask, poison) + %r.legal = call @llvm.vector.insert.nxv2i64.nxv1i64( poison, %r, i64 0) + ret %r.legal +} + define @masked_sgather_nxv2i32( %ptrs, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll index 9244f2c81e799..beae9876c2636 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll @@ -65,6 +65,20 @@ define void @masked_scatter_nxv2f64( %data, ptr %base, %data.wide, ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, lsl #3] +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i64, ptr %base, %offsets + %data = call @llvm.vector.extract.nxv1i64.nxv2i64( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i64( %data, align 8 %ptrs, %mask) + ret void +} + declare void @llvm.masked.scatter.nxv2i16(, , i32, ) declare void @llvm.masked.scatter.nxv2i32(, , i32, ) declare void @llvm.masked.scatter.nxv2i64(, , i32, ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll index 67acf8618809b..536ddabd5d43b 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll @@ -93,6 +93,68 @@ define void @masked_scatter_nxv2f64_unscaled_64bit_offsets( ret void } +define void @masked_scatter_nxv1i8_unscaled_64bit_offsets( %data.wide, ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i8_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %data = call @llvm.vector.extract.nxv1i8.nxv16i8( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i8( %data, align 1 %ptrs, %mask) + ret void +} + +define void @masked_scatter_nxv1i16_unscaled_64bit_offsets( %data.wide, ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i16_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %data = call @llvm.vector.extract.nxv1i16.nxv8i16( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i16( %data, align 2 %ptrs, %mask) + ret void +} + +define void @masked_scatter_nxv1i32_unscaled_64bit_offsets( %data.wide, ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i32_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %data = call @llvm.vector.extract.nxv1i32.nxv4i32( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i32( %data, align 4 %ptrs, %mask) + ret void +} + +define void @masked_scatter_nxv1i64_unscaled_64bit_offsets( %data.wide, ptr %base, %wide.offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i64_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %offsets = call @llvm.vector.extract.nxv1i64.nxv2i64( %wide.offsets, i64 0) + %ptrs = getelementptr i8, ptr %base, %offsets + %data = call @llvm.vector.extract.nxv1i64.nxv2i64( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i64( %data, align 8 %ptrs, %mask) + ret void +} + declare void @llvm.masked.scatter.nxv2f16(, , i32, ) declare void @llvm.masked.scatter.nxv4f16(, , i32, ) declare void @llvm.masked.scatter.nxv2bf16(, , i32, ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll index 8f2cbbdb55636..483953c2b23d4 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll @@ -73,6 +73,64 @@ define void @masked_scatter_nxv2f64( %data, %data.wide, %wide.ptrs, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %data = call @llvm.vector.extract.nxv1i8.nxv16i8( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i8( %data, align 1 %ptrs, %mask) + ret void +} + +define void @masked_scatter_nxv1i16( %data.wide, %wide.ptrs, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %data = call @llvm.vector.extract.nxv1i16.nxv8i16( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i16( %data, align 2 %ptrs, %mask) + ret void +} + +define void @masked_scatter_nxv1i32( %data.wide, %wide.ptrs, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %data = call @llvm.vector.extract.nxv1i32.nxv4i32( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i32( %data, align 4 %ptrs, %mask) + ret void +} + +define void @masked_scatter_nxv1i64( %data.wide, %wide.ptrs, %mask) { +; CHECK-LABEL: masked_scatter_nxv1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: ret + %ptrs = call @llvm.vector.extract.nxv1p0.nxv2p0( %wide.ptrs, i64 0) + %data = call @llvm.vector.extract.nxv1i64.nxv2i64( %data.wide, i64 0) + call void @llvm.masked.scatter.nxv1i64( %data, align 8 %ptrs, %mask) + ret void +} + define void @masked_scatter_splat_constant_pointer ( %pg) { ; CHECK-LABEL: masked_scatter_splat_constant_pointer: ; CHECK: // %bb.0: // %vector.body From a5e83b9a9b16d770792f7cd02ebe30bcfd95cbfe Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 19 Jun 2026 11:09:35 +0100 Subject: [PATCH 019/149] [Clang][NEON ACLE] Remove +bf16 requirement from opaque bfloat builtins. (#204201) Builtins that only care about the size of the element type but not its format (e.g loads, stores and shuffles) do not require any special instructions to code generate beyond those already available to +neon. Fixes https://github.com/llvm/llvm-project/issues/203159 --- clang/include/clang/Basic/arm_neon.td | 34 +++++------ clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 56 ------------------- .../CodeGen/AArch64/bf16-getset-intrinsics.c | 2 +- .../CodeGen/AArch64/bf16-lane-intrinsics.c | 4 +- .../CodeGen/AArch64/bf16-ldst-intrinsics.c | 4 +- .../AArch64/bf16-reinterpret-intrinsics.c | 2 +- clang/test/CodeGen/AArch64/neon-luti.c | 4 +- .../CodeGen/arm-bf16-reinterpret-intrinsics.c | 2 +- .../Sema/aarch64-neon-immediate-ranges/luti.c | 2 +- clang/test/Sema/aarch64-neon-target.c | 3 - .../aarch64-neon-without-target-feature.cpp | 4 +- clang/test/Sema/arm-neon-target.c | 3 - 12 files changed, 30 insertions(+), 90 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 3bf140ff953b9..24689a6d7a0cb 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -285,7 +285,7 @@ def SPLATQ : WInst<"splat_laneq", ".(!Q)I", "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPlmQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; -let TargetGuard = "bf16,neon" in { +let TargetGuard = "neon" in { def SPLAT_BF : WInst<"splat_lane", ".(!q)I", "bQb", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb", @@ -2024,8 +2024,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v def VCMLAQ_ROT270_FP64 : SInst<"vcmlaq_rot270", "QQQQ", "d">; } -// V8.2-A BFloat intrinsics -let TargetGuard = "bf16,neon" in { +let TargetGuard = "neon" in { def VCREATE_BF : NoTestOpInst<"vcreate", ".(IU>)", "b", OP_CAST> { let BigEndianSafe = 1; } @@ -2088,7 +2087,11 @@ let TargetGuard = "bf16,neon" in { def VLD2_DUP_BF : WInst<"vld2_dup", "2(c*!)", "bQb">; def VLD3_DUP_BF : WInst<"vld3_dup", "3(c*!)", "bQb">; def VLD4_DUP_BF : WInst<"vld4_dup", "4(c*!)", "bQb">; +} + +// V8.2-A BFloat intrinsics +let TargetGuard = "bf16,neon" in { def VCVT_F32_BF16 : SOpInst<"vcvt_f32_bf16", "(F>)(Bq!)", "Qb", OP_VCVT_F32_BF16>; def VCVT_LOW_F32_BF16 : SOpInst<"vcvt_low_f32", "(F>)(BQ!)", "Qb", OP_VCVT_F32_BF16_LO>; def VCVT_HIGH_F32_BF16 : SOpInst<"vcvt_high_f32", "(F>)(BQ!)", "Qb", OP_VCVT_F32_BF16_HI>; @@ -2108,21 +2111,23 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "b def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">; def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">; def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">; +} +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "neon" in { def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>; def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>; def COPY_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..IQI", "b", OP_COPY_LN>; def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>; } -let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16,neon" in { +let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "neon" in { let BigEndianSafe = 1 in { defm VREINTERPRET_BF : REINTERPRET_CROSS_TYPES< "csilUcUsUiUlhfPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQPcQPsQPl", "bQb">; } } -let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "neon" in { let BigEndianSafe = 1 in { defm VVREINTERPRET_BF : REINTERPRET_CROSS_TYPES< "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">; @@ -2155,17 +2160,14 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { [ImmCheck<3, ImmCheck0_1>]>; def VLUTI4_H_X2_Q : SInst<"vluti4_laneq_x2", ".2(]>; - - let TargetGuard = "lut,bf16" in { - def VLUTI2_BF : SInst<"vluti2_lane", "Q.(]>; - def VLUTI2_BF_Q : SInst<"vluti2_laneq", "Q.(]>; - def VLUTI4_BF_X2 : SInst<"vluti4_lane_x2", ".2(]>; - def VLUTI4_BF_X2_Q : SInst<"vluti4_laneq_x2", ".2(]>; - } + def VLUTI2_BF : SInst<"vluti2_lane", "Q.(]>; + def VLUTI2_BF_Q : SInst<"vluti2_laneq", "Q.(]>; + def VLUTI4_BF_X2 : SInst<"vluti4_lane_x2", ".2(]>; + def VLUTI4_BF_X2_Q : SInst<"vluti4_laneq_x2", ".2(]>; } let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index dd355821fe5ff..aa32bc2a1d5a7 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -858,10 +858,6 @@ static const ARMNeonVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { // Some intrinsics are equivalent for codegen. static const std::pair NEONEquivalentIntrinsicMap[] = { - { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, }, - { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, }, - { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, }, - { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, }, { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, }, { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, }, { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, }, @@ -890,36 +886,6 @@ static const std::pair NEONEquivalentIntrinsicMap[] = { { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, }, { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, }, { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, }, - { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v }, - { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v }, - { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v }, - { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v }, - { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v }, - { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v }, - { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v }, - { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v }, - { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v }, - { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v }, - { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v }, - { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v }, - { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v }, - { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v }, - { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v }, - { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v }, - { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v }, - { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v }, - { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v }, - { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v }, - { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v }, - { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v }, - { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v }, - { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v }, - { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v }, - { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v }, - { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v }, - { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v }, - { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v }, - { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v }, { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, }, { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, }, { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, }, @@ -964,28 +930,6 @@ static const std::pair NEONEquivalentIntrinsicMap[] = { { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, }, { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, }, { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, }, - { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v }, - { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v }, - { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v }, - { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v }, - { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v }, - { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v }, - { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v }, - { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v }, - { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v }, - { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v }, - { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v }, - { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v }, - { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v }, - { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v }, - { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v }, - { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v }, - { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v }, - { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v }, - { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v }, - { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v }, - { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v }, - { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v }, // The mangling rules cause us to have one ID for each type for vldap1(q)_lane // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an // arbitrary one to be handled as tha canonical variation. diff --git a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c index c93e3ca31896c..a0ea29cff0a08 100644 --- a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1_cg_arm64_neon -target-feature +bf16 -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1_cg_arm64_neon -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target diff --git a/clang/test/CodeGen/AArch64/bf16-lane-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-lane-intrinsics.c index ccd6d17412a8b..c8212908315f5 100644 --- a/clang/test/CodeGen/AArch64/bf16-lane-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-lane-intrinsics.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ +// RUN: %clang_cc1 -triple aarch64 -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-LE %s -// RUN: %clang_cc1 -triple aarch64_be -target-feature +neon -target-feature +bf16 \ +// RUN: %clang_cc1 -triple aarch64_be -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-BE %s // REQUIRES: aarch64-registered-target || arm-registered-target diff --git a/clang/test/CodeGen/AArch64/bf16-ldst-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-ldst-intrinsics.c index 5d778e3b51d0e..44ddd578d81f2 100644 --- a/clang/test/CodeGen/AArch64/bf16-ldst-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-ldst-intrinsics.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ +// RUN: %clang_cc1 -triple aarch64 -target-feature +neon \ // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64 -// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ +// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -mfloat-abi hard \ // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32 // REQUIRES: arm-registered-target,aarch64-registered-target diff --git a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c index 88f2305e2782c..007a0b1b32b9e 100644 --- a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ +// RUN: %clang_cc1 -triple aarch64 -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ // RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s diff --git a/clang/test/CodeGen/AArch64/neon-luti.c b/clang/test/CodeGen/AArch64/neon-luti.c index 4b485636d45b1..4017bfa315a66 100644 --- a/clang/test/CodeGen/AArch64/neon-luti.c +++ b/clang/test/CodeGen/AArch64/neon-luti.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 // REQUIRES: aarch64-registered-target #include -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -O3 -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -O3 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_u8( // CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { diff --git a/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c index f8c3a94133131..de04466b3bce0 100644 --- a/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple armv8.2a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ +// RUN: %clang_cc1 -triple armv8.2a-arm-none-eabi -target-feature +neon -mfloat-abi hard \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ // RUN: | opt -S -passes=instcombine \ // RUN: | FileCheck %s diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/luti.c b/clang/test/Sema/aarch64-neon-immediate-ranges/luti.c index bed8cbc1481dd..9daf3018273de 100644 --- a/clang/test/Sema/aarch64-neon-immediate-ranges/luti.c +++ b/clang/test/Sema/aarch64-neon-immediate-ranges/luti.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -ffreestanding -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +lut -ffreestanding -fsyntax-only -verify %s #include // REQUIRES: aarch64-registered-target diff --git a/clang/test/Sema/aarch64-neon-target.c b/clang/test/Sema/aarch64-neon-target.c index ff1928832862d..6174a7d0a0694 100644 --- a/clang/test/Sema/aarch64-neon-target.c +++ b/clang/test/Sema/aarch64-neon-target.c @@ -93,9 +93,6 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t // bf16 vbfdot_f32(v2f32, v4bf16, v4bf16); // expected-error {{always_inline function 'vbfdot_f32' requires target feature 'bf16'}} vcreate_bf16(10); - vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_bf16' needs target feature bf16}} - vdup_n_bf16(bf16); // expected-error {{always_inline function 'vdup_n_bf16' requires target feature 'bf16'}} - vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_bf16' needs target feature bf16}} vcvt_f32_bf16(v4bf16); // expected-error {{always_inline function 'vcvt_f32_bf16' requires target feature 'bf16'}} vcvt_bf16_f32(v4f32); // expected-error {{always_inline function 'vcvt_bf16_f32' requires target feature 'bf16'}} // f16mm / f16f32mm diff --git a/clang/test/Sema/aarch64-neon-without-target-feature.cpp b/clang/test/Sema/aarch64-neon-without-target-feature.cpp index 86dbb343198c5..97c01e0f51f5e 100644 --- a/clang/test/Sema/aarch64-neon-without-target-feature.cpp +++ b/clang/test/Sema/aarch64-neon-without-target-feature.cpp @@ -23,9 +23,9 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t // bf16 vbfdot_f32(v2f32, v4bf16, v4bf16); // expected-error {{always_inline function 'vbfdot_f32' requires target feature 'neon'}} vcreate_bf16(10); - vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_bf16' needs target feature bf16,neon}} + vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_v' needs target feature neon}} vdup_n_bf16(bf16); // expected-error {{always_inline function 'vdup_n_bf16' requires target feature 'neon'}} - vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_bf16' needs target feature bf16,neon}} + vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_v' needs target feature neon}} vcvt_f32_bf16(v4bf16); // expected-error {{always_inline function 'vcvt_f32_bf16' requires target feature 'neon'}} vcvt_bf16_f32(v4f32); // expected-error {{always_inline function 'vcvt_bf16_f32' requires target feature 'neon'}} vmmlaq_f16_f16(v8f16, v8f16, v8f16); // expected-error {{always_inline function 'vmmlaq_f16_f16' requires target feature 'neon'}} diff --git a/clang/test/Sema/arm-neon-target.c b/clang/test/Sema/arm-neon-target.c index 1dc2b00925d61..f8d2da4aecc45 100644 --- a/clang/test/Sema/arm-neon-target.c +++ b/clang/test/Sema/arm-neon-target.c @@ -56,9 +56,6 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t // bf16 vbfdot_f32(v2f32, v4bf16, v4bf16); // expected-error {{always_inline function 'vbfdot_f32' requires target feature 'bf16'}} vcreate_bf16(10); - vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_bf16' needs target feature bf16}} - vdup_n_bf16(bf16); // expected-error {{always_inline function 'vdup_n_bf16' requires target feature 'bf16'}} - vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_bf16' needs target feature bf16}} vcvt_f32_bf16(v4bf16); // expected-error {{always_inline function 'vcvt_f32_bf16' requires target feature 'bf16'}} vcvt_bf16_f32(v4f32); // expected-error {{always_inline function 'vcvt_bf16_f32' requires target feature 'bf16'}} // v8.1 - qrdmla From fdf3d44c9004eaf2ed112ced60b3f0e384724281 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 19 Jun 2026 11:16:55 +0100 Subject: [PATCH 020/149] [InstCombine] Add tests showing failure to fold pdep(0,x) and pext(0,x) to 0 (#204783) As noted on #204144 --- llvm/test/Transforms/InstCombine/pdep.ll | 18 ++++++++++++++++++ llvm/test/Transforms/InstCombine/pext.ll | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/pdep.ll b/llvm/test/Transforms/InstCombine/pdep.ll index b726e87a6168c..73e13f4053a17 100644 --- a/llvm/test/Transforms/InstCombine/pdep.ll +++ b/llvm/test/Transforms/InstCombine/pdep.ll @@ -1,6 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instcombine -S | FileCheck %s +define i32 @test_pdep_32_zero_src(i32 %x) nounwind readnone { +; CHECK-LABEL: @test_pdep_32_zero_src( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 0, i32 [[X:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] +; + %1 = tail call i32 @llvm.pdep.i32(i32 0, i32 %x) + ret i32 %1 +} + +define i64 @test_pdep_64_zero_src(i64 %x) nounwind readnone { +; CHECK-LABEL: @test_pdep_64_zero_src( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 0, i64 [[X:%.*]]) +; CHECK-NEXT: ret i64 [[TMP1]] +; + %1 = tail call i64 @llvm.pdep.i64(i64 0, i64 %x) + ret i64 %1 +} + define i32 @test_pdep_32_zero_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pdep_32_zero_mask( ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/InstCombine/pext.ll b/llvm/test/Transforms/InstCombine/pext.ll index 0f13f3f542023..c938abffd42cf 100644 --- a/llvm/test/Transforms/InstCombine/pext.ll +++ b/llvm/test/Transforms/InstCombine/pext.ll @@ -1,6 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instcombine -S | FileCheck %s +define i32 @test_pext_32_zero_src(i32 %x) nounwind readnone { +; CHECK-LABEL: @test_pext_32_zero_src( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 0, i32 [[X:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] +; + %1 = tail call i32 @llvm.pext.i32(i32 0, i32 %x) + ret i32 %1 +} + +define i64 @test_pext_64_zero_src(i64 %x) nounwind readnone { +; CHECK-LABEL: @test_pext_64_zero_src( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 0, i64 [[X:%.*]]) +; CHECK-NEXT: ret i64 [[TMP1]] +; + %1 = tail call i64 @llvm.pext.i64(i64 0, i64 %x) + ret i64 %1 +} + define i32 @test_pext_32_zero_mask(i32 %x) nounwind readnone { ; CHECK-LABEL: @test_pext_32_zero_mask( ; CHECK-NEXT: ret i32 0 From e6daa6810e5ed49104a0f409d51e953905c9752e Mon Sep 17 00:00:00 2001 From: Garvit Gupta Date: Fri, 19 Jun 2026 15:50:21 +0530 Subject: [PATCH 021/149] Revert "Revert "[Compiler-rt][test] Fix circular link dependency between builtins and libc"" (#204728) Reverts llvm/llvm-project#203152 --- compiler-rt/test/builtins/Unit/lit.cfg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compiler-rt/test/builtins/Unit/lit.cfg.py b/compiler-rt/test/builtins/Unit/lit.cfg.py index 8d602d1c417fa..2bb72630a41e9 100644 --- a/compiler-rt/test/builtins/Unit/lit.cfg.py +++ b/compiler-rt/test/builtins/Unit/lit.cfg.py @@ -107,7 +107,9 @@ def get_libgcc_file_name(): if config.target_os == "Haiku": config.substitutions.append(("%librt ", base_lib + " -lroot ")) else: - config.substitutions.append(("%librt ", base_lib + " -lc -lm ")) + config.substitutions.append( + ("%librt ", "-lm -Wl,--start-group " + base_lib + " -lc -Wl,--end-group ") + ) builtins_test_crt = get_required_attr(config, "builtins_test_crt") if builtins_test_crt: From 500d1f848c9a5488b7ff5ab2aad09041935ae7ff Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Fri, 19 Jun 2026 12:22:38 +0200 Subject: [PATCH 022/149] [SPIR-V] Fix crash on void indirect call with aggregate argument (#204388) removeAggregateTypesFromCalls named the call to key the type-restoration metadata, which asserts for void-returning calls. Key the metadata via instruction metadata on the call instead, which works for void results. --- .../Target/SPIRV/SPIRVPrepareFunctions.cpp | 15 ++++--- llvm/lib/Target/SPIRV/SPIRVUtils.cpp | 24 +++++++++-- .../fun-ptr-void-call-aggregate-arg.ll | 42 +++++++++++++++++++ 3 files changed, 71 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-void-call-aggregate-arg.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 2543000e2c61e..04cce9915e0da 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -778,6 +778,7 @@ bool SPIRVPrepareFunctionsImpl::removeAggregateTypesFromCalls(Function *F) { IRBuilder<> B(F->getContext()); + unsigned MutatedCallIdx = 0; for (auto &&[CB, NewFnTy] : Calls) { SmallVector> ChangedTypes; SmallVector NewArgTypes; @@ -799,11 +800,13 @@ bool SPIRVPrepareFunctionsImpl::removeAggregateTypesFromCalls(Function *F) { NewFnTy = FunctionType::get(RetTy, NewArgTypes, CB->getFunctionType()->isVarArg()); - if (!CB->hasName()) - CB->setName("spv.mutated_callsite." + F->getName()); - else - CB->setName("spv.named_mutated_callsite." + F->getName() + "." + - CB->getName()); + // Keyed via instruction metadata, not a name. + std::string Key = + ("spv.mutated_callsite." + F->getName() + "." + Twine(MutatedCallIdx++)) + .str(); + CB->setMetadata( + "spv.mutated_callsite", + MDNode::get(F->getContext(), MDString::get(F->getContext(), Key))); std::string Constraints; if (auto *ASM = dyn_cast(CB->getCalledOperand())) { @@ -817,7 +820,7 @@ bool SPIRVPrepareFunctionsImpl::removeAggregateTypesFromCalls(Function *F) { addFunctionTypeMutation( F->getParent()->getOrInsertNamedMetadata("spv.mutated_callsites"), - std::move(ChangedTypes), CB->getName(), Constraints); + std::move(ChangedTypes), Key, Constraints); } for (auto &&[CB, NewFTy] : Calls) { diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index f4cfc1e642b23..7ffd6d1e86c21 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -116,17 +116,33 @@ FunctionType *getOriginalFunctionType(const Function &F) { F.getName()); } +// Keyed via instruction metadata, not a name. +static std::optional getMutatedCallsiteKey(const CallBase &CB) { + if (MDNode *MD = CB.getMetadata("spv.mutated_callsite")) + if (MD->getNumOperands() > 0) + if (auto *MDS = dyn_cast(MD->getOperand(0))) + return MDS->getString(); + return std::nullopt; +} + FunctionType *getOriginalFunctionType(const CallBase &CB) { + std::optional Key = getMutatedCallsiteKey(CB); + if (!Key) + return CB.getFunctionType(); return extractFunctionTypeFromMetadata( CB.getModule()->getNamedMetadata("spv.mutated_callsites"), - CB.getFunctionType(), CB.getName()); + CB.getFunctionType(), *Key); } StringRef getOriginalAsmConstraints(const CallBase &CB) { + StringRef Constraints = + cast(CB.getCalledOperand())->getConstraintString(); + std::optional Key = getMutatedCallsiteKey(CB); + if (!Key) + return Constraints; return extractAsmConstraintsFromMetadata( - CB.getModule()->getNamedMetadata("spv.mutated_callsites"), - cast(CB.getCalledOperand())->getConstraintString(), - CB.getName()); + CB.getModule()->getNamedMetadata("spv.mutated_callsites"), Constraints, + *Key); } } // Namespace SPIRV diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-void-call-aggregate-arg.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-void-call-aggregate-arg.ll new file mode 100644 index 0000000000000..6ecc9fd3e8ca3 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-void-call-aggregate-arg.ll @@ -0,0 +1,42 @@ +; Check that a void-returning indirect call whose argument is an aggregate +; doesn't crash while the aggregate argument type is temporarily mutated. +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - -filetype=obj | spirv-val %} + +; The -discard-value-names run additionally checks that the per-callsite type +; restoration is keyed independently of value names. +; RUN: llvm-as < %s | llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers -discard-value-names -o - | FileCheck %s +; %if spirv-tools %{ llvm-as < %s | llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers -discard-value-names -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpCapability FunctionPointersINTEL +; CHECK-DAG: OpExtension "SPV_INTEL_function_pointers" + +; CHECK: %[[#Int32Ty:]] = OpTypeInt 32 0 +; CHECK: %[[#Agg2Ty:]] = OpTypeStruct %[[#Int32Ty]] %[[#Int32Ty]] +; CHECK: %[[#VoidTy:]] = OpTypeVoid +; CHECK: %[[#VoidCalleeTy:]] = OpTypeFunction %[[#VoidTy]] %[[#Agg2Ty]] +; CHECK: %[[#VoidCalleePtrTy:]] = OpTypePointer Generic %[[#VoidCalleeTy]] +; CHECK: %[[#Callee2Ty:]] = OpTypeFunction %[[#Agg2Ty]] %[[#Agg2Ty]] +; CHECK: %[[#Agg3Ty:]] = OpTypeStruct %[[#Int32Ty]] %[[#Int32Ty]] %[[#Int32Ty]] +; CHECK: %[[#Callee3Ty:]] = OpTypeFunction %[[#Agg3Ty]] %[[#Agg3Ty]] + +; CHECK: %[[#Fp:]] = OpFunctionParameter %[[#VoidCalleePtrTy]] +; CHECK: %[[#Arg:]] = OpFunctionParameter %[[#Agg2Ty]] +; CHECK: OpFunctionPointerCallINTEL %[[#VoidTy]] %[[#Fp]] %[[#Arg]] + +; CHECK: OpFunctionPointerCallINTEL %[[#Agg2Ty]] +; CHECK: OpFunctionPointerCallINTEL %[[#Agg3Ty]] + +%agg2 = type { i32, i32 } +%agg3 = type { i32, i32, i32 } + +define spir_func void @caller(ptr addrspace(4) %fp, %agg2 %a) { + call addrspace(4) void %fp(%agg2 %a) + ret void +} + +define spir_func void @caller_two(ptr addrspace(4) %fp2, ptr addrspace(4) %fp3, %agg2 %a, %agg3 %b) { + %r2 = call addrspace(4) %agg2 %fp2(%agg2 %a) + %r3 = call addrspace(4) %agg3 %fp3(%agg3 %b) + ret void +} From b90ec9c2747f336991f392dc843ecff1ca2faed7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96mer=20Sinan=20A=C4=9Facan?= Date: Fri, 19 Jun 2026 11:23:03 +0100 Subject: [PATCH 023/149] [StackColoring] Remove unused BB numbering state (#204414) --- llvm/lib/CodeGen/StackColoring.cpp | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp index cdb0ca5147728..ea4c49dae8260 100644 --- a/llvm/lib/CodeGen/StackColoring.cpp +++ b/llvm/lib/CodeGen/StackColoring.cpp @@ -402,11 +402,8 @@ class StackColoring { using LivenessMap = DenseMap; LivenessMap BlockLiveness; - /// Maps serial numbers to basic blocks. - DenseMap BasicBlocks; - - /// Maps basic blocks to a serial number. - SmallVector BasicBlockNumbering; + /// Depth-first ordering of the basic blocks. + SmallVector BasicBlockOrdering; /// Maps slots to their use interval. Outside of this interval, slots /// values are either dead or `undef` and they will not be written to. @@ -640,6 +637,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) { // Step 1: collect markers and populate the "InterestingSlots" // and "ConservativeSlots" sets. for (MachineBasicBlock *MBB : depth_first(MF)) { + BasicBlockOrdering.push_back(MBB); + // Compute the set of slots for which we've seen a START marker but have // not yet seen an END marker at this point in the walk (e.g. on entry // to this bb). @@ -727,14 +726,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) { LLVM_DEBUG(dumpBV("Conservative slots", ConservativeSlots)); // Step 2: compute begin/end sets for each block - - // NOTE: We use a depth-first iteration to ensure that we obtain a - // deterministic numbering. - for (MachineBasicBlock *MBB : depth_first(MF)) { - // Assign a serial number to this basic block. - BasicBlocks[MBB] = BasicBlockNumbering.size(); - BasicBlockNumbering.push_back(MBB); - + for (const MachineBasicBlock *MBB : BasicBlockOrdering) { // Keep a reference to avoid repeated lookups. BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB]; @@ -742,7 +734,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) { BlockInfo.End.resize(NumSlot); SmallVector slots; - for (MachineInstr &MI : *MBB) { + for (const MachineInstr &MI : *MBB) { bool isStart = false; slots.clear(); if (isLifetimeStartOrEnd(MI, slots, isStart)) { @@ -791,7 +783,7 @@ void StackColoring::calculateLocalLiveness() { changed = false; ++NumIters; - for (const MachineBasicBlock *BB : BasicBlockNumbering) { + for (const MachineBasicBlock *BB : BasicBlockOrdering) { // Use an iterator to avoid repeated lookups. LivenessMap::iterator BI = BlockLiveness.find(BB); assert(BI != BlockLiveness.end() && "Block not found"); @@ -1213,8 +1205,7 @@ bool StackColoring::run(MachineFunction &Func, bool OnlyRemoveMarkers) { MF = &Func; MFI = &MF->getFrameInfo(); BlockLiveness.clear(); - BasicBlocks.clear(); - BasicBlockNumbering.clear(); + BasicBlockOrdering.clear(); Markers.clear(); Intervals.clear(); LiveStarts.clear(); From f6fd6ea3c1b59728c3c8f3443ef9a02d367ec8e0 Mon Sep 17 00:00:00 2001 From: bogdan-petkovic Date: Fri, 19 Jun 2026 12:34:25 +0200 Subject: [PATCH 024/149] [mlir][ExecutionEngine] Fix dead -Wno-c++98-compat-extra-semi guard (#204524) `check_cxx_compiler_flag` stores its result in `CXX_SUPPORTS_NO_CXX98_COMPAT_EXTRA_SEMI_FLAG`, but the guarding `if()` checked `CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG` (without `_NO_`), which is never set. The condition was therefore always false and the `-Wno-c++98-compat-extra-semi` suppression for `mlir_rocm_runtime` was never applied. The sibling flag checks in the same block (`-Wno-return-type-c-linkage`, `-Wno-nested-anon-types`, `-Wno-gnu-anonymous-struct`) already use matching variable names, so this aligns the typo'd guard with the established pattern. No test is included, this is a build-system-only (CMake) change to a warning-suppression guard and is not unit-testable. Signed-off-by: bogdan-petkovic --- mlir/lib/ExecutionEngine/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index 2176ccaa6031f..87af4724f159a 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -414,7 +414,7 @@ if(LLVM_ENABLE_PIC) # Supress compiler warnings from HIP headers check_cxx_compiler_flag(-Wno-c++98-compat-extra-semi CXX_SUPPORTS_NO_CXX98_COMPAT_EXTRA_SEMI_FLAG) - if (CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG) + if (CXX_SUPPORTS_NO_CXX98_COMPAT_EXTRA_SEMI_FLAG) target_compile_options(mlir_rocm_runtime PRIVATE "-Wno-c++98-compat-extra-semi") endif() From 3cc94639047973169a9874139b8d95e17cdba50b Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Fri, 19 Jun 2026 19:46:19 +0900 Subject: [PATCH 025/149] [Delinearization] Narrow the scope of the term collection (#204145) In parametric delinearization, it collects subexpressions whose SCEV type is `SCEVUnknown` and uses them as candidates for the array dimensions. When traversing these subexpressions, it may follow any kind of expression. For example, if it follows a `sext` expression, this can lead to type inconsistencies among the collected terms. This patch fixes this issue by preventing traversal into subexpressions other than `SCEVAddExpr` or `SCEVAddRecExpr`. Note: I tried to minimize the test case, but this seems to be as far as it can go. Fix #204066. --- llvm/lib/Analysis/Delinearization.cpp | 16 +++---- .../Delinearization/inconsistent-types.ll | 44 +++++++++++++++++++ 2 files changed, 49 insertions(+), 11 deletions(-) create mode 100644 llvm/test/Analysis/Delinearization/inconsistent-types.ll diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp index 2821cec610d18..5b525b43e4931 100644 --- a/llvm/lib/Analysis/Delinearization.cpp +++ b/llvm/lib/Analysis/Delinearization.cpp @@ -72,16 +72,12 @@ struct SCEVCollectTerms { bool follow(const SCEV *S) { if (isa(S) || isa(S) || - isa(S)) { + isa(S)) if (!containsUndefs(S)) Terms.push_back(S); - // Stop recursion: once we collected a term, do not walk its operands. - return false; - } - - // Keep looking. - return true; + // Keep looking when S is a specific type expression. + return isa(S); } bool isDone() const { return false; } @@ -154,12 +150,10 @@ struct SCEVCollectAddRecMultiplies { return false; Terms.push_back(SE.getMulExpr(Operands)); - // Stop recursion: once we collected a term, do not walk its operands. - return false; } - // Keep looking. - return true; + // Keep looking when S is a specific type expression. + return isa(S); } bool isDone() const { return false; } diff --git a/llvm/test/Analysis/Delinearization/inconsistent-types.ll b/llvm/test/Analysis/Delinearization/inconsistent-types.ll new file mode 100644 index 0000000000000..3effd1a08f43c --- /dev/null +++ b/llvm/test/Analysis/Delinearization/inconsistent-types.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +; Ensure that the program doesn't crash. Previously, the array access was +; delinearized as follows: +; +; ArrayDecl[UnknownSize][%B3] with elements of 4 bytes. +; ArrayRef[0][{(zext i32 ({0,+,1}<%outer.preheader> * %B3) to i64),+,1}<%inner>] +; +; Due to `%B3` is i32, an assertion failure was triggered during the +; delinearization validation. +; +define void @f(i32 %n, ptr %A) { +; CHECK-LABEL: 'f' +; CHECK-NEXT: Inst: store i32 0, ptr %arrayidx, align 4 +; CHECK-NEXT: AccessFunction: {(4 * (zext i32 ({0,+,1}<%outer.preheader> * %B3) to i64)),+,4}<%inner> +; CHECK-NEXT: failed to delinearize +; +entry: + br i1 false, label %outer.preheader, label %exit + +outer.preheader: + %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.latch ] + %B3 = and i32 0, %n + br label %inner + +inner: + %j = phi i64 [ 0, %outer.preheader ], [ %j.inc, %inner ] + %mul = mul i32 %B3, %i + %trunc = trunc i64 %j to i32 + %add = add i32 %mul, %trunc + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr i32, ptr %A, i64 %idxprom + store i32 0, ptr %arrayidx, align 4 + %j.inc = add i64 %j, 1 + br i1 false, label %inner, label %outer.latch + +outer.latch: + %i.inc = add i32 %i, 1 + br i1 false, label %outer.preheader, label %exit + +exit: + ret void +} From 60a2d437bd040a494cfe87fa8f44ebad18db2196 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 19 Jun 2026 11:49:56 +0100 Subject: [PATCH 026/149] [AArch64] Add SVE shuffle optimization pass (#193951) Add a pass to perform VLA shuffle optimizations for SVE. First up is using tbl to replace deinterleave4+uunpk+zext/uitofp by generating shuffle masks with index, exploiting the fact that out-of-range indices in the mask produce zeroes in the result vector. That way, we can easily zero-extend smaller elements by using the destination type when generating the mask, and having one index in range with several out-of-range for each destination element. --- llvm/lib/Target/AArch64/AArch64.h | 14 + .../Target/AArch64/AArch64PassRegistry.def | 6 + .../Target/AArch64/AArch64TargetMachine.cpp | 14 + llvm/lib/Target/AArch64/CMakeLists.txt | 1 + llvm/lib/Target/AArch64/SVEShuffleOpts.cpp | 293 ++++++++ llvm/test/CodeGen/AArch64/O3-pipeline.ll | 3 + .../CodeGen/AArch64/sve-tbl-folding-new-pm.ll | 210 ++++++ .../CodeGen/AArch64/sve-tbl-folding-opts.ll | 642 ++++++++++++++++++ 8 files changed, 1183 insertions(+) create mode 100644 llvm/lib/Target/AArch64/SVEShuffleOpts.cpp create mode 100644 llvm/test/CodeGen/AArch64/sve-tbl-folding-new-pm.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-tbl-folding-opts.ll diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index e84c2bab20207..4fdba2c7dbfcc 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -17,6 +17,7 @@ #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionAnalysisManager.h" #include "llvm/CodeGen/SelectionDAGISel.h" @@ -24,6 +25,7 @@ #include "llvm/PassRegistry.h" #include "llvm/Support/DataTypes.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include #include @@ -77,6 +79,7 @@ FunctionPass *createSMEPeepholeOptPass(); FunctionPass *createMachineSMEABIPass(CodeGenOptLevel); FunctionPass *createAArch64SRLTDefineSuperRegsPass(); ModulePass *createSVEIntrinsicOptsPass(); +Pass *createSVEShuffleOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, const AArch64Subtarget &, @@ -200,8 +203,19 @@ void initializeSMEPeepholeOptPass(PassRegistry &); void initializeMachineSMEABIPass(PassRegistry &); void initializeAArch64SRLTDefineSuperRegsPass(PassRegistry &); void initializeSVEIntrinsicOptsPass(PassRegistry &); +void initializeSVEShuffleOptsPass(PassRegistry &); void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &); +class SVEShuffleOptsPass : public PassInfoMixin { + const AArch64TargetMachine &TM; + +public: + explicit SVEShuffleOptsPass(const AArch64TargetMachine &TM) : TM(TM) {} + LLVM_ABI PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U); +}; + class AArch64StackTaggingPreRAPass : public OptionalPassInfoMixin { public: diff --git a/llvm/lib/Target/AArch64/AArch64PassRegistry.def b/llvm/lib/Target/AArch64/AArch64PassRegistry.def index 6f9eb76930d18..1fc09fc00e9ee 100644 --- a/llvm/lib/Target/AArch64/AArch64PassRegistry.def +++ b/llvm/lib/Target/AArch64/AArch64PassRegistry.def @@ -24,6 +24,12 @@ MODULE_PASS("aarch64-lower-homogeneous-prolog-epilog", AArch64LowerHomogeneousPr #endif #undef FUNCTION_PASS +#ifndef LOOP_PASS +#define LOOP_PASS(NAME, CREATE_PASS) +#endif +LOOP_PASS("aarch64-sve-shuffle-opts", SVEShuffleOptsPass(*static_cast(this))) +#undef LOOP_PASS + #ifndef MACHINE_FUNCTION_PASS #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 24224a5b194ee..b73945c53235e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -229,6 +229,12 @@ static cl::opt EnableSRLTSubregToRegMitigation( "super-regs when using Subreg Liveness Tracking"), cl::init(true), cl::Hidden); +static cl::opt EnableSVEShuffleOpt( + "aarch64-enable-sve-shuffle-opts", + cl::desc("Enable pattern matching of shuffles that could make use of SVE " + "instructions like tbl or the bottom/top variants"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. @@ -281,6 +287,7 @@ LLVMInitializeAArch64Target() { initializeAArch64DAGToDAGISelLegacyPass(PR); initializeAArch64CondBrTuningPass(PR); initializeAArch64Arm64ECCallLoweringPass(PR); + initializeSVEShuffleOptsPass(PR); } bool AArch64TargetMachine::isGlobalISelOptNone() const { @@ -678,6 +685,13 @@ void AArch64PassConfig::addIRPasses() { addPass(createAArch64StackTaggingPass( /*IsOptNone=*/TM->getOptLevel() == CodeGenOptLevel::None)); + // Try to use tbl in place of other shuffling operations if doing so would + // reduce the total number of instructions. Shuffle masks for big endian may + // be different, so require a little endian target. + if (TM->createDataLayout().isLittleEndian() && + getOptLevel() >= CodeGenOptLevel::Default && EnableSVEShuffleOpt) + addPass(createSVEShuffleOptsPass()); + // Match complex arithmetic patterns if (TM->getOptLevel() >= CodeGenOptLevel::Default) addPass(createComplexDeinterleavingPass(TM)); diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 36be664e6eeb4..e3f7f697b69d5 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -92,6 +92,7 @@ add_llvm_target(AArch64CodeGen AArch64TargetTransformInfo.cpp SMEPeepholeOpt.cpp SVEIntrinsicOpts.cpp + SVEShuffleOpts.cpp MachineSMEABIPass.cpp AArch64SRLTDefineSuperRegs.cpp AArch64SIMDInstrOpt.cpp diff --git a/llvm/lib/Target/AArch64/SVEShuffleOpts.cpp b/llvm/lib/Target/AArch64/SVEShuffleOpts.cpp new file mode 100644 index 0000000000000..9c0f39428c6f9 --- /dev/null +++ b/llvm/lib/Target/AArch64/SVEShuffleOpts.cpp @@ -0,0 +1,293 @@ +//===------- SVEShuffleOpts - SVE Shuffle Optimization --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Tries to pattern match and combine scalable vector shuffles that could +// be more efficiently performed by tbl instructions. +// +// An example would be a loop with 4 multiply-accumulate reductions, where the +// new data in each vector iterations comes from a 4-way deinterleaving of +// smaller datatypes loaded from memory which are then zero extended. +// +// Something like the following: +// %bgra = call ... @llvm.masked.load +// %deinterleave = call ... @llvm.vector.deinterleave4(%bgra) +// If the load was of a , we now have 4 deinterleaved +// values. +// %b.i16 = extractvalue %deinterleave, 0 +// %b.i64 = zext %b.i16 to +// %acc.b.next = add %acc.b, %b.i64 +// +// +// If the initial load is a legal vector rather than 4x the size (generating a +// structured ld4 instead), we would see multiple uunpkhi/lo instructions for +// the extensions, followed by uzp1/2 instructions for the deinterleave. +// Instead, we can replace all of those with 4 tbl instructions. The tradeoff, +// of course, is that we now have 4 mask values to maintain which may increase +// register pressure. +// +// This basic transform could be performed in CodeGenPrepare (as the equivalent +// for NEON is), or in a DAG Combine. However, we hope to extend it to detect +// other shuffles that we can fold into the tbl. Extending the above example, +// if instead of directly adding to the accumulator we multiplied it by a +// common term for all 4 components that had been reversed: +// %common.load = call @llvm.masked.load +// %common.reverse = call @llvm.vector.reverse +// These would be loaded at the extended size, in our +// example. +// %b.mul = mul %b.i64, %common.reverse +// %acc.b.next = add %acc.b, %b.mul +// + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "aarch64-sve-shuffle-opts" + +/// A mapping between a vector_deinterleaveN intrinsic and extending cast +/// instructions used on the resulting subvectors. +using DeinterleaveMap = SmallDenseMap>; + +/// Evaluate a deinterleave and see what the uses are. If we find other +/// operations that we can combine into a tbl shuffle, add the deinterleave and +/// the operations (currently only zext or uitofp) to the candidates map. +static void evaluateDeinterleave(IntrinsicInst *I, DeinterleaveMap &Candidates, + Loop &L, const AArch64TargetLowering &TL, + const DataLayout DL) { + unsigned IntId = I->getIntrinsicID(); + assert(IntId == Intrinsic::vector_deinterleave4 && + "Only deinterleave4 supported currently"); + + ConstantRange VScaleRange = getVScaleRange(I->getFunction(), 64); + // TBL zeroes elements with an out-of-bounds index, but for the largest + // possible SVE vector (2048b) the maximum value for i8 elements (255) is not + // large enough to encode an 'out of bounds' value. So we can only perform + // this optimization for i8 elements if we know vscale is < 16. + EVT InputVT = TL.getValueType(DL, I->getOperand(0)->getType()); + if (!InputVT.isScalableVector() || + (InputVT.getScalarSizeInBits() < 16 && + (!VScaleRange.getUpper().ult(16) || VScaleRange.isUpperWrapped())) || + TL.getTypeConversion(I->getContext(), InputVT).first != + TargetLoweringBase::TypeLegal) + return; + + std::array Extends = {}; + unsigned Opcode = 0; + Type *DestTy = nullptr; + for (User *U : I->users()) { + auto *Extract = dyn_cast(U); + if (!Extract || !Extract->hasOneUse()) + return; + + // We expect only a single cast instruction as a user for the extract. + auto *Extend = dyn_cast_if_present(*Extract->users().begin()); + if (!Extend || (!isa(Extend) && !isa(Extend))) + return; + + // We're only interested if the uses are in the loop. This is almost + // certainly the case. + if (!L.contains(Extend)) + return; + + Opcode = Extend->getOpcode(); + DestTy = Extend->getDestTy(); + + // Make sure DestTy matches the input size. + if (DestTy->getPrimitiveSizeInBits() != InputVT.getSizeInBits()) + return; + + Extends[Extract->getIndices().front()] = Extend; + } + + // Check that all extracted values are being extended the same way, and that + // we have the expected number of extensions. + if (!all_of(Extends, [DestTy, Opcode](CastInst *CI) { + return !CI || (CI->getDestTy() == DestTy && CI->getOpcode() == Opcode); + })) + return; + + Candidates.try_emplace(I, Extends); +} + +/// Given a map of deinterleaves to zext or uitofp casts, remove the operations +/// and replace them with tbl shuffles. +static void optimizeSVEDeinterleavedExtends(DeinterleaveMap Deinterleaves) { + for (auto &[Deinterleave, Extends] : Deinterleaves) { + VectorType *DestTy = cast(Extends[0]->getDestTy()); + VectorType *SrcTy = cast(Extends[0]->getSrcTy()); + unsigned DstBits = DestTy->getScalarSizeInBits(); + unsigned SrcBits = SrcTy->getScalarSizeInBits(); + bool IsUIToFP = isa(Extends[0]); + VectorType *StepVecTy = VectorType::getInteger(DestTy); + Value *Input = Deinterleave->getOperand(0); + Type *InputTy = Input->getType(); + + APInt Invalid = APInt::getAllOnes(DstBits); + for (auto [Idx, Extend] : enumerate(Extends)) { + // If not all lanes were extracted, we can have gaps. Skip over them. + if (!Extend) + continue; + // Build the mask using stepvectors and casting. + // We want to select the Idx'th element, and every 4 elements after that. + // Each element needs to be zero extended; we can do that by providing + // tbl index values that are out of range. We can't do that nicely with + // a stepvector of the same element type as the input type, but we can + // do it with elements the size of the output type. + // E.g. for element 0 of a 16b -> 64b zext, we would start with a mask of + // 0xFFFF_FFFF_FFFF_0000 + Idx for the start of the stepvector, and use a + // step of 4. We then cast that back to an element size of 16b, yielding + // <0x0000 + Idx, 0xFFFF, 0xFFFF, 0xFFFF, 0x0004 + Idx, 0xFFFF...>. + APInt StartIdx = Invalid << SrcBits; + StartIdx += Idx; + IRBuilder<> Builder(Extend); + Value *StepVector = Builder.CreateStepVector(StepVecTy); + Value *ScaledSteps = + Builder.CreateNUWMul(StepVector, ConstantInt::get(StepVecTy, 4)); + Value *ZextTbl = Builder.CreateNUWAdd( + ScaledSteps, ConstantInt::get(StepVecTy, StartIdx)); + Value *FinalMask = Builder.CreateBitCast(ZextTbl, InputTy); + + // Replace the deinterleave, extractvalue, and extension chain with + // a tbl directly on the input value. + Value *Tbl = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_tbl, + {InputTy}, {Input, FinalMask}); + Value *Widen = Builder.CreateBitCast(Tbl, StepVecTy); + if (IsUIToFP) + Widen = Builder.CreateUIToFP(Widen, DestTy); + LLVM_DEBUG(dbgs() << "SVETBLOPT: Replaced " << *Extend << " with " + << *Widen << "\n"); + Extend->replaceAllUsesWith(Widen); + Extend->eraseFromParent(); + } + + // Delete the unused extracts and deinterleave. + for (User *U : make_early_inc_range(Deinterleave->users())) + cast(U)->eraseFromParent(); + Deinterleave->eraseFromParent(); + } +} + +static bool processLoop(Loop &L, const AArch64Subtarget &ST, DataLayout DL) { + // At present, we only want to do this for innermost loops when SVE + // is available. + if (!L.isInnermost() || !ST.isSVEorStreamingSVEAvailable()) + return false; + + // TODO: Pull other shuffles into the tbl where possible. + // TODO: Add more advanced cases, such as introducing shuffles so that + // the SVE odd/even BT narrowing instructions can be used. + // TODO: Support other deinterleaves. + const AArch64TargetLowering &TL = *ST.getTargetLowering(); + assert(DL.isLittleEndian() && + "Shuffle optimizations unsupported for big endian targets."); + DeinterleaveMap Candidates; + for (auto *BB : L.blocks()) + for (auto &I : *BB) + if (match(&I, m_Intrinsic(m_Value()))) + evaluateDeinterleave(cast(&I), Candidates, L, TL, DL); + + if (Candidates.empty()) + return false; + + optimizeSVEDeinterleavedExtends(Candidates); + return true; +} + +namespace { +struct SVEShuffleOpts : public LoopPass { + static char ID; // Pass identification, replacement for typeid + SVEShuffleOpts() : LoopPass(ID) {} + + bool runOnLoop(Loop *L, LPPassManager &PM) override { + if (skipLoop(L)) + return false; + + TargetPassConfig &TPC = getAnalysis(); + const AArch64TargetMachine &TM = TPC.getTM(); + const AArch64Subtarget &ST = + *TM.getSubtargetImpl(*L->getHeader()->getParent()); + + return processLoop(*L, ST, TM.createDataLayout()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } + + StringRef getPassName() const override { return "SVE Shuffle Optimizations"; } +}; +} // end anonymous namespace + +char SVEShuffleOpts::ID = 0; +static const char *name = "SVE Shuffle Optimizations"; +INITIALIZE_PASS_BEGIN(SVEShuffleOpts, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(SVEShuffleOpts, DEBUG_TYPE, name, false, false) + +Pass *llvm::createSVEShuffleOptsPass() { return new SVEShuffleOpts(); } + +PreservedAnalyses SVEShuffleOptsPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + const AArch64Subtarget &ST = + *TM.getSubtargetImpl(*L.getHeader()->getParent()); + + if (processLoop(L, ST, TM.createDataLayout())) { + PreservedAnalyses PA; + PA.preserveSet(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; + } + + return PreservedAnalyses::all(); +} diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 1a0ffe234a236..08d3b94530d14 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -84,6 +84,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: AArch64 Stack Tagging +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: SVE Shuffle Optimizations ; CHECK-NEXT: Complex Deinterleaving Pass ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory SSA diff --git a/llvm/test/CodeGen/AArch64/sve-tbl-folding-new-pm.ll b/llvm/test/CodeGen/AArch64/sve-tbl-folding-new-pm.ll new file mode 100644 index 0000000000000..6a533a2419255 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-tbl-folding-new-pm.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=aarch64-sve-shuffle-opts -mtriple=aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s + +define void @zext_nxv8i16_to_nxv8i64_deinterleave_in_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: define void @zext_nxv8i16_to_nxv8i64_deinterleave_in_loop( +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], [[MASK:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[STRIDE:%.*]] = shl nuw nsw i64 [[VSCALE]], 1 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_B_I64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[ADD_B_I64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_G_I64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[ADD_G_I64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_R_I64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[ADD_R_I64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_A_I64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[ADD_A_I64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds nuw [4 x i16], ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[BGRA:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[SRC_GEP]], [[MASK]], zeroinitializer) +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw [[TMP0]], splat (i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw [[TMP1]], splat (i64 -65536) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw [[TMP6]], splat (i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = add nuw [[TMP7]], splat (i64 -65535) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast [[TMP8]] to +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul nuw [[TMP12]], splat (i64 4) +; CHECK-NEXT: [[TMP14:%.*]] = add nuw [[TMP13]], splat (i64 -65534) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast [[TMP14]] to +; CHECK-NEXT: [[TMP16:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast [[TMP16]] to +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw [[TMP18]], splat (i64 4) +; CHECK-NEXT: [[TMP20:%.*]] = add nuw [[TMP19]], splat (i64 -65533) +; CHECK-NEXT: [[TMP21:%.*]] = bitcast [[TMP20]] to +; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP21]]) +; CHECK-NEXT: [[TMP23:%.*]] = bitcast [[TMP22]] to +; CHECK-NEXT: [[ADD_B_I64]] = add [[ACC_B_I64]], [[TMP5]] +; CHECK-NEXT: [[ADD_G_I64]] = add [[ACC_G_I64]], [[TMP11]] +; CHECK-NEXT: [[ADD_R_I64]] = add [[ACC_R_I64]], [[TMP17]] +; CHECK-NEXT: [[ADD_A_I64]] = add [[ACC_A_I64]], [[TMP23]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[STRIDE]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 2048 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[ADD_B_I64_LCSSA:%.*]] = phi [ [[ADD_B_I64]], %[[LOOP]] ] +; CHECK-NEXT: [[ADD_G_I64_LCSSA:%.*]] = phi [ [[ADD_G_I64]], %[[LOOP]] ] +; CHECK-NEXT: [[ADD_R_I64_LCSSA:%.*]] = phi [ [[ADD_R_I64]], %[[LOOP]] ] +; CHECK-NEXT: [[ADD_A_I64_LCSSA:%.*]] = phi [ [[ADD_A_I64]], %[[LOOP]] ] +; CHECK-NEXT: store [[ADD_B_I64_LCSSA]], ptr [[DST]], align 16 +; CHECK-NEXT: [[G_I64_GEP:%.*]] = getelementptr , ptr [[DST]], i64 1 +; CHECK-NEXT: store [[ADD_G_I64_LCSSA]], ptr [[G_I64_GEP]], align 16 +; CHECK-NEXT: [[R_I64_GEP:%.*]] = getelementptr , ptr [[DST]], i64 2 +; CHECK-NEXT: store [[ADD_R_I64_LCSSA]], ptr [[R_I64_GEP]], align 16 +; CHECK-NEXT: [[A_I64_GEP:%.*]] = getelementptr , ptr [[DST]], i64 3 +; CHECK-NEXT: store [[ADD_A_I64_LCSSA]], ptr [[A_I64_GEP]], align 16 +; CHECK-NEXT: ret void +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.i64 = phi [ splat(i64 0), %entry ], [ %add.b.i64, %loop ] + %acc.g.i64 = phi [ splat(i64 0), %entry ], [ %add.g.i64, %loop ] + %acc.r.i64 = phi [ splat(i64 0), %entry ], [ %add.r.i64, %loop ] + %acc.a.i64 = phi [ splat(i64 0), %entry ], [ %add.a.i64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i16], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.i64 = zext %b.i16 to + %g.i64 = zext %g.i16 to + %r.i64 = zext %r.i16 to + %a.i64 = zext %a.i16 to + %add.b.i64 = add %acc.b.i64, %b.i64 + %add.g.i64 = add %acc.g.i64, %g.i64 + %add.r.i64 = add %acc.r.i64, %r.i64 + %add.a.i64 = add %acc.a.i64, %a.i64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %add.b.i64, ptr %dst + %g.i64.gep = getelementptr , ptr %dst, i64 1 + store %add.g.i64, ptr %g.i64.gep + %r.i64.gep = getelementptr , ptr %dst, i64 2 + store %add.r.i64, ptr %r.i64.gep + %a.i64.gep = getelementptr , ptr %dst, i64 3 + store %add.a.i64, ptr %a.i64.gep + ret void +} + +define void @uitofp_nxv8i16_to_nxv8f64_deinterleave_in_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: define void @uitofp_nxv8i16_to_nxv8f64_deinterleave_in_loop( +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[STRIDE:%.*]] = shl nuw nsw i64 [[VSCALE]], 2 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_B_F64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[FADD_B_F64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_G_F64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[FADD_G_F64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_R_F64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[FADD_R_F64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACC_A_F64:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[FADD_A_F64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds nuw [4 x i16], ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[BGRA:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[SRC_GEP]], [[MASK]], zeroinitializer) +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw [[TMP0]], splat (i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw [[TMP1]], splat (i64 -65536) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +; CHECK-NEXT: [[TMP6:%.*]] = uitofp [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw [[TMP7]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = add nuw [[TMP8]], splat (i64 -65535) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast [[TMP11]] to +; CHECK-NEXT: [[TMP13:%.*]] = uitofp [[TMP12]] to +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw [[TMP14]], splat (i64 4) +; CHECK-NEXT: [[TMP16:%.*]] = add nuw [[TMP15]], splat (i64 -65534) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast [[TMP16]] to +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP17]]) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast [[TMP18]] to +; CHECK-NEXT: [[TMP20:%.*]] = uitofp [[TMP19]] to +; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul nuw [[TMP21]], splat (i64 4) +; CHECK-NEXT: [[TMP23:%.*]] = add nuw [[TMP22]], splat (i64 -65533) +; CHECK-NEXT: [[TMP24:%.*]] = bitcast [[TMP23]] to +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.aarch64.sve.tbl.nxv8i16( [[BGRA]], [[TMP24]]) +; CHECK-NEXT: [[TMP26:%.*]] = bitcast [[TMP25]] to +; CHECK-NEXT: [[TMP27:%.*]] = uitofp [[TMP26]] to +; CHECK-NEXT: [[FADD_B_F64]] = fadd [[ACC_B_F64]], [[TMP6]] +; CHECK-NEXT: [[FADD_G_F64]] = fadd [[ACC_G_F64]], [[TMP13]] +; CHECK-NEXT: [[FADD_R_F64]] = fadd [[ACC_R_F64]], [[TMP20]] +; CHECK-NEXT: [[FADD_A_F64]] = fadd [[ACC_A_F64]], [[TMP27]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[STRIDE]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 2048 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[FADD_B_F64_LCSSA:%.*]] = phi [ [[FADD_B_F64]], %[[LOOP]] ] +; CHECK-NEXT: [[FADD_G_F64_LCSSA:%.*]] = phi [ [[FADD_G_F64]], %[[LOOP]] ] +; CHECK-NEXT: [[FADD_R_F64_LCSSA:%.*]] = phi [ [[FADD_R_F64]], %[[LOOP]] ] +; CHECK-NEXT: [[FADD_A_F64_LCSSA:%.*]] = phi [ [[FADD_A_F64]], %[[LOOP]] ] +; CHECK-NEXT: store [[FADD_B_F64_LCSSA]], ptr [[DST]], align 16 +; CHECK-NEXT: [[G_F64_GEP:%.*]] = getelementptr , ptr [[DST]], i64 1 +; CHECK-NEXT: store [[FADD_G_F64_LCSSA]], ptr [[G_F64_GEP]], align 16 +; CHECK-NEXT: [[R_F64_GEP:%.*]] = getelementptr , ptr [[DST]], i64 2 +; CHECK-NEXT: store [[FADD_R_F64_LCSSA]], ptr [[R_F64_GEP]], align 16 +; CHECK-NEXT: [[A_F64_GEP:%.*]] = getelementptr , ptr [[DST]], i64 3 +; CHECK-NEXT: store [[FADD_A_F64_LCSSA]], ptr [[A_F64_GEP]], align 16 +; CHECK-NEXT: ret void +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 2 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.b.f64, %loop ] + %acc.g.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.g.f64, %loop ] + %acc.r.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.r.f64, %loop ] + %acc.a.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.a.f64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i16], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.f64 = uitofp %b.i16 to + %g.f64 = uitofp %g.i16 to + %r.f64 = uitofp %r.i16 to + %a.f64 = uitofp %a.i16 to + %fadd.b.f64 = fadd %acc.b.f64, %b.f64 + %fadd.g.f64 = fadd %acc.g.f64, %g.f64 + %fadd.r.f64 = fadd %acc.r.f64, %r.f64 + %fadd.a.f64 = fadd %acc.a.f64, %a.f64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %fadd.b.f64, ptr %dst + %g.f64.gep = getelementptr , ptr %dst, i64 1 + store %fadd.g.f64, ptr %g.f64.gep + %r.f64.gep = getelementptr , ptr %dst, i64 2 + store %fadd.r.f64, ptr %r.f64.gep + %a.f64.gep = getelementptr , ptr %dst, i64 3 + store %fadd.a.f64, ptr %a.f64.gep + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-tbl-folding-opts.ll b/llvm/test/CodeGen/AArch64/sve-tbl-folding-opts.ll new file mode 100644 index 0000000000000..e101489c564c8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-tbl-folding-opts.ll @@ -0,0 +1,642 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O3 < %s | FileCheck %s + +target triple = "aarch64" + +define void @zext_nxv8i16_to_nxv8i64_deinterleave_in_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: zext_nxv8i16_to_nxv8i64_deinterleave_in_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z7.d, #0, #4 +; CHECK-NEXT: mov z4.d, #0xffffffffffff0000 +; CHECK-NEXT: mov z5.d, #0xffffffffffff0001 +; CHECK-NEXT: mov x8, #-65534 // =0xffffffffffff0002 +; CHECK-NEXT: mov z24.d, #0xffffffffffff0003 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov z6.d, x8 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov w8, #2048 // =0x800 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: add z4.d, z7.d, z4.d +; CHECK-NEXT: add z5.d, z7.d, z5.d +; CHECK-NEXT: rdvl x10, #1 +; CHECK-NEXT: add z6.d, z7.d, z6.d +; CHECK-NEXT: add z7.d, z7.d, z24.d +; CHECK-NEXT: .LBB0_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1h { z24.h }, p0/z, [x0] +; CHECK-NEXT: subs x8, x8, x9 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: tbl z25.h, { z24.h }, z4.h +; CHECK-NEXT: tbl z26.h, { z24.h }, z5.h +; CHECK-NEXT: tbl z27.h, { z24.h }, z6.h +; CHECK-NEXT: tbl z24.h, { z24.h }, z7.h +; CHECK-NEXT: add z0.d, z0.d, z25.d +; CHECK-NEXT: add z1.d, z1.d, z26.d +; CHECK-NEXT: add z2.d, z2.d, z27.d +; CHECK-NEXT: add z3.d, z3.d, z24.d +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: str z1, [x1, #1, mul vl] +; CHECK-NEXT: str z2, [x1, #2, mul vl] +; CHECK-NEXT: str z3, [x1, #3, mul vl] +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.i64 = phi [ splat(i64 0), %entry ], [ %add.b.i64, %loop ] + %acc.g.i64 = phi [ splat(i64 0), %entry ], [ %add.g.i64, %loop ] + %acc.r.i64 = phi [ splat(i64 0), %entry ], [ %add.r.i64, %loop ] + %acc.a.i64 = phi [ splat(i64 0), %entry ], [ %add.a.i64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i16], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.i64 = zext %b.i16 to + %g.i64 = zext %g.i16 to + %r.i64 = zext %r.i16 to + %a.i64 = zext %a.i16 to + %add.b.i64 = add %acc.b.i64, %b.i64 + %add.g.i64 = add %acc.g.i64, %g.i64 + %add.r.i64 = add %acc.r.i64, %r.i64 + %add.a.i64 = add %acc.a.i64, %a.i64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %add.b.i64, ptr %dst + %g.i64.gep = getelementptr , ptr %dst, i64 1 + store %add.g.i64, ptr %g.i64.gep + %r.i64.gep = getelementptr , ptr %dst, i64 2 + store %add.r.i64, ptr %r.i64.gep + %a.i64.gep = getelementptr , ptr %dst, i64 3 + store %add.a.i64, ptr %a.i64.gep + ret void +} + +;; TODO: Do we want to perform the sext equivalent? Requires a splat of the +;; sign bits into another register (using asr) and a more complex tbl +;; mask to choose; more instructions, but may still be worthwhile if +;; we find cases in real code. +define void @sext_nxv8i16_to_nxv8i64_deinterleave_in_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: sext_nxv8i16_to_nxv8i64_deinterleave_in_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov w8, #2048 // =0x800 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: rdvl x10, #1 +; CHECK-NEXT: .LBB1_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0] +; CHECK-NEXT: subs x8, x8, x9 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: uunpkhi z5.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpkhi z6.d, z5.s +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: uunpkhi z7.d, z4.s +; CHECK-NEXT: uunpklo z4.d, z4.s +; CHECK-NEXT: uzp1 z24.d, z5.d, z6.d +; CHECK-NEXT: uzp2 z5.d, z5.d, z6.d +; CHECK-NEXT: uzp1 z25.d, z4.d, z7.d +; CHECK-NEXT: uzp2 z4.d, z4.d, z7.d +; CHECK-NEXT: uzp1 z6.d, z25.d, z24.d +; CHECK-NEXT: uzp2 z7.d, z4.d, z5.d +; CHECK-NEXT: uzp2 z24.d, z25.d, z24.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: sxth z6.d, p1/m, z6.d +; CHECK-NEXT: sxth z24.d, p1/m, z24.d +; CHECK-NEXT: sxth z7.d, p1/m, z7.d +; CHECK-NEXT: sxth z4.d, p1/m, z4.d +; CHECK-NEXT: add z0.d, z0.d, z6.d +; CHECK-NEXT: add z2.d, z2.d, z24.d +; CHECK-NEXT: add z3.d, z3.d, z7.d +; CHECK-NEXT: add z1.d, z1.d, z4.d +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: str z1, [x1, #1, mul vl] +; CHECK-NEXT: str z2, [x1, #2, mul vl] +; CHECK-NEXT: str z3, [x1, #3, mul vl] +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.i64 = phi [ splat(i64 0), %entry ], [ %add.b.i64, %loop ] + %acc.g.i64 = phi [ splat(i64 0), %entry ], [ %add.g.i64, %loop ] + %acc.r.i64 = phi [ splat(i64 0), %entry ], [ %add.r.i64, %loop ] + %acc.a.i64 = phi [ splat(i64 0), %entry ], [ %add.a.i64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i16], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.i64 = sext %b.i16 to + %g.i64 = sext %g.i16 to + %r.i64 = sext %r.i16 to + %a.i64 = sext %a.i16 to + %add.b.i64 = add %acc.b.i64, %b.i64 + %add.g.i64 = add %acc.g.i64, %g.i64 + %add.r.i64 = add %acc.r.i64, %r.i64 + %add.a.i64 = add %acc.a.i64, %a.i64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %add.b.i64, ptr %dst + %g.i64.gep = getelementptr , ptr %dst, i64 1 + store %add.g.i64, ptr %g.i64.gep + %r.i64.gep = getelementptr , ptr %dst, i64 2 + store %add.r.i64, ptr %r.i64.gep + %a.i64.gep = getelementptr , ptr %dst, i64 3 + store %add.a.i64, ptr %a.i64.gep + ret void +} + +;; Check that we reuse tbl masks for the same shuffle type. +define void @zext_2x_nxv8i16_to_nxv8i64_deinterleave_in_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: zext_2x_nxv8i16_to_nxv8i64_deinterleave_in_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16 +; CHECK-NEXT: index z6.d, #0, #4 +; CHECK-NEXT: mov z1.d, #0xffffffffffff0000 +; CHECK-NEXT: mov z2.d, #0xffffffffffff0001 +; CHECK-NEXT: mov x8, #-65534 // =0xffffffffffff0002 +; CHECK-NEXT: mov z24.d, #0xffffffffffff0003 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov z4.d, x8 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov w8, #2048 // =0x800 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: add z1.d, z6.d, z1.d +; CHECK-NEXT: add z2.d, z6.d, z2.d +; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: add z4.d, z6.d, z4.d +; CHECK-NEXT: add z6.d, z6.d, z24.d +; CHECK-NEXT: .LBB2_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1h { z24.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z25.h }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: subs x8, x8, x9 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: tbl z26.h, { z24.h }, z1.h +; CHECK-NEXT: tbl z27.h, { z24.h }, z2.h +; CHECK-NEXT: tbl z28.h, { z24.h }, z4.h +; CHECK-NEXT: tbl z29.h, { z24.h }, z6.h +; CHECK-NEXT: tbl z30.h, { z25.h }, z1.h +; CHECK-NEXT: tbl z31.h, { z25.h }, z2.h +; CHECK-NEXT: tbl z8.h, { z25.h }, z4.h +; CHECK-NEXT: tbl z9.h, { z25.h }, z6.h +; CHECK-NEXT: add z24.d, z0.d, z26.d +; CHECK-NEXT: add z25.d, z3.d, z27.d +; CHECK-NEXT: add z26.d, z5.d, z28.d +; CHECK-NEXT: add z27.d, z7.d, z29.d +; CHECK-NEXT: add z0.d, z24.d, z30.d +; CHECK-NEXT: add z3.d, z25.d, z31.d +; CHECK-NEXT: add z5.d, z26.d, z8.d +; CHECK-NEXT: add z7.d, z27.d, z9.d +; CHECK-NEXT: b.ne .LBB2_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z24, [x1] +; CHECK-NEXT: str z25, [x1, #1, mul vl] +; CHECK-NEXT: str z26, [x1, #2, mul vl] +; CHECK-NEXT: str z27, [x1, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 2 + %midpoint = shl nuw nsw i64 %vscale, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.i64 = phi [ splat(i64 0), %entry ], [ %add.b.i64.2, %loop ] + %acc.g.i64 = phi [ splat(i64 0), %entry ], [ %add.g.i64.2, %loop ] + %acc.r.i64 = phi [ splat(i64 0), %entry ], [ %add.r.i64.2, %loop ] + %acc.a.i64 = phi [ splat(i64 0), %entry ], [ %add.a.i64.2, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i16], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.i64 = zext %b.i16 to + %g.i64 = zext %g.i16 to + %r.i64 = zext %r.i16 to + %a.i64 = zext %a.i16 to + %add.b.i64 = add %acc.b.i64, %b.i64 + %add.g.i64 = add %acc.g.i64, %g.i64 + %add.r.i64 = add %acc.r.i64, %r.i64 + %add.a.i64 = add %acc.a.i64, %a.i64 + %src.gep.2 = getelementptr inbounds nuw [4 x i16], ptr %src.gep, i64 %midpoint + %bgra.2 = call @llvm.masked.load(ptr %src.gep.2, %mask, zeroinitializer) + %deinterleave.2 = tail call { , , , } @llvm.vector.deinterleave4( %bgra.2) + %b.i16.2 = extractvalue { , , , } %deinterleave.2, 0 + %g.i16.2 = extractvalue { , , , } %deinterleave.2, 1 + %r.i16.2 = extractvalue { , , , } %deinterleave.2, 2 + %a.i16.2 = extractvalue { , , , } %deinterleave.2, 3 + %b.i64.2 = zext %b.i16.2 to + %g.i64.2 = zext %g.i16.2 to + %r.i64.2 = zext %r.i16.2 to + %a.i64.2 = zext %a.i16.2 to + %add.b.i64.2 = add %add.b.i64, %b.i64.2 + %add.g.i64.2 = add %add.g.i64, %g.i64.2 + %add.r.i64.2 = add %add.r.i64, %r.i64.2 + %add.a.i64.2 = add %add.a.i64, %a.i64.2 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %add.b.i64, ptr %dst + %g.i64.gep = getelementptr , ptr %dst, i64 1 + store %add.g.i64, ptr %g.i64.gep + %r.i64.gep = getelementptr , ptr %dst, i64 2 + store %add.r.i64, ptr %r.i64.gep + %a.i64.gep = getelementptr , ptr %dst, i64 3 + store %add.a.i64, ptr %a.i64.gep + ret void +} + +define void @uitofp_nxv8i16_to_nxv8f64_deinterleave_in_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: uitofp_nxv8i16_to_nxv8f64_deinterleave_in_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z7.d, #0, #4 +; CHECK-NEXT: mov z4.d, #0xffffffffffff0000 +; CHECK-NEXT: mov z5.d, #0xffffffffffff0001 +; CHECK-NEXT: mov x8, #-65534 // =0xffffffffffff0002 +; CHECK-NEXT: mov z24.d, #0xffffffffffff0003 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov z6.d, x8 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov w8, #2048 // =0x800 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: add z4.d, z7.d, z4.d +; CHECK-NEXT: add z5.d, z7.d, z5.d +; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: add z6.d, z7.d, z6.d +; CHECK-NEXT: add z7.d, z7.d, z24.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: .LBB3_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1h { z24.h }, p0/z, [x0] +; CHECK-NEXT: subs x8, x8, x9 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: tbl z25.h, { z24.h }, z4.h +; CHECK-NEXT: tbl z26.h, { z24.h }, z5.h +; CHECK-NEXT: tbl z27.h, { z24.h }, z6.h +; CHECK-NEXT: tbl z24.h, { z24.h }, z7.h +; CHECK-NEXT: ucvtf z25.d, p1/m, z25.d +; CHECK-NEXT: ucvtf z26.d, p1/m, z26.d +; CHECK-NEXT: ucvtf z27.d, p1/m, z27.d +; CHECK-NEXT: ucvtf z24.d, p1/m, z24.d +; CHECK-NEXT: fadd z0.d, z0.d, z25.d +; CHECK-NEXT: fadd z1.d, z1.d, z26.d +; CHECK-NEXT: fadd z2.d, z2.d, z27.d +; CHECK-NEXT: fadd z3.d, z3.d, z24.d +; CHECK-NEXT: b.ne .LBB3_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: str z1, [x1, #1, mul vl] +; CHECK-NEXT: str z2, [x1, #2, mul vl] +; CHECK-NEXT: str z3, [x1, #3, mul vl] +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 2 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.b.f64, %loop ] + %acc.g.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.g.f64, %loop ] + %acc.r.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.r.f64, %loop ] + %acc.a.f64 = phi [ splat(double 0.000000e+00), %entry ], [ %fadd.a.f64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i16], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.f64 = uitofp %b.i16 to + %g.f64 = uitofp %g.i16 to + %r.f64 = uitofp %r.i16 to + %a.f64 = uitofp %a.i16 to + %fadd.b.f64 = fadd %acc.b.f64, %b.f64 + %fadd.g.f64 = fadd %acc.g.f64, %g.f64 + %fadd.r.f64 = fadd %acc.r.f64, %r.f64 + %fadd.a.f64 = fadd %acc.a.f64, %a.f64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %fadd.b.f64, ptr %dst + %g.f64.gep = getelementptr , ptr %dst, i64 1 + store %fadd.g.f64, ptr %g.f64.gep + %r.f64.gep = getelementptr , ptr %dst, i64 2 + store %fadd.r.f64, ptr %r.f64.gep + %a.f64.gep = getelementptr , ptr %dst, i64 3 + store %fadd.a.f64, ptr %a.f64.gep + ret void +} + +define void @zext_nxv16i8_to_nxv16i32_deinterleave_in_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: zext_nxv16i8_to_nxv16i32_deinterleave_in_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov w8, #2048 // =0x800 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cnth x10 +; CHECK-NEXT: .LBB4_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1b { z4.b }, p0/z, [x0] +; CHECK-NEXT: subs x8, x8, x9 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: uunpkhi z5.h, z4.b +; CHECK-NEXT: uunpklo z4.h, z4.b +; CHECK-NEXT: uunpkhi z6.s, z5.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpkhi z7.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uzp1 z24.s, z5.s, z6.s +; CHECK-NEXT: uzp2 z5.s, z5.s, z6.s +; CHECK-NEXT: uzp1 z25.s, z4.s, z7.s +; CHECK-NEXT: uzp2 z4.s, z4.s, z7.s +; CHECK-NEXT: uzp1 z6.s, z25.s, z24.s +; CHECK-NEXT: uzp2 z7.s, z4.s, z5.s +; CHECK-NEXT: uzp2 z24.s, z25.s, z24.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z5.s +; CHECK-NEXT: and z6.s, z6.s, #0xff +; CHECK-NEXT: and z24.s, z24.s, #0xff +; CHECK-NEXT: and z7.s, z7.s, #0xff +; CHECK-NEXT: and z4.s, z4.s, #0xff +; CHECK-NEXT: add z0.s, z0.s, z6.s +; CHECK-NEXT: add z2.s, z2.s, z24.s +; CHECK-NEXT: add z3.s, z3.s, z7.s +; CHECK-NEXT: add z1.s, z1.s, z4.s +; CHECK-NEXT: b.ne .LBB4_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: str z1, [x1, #1, mul vl] +; CHECK-NEXT: str z2, [x1, #2, mul vl] +; CHECK-NEXT: str z3, [x1, #3, mul vl] +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.i64 = phi [ splat(i32 0), %entry ], [ %add.b.i64, %loop ] + %acc.g.i64 = phi [ splat(i32 0), %entry ], [ %add.g.i64, %loop ] + %acc.r.i64 = phi [ splat(i32 0), %entry ], [ %add.r.i64, %loop ] + %acc.a.i64 = phi [ splat(i32 0), %entry ], [ %add.a.i64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i8], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.i64 = zext %b.i16 to + %g.i64 = zext %g.i16 to + %r.i64 = zext %r.i16 to + %a.i64 = zext %a.i16 to + %add.b.i64 = add %acc.b.i64, %b.i64 + %add.g.i64 = add %acc.g.i64, %g.i64 + %add.r.i64 = add %acc.r.i64, %r.i64 + %add.a.i64 = add %acc.a.i64, %a.i64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %add.b.i64, ptr %dst + %g.i64.gep = getelementptr , ptr %dst, i64 1 + store %add.g.i64, ptr %g.i64.gep + %r.i64.gep = getelementptr , ptr %dst, i64 2 + store %add.r.i64, ptr %r.i64.gep + %a.i64.gep = getelementptr , ptr %dst, i64 3 + store %add.a.i64, ptr %a.i64.gep + ret void +} + +;; If we know vscale is small enough, then we can do extensions via tbl even for +;; 8b elements. +define void @zext_nxv16i8_to_nxv16i32_deinterleave_in_loop_max_vscale_8(ptr %src, ptr %dst, %mask) #1 { +; CHECK-LABEL: zext_nxv16i8_to_nxv16i32_deinterleave_in_loop_max_vscale_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z7.s, #0, #4 +; CHECK-NEXT: mov w9, #-254 // =0xffffff02 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov z6.s, w9 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: movprfx z4, z7 +; CHECK-NEXT: orr z4.s, z4.s, #0xffffff00 +; CHECK-NEXT: movprfx z5, z7 +; CHECK-NEXT: orr z5.s, z5.s, #0xffffff01 +; CHECK-NEXT: orr z6.d, z7.d, z6.d +; CHECK-NEXT: orr z7.s, z7.s, #0xffffff03 +; CHECK-NEXT: .LBB5_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1b { z24.b }, p0/z, [x0, x8] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 +; CHECK-NEXT: tbl z25.b, { z24.b }, z4.b +; CHECK-NEXT: tbl z26.b, { z24.b }, z5.b +; CHECK-NEXT: tbl z27.b, { z24.b }, z6.b +; CHECK-NEXT: tbl z24.b, { z24.b }, z7.b +; CHECK-NEXT: add z0.s, z0.s, z25.s +; CHECK-NEXT: add z1.s, z1.s, z26.s +; CHECK-NEXT: add z2.s, z2.s, z27.s +; CHECK-NEXT: add z3.s, z3.s, z24.s +; CHECK-NEXT: b.ne .LBB5_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: str z1, [x1, #1, mul vl] +; CHECK-NEXT: str z2, [x1, #2, mul vl] +; CHECK-NEXT: str z3, [x1, #3, mul vl] +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.i64 = phi [ splat(i32 0), %entry ], [ %add.b.i64, %loop ] + %acc.g.i64 = phi [ splat(i32 0), %entry ], [ %add.g.i64, %loop ] + %acc.r.i64 = phi [ splat(i32 0), %entry ], [ %add.r.i64, %loop ] + %acc.a.i64 = phi [ splat(i32 0), %entry ], [ %add.a.i64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i8], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.i64 = zext %b.i16 to + %g.i64 = zext %g.i16 to + %r.i64 = zext %r.i16 to + %a.i64 = zext %a.i16 to + %add.b.i64 = add %acc.b.i64, %b.i64 + %add.g.i64 = add %acc.g.i64, %g.i64 + %add.r.i64 = add %acc.r.i64, %r.i64 + %add.a.i64 = add %acc.a.i64, %a.i64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %add.b.i64, ptr %dst + %g.i64.gep = getelementptr , ptr %dst, i64 1 + store %add.g.i64, ptr %g.i64.gep + %r.i64.gep = getelementptr , ptr %dst, i64 2 + store %add.r.i64, ptr %r.i64.gep + %a.i64.gep = getelementptr , ptr %dst, i64 3 + store %add.a.i64, ptr %a.i64.gep + ret void +} + +;; If not in a loop, don't perform the transform, since the setup costs more +;; than the unpacks. +define void @zext_nxv8i16_to_nxv8i64_out_of_loop(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: zext_nxv8i16_to_nxv8i64_out_of_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpkhi z3.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 z4.d, z1.d, z2.d +; CHECK-NEXT: uzp2 z1.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z5.d, z0.d, z3.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z3.d +; CHECK-NEXT: uzp1 z2.d, z5.d, z4.d +; CHECK-NEXT: uzp1 z3.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z4.d, z5.d, z4.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: and z2.d, z2.d, #0xffff +; CHECK-NEXT: and z3.d, z3.d, #0xffff +; CHECK-NEXT: and z4.d, z4.d, #0xffff +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: str z2, [x1] +; CHECK-NEXT: str z3, [x1, #1, mul vl] +; CHECK-NEXT: str z4, [x1, #2, mul vl] +; CHECK-NEXT: str z0, [x1, #3, mul vl] +; CHECK-NEXT: ret +entry: + %bgra = call @llvm.masked.load(ptr %src, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %g.i16 = extractvalue { , , , } %deinterleave, 1 + %r.i16 = extractvalue { , , , } %deinterleave, 2 + %a.i16 = extractvalue { , , , } %deinterleave, 3 + %b.i64 = zext %b.i16 to + %g.i64 = zext %g.i16 to + %r.i64 = zext %r.i16 to + %a.i64 = zext %a.i16 to + store %b.i64, ptr %dst + %g.i64.gep = getelementptr , ptr %dst, i64 1 + store %g.i64, ptr %g.i64.gep + %r.i64.gep = getelementptr , ptr %dst, i64 2 + store %r.i64, ptr %r.i64.gep + %a.i64.gep = getelementptr , ptr %dst, i64 3 + store %a.i64, ptr %a.i64.gep + ret void +} + +define void @zext_nxv8i16_to_nxv8i64_deinterleave_single_lane_used(ptr %src, ptr %dst, %mask) #0 { +; CHECK-LABEL: zext_nxv8i16_to_nxv8i64_deinterleave_single_lane_used: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #-65536 // =0xffffffffffff0000 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: index z1.d, x8, #4 +; CHECK-NEXT: mov w8, #2048 // =0x800 +; CHECK-NEXT: rdvl x10, #1 +; CHECK-NEXT: .LBB7_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] +; CHECK-NEXT: subs x8, x8, x9 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: tbl z2.h, { z2.h }, z1.h +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: b.ne .LBB7_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %stride = shl nuw nsw i64 %vscale, 1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %acc.b.i64 = phi [ splat(i64 0), %entry ], [ %add.b.i64, %loop ] + %src.gep = getelementptr inbounds nuw [4 x i16], ptr %src, i64 %iv + %bgra = call @llvm.masked.load(ptr %src.gep, %mask, zeroinitializer) + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4( %bgra) + %b.i16 = extractvalue { , , , } %deinterleave, 0 + %b.i64 = zext %b.i16 to + %add.b.i64 = add %acc.b.i64, %b.i64 + %iv.next = add nuw i64 %iv, %stride + %ec = icmp eq i64 %iv.next, 2048 + br i1 %ec, label %exit, label %loop + +exit: + store %add.b.i64, ptr %dst + ret void +} + +attributes #0 = { "target-features"="+sve" } +attributes #1 = { "target-features"="+sve" vscale_range(1, 8) } From 80c80e6d04515acf7dac8c256a1a3b3dc14cfa4d Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 19 Jun 2026 14:20:36 +0200 Subject: [PATCH 027/149] [clang][bytecode] Check const writes more thorougly (#204529) We used to only have a list of blocks under construction, but now we have a list of pointers, which gives us more information. Use this new list to diagnose a case we couldn't previously diagnose. The test case is from `constant-expression-cxx14.cpp` and shows that a write to a const member is invalid, even if the parent object is being constructed right now. --- clang/lib/AST/ByteCode/Interp.cpp | 45 +++++++++++++++++++++++++++---- clang/test/AST/ByteCode/cxx20.cpp | 34 +++++++++++++++++++++++ 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index e5bf9c0c590ac..60914a2da111a 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -587,11 +587,44 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { // The This pointer is writable in constructors and destructors, // even if isConst() returns true. - if (S.initializingBlock(Ptr.block())) - return true; + for (PtrView V : llvm::reverse(S.InitializingPtrs)) { + if (V.block() != Ptr.block()) + continue; + if (!V.getFieldDesc()->IsConst) { + // If the pointer being initialized is not declared as const, + // Ptr is const because of a parent of V, but that is irrelevant + // since V is being initialized and NOT const. + // This is fine, so return true. + return true; + } + + // We know that Ptr is const because of a parent field and we also + // know that V is explicitly marked const. + // But since V is in InitializingPtrs, the fact that it is const doesn't + // matter and it is writable. + // What we now need to check is whether there is a pointer between Ptr and V + // that is marked const but NOT in InitializingPtrs. If that is the case, + // Ptr is currently not writable. + bool FoundProblem = false; + for (PtrView P = Ptr.view(); P != V; P = P.getBase()) { + if (P.getFieldDesc()->IsConst) { + FoundProblem = true; + break; + } + } + + // We couldn't find any pointer that's explicitly marked const, so + // Ptr is writable right now. + if (!FoundProblem) + return true; + // We only need to find the right block once. + break; + } if (!S.checkingPotentialConstantExpression()) { - const QualType Ty = Ptr.getType(); + QualType Ty = Ptr.getType(); + if (!Ptr.getFieldDesc()->IsConst) + Ty.addConst(); const SourceInfo &Loc = S.Current->getSource(OpPC); S.FFDiag(Loc, diag::note_constexpr_modify_const_type) << Ty; } @@ -1803,6 +1836,7 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func, return false; }; + bool InstancePtrTracked = false; if (Func->hasThisPointer()) { size_t ArgSize = Func->getArgSize() + VarArgSize; size_t ThisOffset = ArgSize - (Func->hasRVO() ? primSize(PT_Ptr) : 0); @@ -1845,7 +1879,8 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func, if (Func->isDestructor() && !CheckDestructor(S, OpPC, ThisPtr)) return false; - if (Func->isConstructor() || Func->isDestructor()) + InstancePtrTracked = (Func->isConstructor() || Func->isDestructor()); + if (InstancePtrTracked) S.InitializingPtrs.push_back(ThisPtr.view()); } @@ -1872,7 +1907,7 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func, InterpStateCCOverride CCOverride(S, Func->isImmediate()); bool Success = Interpret(S); // Remove initializing block again. - if (Func->isConstructor() || Func->isDestructor()) + if (InstancePtrTracked) S.InitializingPtrs.pop_back(); if (!Success) { diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index 625e65c769133..a6409d4a2c268 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -1423,3 +1423,37 @@ namespace FuncPtrRef { } static_assert(bullet_five_tests()); } + +namespace ConstWrites { + struct basic_string { + unsigned char a; + constexpr basic_string() { + a = false; + } + }; + struct array { + basic_string str; + }; + + constexpr bool tests() { + const array right{}; + return true; + } + static_assert(tests()); + + struct A { + int n; + constexpr A() : n(1) { n = 2; } + }; + struct B { + const A a; + constexpr B(bool mutate) { + if (mutate) + const_cast(a).n = 3; // both-note {{modification of object of const-qualified type 'const int'}} + } + }; + constexpr B b(false); + static_assert(b.a.n == 2, ""); + constexpr B bad(true); // both-error {{must be initialized by a constant expression}} \ + // both-note {{in call to 'B(true)'}} +} From a6fe3c7422db83346c336346e0b7a56ff084b17f Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Fri, 19 Jun 2026 14:30:32 +0200 Subject: [PATCH 028/149] [libc++][test] Migrate _BitInt probe to __BITINT_MAXWIDTH__ and fix latent test bugs (#203876) `libcxx` tests gate `_BitInt` blocks on `TEST_HAS_EXTENSION(bit_int)`, which is not a recognized Clang extension and returns 0 in every language mode. The blocks have been compiling as dead code, hiding latent bugs across 23 files. Migrate to a `TEST_HAS_BITINT` helper backed by the standard `__BITINT_MAXWIDTH__`. The latent bugs the activation surfaces are fixed in the same commit: - overflow-safe `min`; - post-P4052R0 saturating-arithmetic renames plus a `clang-21`/`apple-clang-21` skip for `saturating.bitint.pass.cpp` (Clang 21 asserts in constexpr eval on non-byte-aligned `_BitInt`); - an `intcmp` syntax fix; - `byteswap.verify` directive tightening; - a missing `` include in `byteswap.pass` (only visible under `-fmodules`); - C++03-compatible `static_assert` form in `digits10`; gating `digits`/`digits10` `_BitInt` blocks behind `!_LIBCPP_USE_FROZEN_CXX03_HEADERS` since the fix from #193002 was not backported to the frozen snapshot; and - `make_format_args` reduced to a placeholder pending a SFINAE-friendly rejection path. Discussion: https://discourse.llvm.org/t/implementing-p3666r4-bit-precise-integers-in-libc/91070 Assisted-by: Claude (Anthropic) --------- Co-authored-by: Claude Opus 4.6 --- .../__libcpp_signed_integer.compile.pass.cpp | 2 +- ...__libcpp_unsigned_integer.compile.pass.cpp | 2 +- .../views/mdspan/extents/bitint.pass.cpp | 6 +- .../numeric.limits.members/digits.pass.cpp | 6 +- .../numeric.limits.members/digits10.pass.cpp | 8 +- .../numeric.limits.members/max.pass.cpp | 11 +- .../numeric.limits.members/min.pass.cpp | 25 ++- .../bit/bit.pow.two/bit_ceil.pass.cpp | 4 +- .../bit/bit.pow.two/bit_floor.pass.cpp | 4 +- .../bit/bit.pow.two/bit_width.pass.cpp | 4 +- .../bit/bit.pow.two/has_single_bit.pass.cpp | 4 +- .../bit/bitops.count/countl_one.pass.cpp | 4 +- .../bit/bitops.count/countl_zero.pass.cpp | 4 +- .../bit/bitops.count/countr_one.pass.cpp | 4 +- .../bit/bitops.count/countr_zero.pass.cpp | 4 +- .../bit/bitops.count/popcount.pass.cpp | 4 +- .../std/numerics/bit/bitops.rot/rotl.pass.cpp | 4 +- .../std/numerics/bit/bitops.rot/rotr.pass.cpp | 4 +- .../test/std/numerics/bit/byteswap.pass.cpp | 13 +- .../test/std/numerics/bit/byteswap.verify.cpp | 71 ++++---- .../saturating.bitint.pass.cpp | 168 ++++++++---------- .../make_format_args.bitint.verify.cpp | 57 ------ .../utility.intcmp/intcmp.bitint.pass.cpp | 8 +- libcxx/test/support/test_macros.h | 9 + 24 files changed, 197 insertions(+), 233 deletions(-) delete mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp index 1f2d9685bbe5a..524b22cc4bef3 100644 --- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp +++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp @@ -79,7 +79,7 @@ static_assert(!std::__signed_integer); static_assert(!std::__signed_integer); // Extended signed integer types per [basic.fundamental]/p3 Note 1. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT static_assert(std::__signed_integer); static_assert(std::__signed_integer); static_assert(std::__signed_integer); diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp index 3f78f170b7038..234cc56f1697d 100644 --- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp +++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp @@ -79,7 +79,7 @@ static_assert(!std::__unsigned_integer); static_assert(!std::__unsigned_integer); // Extended unsigned integer types per [basic.fundamental]/p3 Note 1. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT static_assert(std::__unsigned_integer); static_assert(std::__unsigned_integer); static_assert(std::__unsigned_integer); diff --git a/libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp index 9a4dc02a15c6e..1f03730f7cb30 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp @@ -27,7 +27,7 @@ #include "test_macros.h" -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT template constexpr bool test_extents_with_index_type() { @@ -72,10 +72,10 @@ constexpr bool test() { return true; } -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT int main(int, char**) { -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT test(); static_assert(test()); #endif diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp index 807ea69f07680..0522c9b3af330 100644 --- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp @@ -10,6 +10,8 @@ // digits +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include #include @@ -55,7 +57,7 @@ int main(int, char**) // _BitInt(N): digits must equal N for unsigned and N-1 for signed, // regardless of padding bits for non-byte-aligned widths. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT // Byte-aligned widths. test(); test(); @@ -89,7 +91,7 @@ int main(int, char**) test(); test(); # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp index 002f951b2b829..3df9dc26dc94d 100644 --- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp @@ -10,6 +10,8 @@ // digits10 +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include #include @@ -58,7 +60,7 @@ int main(int, char**) test(); // _BitInt(N): digits10 = floor((N - is_signed) * log10(2)). -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT test(); // digits=8, log10=2.4 test(); // digits=7, log10=2.1 test(); // digits=13, log10=3.9 @@ -107,8 +109,8 @@ int main(int, char**) // The 1936274/6432163 convergent stays exact up to d=51132156. 8388608 is // the largest width tested above, so if Clang raises __BITINT_MAXWIDTH__, // extend the coverage before trusting the formula at the new range. - LIBCPP_STATIC_ASSERT(__BITINT_MAXWIDTH__ <= 8388608); -#endif // TEST_HAS_EXTENSION(bit_int) + LIBCPP_STATIC_ASSERT(__BITINT_MAXWIDTH__ <= 8388608, "extend digits10 _BitInt coverage for the new maximum width"); +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp index fe8f039416d3a..06355c9de4771 100644 --- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp @@ -67,16 +67,25 @@ int main(int, char**) // _BitInt(N): max is 2^N - 1 for unsigned and 2^(N-1) - 1 for signed. // Exercises the digits fix through `__max = ~0 ^ __min`. -#if TEST_HAS_EXTENSION(bit_int) + // TODO: Remove guards for MSan once https://llvm.org/PR204217 is fixed. + // MSan does not track _BitInt padding bits, so non-byte-aligned widths + // surface as false-positive use-of-uninitialized-value through the + // numeric_limits::max() shift; restrict to byte-aligned widths under + // memory sanitizer. +#if TEST_HAS_BITINT test((unsigned _BitInt(8)) ~(unsigned _BitInt(8))0); test((signed _BitInt(8))0x7F); +# if !TEST_HAS_FEATURE(memory_sanitizer) test((unsigned _BitInt(13))0x1FFF); test((signed _BitInt(13))0x0FFF); +# endif test((unsigned _BitInt(64)) ~(unsigned _BitInt(64))0); test((signed _BitInt(64))0x7FFFFFFFFFFFFFFFLL); # if __BITINT_MAXWIDTH__ >= 128 +# if !TEST_HAS_FEATURE(memory_sanitizer) test((unsigned _BitInt(77)) ~(unsigned _BitInt(77))0); test((signed _BitInt(77)) ~((signed _BitInt(77))1 << 76)); +# endif test((unsigned _BitInt(128)) ~(unsigned _BitInt(128))0); test((signed _BitInt(128)) ~((signed _BitInt(128))1 << 127)); # endif diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp index a9c72da2103b4..38a06cbed9e23 100644 --- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp @@ -68,22 +68,33 @@ int main(int, char**) // _BitInt(N): min is 0 for unsigned and -2^(N-1) for signed. The shift // `1 << digits` flowed through the buggy digits field, so this also // exercises the digits fix for non-byte-aligned widths. -#if TEST_HAS_EXTENSION(bit_int) + // TODO: Remove guards for MSan once https://llvm.org/PR204217 is fixed. + // MSan does not track _BitInt padding bits, so non-byte-aligned widths + // surface as false-positive use-of-uninitialized-value through the + // numeric_limits::min() shift; restrict to byte-aligned widths under + // memory sanitizer. +#if TEST_HAS_BITINT + // signed _BitInt(N) min is -2^(N-1). Build via unsigned shift then cast to + // avoid integer-overflow warnings (-Werror,-Winteger-overflow). test(0); - test(-(signed _BitInt(8))(1 << 7)); + test(static_cast(static_cast(1) << 7)); +# if !TEST_HAS_FEATURE(memory_sanitizer) test(0); - test(-(signed _BitInt(13))(1 << 12)); + test(static_cast(static_cast(1) << 12)); +# endif test(0); - test(-(signed _BitInt(64))(1ULL << 63)); + test(static_cast(static_cast(1) << 63)); # if __BITINT_MAXWIDTH__ >= 128 +# if !TEST_HAS_FEATURE(memory_sanitizer) test(0); - test(-((signed _BitInt(77))1 << 76)); + test(static_cast(static_cast(1) << 76)); +# endif test(0); - test(-((signed _BitInt(128))1 << 127)); + test(static_cast(static_cast(1) << 127)); # endif # if __BITINT_MAXWIDTH__ >= 256 test(0); - test(-((signed _BitInt(256))1 << 255)); + test(static_cast(static_cast(1) << 255)); # endif #endif diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp index 1aaddafe40cc7..092f08dbb22e7 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp @@ -142,7 +142,7 @@ int main(int, char**) // _BitInt tests. Width tiers follow C23 7.18.2.5. // bit_ceil uses numeric_limits::digits, so only byte-aligned widths. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T32 = unsigned _BitInt(32); using T64 = unsigned _BitInt(64); @@ -200,7 +200,7 @@ int main(int, char**) assert(std::bit_ceil((T256(1) << 200) + 1) == T256(1) << 201); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp index 07dae010b99fa..a233565838e87 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp @@ -142,7 +142,7 @@ int main(int, char**) // _BitInt tests. Width tiers follow C23 7.18.2.5. // bit_floor uses numeric_limits::digits via __bit_log2, so only // byte-aligned widths are safe. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T32 = unsigned _BitInt(32); using T64 = unsigned _BitInt(64); @@ -200,7 +200,7 @@ int main(int, char**) assert(std::bit_floor(T256(~T256(0))) == T256(T256(1) << 255)); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp index efba0dcd2b77b..e160741de90a7 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp @@ -145,7 +145,7 @@ int main(int, char**) // _BitInt tests. Width tiers follow C23 7.18.2.5. // bit_width uses numeric_limits::digits via __bit_log2, so only // byte-aligned widths are safe. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T32 = unsigned _BitInt(32); using T64 = unsigned _BitInt(64); @@ -196,7 +196,7 @@ int main(int, char**) assert(std::bit_width(T256(~T256(0))) == 256); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp index 6bab2b9f9069a..d1c75e0e53b93 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp @@ -141,7 +141,7 @@ int main(int, char**) test(); // _BitInt tests. Width tiers follow C23 7.18.2.5. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T13 = unsigned _BitInt(13); using T32 = unsigned _BitInt(32); @@ -225,7 +225,7 @@ int main(int, char**) assert(!std::has_single_bit((T4096(1) << 4095) | T4096(1))); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp index 39d5db1ed22a8..f176eeb2c21af 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp @@ -138,7 +138,7 @@ int main(int, char**) test(); // _BitInt tests. Width tiers follow C23 7.18.2.5. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T13 = unsigned _BitInt(13); using T32 = unsigned _BitInt(32); @@ -198,7 +198,7 @@ int main(int, char**) assert(std::countl_one(T4096(~T4096(0) ^ (T4096(1) << 1000))) == 3095); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp index a73175d51a201..af1c3517b45e7 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp @@ -137,7 +137,7 @@ int main(int, char**) test(); // _BitInt tests. Width tiers follow C23 7.18.2.5. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T8 = unsigned _BitInt(8); using T13 = unsigned _BitInt(13); @@ -219,7 +219,7 @@ int main(int, char**) assert(std::countl_zero(T4096(~T4096(0))) == 0); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp index ba350a76d96af..64e1506f49e85 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp @@ -142,7 +142,7 @@ int main(int, char**) test(); // _BitInt tests. Width tiers follow C23 7.18.2.5. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T13 = unsigned _BitInt(13); using T32 = unsigned _BitInt(32); @@ -215,7 +215,7 @@ int main(int, char**) assert(std::countr_one(T4096((T4096(1) << 1000) - 1)) == 1000); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp index e7e9d6542ab86..87b9e67e2a03b 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp @@ -139,7 +139,7 @@ int main(int, char**) test(); // _BitInt tests. Width tiers follow C23 7.18.2.5. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T8 = unsigned _BitInt(8); using T13 = unsigned _BitInt(13); @@ -210,7 +210,7 @@ int main(int, char**) assert(std::countr_zero(T4096(1) << 4095) == 4095); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp index dc5cdf89f147b..a06a8ca958bf7 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp @@ -151,7 +151,7 @@ int main(int, char**) // _BitInt tests. Width tiers follow C23 7.18.2.5: BITINT_MAXWIDTH is // guaranteed to be >= ULLONG_WIDTH (>= 64). Anything beyond that is // optional and must be guarded by __BITINT_MAXWIDTH__. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { // Guaranteed widths (<= 64 bits). using T8 = unsigned _BitInt(8); @@ -255,7 +255,7 @@ int main(int, char**) assert(std::popcount(mask1000) == 1000); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp index e9859ac6398b3..da0941dc0929d 100644 --- a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp @@ -168,7 +168,7 @@ int main(int, char**) // _BitInt tests. Width tiers follow C23 7.18.2.5. // rotl uses numeric_limits::digits internally, so only byte-aligned // widths are safe (where digits matches the actual bit width). -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T32 = unsigned _BitInt(32); using T64 = unsigned _BitInt(64); @@ -219,7 +219,7 @@ int main(int, char**) assert(std::rotl(T256(1), 256 + 4) == T256(1) << 4); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp index 428e11dba4969..bbcc1afe7864d 100644 --- a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp @@ -168,7 +168,7 @@ int main(int, char**) // _BitInt tests. Width tiers follow C23 7.18.2.5. // rotr uses numeric_limits::digits internally, so only byte-aligned // widths are safe. -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT { using T32 = unsigned _BitInt(32); using T64 = unsigned _BitInt(64); @@ -219,7 +219,7 @@ int main(int, char**) assert(std::rotr(T256(1), 256 + 4) == T256(1) << 252); } # endif -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT return 0; } diff --git a/libcxx/test/std/numerics/bit/byteswap.pass.cpp b/libcxx/test/std/numerics/bit/byteswap.pass.cpp index f96af9410ead3..0afdc6e8143ba 100644 --- a/libcxx/test/std/numerics/bit/byteswap.pass.cpp +++ b/libcxx/test/std/numerics/bit/byteswap.pass.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -98,7 +99,7 @@ constexpr bool test() { test_implementation_defined_size(); test_implementation_defined_size(); -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT // _BitInt(N) where digits + is_signed == sizeof * CHAR_BIT (no padding // bits) is accepted; other widths are rejected by the static_assert // inside the function body (see byteswap.verify.cpp). @@ -119,8 +120,12 @@ constexpr bool test() { test_num(0x0123456789ABCDEFULL, 0xEFCDAB8967452301ULL); test_num(0x0123456789ABCDEFLL, static_cast(0xEFCDAB8967452301ULL)); -# if __BITINT_MAXWIDTH__ >= 128 - // sizeof == 16: __builtin_bswap128 fallback or __builtin_bswapg. +# if __BITINT_MAXWIDTH__ >= 128 && (TEST_HAS_BUILTIN(__builtin_bswapg) || !defined(TEST_HAS_NO_INT128)) + // sizeof == 16: __builtin_bswap128 fallback or __builtin_bswapg. Targets + // without libc++ __int128 (32-bit ARM, MSVC ABI on Windows clang-cl) and an + // older compiler that lacks __builtin_bswapg cannot byteswap a 16-byte + // value; skip the block. TEST_HAS_NO_INT128 mirrors libc++'s + // _LIBCPP_HAS_INT128 (false on _MSC_VER even when __SIZEOF_INT128__ is set). unsigned _BitInt(128) v128 = (static_cast(0x0123456789ABCDEFULL) << 64) | static_cast(0x13579BDF02468ACEULL); @@ -131,7 +136,7 @@ constexpr bool test() { test_num(static_cast(v128), static_cast(v128_swapped)); # endif -# if __has_builtin(__builtin_bswapg) && __BITINT_MAXWIDTH__ >= 256 +# if TEST_HAS_BUILTIN(__builtin_bswapg) && __BITINT_MAXWIDTH__ >= 256 // sizeof > 16: only the __builtin_bswapg path supports widths beyond what // __builtin_bswap16/32/64/128 cover. unsigned _BitInt(256) v256 = diff --git a/libcxx/test/std/numerics/bit/byteswap.verify.cpp b/libcxx/test/std/numerics/bit/byteswap.verify.cpp index f7ff1c6aefb11..5a205d9ec5051 100644 --- a/libcxx/test/std/numerics/bit/byteswap.verify.cpp +++ b/libcxx/test/std/numerics/bit/byteswap.verify.cpp @@ -18,51 +18,55 @@ #include "test_macros.h" -#if TEST_HAS_EXTENSION(bit_int) - -// Sub-byte widths (sizeof == 1 but bit width below CHAR_BIT) -void test_unsigned_1() { - unsigned _BitInt(1) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} - (void)std::byteswap(v); -} +#if TEST_HAS_BITINT +// Sub-byte widths (sizeof == 1 but bit width below CHAR_BIT). +// _BitInt(1) is excluded because make_unsigned on _BitInt(1) triggers a +// separate static_assert that's unrelated to byteswap's padding-bit Mandate. void test_unsigned_7() { unsigned _BitInt(7) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } void test_signed_7() { signed _BitInt(7) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } // Non-byte-aligned widths void test_unsigned_13() { unsigned _BitInt(13) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } void test_unsigned_17() { unsigned _BitInt(17) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } void test_signed_33() { signed _BitInt(33) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } +// Widths with sizeof == 16 land on the libc++ 128-bit dispatch path, which is +// gated on _LIBCPP_HAS_INT128 or __builtin_bswapg. On platforms without +// either, the size-dispatch static_assert fires alongside the padding-bit +// one, doubling the diagnostic count and breaking 1-to-1 directive matching. +// Restrict 65/80/96/112 to platforms that have one path. TEST_HAS_NO_INT128 +// mirrors libc++'s _LIBCPP_HAS_INT128 (also false on _MSC_VER). +# if TEST_HAS_BUILTIN(__builtin_bswapg) || !defined(TEST_HAS_NO_INT128) void test_unsigned_65() { unsigned _BitInt(65) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } +# endif // Byte-aligned widths whose value bits don't fill the object representation. // On platforms where sizeof(_BitInt(N)) rounds up to a power of two, these @@ -71,14 +75,14 @@ void test_unsigned_65() { void test_unsigned_24() { // sizeof(_BitInt(24)) == 4 on x86_64; 8 padding bits. unsigned _BitInt(24) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } void test_unsigned_40() { // sizeof(_BitInt(40)) == 8 on x86_64; 24 padding bits. unsigned _BitInt(40) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } @@ -87,54 +91,51 @@ void test_unsigned_48() { // bit width (48) is a multiple of 16, so __builtin_bswapg accepts it -- the // libc++ static_assert is what actually catches this case. unsigned _BitInt(48) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } void test_unsigned_56() { // sizeof(_BitInt(56)) == 8 on x86_64; 8 padding bits. unsigned _BitInt(56) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } -# if __BITINT_MAXWIDTH__ >= 80 +// Same dispatch-availability guard as test_unsigned_65 above. +# if TEST_HAS_BUILTIN(__builtin_bswapg) || !defined(TEST_HAS_NO_INT128) +# if __BITINT_MAXWIDTH__ >= 80 void test_unsigned_80() { // sizeof(_BitInt(80)) == 16 on x86_64; 48 padding bits. Width 80 is also // a multiple of 16, so bswapg would accept it without the static_assert. unsigned _BitInt(80) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } -# endif +# endif -# if __BITINT_MAXWIDTH__ >= 96 +# if __BITINT_MAXWIDTH__ >= 96 void test_unsigned_96() { // sizeof(_BitInt(96)) == 16 on x86_64; 32 padding bits. unsigned _BitInt(96) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } -# endif +# endif -# if __BITINT_MAXWIDTH__ >= 112 +# if __BITINT_MAXWIDTH__ >= 112 void test_unsigned_112() { // sizeof(_BitInt(112)) == 16 on x86_64; 16 padding bits. unsigned _BitInt(112) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} + // expected-error-re@*:* {{{{(std::byteswap requires T to have no padding bits|byteswap is unimplemented for integral types of this size)}}}} (void)std::byteswap(v); } +# endif # endif -# if __BITINT_MAXWIDTH__ >= 256 -void test_unsigned_192() { - // sizeof(_BitInt(192)) == 32 on x86_64; 64 padding bits. Multiple of 16 - // but not of the storage size. - unsigned _BitInt(192) v = 0; - // expected-error@*:* {{static assertion failed{{.*}}std::byteswap requires T to have no padding bits}} - (void)std::byteswap(v); -} -# endif +// Widths above 128 bits drop out: Clang's sizeof for those widths matches the +// value width on x86_64 (e.g., sizeof(_BitInt(192)) == 24), so there are no +// padding bits to reject. #else // expected-no-diagnostics diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp index a4c68b0d582ad..9f233c785cbf3 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp @@ -8,22 +8,18 @@ // REQUIRES: std-at-least-c++26 +// TODO(LLVM25): Remove these restrictions. +// Clang <= 22 mis-evaluates std::saturating_mul on non-byte-aligned _BitInt at +// compile time. See https://llvm.org/PR204085 (fixed in Clang 23 via +// https://llvm.org/PR192568). The latest version of Android Clang still has +// this bug. +// UNSUPPORTED: clang-19, clang-20, clang-21, clang-22 +// UNSUPPORTED: apple-clang-17, apple-clang-18, apple-clang-19, apple-clang-20, apple-clang-21 +// UNSUPPORTED: target={{.+}}-android{{.*}} + // -// add_sat, sub_sat, mul_sat, div_sat, saturate_cast applied to _BitInt(N). -// -// After [libc++] recognized _BitInt as an integer type in -// __type_traits/integer_traits.h, these functions silently started -// accepting _BitInt arguments. Saturation at min/max depends on -// numeric_limits<_BitInt(N)>::min/max being correct, which requires the -// digits10 fix from #193002 for odd widths. -// -// Widths covered: -// - _BitInt(13): odd narrow width, signed range -4096..4095. -// Exercises fixed digits10 for saturation clamp. -// - _BitInt(64): equal to long long, integer_traits boundary. -// - _BitInt(128): matches __int128 on targets that support it. -// - _BitInt(200): beyond __int128 (optional via __BITINT_MAXWIDTH__). +// std::saturating_{add,sub,mul,div,cast} applied to _BitInt(N). #include #include @@ -31,7 +27,7 @@ #include "test_macros.h" -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT template constexpr bool test_signed_add_sub() { @@ -39,28 +35,28 @@ constexpr bool test_signed_add_sub() { constexpr T max_v = std::numeric_limits::max(); // Basic: no overflow. - assert(std::add_sat(T(1), T(2)) == T(3)); - assert(std::add_sat(T(-1), T(1)) == T(0)); - assert(std::sub_sat(T(5), T(3)) == T(2)); - assert(std::sub_sat(T(-1), T(-1)) == T(0)); + assert(std::saturating_add(T(1), T(2)) == T(3)); + assert(std::saturating_add(T(-1), T(1)) == T(0)); + assert(std::saturating_sub(T(5), T(3)) == T(2)); + assert(std::saturating_sub(T(-1), T(-1)) == T(0)); // Positive overflow clamps to max. - assert(std::add_sat(max_v, T(1)) == max_v); - assert(std::add_sat(T(1), max_v) == max_v); - assert(std::add_sat(max_v, max_v) == max_v); + assert(std::saturating_add(max_v, T(1)) == max_v); + assert(std::saturating_add(T(1), max_v) == max_v); + assert(std::saturating_add(max_v, max_v) == max_v); // Negative overflow clamps to min. - assert(std::add_sat(min_v, T(-1)) == min_v); - assert(std::add_sat(T(-1), min_v) == min_v); - assert(std::add_sat(min_v, min_v) == min_v); + assert(std::saturating_add(min_v, T(-1)) == min_v); + assert(std::saturating_add(T(-1), min_v) == min_v); + assert(std::saturating_add(min_v, min_v) == min_v); - // sub_sat positive overflow (x >= 0, y < 0). - assert(std::sub_sat(max_v, T(-1)) == max_v); - assert(std::sub_sat(max_v, min_v) == max_v); + // saturating_sub positive overflow (x >= 0, y < 0). + assert(std::saturating_sub(max_v, T(-1)) == max_v); + assert(std::saturating_sub(max_v, min_v) == max_v); - // sub_sat negative overflow (x < 0, y > 0). - assert(std::sub_sat(min_v, T(1)) == min_v); - assert(std::sub_sat(min_v, max_v) == min_v); + // saturating_sub negative overflow (x < 0, y > 0). + assert(std::saturating_sub(min_v, T(1)) == min_v); + assert(std::saturating_sub(min_v, max_v) == min_v); return true; } @@ -70,18 +66,18 @@ constexpr bool test_unsigned_add_sub() { constexpr T max_v = std::numeric_limits::max(); // Basic. - assert(std::add_sat(T(1), T(2)) == T(3)); - assert(std::sub_sat(T(5), T(3)) == T(2)); + assert(std::saturating_add(T(1), T(2)) == T(3)); + assert(std::saturating_sub(T(5), T(3)) == T(2)); // Upper clamp. - assert(std::add_sat(max_v, T(1)) == max_v); - assert(std::add_sat(T(1), max_v) == max_v); - assert(std::add_sat(max_v, max_v) == max_v); + assert(std::saturating_add(max_v, T(1)) == max_v); + assert(std::saturating_add(T(1), max_v) == max_v); + assert(std::saturating_add(max_v, max_v) == max_v); // Lower clamp (wrap-to-zero on unsigned). - assert(std::sub_sat(T(0), T(1)) == T(0)); - assert(std::sub_sat(T(0), max_v) == T(0)); - assert(std::sub_sat(T(3), T(5)) == T(0)); + assert(std::saturating_sub(T(0), T(1)) == T(0)); + assert(std::saturating_sub(T(0), max_v) == T(0)); + assert(std::saturating_sub(T(3), T(5)) == T(0)); return true; } @@ -92,25 +88,25 @@ constexpr bool test_signed_mul_div() { constexpr T max_v = std::numeric_limits::max(); // Basic mul. - assert(std::mul_sat(T(2), T(3)) == T(6)); - assert(std::mul_sat(T(-2), T(3)) == T(-6)); + assert(std::saturating_mul(T(2), T(3)) == T(6)); + assert(std::saturating_mul(T(-2), T(3)) == T(-6)); // Overflow to max. - assert(std::mul_sat(max_v, T(2)) == max_v); - assert(std::mul_sat(T(-1), min_v) == max_v); // -(-min) overflows to +max - assert(std::mul_sat(min_v, T(-1)) == max_v); + assert(std::saturating_mul(max_v, T(2)) == max_v); + assert(std::saturating_mul(T(-1), min_v) == max_v); // -(-min) overflows to +max + assert(std::saturating_mul(min_v, T(-1)) == max_v); // Overflow to min. - assert(std::mul_sat(max_v, T(-2)) == min_v); - assert(std::mul_sat(T(-2), max_v) == min_v); + assert(std::saturating_mul(max_v, T(-2)) == min_v); + assert(std::saturating_mul(T(-2), max_v) == min_v); - // div_sat: regular values. - assert(std::div_sat(T(6), T(3)) == T(2)); - assert(std::div_sat(T(7), T(3)) == T(2)); - assert(std::div_sat(T(-6), T(3)) == T(-2)); + // saturating_div: regular values. + assert(std::saturating_div(T(6), T(3)) == T(2)); + assert(std::saturating_div(T(7), T(3)) == T(2)); + assert(std::saturating_div(T(-6), T(3)) == T(-2)); // The one signed division overflow case: INT_MIN / -1. - assert(std::div_sat(min_v, T(-1)) == max_v); + assert(std::saturating_div(min_v, T(-1)) == max_v); return true; } @@ -119,13 +115,13 @@ template constexpr bool test_unsigned_mul_div() { constexpr T max_v = std::numeric_limits::max(); - assert(std::mul_sat(T(2), T(3)) == T(6)); - assert(std::mul_sat(max_v, T(2)) == max_v); // clamp - assert(std::mul_sat(T(0), max_v) == T(0)); - assert(std::mul_sat(max_v, max_v) == max_v); + assert(std::saturating_mul(T(2), T(3)) == T(6)); + assert(std::saturating_mul(max_v, T(2)) == max_v); // clamp + assert(std::saturating_mul(T(0), max_v) == T(0)); + assert(std::saturating_mul(max_v, max_v) == max_v); - assert(std::div_sat(T(10), T(3)) == T(3)); - assert(std::div_sat(max_v, T(1)) == max_v); + assert(std::saturating_div(T(10), T(3)) == T(3)); + assert(std::saturating_div(max_v, T(1)) == max_v); return true; } @@ -136,19 +132,19 @@ constexpr bool test_saturate_cast() { constexpr U u_max = std::numeric_limits::max(); // Same-type: no clamp. - assert(std::saturate_cast(S(0)) == S(0)); - assert(std::saturate_cast(s_max) == s_max); - assert(std::saturate_cast(s_min) == s_min); - assert(std::saturate_cast(U(0)) == U(0)); - assert(std::saturate_cast(u_max) == u_max); + assert(std::saturating_cast(S(0)) == S(0)); + assert(std::saturating_cast(s_max) == s_max); + assert(std::saturating_cast(s_min) == s_min); + assert(std::saturating_cast(U(0)) == U(0)); + assert(std::saturating_cast(u_max) == u_max); // Signed -> unsigned: negative clamps to zero. - assert(std::saturate_cast(S(-1)) == U(0)); - assert(std::saturate_cast(s_min) == U(0)); - assert(std::saturate_cast(S(1)) == U(1)); + assert(std::saturating_cast(S(-1)) == U(0)); + assert(std::saturating_cast(s_min) == U(0)); + assert(std::saturating_cast(S(1)) == U(1)); // Unsigned -> signed: overflow clamps to s_max. - assert(std::saturate_cast(u_max) == s_max); + assert(std::saturating_cast(u_max) == s_max); return true; } @@ -167,7 +163,7 @@ constexpr bool test() { test_unsigned_mul_div(); test_saturate_cast<_BitInt(64), unsigned _BitInt(64)>(); - // Cross-width saturate_cast: wide source clamped into narrow target. + // Cross-width saturating_cast: wide source clamped into narrow target. { using S13 = _BitInt(13); using S64 = _BitInt(64); @@ -175,16 +171,16 @@ constexpr bool test() { using U64 = unsigned _BitInt(64); // wide signed -> narrow signed - assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); - assert(std::saturate_cast(std::numeric_limits::min()) == std::numeric_limits::min()); + assert(std::saturating_cast(std::numeric_limits::max()) == std::numeric_limits::max()); + assert(std::saturating_cast(std::numeric_limits::min()) == std::numeric_limits::min()); // wide unsigned -> narrow signed - assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); + assert(std::saturating_cast(std::numeric_limits::max()) == std::numeric_limits::max()); // wide signed -> narrow unsigned - assert(std::saturate_cast(std::numeric_limits::min()) == U13{0}); - assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); + assert(std::saturating_cast(std::numeric_limits::min()) == U13{0}); + assert(std::saturating_cast(std::numeric_limits::max()) == std::numeric_limits::max()); // exact-fit no clamp - assert(std::saturate_cast(S13{-1}) == S64{-1}); - assert(std::saturate_cast(U13{42}) == U64{42}); + assert(std::saturating_cast(S13{-1}) == S64{-1}); + assert(std::saturating_cast(U13{42}) == U64{42}); } # if __BITINT_MAXWIDTH__ >= 128 @@ -195,31 +191,17 @@ constexpr bool test() { test_saturate_cast<_BitInt(128), unsigned _BitInt(128)>(); # endif -# if __BITINT_MAXWIDTH__ >= 200 - // Beyond __int128: exercises the overflow-detection fallback on widths - // with no builtin add/sub/mul_sat mapping. - test_signed_add_sub<_BitInt(200)>(); - test_unsigned_add_sub(); - test_signed_mul_div<_BitInt(200)>(); - test_unsigned_mul_div(); - test_saturate_cast<_BitInt(200), unsigned _BitInt(200)>(); - - // Cross-width between 128- and 200-bit widths. - { - using S200 = _BitInt(200); - using S128 = _BitInt(128); - assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); - assert(std::saturate_cast(std::numeric_limits::min()) == std::numeric_limits::min()); - } -# endif + // TODO: __builtin_mul_overflow is currently broken for (unsigned) _BitInt(N) + // where N > 128 (https://llvm.org/PR46337). Cover them once this bug gets + // fixed. return true; } -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT int main(int, char**) { -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT test(); static_assert(test()); #endif diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp deleted file mode 100644 index 52107b8b91527..0000000000000 --- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp +++ /dev/null @@ -1,57 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17 - -// - -// make_format_args with _BitInt(N) wider than __int128 is unsupported. -// -// After [libc++] recognized _BitInt as an integer type in -// __type_traits/integer_traits.h, format_arg_store's __determine_arg_t -// dispatches on sizeof(_Tp) and maps _BitInt up to sizeof(__int128) onto -// the i128 storage slot. For wider _BitInt (sizeof > sizeof(__int128)), -// no storage slot exists and a static_assert fires. -// -// This test pins down that diagnostic so that if the dispatch ever changes -// to silently accept a wider type (or drops the diagnostic), the test -// breaks and forces a reconsideration. - -#include - -#include "test_macros.h" - -#if TEST_HAS_EXTENSION(bit_int) && __BITINT_MAXWIDTH__ >= 129 - -void f_signed() { - // _BitInt(129) has sizeof == 32 on x86-64 (first size wider than __int128). - _BitInt(129) value = 0; - // expected-error-re@*:* {{{{(static assertion|static_assert)}} failed{{.*}}"an unsupported signed integer was used"}} - (void)std::make_format_args(value); -} - -void f_unsigned() { - unsigned _BitInt(129) value = 0; - // expected-error-re@*:* {{{{(static assertion|static_assert)}} failed{{.*}}"an unsupported unsigned integer was used"}} - (void)std::make_format_args(value); -} - -# if __BITINT_MAXWIDTH__ >= 256 -void f_signed_256() { - _BitInt(256) value = 0; - // expected-error-re@*:* {{{{(static assertion|static_assert)}} failed{{.*}}"an unsupported signed integer was used"}} - (void)std::make_format_args(value); -} -# endif - -#else -// When _BitInt is unavailable or the implementation limits preclude the -// test, keep the file well-formed with a trivial positive expectation so -// the driver does not fail. -// expected-no-diagnostics -#endif diff --git a/libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp b/libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp index f96ac1c9f7a32..4eb803734ead3 100644 --- a/libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp +++ b/libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp @@ -36,7 +36,7 @@ #include "test_macros.h" -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT template constexpr bool test_same_sign() { @@ -157,15 +157,15 @@ constexpr bool test() { // Cross-type round-trip equality. static_assert(std::cmp_equal(_BitInt(13)(42), 42)); static_assert(std::cmp_equal(42, _BitInt(13)(42))); - static_assert(std::cmp_equal(unsigned _BitInt(13)(42), 42u)); + static_assert(std::cmp_equal(static_cast(42), 42u)); return true; } -#endif // TEST_HAS_EXTENSION(bit_int) +#endif // TEST_HAS_BITINT int main(int, char**) { -#if TEST_HAS_EXTENSION(bit_int) +#if TEST_HAS_BITINT test(); static_assert(test()); #endif diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 8d88d6fad7d0b..78b1f6eda6576 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -42,6 +42,15 @@ #define TEST_HAS_EXTENSION(X) 0 #endif +// _BitInt(N) is a C23 standard feature and a Clang extension in earlier C and C++. +// __BITINT_MAXWIDTH__ is the portable probe: defined by every compiler that accepts _BitInt. +// Note __has_extension(bit_int) is unusable because it is not recognized by Clang and produces 0. +#ifdef __BITINT_MAXWIDTH__ +# define TEST_HAS_BITINT 1 +#else +# define TEST_HAS_BITINT 0 +#endif + #ifdef __has_warning #define TEST_HAS_WARNING(X) __has_warning(X) #else From eb7ce80944ee06b09c0bd6474254a155461dddd9 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 19 Jun 2026 15:14:29 +0200 Subject: [PATCH 029/149] CodeGenPassBuilder: Use cl::boolOrDefault directly in CGPassBuilderOption (#204196) Current implementation that uses std::optional captures cl::BOU_FALSE, for example -global-isel=0, as true. Explictly setting option to 0 should be false, forced option not set. This could be fixed but I find it cleaner to use boolOrDefault directly and use same logic as in TargetPassConfig. Options EnableIPRA and EnableGlobalISelAbort are left as optional since for them it is explicitly checked if they are set using getNumOccurrences. boolOrDefault has encoded unset option. --- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 20 ++++---- .../include/llvm/Target/CGPassBuilderOption.h | 12 ++--- llvm/lib/CodeGen/TargetPassConfig.cpp | 50 +++++++++---------- .../Target/AArch64/AArch64TargetMachine.cpp | 4 +- .../AArch64/GISel/AArch64CallLowering.cpp | 2 +- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 3 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- 7 files changed, 48 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 9641ac7313c69..898aa7c9cf003 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -194,8 +194,9 @@ template class CodeGenPassBuilder { if (Opt.EnableGlobalISelAbort) TM.Options.GlobalISelAbort = *Opt.EnableGlobalISelAbort; - if (!Opt.OptimizeRegAlloc) - Opt.OptimizeRegAlloc = getOptLevel() != CodeGenOptLevel::None; + if (Opt.OptimizeRegAlloc == cl::BOU_UNSET) + Opt.OptimizeRegAlloc = + getOptLevel() != CodeGenOptLevel::None ? cl::BOU_TRUE : cl::BOU_FALSE; } Error buildPipeline(ModulePassManager &MPM, ModuleAnalysisManager &MAM, @@ -875,19 +876,17 @@ template Error CodeGenPassBuilder::addCoreISelPasses( PassManagerWrapper &PMW) const { // Enable FastISel with -fast-isel, but allow that to be overridden. - TM.setO0WantsFastISel(Opt.EnableFastISelOption.value_or(true)); + TM.setO0WantsFastISel(Opt.EnableFastISelOption != cl::BOU_FALSE); // Determine an instruction selector. enum class SelectorType { SelectionDAG, FastISel, GlobalISel }; SelectorType Selector; - if (Opt.EnableFastISelOption && *Opt.EnableFastISelOption == true) + if (Opt.EnableFastISelOption == cl::BOU_TRUE) Selector = SelectorType::FastISel; - else if ((Opt.EnableGlobalISelOption && - *Opt.EnableGlobalISelOption == true) || + else if (Opt.EnableGlobalISelOption == cl::BOU_TRUE || (TM.Options.EnableGlobalISel && - (!Opt.EnableGlobalISelOption || - *Opt.EnableGlobalISelOption == false))) + Opt.EnableGlobalISelOption != cl::BOU_FALSE)) Selector = SelectorType::GlobalISel; else if (TM.getOptLevel() == CodeGenOptLevel::None && TM.getO0WantsFastISel()) Selector = SelectorType::FastISel; @@ -989,8 +988,9 @@ Error CodeGenPassBuilder::addMachinePasses( // Run register allocation and passes that are tightly coupled with it, // including phi elimination and scheduling. - if (auto Err = *Opt.OptimizeRegAlloc ? derived().addOptimizedRegAlloc(PMW) - : derived().addFastRegAlloc(PMW)) + if (auto Err = Opt.OptimizeRegAlloc == cl::BOU_TRUE + ? derived().addOptimizedRegAlloc(PMW) + : derived().addFastRegAlloc(PMW)) return std::move(Err); // Run post-ra passes. diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h index 22b62e841dbc3..086142b7de574 100644 --- a/llvm/include/llvm/Target/CGPassBuilderOption.h +++ b/llvm/include/llvm/Target/CGPassBuilderOption.h @@ -48,7 +48,7 @@ class RegAllocTypeParser : public cl::parser { // Not one-on-one but mostly corresponding to commandline options in // TargetPassConfig.cpp. struct CGPassBuilderOption { - std::optional OptimizeRegAlloc; + cl::boolOrDefault OptimizeRegAlloc = cl::BOU_UNSET; std::optional EnableIPRA; bool DebugPM = false; bool DisableVerify = false; @@ -84,11 +84,11 @@ struct CGPassBuilderOption { std::string FSProfileFile; std::string FSRemappingFile; - std::optional VerifyMachineCode; - std::optional EnableFastISelOption; - std::optional EnableGlobalISelOption; - std::optional DebugifyAndStripAll; - std::optional DebugifyCheckAndStripAll; + cl::boolOrDefault VerifyMachineCode = cl::BOU_UNSET; + cl::boolOrDefault EnableFastISelOption = cl::BOU_UNSET; + cl::boolOrDefault EnableGlobalISelOption = cl::BOU_UNSET; + cl::boolOrDefault DebugifyAndStripAll = cl::BOU_UNSET; + cl::boolOrDefault DebugifyCheckAndStripAll = cl::BOU_UNSET; }; LLVM_ABI CGPassBuilderOption getCGPassBuilderOption(); diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 4a76aba55b78b..697b0a6447950 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -499,42 +499,42 @@ void TargetPassConfig::setStartStopPasses() { CGPassBuilderOption llvm::getCGPassBuilderOption() { CGPassBuilderOption Opt; -#define SET_OPTION(Option) \ +#define SET_OPTION_IF_PRESENT(Option) \ if (Option.getNumOccurrences()) \ Opt.Option = Option; + SET_OPTION_IF_PRESENT(EnableGlobalISelAbort) + SET_OPTION_IF_PRESENT(EnableIPRA) + +#define SET_OPTION(Option) Opt.Option = Option; + + SET_OPTION(OptimizeRegAlloc) SET_OPTION(EnableFastISelOption) - SET_OPTION(EnableGlobalISelAbort) SET_OPTION(EnableGlobalISelOption) - SET_OPTION(EnableIPRA) - SET_OPTION(OptimizeRegAlloc) SET_OPTION(VerifyMachineCode) SET_OPTION(DisableAtExitBasedGlobalDtorLowering) SET_OPTION(DisableExpandReductions) SET_OPTION(PrintAfterISel) SET_OPTION(FSProfileFile) SET_OPTION(EnableGCEmptyBlocks) - -#define SET_BOOLEAN_OPTION(Option) Opt.Option = Option; - - SET_BOOLEAN_OPTION(EarlyLiveIntervals) - SET_BOOLEAN_OPTION(EnableBlockPlacementStats) - SET_BOOLEAN_OPTION(EnableGlobalMergeFunc) - SET_BOOLEAN_OPTION(EnableImplicitNullChecks) - SET_BOOLEAN_OPTION(EnableMachineOutliner) - SET_BOOLEAN_OPTION(MISchedPostRA) - SET_BOOLEAN_OPTION(DisableLSR) - SET_BOOLEAN_OPTION(DisableConstantHoisting) - SET_BOOLEAN_OPTION(DisableCGP) - SET_BOOLEAN_OPTION(DisablePartialLibcallInlining) - SET_BOOLEAN_OPTION(DisableSelectOptimize) - SET_BOOLEAN_OPTION(PrintISelInput) - SET_BOOLEAN_OPTION(PrintRegUsage) - SET_BOOLEAN_OPTION(DebugifyAndStripAll) - SET_BOOLEAN_OPTION(DebugifyCheckAndStripAll) - SET_BOOLEAN_OPTION(DisableRAFSProfileLoader) - SET_BOOLEAN_OPTION(DisableCFIFixup) - SET_BOOLEAN_OPTION(EnableMachineFunctionSplitter) + SET_OPTION(EarlyLiveIntervals) + SET_OPTION(EnableBlockPlacementStats) + SET_OPTION(EnableGlobalMergeFunc) + SET_OPTION(EnableImplicitNullChecks) + SET_OPTION(EnableMachineOutliner) + SET_OPTION(MISchedPostRA) + SET_OPTION(DisableLSR) + SET_OPTION(DisableConstantHoisting) + SET_OPTION(DisableCGP) + SET_OPTION(DisablePartialLibcallInlining) + SET_OPTION(DisableSelectOptimize) + SET_OPTION(PrintISelInput) + SET_OPTION(PrintRegUsage) + SET_OPTION(DebugifyAndStripAll) + SET_OPTION(DebugifyCheckAndStripAll) + SET_OPTION(DisableRAFSProfileLoader) + SET_OPTION(DisableCFIFixup) + SET_OPTION(EnableMachineFunctionSplitter) return Opt; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index b73945c53235e..b31c7d8ebaaba 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -292,7 +292,7 @@ LLVMInitializeAArch64Target() { bool AArch64TargetMachine::isGlobalISelOptNone() const { const bool GlobalISelFlag = - getCGPassBuilderOption().EnableGlobalISelOption.value_or(false); + getCGPassBuilderOption().EnableGlobalISelOption == cl::BOU_TRUE; return getOptLevel() == CodeGenOptLevel::None || (static_cast(getOptLevel()) > @@ -405,7 +405,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO()); const bool GlobalISelFlag = - getCGPassBuilderOption().EnableGlobalISelOption.value_or(false); + getCGPassBuilderOption().EnableGlobalISelOption == cl::BOU_TRUE; // Enable GlobalISel at or below EnableGlobalISelAt0, unless this is // MachO/CodeModel::Large, which GlobalISel does not support. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index af88bc51e1ae7..e21af65414362 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -601,7 +601,7 @@ bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const { auto OptLevel = MF.getTarget().getOptLevel(); bool IsGlobalISelPreferred = - getCGPassBuilderOption().EnableGlobalISelOption.value_or(false) || + getCGPassBuilderOption().EnableGlobalISelOption == cl::BOU_TRUE || static_cast(OptLevel) <= TM.getEnableGlobalISelAtO() || F.hasOptNone(); return !IsGlobalISelPreferred; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8a92f743894cb..1a872cd847247 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1900,7 +1900,8 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { // operations with most elements being "undef". This inhibits a lot of // optimization opportunities and can result in unreasonably high register // pressure and the inevitable stack spilling. - if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption) + if (!BreakLargePHIs || + getCGPassBuilderOption().EnableGlobalISelOption == cl::BOU_TRUE) return false; FixedVectorType *FVT = dyn_cast(I.getType()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b078e0835a90e..47bc49c3a23f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1626,7 +1626,7 @@ bool GCNPassConfig::addPreISel() { // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel // with -new-reg-bank-select and without any of the fallback options. - if (!getCGPassBuilderOption().EnableGlobalISelOption || + if (getCGPassBuilderOption().EnableGlobalISelOption != cl::BOU_TRUE || !isGlobalISelAbortEnabled() || !NewRegBankSelect) addPass(createLCSSAPass()); @@ -2390,7 +2390,7 @@ void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const { // control flow modifications. addFunctionPass(AMDGPURewriteUndefForPHIPass(), PMW); - if (!getCGPassBuilderOption().EnableGlobalISelOption || + if (getCGPassBuilderOption().EnableGlobalISelOption != cl::BOU_TRUE || !isGlobalISelAbortEnabled() || !NewRegBankSelect) addFunctionPass(LCSSAPass(), PMW); From 54a7896acfb97d7935e085eebbc18fd5e694f67d Mon Sep 17 00:00:00 2001 From: mkovacevic99 Date: Fri, 19 Jun 2026 15:33:42 +0200 Subject: [PATCH 030/149] [JITLink][COFF] Synthesize __imp_ IAT entries (#203906) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a default COFF/x86_64 JITLink pass that synthesizes `__imp_` Import Address Table (IAT) entries for dllimport references. This allows COFF objects using dllimport to be JIT-linked without a hand-built import library or a special generator. On COFF, `__declspec(dllimport)` codegen emits indirect accesses through a named `__imp_X` symbol (`callq *__imp_bar(%rip)`; `movq __imp_g(%rip)` for data), with `__imp_X` left undefined. JITLink had no handling for this. The new pass — the COFF counterpart of the ELF/Mach-O GOT builder — defines each undefined external `__imp_X` over an 8-byte slot holding the address of `X`, and leaves `X` as an ordinary external to be resolved normally (import library, dynamic-library search generator, etc.). Both the call and data-access forms then resolve indirectly through the slot. Rather than the `GOTTableManager` pattern (anonymous entry + edge redirection), the pass defines the *named* `__imp_X` symbol over the slot. ELF GOT references are nameless edge kinds, so that builder must create an anonymous entry and redirect edges; COFF references `__imp_X` by name, so defining it is simpler — no edge rewriting, no orphaned-external cleanup, sharing is automatic, and the call/data-access forms are handled identically. x86_64 only (runs in the COFF/x86_64 backend's default pass pipeline). New lit test `COFF_dllimport_iat.s`: assembles an object referencing `__imp_bar` (call) and `__imp_foo` (data load), supplies `foo`/`bar` via `-abs`, links with `-noexec`, and uses `jitlink-check` to verify each `__imp_` slot holds the target's address and that the references resolve through the slot. Partly implements github issue: https://github.com/llvm/llvm-project/issues/190122 In the comment section of the github issue there is this comment https://github.com/llvm/llvm-project/issues/190122#issuecomment-4617328036 This PR implements point 2 Synthesis IAT entries. --- .../ExecutionEngine/JITLink/COFF_x86_64.cpp | 78 +++++++++++++++++++ .../JITLink/x86-64/COFF_dllimport_iat.s | 55 +++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 llvm/test/ExecutionEngine/JITLink/x86-64/COFF_dllimport_iat.s diff --git a/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp index aa91ac053bb50..2144b2c255d47 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp @@ -247,6 +247,79 @@ class COFFLinkGraphLowering_x86_64 { GetImageBaseSymbol GetImageBase; DenseMap
SectionStartCache; }; + +// Synthesize COFF __imp_ Import Address Table (IAT) entries. +// +// For a dllimport reference, codegen emits an indirect access through a named +// __imp_X symbol, e.g. +// +// callq *__imp_bar(%rip) ; or, for data: movq __imp_g(%rip), %rax +// +// where __imp_X is an undefined external. This pass supplies the missing IAT +// entry by defining __imp_X over an 8-byte pointer slot that holds X's address: +// +// __imp_bar: +// .quad bar ; X is resolved as an ordinary external +// +// X is left external, so its address is provided by whatever resolves the +// JITDylib's externals (an import library, a DynamicLibrarySearchGenerator, +// AutoImportGenerator, ...). If X is unresolvable the link fails, exactly as a +// static link against the corresponding import library would. +// +// This is the COFF analog of the ELF/Mach-O GOT builder, but deliberately NOT +// written as a TableManager/visitEdge pass like x86_64::GOTTableManager. ELF's +// GOT references are *nameless* edge kinds, so that builder has to create an +// anonymous entry and redirect every edge to it (and, for our case, would then +// have to delete the now-orphaned __imp_X external so it isn't looked up). +// COFF instead references a *named* __imp_X symbol, so the simpler and more +// natural thing is to define that symbol over the slot: edges to __imp_X then +// resolve to it with no edge rewriting and no orphan cleanup, call and +// data-access references are handled identically, and sharing is automatic +// because there is exactly one __imp_X symbol per import. +// +// Direct (non-dllimport) references such as `callq foo` are intentionally not +// handled here: those are either kept in range by the slab allocator or thunked +// by the opt-in AutoImportGenerator -- both outside this pass. +Error synthesizeIATEntries_COFF_x86_64(LinkGraph &G) { + static constexpr StringRef ImpPrefix = "__imp_"; + + // Collect the external __imp_ symbols up front: we mutate the symbol lists + // below (makeDefined / addExternalSymbol). + SmallVector Imps; + for (auto *Sym : G.external_symbols()) + if (Sym->hasName() && (*Sym->getName()).starts_with(ImpPrefix)) + Imps.push_back(Sym); + if (Imps.empty()) + return Error::success(); + + auto FindByName = [&](const orc::SymbolStringPtr &Name) -> Symbol * { + if (auto *Sym = G.findExternalSymbolByName(Name)) + return Sym; + if (auto *Sym = G.findDefinedSymbolByName(Name)) + return Sym; + return nullptr; + }; + + Section &IATSec = G.createSection("$__IAT", orc::MemProt::Read); + + for (auto *Imp : Imps) { + orc::SymbolStringPtr Base = + G.intern((*Imp->getName()).drop_front(ImpPrefix.size())); + + // Find the real target X, or add it as an external to be resolved normally. + Symbol *Target = FindByName(std::move(Base)); + if (!Target) + Target = &G.addExternalSymbol(std::move(Base), 0, + /*IsWeaklyReferenced=*/false); + + // 8-byte slot holding &X, with __imp_X defined over it. + Symbol &Slot = x86_64::createAnonymousPointer(G, IATSec, Target); + G.makeDefined(*Imp, Slot.getBlock(), 0, G.getPointerSize(), Linkage::Strong, + Scope::Local, /*IsLive=*/true); + } + + return Error::success(); +} } // namespace namespace llvm { @@ -303,6 +376,11 @@ void link_COFF_x86_64(std::unique_ptr G, } else Config.PrePrunePasses.push_back(markAllSymbolsLive); + // Synthesize __imp_X IAT entries for dllimport references, like the GOT/PLT + // builders for ELF/Mach-O. Runs in PostPrune (before external-symbol + // lookup) so the X targets it introduces are resolved normally. + Config.PostPrunePasses.push_back(synthesizeIATEntries_COFF_x86_64); + // Add COFF edge lowering passes. Config.PreFixupPasses.push_back(COFFLinkGraphLowering_x86_64()); } diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_dllimport_iat.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_dllimport_iat.s new file mode 100644 index 0000000000000..754fa956758a5 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_dllimport_iat.s @@ -0,0 +1,55 @@ +# Verify the COFF __imp_ IAT synthesis pass: for a dllimport reference to an +# undefined __imp_X symbol, JITLink should define __imp_X over an 8-byte pointer +# slot that holds the address of X (resolved as an ordinary external). Both the +# call form (callq *__imp_X) and the data-access form (movq __imp_X) resolve +# indirectly through that slot. +# +# X (foo/bar) is supplied as an absolute symbol, so no real library is needed -- +# this exercises the pass itself, not any resolution mechanism. +# +# RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t.o +# RUN: llvm-jitlink -noexec \ +# RUN: -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \ +# RUN: -abs foo=0x7ff700000000 \ +# RUN: -abs bar=0x7ff700001000 \ +# RUN: -check %s %t.o + + .text + + .def main; + .scl 2; + .type 32; + .endef + .globl main + .p2align 4, 0x90 +main: + retq + +# The synthesized __imp_bar slot holds bar's address... +# jitlink-check: *{8}(__imp_bar) = bar +# ... and the dllimport call reads through that slot (RIP-relative displacement +# of the indirect call's memory operand, MCInst operand 3). +# jitlink-check: decode_operand(test_call, 3) = __imp_bar - next_pc(test_call) + .def test_call; + .scl 2; + .type 32; + .endef + .globl test_call + .p2align 4, 0x90 +test_call: + callq *__imp_bar(%rip) + retq + +# Same for a data access: the __imp_foo slot holds foo's address, and the load +# reads through it (displacement is MCInst operand 4 for `movq mem, reg`). +# jitlink-check: *{8}(__imp_foo) = foo +# jitlink-check: decode_operand(test_load, 4) = __imp_foo - next_pc(test_load) + .def test_load; + .scl 2; + .type 32; + .endef + .globl test_load + .p2align 4, 0x90 +test_load: + movq __imp_foo(%rip), %rax + retq From 12ee71c377db385c4af0cfe92488406b9a8fa13c Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 19 Jun 2026 08:59:04 -0500 Subject: [PATCH 031/149] [Clang] Respect `-fno-slp-vectorize` for the LTO pipeline (#201585) Summary: This is related to reported regressions in the GROMACS suite when offloading to AMDGCN devices through the RDC / LTO interface. The application intentionally passes `-fno-slp-vectorize` to disable that pass, but there's currently no way to do this through the LTO pipline. This PR causes the driver to emit `plugin-opt=` for the `-mllvm` option. That means the pass is still enabled but it should be a no-op now. --- clang/lib/Driver/ToolChains/Clang.cpp | 4 +++- clang/lib/Driver/ToolChains/CommonArgs.cpp | 9 +++++++++ clang/test/Driver/lto.c | 8 ++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index e3288c81d4c95..0cbb1f18809f7 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9646,7 +9646,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, OPT_fsanitize_minimal_runtime, OPT_fno_sanitize_minimal_runtime, OPT_fsanitize_trap_EQ, - OPT_fno_sanitize_trap_EQ}; + OPT_fno_sanitize_trap_EQ, + OPT_fslp_vectorize, + OPT_fno_slp_vectorize}; const llvm::DenseSet LinkerOptions{OPT_mllvm, OPT_Zlinker_input}; auto ToolChainHasRT = [&](const ToolChain &TC, StringRef Name) { return TC.getVFS().exists( diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 48724746d9330..547405eaf7663 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1157,6 +1157,15 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + ParallelismOpt + Parallelism)); + // Forward the SLP vectorization preference to the LTO backend by toggling + // the existing -vectorize-slp cl::opt, which the pass honors directly. This + // avoids minting dedicated linker options for what is only pipeline tuning. + if (Arg *A = Args.getLastArg(options::OPT_fslp_vectorize, + options::OPT_fno_slp_vectorize)) + CmdArgs.push_back(Args.MakeArgString( + Twine(PluginOptPrefix) + "-vectorize-slp=" + + (A->getOption().matches(options::OPT_fslp_vectorize) ? "1" : "0"))); + // Pass down GlobalISel options. if (Arg *A = Args.getLastArg(options::OPT_fglobal_isel, options::OPT_fno_global_isel)) { diff --git a/clang/test/Driver/lto.c b/clang/test/Driver/lto.c index 81165d3b9e8a3..c9ee2f9c26223 100644 --- a/clang/test/Driver/lto.c +++ b/clang/test/Driver/lto.c @@ -117,6 +117,14 @@ // CHECK-GISEL: "-plugin-opt=-global-isel=1" // CHECK-DISABLE-GISEL: "-plugin-opt=-global-isel=0" +// RUN: %clang --target=x86_64-unknown-linux-gnu -### %s -flto -fno-slp-vectorize 2> %t +// RUN: FileCheck --check-prefix=CHECK-NO-SLP < %t %s +// RUN: %clang --target=x86_64-unknown-linux-gnu -### %s -flto -fslp-vectorize 2> %t +// RUN: FileCheck --check-prefix=CHECK-SLP < %t %s + +// CHECK-NO-SLP: "-plugin-opt=-vectorize-slp=0" +// CHECK-SLP: "-plugin-opt=-vectorize-slp=1" + // -flto passes -time-passes when -ftime-report is passed // RUN: %clang --target=x86_64-unknown-linux-gnu -### %s -flto -ftime-report 2> %t // RUN: FileCheck --check-prefix=CHECK-TIME-REPORT < %t %s From 0390898335f9f32ea71ff288a5b4085cecc10391 Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Fri, 19 Jun 2026 07:11:48 -0700 Subject: [PATCH 032/149] [mlir][affine] Implement LoopLikeInterface::getStaticTripCount on AffineForOp (#204687) LoopLikeInterface is useful, but missing `getStaticTripCount` requires adding extra cases to check when processing otherwise dialect-agnostic code. There is an existing free function `getConstantTripCount`, which I deprecated and replaced (NFC) with the new implementation. I believe the new implementation is slightly more efficient than `getConstantTripCount` because it checks if the expression is constant and fast-fails before constructing the output `AffineMap` that was returned by `getTripCountMapAndOperands`. Assisted by Gemini --- .../Dialect/Affine/Analysis/LoopAnalysis.h | 1 + .../mlir/Dialect/Affine/IR/AffineOps.td | 2 +- .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 24 +------- mlir/lib/Dialect/Affine/Analysis/Utils.cpp | 4 +- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 55 +++++++++++++++++++ .../AffineLoopInvariantCodeMotion.cpp | 5 +- .../Dialect/Affine/Transforms/LoopTiling.cpp | 5 +- .../Dialect/Affine/Transforms/LoopUnroll.cpp | 4 +- .../Transforms/PipelineDataTransfer.cpp | 3 +- .../Dialect/Affine/Utils/LoopFusionUtils.cpp | 6 +- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 36 +++++++----- mlir/test/Dialect/Affine/trip-count.mlir | 38 +++++++++++++ 12 files changed, 133 insertions(+), 50 deletions(-) create mode 100644 mlir/test/Dialect/Affine/trip-count.mlir diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h index 43d61832cafdd..3fcb63a4da885 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h @@ -41,6 +41,7 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, /// Returns the trip count of the loop if it's a constant, std::nullopt /// otherwise. This uses affine expression analysis and is able to determine /// constant trip count in non-trivial cases. +[[deprecated("use AffineForOp::getStaticTripCount instead")]] std::optional getConstantTripCount(AffineForOp forOp); /// Returns the greatest known integral divisor of the trip count. Affine diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index 3d7cbcc375d2a..1e14f9f37288d 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -135,7 +135,7 @@ def AffineForOp : Affine_Op<"for", RecursiveMemoryEffects, DeclareOpInterfaceMethods, + "replaceWithAdditionalYields", "getStaticTripCount"]>, DeclareOpInterfaceMethods]> { let summary = "for operation"; diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 166d39e88d41e..40802cc6e85e5 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -214,27 +214,9 @@ void mlir::affine::getTripCountMapAndOperands( /// getTripCount) and is able to determine constant trip count in non-trivial /// cases. std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { - SmallVector operands; - AffineMap map; - getTripCountMapAndOperands(forOp, &map, &operands); - - if (!map) - return std::nullopt; - - // Take the min if all trip counts are constant. - std::optional tripCount; - for (auto resultExpr : map.getResults()) { - if (auto constExpr = dyn_cast(resultExpr)) { - if (tripCount.has_value()) - tripCount = - std::min(*tripCount, static_cast(constExpr.getValue())); - else - tripCount = constExpr.getValue(); - } else { - return std::nullopt; - } - } - return tripCount; + if (std::optional tripCount = forOp.getStaticTripCount()) + return tripCount->getZExtValue(); + return std::nullopt; } /// Returns the greatest known integral divisor of the trip count. Affine diff --git a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp index ebe932a14694a..cac305df8ba75 100644 --- a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp @@ -1833,9 +1833,9 @@ bool mlir::affine::buildSliceTripCountMap( forOp.getConstantUpperBound() - forOp.getConstantLowerBound(); continue; } - std::optional maybeConstTripCount = getConstantTripCount(forOp); + std::optional maybeConstTripCount = forOp.getStaticTripCount(); if (maybeConstTripCount.has_value()) { - (*tripCountMap)[op] = *maybeConstTripCount; + (*tripCountMap)[op] = maybeConstTripCount->getZExtValue(); continue; } return false; diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 7d8974bd6c1b7..f095500495f18 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -2826,6 +2826,61 @@ std::optional> AffineForOp::getLoopUpperBounds() { OpFoldResult(b.getI64IntegerAttr(getConstantUpperBound()))}; } +std::optional AffineForOp::getStaticTripCount() { + MLIRContext *context = getContext(); + int64_t step = getStepAsInt(); + if (step <= 0) + return std::nullopt; + + if (hasConstantBounds()) { + int64_t lb = getConstantLowerBound(); + int64_t ub = getConstantUpperBound(); + int64_t loopSpan = ub - lb; + if (loopSpan < 0) + loopSpan = 0; + return APInt(64, llvm::divideCeilSigned(loopSpan, step)); + } + + auto lbMap = getLowerBoundMap(); + auto ubMap = getUpperBoundMap(); + if (lbMap.getNumResults() != 1) + return std::nullopt; + + // Difference of each upper bound expression from the single lower bound + // expression (divided by the step) provides the expressions for the trip + // count map. + AffineValueMap ubValueMap(ubMap, getUpperBoundOperands()); + + SmallVector lbSplatExpr(ubValueMap.getNumResults(), + lbMap.getResult(0)); + auto lbMapSplat = AffineMap::get(lbMap.getNumDims(), lbMap.getNumSymbols(), + lbSplatExpr, context); + AffineValueMap lbSplatValueMap(lbMapSplat, getLowerBoundOperands()); + + AffineValueMap tripCountValueMap; + AffineValueMap::difference(ubValueMap, lbSplatValueMap, &tripCountValueMap); + + // Take the min if all trip counts are constant. + std::optional tripCount; + for (unsigned i = 0, e = tripCountValueMap.getNumResults(); i < e; ++i) { + AffineExpr expr = tripCountValueMap.getResult(i).ceilDiv(step); + if (auto constExpr = llvm::dyn_cast(expr)) { + uint64_t value = constExpr.getValue(); + if (tripCount.has_value()) + tripCount = std::min(*tripCount, value); + else + tripCount = value; + } else { + return std::nullopt; + } + } + + if (tripCount.has_value()) + return APInt(64, *tripCount); + + return std::nullopt; +} + FailureOr AffineForOp::replaceWithAdditionalYields( RewriterBase &rewriter, ValueRange newInitOperands, bool replaceInitOperandUsesInLoop, diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp index 3c55830df61c3..1887c321e206a 100644 --- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp @@ -178,8 +178,9 @@ void LoopInvariantCodeMotion::runOnAffineForOp(AffineForOp forOp) { // at least once. For unknown (dynamic) or zero trip counts we cannot prove // the body executes, so hoisting a side-effectful op would change observable // program semantics. Pure (side-effect-free) ops may always be hoisted. - auto tripCount = getConstantTripCount(forOp); - bool guaranteedToExecute = tripCount.has_value() && *tripCount > 0; + auto tripCount = forOp.getStaticTripCount(); + bool guaranteedToExecute = + tripCount.has_value() && tripCount->getZExtValue() > 0; for (Operation &op : *forOp.getBody()) { // Register op in the set of ops that have users. This set is used diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp index 188db218a5220..d3208d5c8f7eb 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp @@ -91,12 +91,13 @@ static void adjustToDivisorsOfTripCounts(ArrayRef band, assert(band.size() == tileSizes->size() && "invalid tile size count"); for (unsigned i = 0, e = band.size(); i < e; i++) { unsigned &tSizeAdjusted = (*tileSizes)[i]; - std::optional mayConst = getConstantTripCount(band[i]); + AffineForOp forOp = band[i]; + std::optional mayConst = forOp.getStaticTripCount(); if (!mayConst) continue; // Adjust the tile size to largest factor of the trip count less than // tSize. - uint64_t constTripCount = *mayConst; + uint64_t constTripCount = mayConst->getZExtValue(); if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2) tSizeAdjusted = constTripCount / 2; while (constTripCount % tSizeAdjusted != 0) diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp index 837d4f714d25e..1006a7d2c3cca 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -100,8 +100,8 @@ void LoopUnroll::runOnOperation() { // so that loops are gathered from innermost to outermost (or else // unrolling an outer one may delete gathered inner ones). getOperation().walk([&](AffineForOp forOp) { - std::optional tripCount = getConstantTripCount(forOp); - if (tripCount && *tripCount <= unrollFullThreshold) + std::optional tripCount = forOp.getStaticTripCount(); + if (tripCount && tripCount->getZExtValue() <= unrollFullThreshold) loops.push_back(forOp); }); for (auto forOp : loops) diff --git a/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp index d84cb4f0cde5f..575b529658127 100644 --- a/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp @@ -245,8 +245,7 @@ static void findMatchingStartFinishInsts( /// 'forOp' is deleted, and a prologue, a new pipelined loop, and epilogue are /// inserted right before where it was. void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) { - auto mayBeConstTripCount = getConstantTripCount(forOp); - if (!mayBeConstTripCount) { + if (!forOp.getStaticTripCount()) { LLVM_DEBUG(forOp.emitRemark("won't pipeline due to unknown trip count")); return; } diff --git a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp index 82247dcfe71ef..68296ea3368a1 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp @@ -357,7 +357,7 @@ FusionResult mlir::affine::canFuseLoops(AffineForOp srcForOp, static LogicalResult promoteSingleIterReductionLoop(AffineForOp forOp, bool siblingFusionUser) { // Check if the reduction loop is a single iteration loop. - std::optional tripCount = getConstantTripCount(forOp); + std::optional tripCount = forOp.getStaticTripCount(); if (!tripCount || *tripCount != 1) return failure(); auto *parentOp = forOp->getParentOp(); @@ -496,14 +496,14 @@ bool mlir::affine::getLoopNestStats(AffineForOp forOpRoot, // Record trip count for 'forOp'. Set flag if trip count is not // constant. - std::optional maybeConstTripCount = getConstantTripCount(forOp); + std::optional maybeConstTripCount = forOp.getStaticTripCount(); if (!maybeConstTripCount) { // Currently only constant trip count loop nests are supported. LDBG() << "Non-constant trip count unsupported"; return WalkResult::interrupt(); } - stats->tripCountMap[childForOp] = *maybeConstTripCount; + stats->tripCountMap[childForOp] = maybeConstTripCount->getZExtValue(); return WalkResult::advance(); }); return !walkResult.wasInterrupted(); diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index 8f1249e3afaf0..90bc57e950cf1 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -117,7 +117,7 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) { /// Promotes the loop body of a forOp to its containing block if the forOp /// was known to have a single iteration. LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { - std::optional tripCount = getConstantTripCount(forOp); + std::optional tripCount = forOp.getStaticTripCount(); if (!tripCount || *tripCount != 1) return failure(); @@ -239,12 +239,12 @@ LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp, // conditional guards (or context information to prevent such versioning). The // better way to pipeline for such loops is to first tile them and extract // constant trip count "full tiles" before applying this. - auto mayBeConstTripCount = getConstantTripCount(forOp); + auto mayBeConstTripCount = forOp.getStaticTripCount(); if (!mayBeConstTripCount) { LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled")); return success(); } - uint64_t tripCount = *mayBeConstTripCount; + uint64_t tripCount = mayBeConstTripCount->getZExtValue(); assert(isOpwiseShiftValid(forOp, shifts) && "shifts will lead to an invalid transformation\n"); @@ -707,8 +707,10 @@ constructTiledIndexSetHyperRect(MutableArrayRef origLoops, // Bounds for intra-tile loops. for (unsigned i = 0; i < width; i++) { int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]); - std::optional mayBeConstantCount = - getConstantTripCount(origLoops[i]); + AffineForOp forOp = origLoops[i]; + std::optional mayBeConstantCount = std::nullopt; + if (auto staticTripCount = forOp.getStaticTripCount()) + mayBeConstantCount = staticTripCount->getZExtValue(); // The lower bound is just the tile-space loop. AffineMap lbMap = b.getDimIdentityMap(); newLoops[width + i].setLowerBound( @@ -869,9 +871,9 @@ void mlir::affine::getPerfectlyNestedLoops( /// Unrolls this loop completely. LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { - std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + std::optional mayBeConstantTripCount = forOp.getStaticTripCount(); if (mayBeConstantTripCount.has_value()) { - uint64_t tripCount = *mayBeConstantTripCount; + uint64_t tripCount = mayBeConstantTripCount->getZExtValue(); if (tripCount == 0) return success(); if (tripCount == 1) @@ -885,10 +887,10 @@ LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { /// whichever is lower. LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor) { - std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + std::optional mayBeConstantTripCount = forOp.getStaticTripCount(); if (mayBeConstantTripCount.has_value() && - *mayBeConstantTripCount < unrollFactor) - return loopUnrollByFactor(forOp, *mayBeConstantTripCount); + mayBeConstantTripCount->ult(unrollFactor)) + return loopUnrollByFactor(forOp, mayBeConstantTripCount->getZExtValue()); return loopUnrollByFactor(forOp, unrollFactor); } @@ -998,7 +1000,9 @@ LogicalResult mlir::affine::loopUnrollByFactor( bool cleanUpUnroll) { assert(unrollFactor > 0 && "unroll factor should be positive"); - std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + std::optional mayBeConstantTripCount = std::nullopt; + if (auto staticTripCount = forOp.getStaticTripCount()) + mayBeConstantTripCount = staticTripCount->getZExtValue(); if (unrollFactor == 1) { if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp))) return failure(); @@ -1060,10 +1064,10 @@ LogicalResult mlir::affine::loopUnrollByFactor( LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor) { - std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + std::optional mayBeConstantTripCount = forOp.getStaticTripCount(); if (mayBeConstantTripCount.has_value() && - *mayBeConstantTripCount < unrollJamFactor) - return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount); + mayBeConstantTripCount->getZExtValue() < unrollJamFactor) + return loopUnrollJamByFactor(forOp, mayBeConstantTripCount->getZExtValue()); return loopUnrollJamByFactor(forOp, unrollJamFactor); } @@ -1085,7 +1089,9 @@ LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor) { assert(unrollJamFactor > 0 && "unroll jam factor should be positive"); - std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + std::optional mayBeConstantTripCount = std::nullopt; + if (auto staticTripCount = forOp.getStaticTripCount()) + mayBeConstantTripCount = staticTripCount->getZExtValue(); if (unrollJamFactor == 1) { if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp))) return failure(); diff --git a/mlir/test/Dialect/Affine/trip-count.mlir b/mlir/test/Dialect/Affine/trip-count.mlir new file mode 100644 index 0000000000000..e28e410fd2112 --- /dev/null +++ b/mlir/test/Dialect/Affine/trip-count.mlir @@ -0,0 +1,38 @@ +// This test ensures that the LoopLikeInterfaceOp methods required +// for op-agnostic trip count analysis work for affine.for. + +// RUN: mlir-opt %s -test-scf-for-utils --split-input-file | FileCheck %s + +// CHECK-LABEL: func.func @affine_constant_loops +func.func @affine_constant_loops() { + // CHECK: "test.trip-count" = 10 + affine.for %i = 0 to 10 { + affine.yield + } + // CHECK: "test.trip-count" = 5 + affine.for %i = 0 to 10 step 2 { + affine.yield + } + // CHECK: "test.trip-count" = 0 + affine.for %i = 10 to 0 { + affine.yield + } + return +} + +// ----- + +// CHECK-LABEL: func.func @affine_symbolic_loops +func.func @affine_symbolic_loops(%N : index) { + // CHECK: "test.trip-count" = "none" + affine.for %i = 0 to %N { + affine.yield + } + + // CHECK: "test.trip-count" = 4 + affine.for %i = max affine_map<(d0) -> (d0)>(%N) to min affine_map<(d0) -> (d0 + 4)>(%N) { + affine.yield + } + + return +} From 72af16e1a1cc3e1db9a5cd458f4ed410ca90cb87 Mon Sep 17 00:00:00 2001 From: aokblast Date: Fri, 19 Jun 2026 22:13:57 +0800 Subject: [PATCH 033/149] [clang][FreeBSD] Re-enable the crash-recovery test on FreeBSD (#192608) All of the tests work now on FreeBSD, so we re-enable the feature again on FreeBSD. --- clang/test/lit.cfg.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index dc0d87f0a29a1..f7b3a77266cb8 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -336,9 +336,7 @@ def have_host_clang_repl_cuda(): "default-cxx-stdlib={}".format(config.clang_default_cxx_stdlib) ) -# As of 2011.08, crash-recovery tests still do not pass on FreeBSD. -if platform.system() not in ["FreeBSD"]: - config.available_features.add("crash-recovery") +config.available_features.add("crash-recovery") # ANSI escape sequences in non-dumb terminal if platform.system() not in ["Windows"]: From 825950238fffb9549b7fd81700bb241e1473c866 Mon Sep 17 00:00:00 2001 From: quic-k Date: Fri, 19 Jun 2026 19:46:14 +0530 Subject: [PATCH 034/149] [Clang][Hexagon] Predefine _GNU_SOURCE for C++ compilations (#201599) Predefine _GNU_SOURCE in C++ mode for H2, QuRT, and baremetal Hexagon targets. Signed-off-by: Kushal Pal --- clang/lib/Basic/Targets/Hexagon.cpp | 3 +++ clang/lib/Basic/Targets/OSTargets.h | 4 ++++ clang/test/Preprocessor/hexagon-predefines.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/clang/lib/Basic/Targets/Hexagon.cpp b/clang/lib/Basic/Targets/Hexagon.cpp index 9bf34e67a03fd..615114f0fd1ea 100644 --- a/clang/lib/Basic/Targets/Hexagon.cpp +++ b/clang/lib/Basic/Targets/Hexagon.cpp @@ -116,6 +116,9 @@ void HexagonTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2"); Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4"); Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8"); + + if (Opts.CPlusPlus && getTriple().getOS() == llvm::Triple::UnknownOS) + Builder.defineMacro("_GNU_SOURCE"); } bool HexagonTargetInfo::initFeatureMap( diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h index 943373c20af32..9461680df8bdb 100644 --- a/clang/lib/Basic/Targets/OSTargets.h +++ b/clang/lib/Basic/Targets/OSTargets.h @@ -1092,6 +1092,8 @@ class LLVM_LIBRARY_VISIBILITY QURTTargetInfo : public OSTargetInfo { void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple, MacroBuilder &Builder) const override { Builder.defineMacro("__qurt__"); + if (Opts.CPlusPlus) + Builder.defineMacro("_GNU_SOURCE"); } public: @@ -1105,6 +1107,8 @@ class LLVM_LIBRARY_VISIBILITY H2TargetInfo : public OSTargetInfo { void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple, MacroBuilder &Builder) const override { Builder.defineMacro("__h2__"); + if (Opts.CPlusPlus) + Builder.defineMacro("_GNU_SOURCE"); } public: diff --git a/clang/test/Preprocessor/hexagon-predefines.c b/clang/test/Preprocessor/hexagon-predefines.c index cb3e9492ea07e..f115e6e0a9926 100644 --- a/clang/test/Preprocessor/hexagon-predefines.c +++ b/clang/test/Preprocessor/hexagon-predefines.c @@ -261,3 +261,19 @@ // CHECK-H2: #define __h2__ 1 // CHECK-H2: #define __hexagon__ 1 // CHECK-H2-NOT: #define __linux__ + +// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -x c++ %s | FileCheck \ +// RUN: %s -check-prefix CHECK-CXX-GNU +// CHECK-CXX-GNU: #define _GNU_SOURCE 1 + +// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-h2 -x c++ %s | FileCheck \ +// RUN: %s -check-prefix CHECK-H2-CXX-GNU +// CHECK-H2-CXX-GNU: #define _GNU_SOURCE 1 + +// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-qurt -x c++ %s | FileCheck \ +// RUN: %s -check-prefix CHECK-QURT-CXX-GNU +// CHECK-QURT-CXX-GNU: #define _GNU_SOURCE 1 + +// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf %s | FileCheck \ +// RUN: %s -check-prefix CHECK-C-GNU +// CHECK-C-GNU-NOT: #define _GNU_SOURCE From 8eae496effc992b28c12c23374e89474956b9860 Mon Sep 17 00:00:00 2001 From: Vinay Deshmukh Date: Fri, 19 Jun 2026 10:19:34 -0400 Subject: [PATCH 035/149] [libc++] Make std::multimap constexpr as part of P3372R3 (#161901) Fixes #128661 Co-authored-by: Nikolas Klauser Co-authored-by: Louis Dionne --- libcxx/include/map | 265 +++++++++++------- .../alg.foreach/for_each.associative.pass.cpp | 4 +- .../ranges.for_each.associative.pass.cpp | 8 +- .../map/map.cons/move_alloc.pass.cpp | 1 + .../map/map.cons/move_assign.pass.cpp | 1 + .../map/map.modifiers/merge.pass.cpp | 2 +- .../associative/map/map.ops/contains.pass.cpp | 3 +- .../map/map.ops/contains_transparent.pass.cpp | 6 +- .../associative/multimap/empty.pass.cpp | 14 +- .../multimap/get_allocator.pass.cpp | 14 +- .../multimap/incomplete_type.pass.cpp | 13 +- .../associative/multimap/iterator.pass.cpp | 36 ++- .../associative/multimap/max_size.pass.cpp | 14 +- .../multimap/multimap.cons/alloc.pass.cpp | 14 +- .../assign_initializer_list.pass.cpp | 14 +- .../multimap/multimap.cons/compare.pass.cpp | 14 +- .../multimap.cons/compare_alloc.pass.cpp | 14 +- .../multimap/multimap.cons/copy.pass.cpp | 11 +- .../multimap.cons/copy_alloc.pass.cpp | 12 +- .../multimap.cons/copy_assign.pass.cpp | 28 +- .../multimap/multimap.cons/deduct.pass.cpp | 12 +- .../multimap.cons/deduct_const.pass.cpp | 12 +- .../multimap/multimap.cons/default.pass.cpp | 14 +- .../multimap.cons/default_noexcept.pass.cpp | 14 +- .../multimap.cons/dtor_noexcept.pass.cpp | 14 +- .../multimap.cons/from_range.pass.cpp | 17 +- .../multimap.cons/initializer_list.pass.cpp | 14 +- .../initializer_list_compare.pass.cpp | 14 +- .../initializer_list_compare_alloc.pass.cpp | 14 +- .../multimap/multimap.cons/iter_iter.pass.cpp | 14 +- .../multimap.cons/iter_iter_comp.pass.cpp | 14 +- .../iter_iter_comp_alloc.pass.cpp | 14 +- .../multimap/multimap.cons/move.pass.cpp | 14 +- .../multimap.cons/move_alloc.pass.cpp | 15 +- .../multimap.cons/move_assign.pass.cpp | 16 +- .../multimap.cons/move_noexcept.pass.cpp | 14 +- .../multimap.erasure/erase_if.pass.cpp | 27 +- .../multimap.modifiers/clear.pass.cpp | 14 +- .../multimap.modifiers/emplace.pass.cpp | 111 ++++---- .../multimap.modifiers/emplace_hint.pass.cpp | 111 ++++---- .../multimap.modifiers/erase_iter.pass.cpp | 17 +- .../erase_iter_iter.pass.cpp | 14 +- .../multimap.modifiers/erase_key.pass.cpp | 14 +- .../extract_iterator.pass.cpp | 29 +- .../multimap.modifiers/extract_key.pass.cpp | 30 +- .../insert_allocator_requirements.pass.cpp | 2 +- .../multimap.modifiers/insert_cv.pass.cpp | 16 +- .../insert_initializer_list.pass.cpp | 14 +- .../insert_iter_cv.pass.cpp | 16 +- .../insert_iter_iter.pass.cpp | 11 +- .../insert_iter_rv.pass.cpp | 16 +- .../insert_node_type.pass.cpp | 26 +- .../insert_node_type_hint.pass.cpp | 24 +- .../multimap.modifiers/insert_range.pass.cpp | 16 +- .../multimap.modifiers/insert_rv.pass.cpp | 16 +- .../multimap.modifiers/merge.pass.cpp | 33 ++- .../compare.three_way.pass.cpp | 15 +- .../multimap.nonmember/op_compare.pass.cpp | 25 +- .../multimap.observers/key_comp.pass.cpp | 16 +- .../multimap.observers/value_comp.pass.cpp | 16 +- .../multimap/multimap.ops/count.pass.cpp | 14 +- .../multimap/multimap.ops/count0.pass.cpp | 14 +- .../multimap.ops/count_transparent.pass.cpp | 19 +- .../multimap.ops/equal_range.pass.cpp | 16 +- .../multimap.ops/equal_range0.pass.cpp | 16 +- .../equal_range_transparent.pass.cpp | 22 +- .../multimap/multimap.ops/find.pass.cpp | 18 +- .../multimap/multimap.ops/find0.pass.cpp | 16 +- .../multimap.ops/lower_bound.pass.cpp | 16 +- .../multimap.ops/lower_bound0.pass.cpp | 16 +- .../multimap.ops/upper_bound.pass.cpp | 16 +- .../multimap.ops/upper_bound0.pass.cpp | 16 +- .../multimap.special/member_swap.pass.cpp | 14 +- .../multimap.special/non_member_swap.pass.cpp | 14 +- .../multimap.special/swap_noexcept.pass.cpp | 14 +- .../multimap.value_compare/invoke.pass.cpp | 18 +- .../multimap.value_compare/types.pass.cpp | 12 +- .../associative/multimap/size.pass.cpp | 14 +- .../associative/multimap/types.pass.cpp | 12 +- .../container.node/node_handle.pass.cpp | 2 +- 80 files changed, 1226 insertions(+), 416 deletions(-) diff --git a/libcxx/include/map b/libcxx/include/map index c983a3ed07cd4..de0a475ee572f 100644 --- a/libcxx/include/map +++ b/libcxx/include/map @@ -1370,13 +1370,15 @@ public: __tree_.__node_handle_merge_unique(__source.__tree_); } template - _LIBCPP_HIDE_FROM_ABI void merge(multimap& __source) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + merge(multimap& __source) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __source.get_allocator() == get_allocator(), "merging container with incompatible allocator"); __tree_.__node_handle_merge_unique(__source.__tree_); } template - _LIBCPP_HIDE_FROM_ABI void merge(multimap&& __source) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + merge(multimap&& __source) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __source.get_allocator() == get_allocator(), "merging container with incompatible allocator"); __tree_.__node_handle_merge_unique(__source.__tree_); @@ -1739,10 +1741,11 @@ public: protected: key_compare comp; - _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : comp(__c) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare(key_compare __c) : comp(__c) {} public: - _LIBCPP_HIDE_FROM_ABI bool operator()(const value_type& __x, const value_type& __y) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator()(const value_type& __x, const value_type& __y) const { return comp(__x.first, __y.first); } }; @@ -1775,26 +1778,28 @@ public: template friend class multimap; - _LIBCPP_HIDE_FROM_ABI multimap() _NOEXCEPT_( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap() _NOEXCEPT_( is_nothrow_default_constructible::value&& is_nothrow_default_constructible::value&& is_nothrow_copy_constructible::value) : __tree_(__vc(key_compare())) {} - _LIBCPP_HIDE_FROM_ABI explicit multimap(const key_compare& __comp) _NOEXCEPT_( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit multimap(const key_compare& __comp) _NOEXCEPT_( is_nothrow_default_constructible::value&& is_nothrow_copy_constructible::value) : __tree_(__vc(__comp)) {} - _LIBCPP_HIDE_FROM_ABI explicit multimap(const key_compare& __comp, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit multimap(const key_compare& __comp, const allocator_type& __a) : __tree_(__vc(__comp), typename __base::allocator_type(__a)) {} template - _LIBCPP_HIDE_FROM_ABI multimap(_InputIterator __f, _InputIterator __l, const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + multimap(_InputIterator __f, _InputIterator __l, const key_compare& __comp = key_compare()) : __tree_(__vc(__comp)) { insert(__f, __l); } template - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(_InputIterator __f, _InputIterator __l, const key_compare& __comp, const allocator_type& __a) : __tree_(__vc(__comp), typename __base::allocator_type(__a)) { insert(__f, __l); @@ -1802,7 +1807,7 @@ public: # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(from_range_t, _Range&& __range, const key_compare& __comp = key_compare(), @@ -1814,45 +1819,50 @@ public: # if _LIBCPP_STD_VER >= 14 template - _LIBCPP_HIDE_FROM_ABI multimap(_InputIterator __f, _InputIterator __l, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(_InputIterator __f, _InputIterator __l, const allocator_type& __a) : multimap(__f, __l, key_compare(), __a) {} # endif # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI multimap(from_range_t, _Range&& __range, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(from_range_t, _Range&& __range, const allocator_type& __a) : multimap(from_range, std::forward<_Range>(__range), key_compare(), __a) {} # endif - _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(const multimap& __m) = default; - _LIBCPP_HIDE_FROM_ABI multimap& operator=(const multimap& __m) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap& operator=(const multimap& __m) = default; # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(multimap&& __m) = default; - _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), __a) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(multimap&& __m, const allocator_type& __a) + : __tree_(std::move(__m.__tree_), __a) {} - _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap& operator=(multimap&& __m) = default; - _LIBCPP_HIDE_FROM_ABI multimap(initializer_list __il, const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + multimap(initializer_list __il, const key_compare& __comp = key_compare()) : __tree_(__vc(__comp)) { insert(__il.begin(), __il.end()); } - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(initializer_list __il, const key_compare& __comp, const allocator_type& __a) : __tree_(__vc(__comp), typename __base::allocator_type(__a)) { insert(__il.begin(), __il.end()); } # if _LIBCPP_STD_VER >= 14 - _LIBCPP_HIDE_FROM_ABI multimap(initializer_list __il, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(initializer_list __il, const allocator_type& __a) : multimap(__il, key_compare(), __a) {} # endif - _LIBCPP_HIDE_FROM_ABI multimap& operator=(initializer_list __il) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap& operator=(initializer_list __il) { clear(); insert(__il.begin(), __il.end()); return *this; @@ -1860,193 +1870,247 @@ public: # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI explicit multimap(const allocator_type& __a) : __tree_(typename __base::allocator_type(__a)) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit multimap(const allocator_type& __a) + : __tree_(typename __base::allocator_type(__a)) {} - _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m, const allocator_type& __a) : __tree_(__m.__tree_, __a) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 multimap(const multimap& __m, const allocator_type& __a) + : __tree_(__m.__tree_, __a) {} - _LIBCPP_HIDE_FROM_ABI ~multimap() { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 ~multimap() { static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), ""); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __tree_.begin(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __tree_.begin(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __tree_.end(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __tree_.end(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() _NOEXCEPT { + return __tree_.begin(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const _NOEXCEPT { + return __tree_.begin(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() _NOEXCEPT { + return __tree_.end(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const _NOEXCEPT { + return __tree_.end(); + } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() _NOEXCEPT { + return reverse_iterator(end()); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator + rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() _NOEXCEPT { + return reverse_iterator(begin()); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return begin(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return end(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return rbegin(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const _NOEXCEPT { + return begin(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const _NOEXCEPT { + return end(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator + crbegin() const _NOEXCEPT { + return rbegin(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const _NOEXCEPT { + return rend(); + } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __tree_.size() == 0; } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __tree_.size(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return __tree_.max_size(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const _NOEXCEPT { + return __tree_.size() == 0; + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const _NOEXCEPT { + return __tree_.size(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const _NOEXCEPT { + return __tree_.max_size(); + } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 allocator_type get_allocator() const _NOEXCEPT { return allocator_type(__tree_.__alloc()); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __tree_.value_comp().key_comp(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { + return __tree_.value_comp().key_comp(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const { return value_compare(__tree_.value_comp().key_comp()); } # ifndef _LIBCPP_CXX03_LANG template - _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) { return __tree_.__emplace_multi(std::forward<_Args>(__args)...); } template - _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __p, _Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __p, _Args&&... __args) { return __tree_.__emplace_hint_multi(__p.__i_, std::forward<_Args>(__args)...); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI iterator insert(_Pp&& __p) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(_Pp&& __p) { return __tree_.__emplace_multi(std::forward<_Pp>(__p)); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __pos, _Pp&& __p) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __pos, _Pp&& __p) { return __tree_.__emplace_hint_multi(__pos.__i_, std::forward<_Pp>(__p)); } - _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __v) { return __tree_.__emplace_multi(std::move(__v)); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __v) { + return __tree_.__emplace_multi(std::move(__v)); + } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __v) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __p, value_type&& __v) { return __tree_.__emplace_hint_multi(__p.__i_, std::move(__v)); } - _LIBCPP_HIDE_FROM_ABI void insert(initializer_list __il) { insert(__il.begin(), __il.end()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list __il) { + insert(__il.begin(), __il.end()); + } # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __v) { return __tree_.__emplace_multi(__v); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __v) { + return __tree_.__emplace_multi(__v); + } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __v) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __p, const value_type& __v) { return __tree_.__emplace_hint_multi(__p.__i_, __v); } template - _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __f, _InputIterator __l) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __f, _InputIterator __l) { __tree_.__insert_range_multi(__f, __l); } # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) { __tree_.__insert_range_multi(ranges::begin(__range), ranges::end(__range)); } # endif - _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __p) { return __tree_.erase(__p.__i_); } - _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __p) { return __tree_.erase(__p.__i_); } - _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __k) { return __tree_.__erase_multi(__k); } - _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __f, const_iterator __l) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __p) { + return __tree_.erase(__p.__i_); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __p) { return __tree_.erase(__p.__i_); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __k) { + return __tree_.__erase_multi(__k); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __f, const_iterator __l) { return __tree_.erase(__f.__i_, __l.__i_); } # if _LIBCPP_STD_VER >= 17 - _LIBCPP_HIDE_FROM_ABI iterator insert(node_type&& __nh) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(node_type&& __nh) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(__nh.empty() || __nh.get_allocator() == get_allocator(), "node_type with incompatible allocator passed to multimap::insert()"); return __tree_.template __node_handle_insert_multi(std::move(__nh)); } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, node_type&& __nh) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, node_type&& __nh) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(__nh.empty() || __nh.get_allocator() == get_allocator(), "node_type with incompatible allocator passed to multimap::insert()"); return __tree_.template __node_handle_insert_multi(__hint.__i_, std::move(__nh)); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI node_type extract(key_type const& __key) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 node_type extract(key_type const& __key) { return __tree_.template __node_handle_extract(__key); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI node_type extract(const_iterator __it) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 node_type extract(const_iterator __it) { return __tree_.template __node_handle_extract(__it.__i_); } template - _LIBCPP_HIDE_FROM_ABI void merge(multimap& __source) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + merge(multimap& __source) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __source.get_allocator() == get_allocator(), "merging container with incompatible allocator"); return __tree_.__node_handle_merge_multi(__source.__tree_); } template - _LIBCPP_HIDE_FROM_ABI void merge(multimap&& __source) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + merge(multimap&& __source) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __source.get_allocator() == get_allocator(), "merging container with incompatible allocator"); return __tree_.__node_handle_merge_multi(__source.__tree_); } template - _LIBCPP_HIDE_FROM_ABI void merge(map& __source) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + merge(map& __source) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __source.get_allocator() == get_allocator(), "merging container with incompatible allocator"); return __tree_.__node_handle_merge_multi(__source.__tree_); } template - _LIBCPP_HIDE_FROM_ABI void merge(map&& __source) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + merge(map&& __source) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __source.get_allocator() == get_allocator(), "merging container with incompatible allocator"); return __tree_.__node_handle_merge_multi(__source.__tree_); } # endif - _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __tree_.clear(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() _NOEXCEPT { __tree_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(multimap& __m) _NOEXCEPT_(__is_nothrow_swappable_v<__base>) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(multimap& __m) + _NOEXCEPT_(__is_nothrow_swappable_v<__base>) { __tree_.swap(__m.__tree_); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __k) { + return __tree_.find(__k); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __k) const { + return __tree_.find(__k); + } # if _LIBCPP_STD_VER >= 14 template || __is_transparently_comparable_v<_Comp, key_type, _K2>, int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _K2& __k) { return __tree_.find(__k); } template || __is_transparently_comparable_v<_Comp, key_type, _K2>, int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _K2& __k) const { return __tree_.find(__k); } # endif - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __k) const { return __tree_.__count_multi(__k); } # if _LIBCPP_STD_VER >= 14 template , int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _K2& __k) const { return __tree_.__count_multi(__k); } # endif # if _LIBCPP_STD_VER >= 20 - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __k) const { + return find(__k) != end(); + } template || __is_transparently_comparable_v<_Comp, key_type, _K2>, int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _K2& __k) const { return find(__k) != end(); } # endif // _LIBCPP_STD_VER >= 20 - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __k) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __k) { return __tree_.__lower_bound_multi(__k); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator + lower_bound(const key_type& __k) const { return __tree_.__lower_bound_multi(__k); } @@ -2054,23 +2118,25 @@ public: template || __is_transparently_comparable_v<_Comp, key_type, _K2>, int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _K2& __k) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _K2& __k) { return __tree_.__lower_bound_multi(__k); } template || __is_transparently_comparable_v<_Comp, key_type, _K2>, int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _K2& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator + lower_bound(const _K2& __k) const { return __tree_.__lower_bound_multi(__k); } # endif - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __k) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __k) { return __tree_.__upper_bound_multi(__k); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator + upper_bound(const key_type& __k) const { return __tree_.__upper_bound_multi(__k); } @@ -2078,30 +2144,35 @@ public: template || __is_transparently_comparable_v<_Comp, key_type, _K2>, int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _K2& __k) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _K2& __k) { return __tree_.__upper_bound_multi(__k); } template || __is_transparently_comparable_v<_Comp, key_type, _K2>, int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _K2& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator + upper_bound(const _K2& __k) const { return __tree_.__upper_bound_multi(__k); } # endif - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __k) { + [[__nodiscard__]] + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair equal_range(const key_type& __k) { return __tree_.__equal_range_multi(__k); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair + equal_range(const key_type& __k) const { return __tree_.__equal_range_multi(__k); } # if _LIBCPP_STD_VER >= 14 template , int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI pair equal_range(const _K2& __k) { + [[__nodiscard__]] + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair equal_range(const _K2& __k) { return __tree_.__equal_range_multi(__k); } template , int> = 0> - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI pair equal_range(const _K2& __k) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair + equal_range(const _K2& __k) const { return __tree_.__equal_range_multi(__k); } # endif @@ -2175,7 +2246,7 @@ struct __specialized_algorithm<_Algorithm::__for_each, __single_range - _LIBCPP_HIDE_FROM_ABI static auto operator()(_Map&& __map, _Func __func, _Proj __proj) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto operator()(_Map&& __map, _Func __func, _Proj __proj) { auto [_, __func2] = __specialized_algorithm<_Algorithm::__for_each, __single_range>()( __map.__tree_, std::move(__func), std::move(__proj)); return std::make_pair(__map.end(), std::move(__func2)); @@ -2184,7 +2255,7 @@ struct __specialized_algorithm<_Algorithm::__for_each, __single_range -inline _LIBCPP_HIDE_FROM_ABI bool +inline _LIBCPP_HIDE_FROM_ABI bool _LIBCPP_CONSTEXPR_SINCE_CXX26 operator==(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { return __x.size() == __y.size() && std::equal(__x.begin(), __x.end(), __y.begin()); } @@ -2192,31 +2263,31 @@ operator==(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap< # if _LIBCPP_STD_VER <= 17 template -inline _LIBCPP_HIDE_FROM_ABI bool +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool operator<(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { return std::lexicographical_compare(__x.begin(), __x.end(), __y.begin(), __y.end()); } template -inline _LIBCPP_HIDE_FROM_ABI bool +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool operator!=(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI bool +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool operator>(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { return __y < __x; } template -inline _LIBCPP_HIDE_FROM_ABI bool +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool operator>=(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { return !(__x < __y); } template -inline _LIBCPP_HIDE_FROM_ABI bool +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool operator<=(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { return !(__y < __x); } @@ -2224,7 +2295,7 @@ operator<=(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap< # else // #if _LIBCPP_STD_VER <= 17 template -_LIBCPP_HIDE_FROM_ABI __synth_three_way_result> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __synth_three_way_result> operator<=>(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way); @@ -2233,7 +2304,7 @@ operator<=>(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, # endif // #if _LIBCPP_STD_VER <= 17 template -inline _LIBCPP_HIDE_FROM_ABI void +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(multimap<_Key, _Tp, _Compare, _Allocator>& __x, multimap<_Key, _Tp, _Compare, _Allocator>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { __x.swap(__y); @@ -2241,7 +2312,7 @@ swap(multimap<_Key, _Tp, _Compare, _Allocator>& __x, multimap<_Key, _Tp, _Compar # if _LIBCPP_STD_VER >= 20 template -inline _LIBCPP_HIDE_FROM_ABI typename multimap<_Key, _Tp, _Compare, _Allocator>::size_type +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 typename multimap<_Key, _Tp, _Compare, _Allocator>::size_type erase_if(multimap<_Key, _Tp, _Compare, _Allocator>& __c, _Predicate __pred) { return std::__libcpp_erase_if_container(__c, __pred); } diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.associative.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.associative.pass.cpp index 0fcd3ab27635a..645e1e4af792e 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.associative.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.associative.pass.cpp @@ -78,9 +78,7 @@ TEST_CONSTEXPR_CXX26 bool test() { if (!TEST_IS_CONSTANT_EVALUATED) test_node_container >([](int i) { return i; }); test_node_container >([](int i) { return std::make_pair(i, i); }); - // FIXME: remove when multimap is made constexpr - if (!TEST_IS_CONSTANT_EVALUATED) - test_node_container >([](int i) { return std::make_pair(i, i); }); + test_node_container >([](int i) { return std::make_pair(i, i); }); return true; } diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.associative.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.associative.pass.cpp index 0a1bbe024cffa..e0186654e4bbc 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.associative.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.associative.pass.cpp @@ -258,11 +258,9 @@ TEST_CONSTEXPR_CXX26 bool test() { // FIXME: remove when multiset is made constexpr test_node_container >([](int i) { return i; }); - - // FIXME: remove when multimap is made constexpr - test_node_container >([](int i) { return std::make_pair(i, i); }); } test_node_container >([](int i) { return std::make_pair(i, i); }); + test_node_container >([](int i) { return std::make_pair(i, i); }); if (!TEST_IS_CONSTANT_EVALUATED) { // FIXME: remove when set is made constexpr @@ -270,12 +268,10 @@ TEST_CONSTEXPR_CXX26 bool test() { // FIXME: remove when multiset is made constexpr test_invoke_set_like(); - - // FIXME: remove when multimap is made constexpr - test_invoke_map_like(); } test_invoke_map_like(); + test_invoke_map_like(); return true; } diff --git a/libcxx/test/std/containers/associative/map/map.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/associative/map/map.cons/move_alloc.pass.cpp index 169639f5afa78..a7bdbea189f8f 100644 --- a/libcxx/test/std/containers/associative/map/map.cons/move_alloc.pass.cpp +++ b/libcxx/test/std/containers/associative/map/map.cons/move_alloc.pass.cpp @@ -161,6 +161,7 @@ int main(int, char**) { #if TEST_STD_VER >= 26 // FIXME: It is not yet possible to replace a `const MoveOnly` key subobject during constant evaluation. + // See https://github.com/llvm/llvm-project/issues/204617. // static_assert(test_move_alloc()); static_assert(test_move_alloc()); #endif diff --git a/libcxx/test/std/containers/associative/map/map.cons/move_assign.pass.cpp b/libcxx/test/std/containers/associative/map/map.cons/move_assign.pass.cpp index 4fffb2fe04300..eeea47fc94d18 100644 --- a/libcxx/test/std/containers/associative/map/map.cons/move_assign.pass.cpp +++ b/libcxx/test/std/containers/associative/map/map.cons/move_assign.pass.cpp @@ -107,6 +107,7 @@ int main(int, char**) { #if TEST_STD_VER >= 26 // FIXME: It is not yet possible to replace a `const MoveOnly` key subobject during constant evaluation. + // See https://github.com/llvm/llvm-project/issues/204617. // static_assert(test_move_assign()); static_assert(test_move_assign()); #endif diff --git a/libcxx/test/std/containers/associative/map/map.modifiers/merge.pass.cpp b/libcxx/test/std/containers/associative/map/map.modifiers/merge.pass.cpp index c46234a8ff7fc..c3f8f266a929b 100644 --- a/libcxx/test/std/containers/associative/map/map.modifiers/merge.pass.cpp +++ b/libcxx/test/std/containers/associative/map/map.modifiers/merge.pass.cpp @@ -132,7 +132,7 @@ bool test() { first.merge(second); first.merge(std::move(second)); } - if (!TEST_IS_CONSTANT_EVALUATED) { + { std::multimap second; first.merge(second); first.merge(std::move(second)); diff --git a/libcxx/test/std/containers/associative/map/map.ops/contains.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/contains.pass.cpp index 64ccb401e134c..e93b12bd71370 100644 --- a/libcxx/test/std/containers/associative/map/map.ops/contains.pass.cpp +++ b/libcxx/test/std/containers/associative/map/map.ops/contains.pass.cpp @@ -49,8 +49,7 @@ TEST_CONSTEXPR_CXX26 bool test() { test, std::pair >( -1, std::make_pair(1, E{}), std::make_pair(2, E{}), std::make_pair(3, E{}), std::make_pair(4, E{})); } - // FIXME: remove when multimap is made constexpr - if (!TEST_IS_CONSTANT_EVALUATED) { + { test, std::pair >( 'e', std::make_pair('a', 10), std::make_pair('b', 11), std::make_pair('c', 12), std::make_pair('d', 13)); diff --git a/libcxx/test/std/containers/associative/map/map.ops/contains_transparent.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/contains_transparent.pass.cpp index 778bd312469a9..4b9038f7121d6 100644 --- a/libcxx/test/std/containers/associative/map/map.ops/contains_transparent.pass.cpp +++ b/libcxx/test/std/containers/associative/map/map.ops/contains_transparent.pass.cpp @@ -41,11 +41,7 @@ TEST_CONSTEXPR_CXX26 bool test() { TEST_CONSTEXPR_CXX26 bool test() { test, int, Comp> >(); - - // FIXME: remove when multimap is made constexpr - if (!TEST_IS_CONSTANT_EVALUATED) { - test, int, Comp> >(); - } + test, int, Comp> >(); return true; } diff --git a/libcxx/test/std/containers/associative/multimap/empty.pass.cpp b/libcxx/test/std/containers/associative/multimap/empty.pass.cpp index c183cc385a916..bb7b6624b2991 100644 --- a/libcxx/test/std/containers/associative/multimap/empty.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/empty.pass.cpp @@ -10,7 +10,7 @@ // class multimap -// bool empty() const; +// bool empty() const; // constexpr since C++26 #include #include @@ -18,7 +18,8 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { { typedef std::multimap M; M m; @@ -40,5 +41,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/get_allocator.pass.cpp b/libcxx/test/std/containers/associative/multimap/get_allocator.pass.cpp index 102dd0b5a36cc..92c128a9e5010 100644 --- a/libcxx/test/std/containers/associative/multimap/get_allocator.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/get_allocator.pass.cpp @@ -10,7 +10,7 @@ // class multimap -// allocator_type get_allocator() const +// allocator_type get_allocator() const // constexpr since C++26 #include #include @@ -19,7 +19,8 @@ #include "test_allocator.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { typedef std::pair ValueType; { std::allocator alloc; @@ -32,5 +33,14 @@ int main(int, char**) { assert(m.get_allocator() == alloc); } + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/incomplete_type.pass.cpp b/libcxx/test/std/containers/associative/multimap/incomplete_type.pass.cpp index 470275aea064b..3bcd40cad9838 100644 --- a/libcxx/test/std/containers/associative/multimap/incomplete_type.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/incomplete_type.pass.cpp @@ -26,11 +26,22 @@ struct A { inline bool operator==(A const& L, A const& R) { return &L == &R; } inline bool operator<(A const& L, A const& R) { return L.data < R.data; } -int main(int, char**) { + +TEST_CONSTEXPR_CXX26 +bool test() { A a; // Make sure that the allocator isn't rebound to and incomplete type std::multimap, complete_type_allocator > > m; + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/iterator.pass.cpp b/libcxx/test/std/containers/associative/multimap/iterator.pass.cpp index ffdc39ff35563..dfac6418fcde1 100644 --- a/libcxx/test/std/containers/associative/multimap/iterator.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/iterator.pass.cpp @@ -10,20 +10,20 @@ // class multimap -// iterator begin(); -// const_iterator begin() const; -// iterator end(); -// const_iterator end() const; +// iterator begin(); // constexpr since C++26 +// const_iterator begin() const; // constexpr since C++26 +// iterator end(); // constexpr since C++26 +// const_iterator end() const; // constexpr since C++26 // -// reverse_iterator rbegin(); -// const_reverse_iterator rbegin() const; -// reverse_iterator rend(); -// const_reverse_iterator rend() const; +// reverse_iterator rbegin(); // constexpr since C++26 +// const_reverse_iterator rbegin() const; // constexpr since C++26 +// reverse_iterator rend(); // constexpr since C++26 +// const_reverse_iterator rend() const; // constexpr since C++26 // -// const_iterator cbegin() const; -// const_iterator cend() const; -// const_reverse_iterator crbegin() const; -// const_reverse_iterator crend() const; +// const_iterator cbegin() const; // constexpr since C++26 +// const_iterator cend() const; // constexpr since C++26 +// const_reverse_iterator crbegin() const; // constexpr since C++26 +// const_reverse_iterator crend() const; // constexpr since C++26 #include #include @@ -32,7 +32,8 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { { typedef std::pair V; V ar[] = {V(1, 1), V(1, 1.5), V(1, 2), V(2, 1), V(2, 1.5), V(2, 2), V(3, 1), V(3, 1.5), @@ -165,5 +166,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/max_size.pass.cpp b/libcxx/test/std/containers/associative/multimap/max_size.pass.cpp index c6208d27336b5..4d78bdaf99625 100644 --- a/libcxx/test/std/containers/associative/multimap/max_size.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/max_size.pass.cpp @@ -10,7 +10,7 @@ // class multimap -// size_type max_size() const; +// size_type max_size() const; // constexpr since C++26 #include #include @@ -20,7 +20,8 @@ #include "test_allocator.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { typedef std::pair KV; { typedef limited_allocator A; @@ -45,5 +46,14 @@ int main(int, char**) { assert(c.max_size() <= alloc_max_size(c.get_allocator())); } + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/alloc.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/alloc.pass.cpp index 10184633a82de..1d7af0ab1ffff 100644 --- a/libcxx/test/std/containers/associative/multimap/multimap.cons/alloc.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/alloc.pass.cpp @@ -10,7 +10,7 @@ // class multimap -// explicit multimap(const allocator_type& a); +// explicit multimap(const allocator_type& a); // constexpr since C++26 #include #include @@ -19,7 +19,8 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { { typedef std::less C; typedef test_allocator > A; @@ -47,5 +48,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/assign_initializer_list.pass.cpp index d1de8fab172cf..1c806c31f335c 100644 --- a/libcxx/test/std/containers/associative/multimap/multimap.cons/assign_initializer_list.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/assign_initializer_list.pass.cpp @@ -12,7 +12,7 @@ // class multimap -// multimap& operator=(initializer_list il); +// multimap& operator=(initializer_list il); // constexpr since C++26 #include #include @@ -20,7 +20,8 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { { typedef std::multimap C; typedef C::value_type V; @@ -58,5 +59,14 @@ int main(int, char**) { assert(*++i == V(3, 2)); } + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/compare.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/compare.pass.cpp index 84584a427ead0..d9a2257dbacfc 100644 --- a/libcxx/test/std/containers/associative/multimap/multimap.cons/compare.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/compare.pass.cpp @@ -10,7 +10,7 @@ // class multimap -// explicit multimap(const key_compare& comp); +// explicit multimap(const key_compare& comp); // constexpr since C++26 #include #include @@ -19,7 +19,8 @@ #include "../../../test_compare.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { { typedef test_less C; const std::multimap m(C(3)); @@ -37,5 +38,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/compare_alloc.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/compare_alloc.pass.cpp index 207e7e271234f..a72bbabd376d3 100644 --- a/libcxx/test/std/containers/associative/multimap/multimap.cons/compare_alloc.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/compare_alloc.pass.cpp @@ -10,7 +10,7 @@ // class multimap -// multimap(const key_compare& comp, const allocator_type& a); +// multimap(const key_compare& comp, const allocator_type& a); // constexpr since C++26 #include #include @@ -20,7 +20,8 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 +bool test() { { typedef test_less C; typedef test_allocator > A; @@ -51,5 +52,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + test(); + +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/copy.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/copy.pass.cpp index 724755d1ef655..999d6be70b3c8 100644 --- a/libcxx/test/std/containers/associative/multimap/multimap.cons/copy.pass.cpp +++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/copy.pass.cpp @@ -10,7 +10,7 @@ // class multimap -// multimap(const multimap& m); +// multimap(const multimap& m); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "min_allocator.h" template