- Notifications
You must be signed in to change notification settings - Fork 15.6k
[SLP] SLP's copyable elements based upon Main/Alt operations. #124242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
| @llvm/pr-subscribers-vectorizers Author: Dinar Temirbulatov (dtemirbulatov) ChangesAdded testcase from #110740. Patch is 95.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124242.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c98d872fb6467f..47b61496b5e155 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -201,6 +201,10 @@ static cl::opt<bool> VectorizeNonPowerOf2( "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements.")); +static cl::opt<bool> + VectorizeCopyable("slp-vectorize-copyable", cl::init(false), cl::Hidden, + cl::desc("Try to vectorize with copyable elements.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -426,6 +430,8 @@ static bool isVectorLikeInstWithConstOps(Value *V) { if (isa<ExtractElementInst>(I)) return isConstant(I->getOperand(1)); assert(isa<InsertElementInst>(V) && "Expected only insertelement."); + if (I->getNumOperands() < 2) + return false; return isConstant(I->getOperand(2)); } @@ -594,6 +600,41 @@ static std::optional<unsigned> getElementIndex(const Value *Inst, return Index; } +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + if (Opcode != Instruction::PHI && Opcode != Instruction::Invoke && + !isa<FPMathOperator>(I) && + ((I->getType()->isIntegerTy() && + (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::And || + I->getOpcode() == Instruction::AShr || + I->getOpcode() == Instruction::BitCast || + I->getOpcode() == Instruction::Call || + // Issue with scheduling with isVectorLikeInstWithConstOps + // operations. + // I->getOpcode() == Instruction::ExtractElement || + // I->getOpcode() == Instruction::ExtractValue || + I->getOpcode() == Instruction::ICmp || + I->getOpcode() == Instruction::Load || + I->getOpcode() == Instruction::LShr || + I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::Or || + I->getOpcode() == Instruction::PtrToInt || + I->getOpcode() == Instruction::Select || + I->getOpcode() == Instruction::SExt || + I->getOpcode() == Instruction::Shl || + I->getOpcode() == Instruction::Sub || + I->getOpcode() == Instruction::Trunc || + I->getOpcode() == Instruction::Xor || + I->getOpcode() == Instruction::ZExt)))) + return I->getOpcode(); + return 0; +} + namespace { /// Specifies the way the mask should be analyzed for undefs/poisonous elements /// in the shuffle mask. @@ -853,6 +894,16 @@ class InstructionsState { } // end anonymous namespace +/// Chooses the correct key for scheduling data. If \p Op has the same (or +/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p +/// OpValue. +static Value *isOneOf(const InstructionsState &S, Value *Op) { + auto *I = dyn_cast<Instruction>(Op); + if (I && S.isOpcodeOrAlt(I)) + return Op; + return S.getMainOp(); +} + /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. /// @@ -865,6 +916,14 @@ static bool isValidForAlternation(unsigned Opcode) { return true; } +// Check for inner dependencies, we could not support such depenedies if it +// comes from a main operaion, only from an alternative. +static bool checkCopyableInnerDep(ArrayRef<Value *> VL, + const InstructionsState &S); + +// Determine that the vector could be vectorized with copyable elements. +static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt); + static InstructionsState getSameOpcode(ArrayRef<Value *> VL, const TargetLibraryInfo &TLI); @@ -917,19 +976,53 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, return InstructionsState::invalid(); Instruction *MainOp = cast<Instruction>(*It); + Instruction *AltOp = MainOp; + unsigned Opcode = MainOp->getOpcode(); + unsigned AltOpcode = Opcode; + for (Value *V : iterator_range(It + 1, VL.end())) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + continue; + unsigned VOpcode = Inst->getOpcode(); + if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) && + VOpcode != Opcode && isValidForAlternation(VOpcode)) { + AltOpcode = VOpcode; + AltOp = Inst; + break; + } + } unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>); if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) || (VL.size() == 2 && InstCnt < 2)) return InstructionsState::invalid(); + bool IsBinOp = isa<BinaryOperator>(MainOp); + bool IsCopyable = false; + if (MainOp && AltOp && MainOp != AltOp) { + if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) { + std::swap(MainOp, AltOp); + std::swap(AltOpcode, Opcode); + IsBinOp = true; + } + IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp); + if (IsCopyable && isa<CmpInst>(AltOp)) { + Type *Ty0 = MainOp->getOperand(0)->getType(); + Type *Ty1 = AltOp->getOperand(0)->getType(); + if (Ty0 != Ty1) + return InstructionsState::invalid(); + } + if (!IsCopyable) { + MainOp = cast<Instruction>(*It); + AltOp = MainOp; + Opcode = MainOp->getOpcode(); + AltOpcode = Opcode; + IsBinOp = isa<BinaryOperator>(MainOp); + } + } bool IsCastOp = isa<CastInst>(MainOp); - bool IsBinOp = isa<BinaryOperator>(MainOp); bool IsCmpOp = isa<CmpInst>(MainOp); CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE; - Instruction *AltOp = MainOp; - unsigned Opcode = MainOp->getOpcode(); - unsigned AltOpcode = Opcode; bool SwappedPredsCompatible = IsCmpOp && [&]() { SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds; @@ -984,7 +1077,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, AltOp = I; continue; } - } else if (IsCastOp && isa<CastInst>(I)) { + } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) { Value *Op0 = MainOp->getOperand(0); Type *Ty0 = Op0->getType(); Value *Op1 = I->getOperand(0); @@ -1001,13 +1094,15 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, continue; } } - } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) { + } else if (auto *Inst = dyn_cast<CmpInst>(I); + Inst && (IsCmpOp || IsCopyable)) { auto *BaseInst = cast<CmpInst>(MainOp); Type *Ty0 = BaseInst->getOperand(0)->getType(); Type *Ty1 = Inst->getOperand(0)->getType(); if (Ty0 == Ty1) { - assert(InstOpcode == Opcode && "Expected same CmpInst opcode."); - assert(InstOpcode == AltOpcode && + assert((IsCopyable || InstOpcode == Opcode) && + "Expected same CmpInst opcode."); + assert((IsCopyable || InstOpcode == AltOpcode) && "Alternate instructions are only supported by BinaryOperator " "and CastInst."); // Check for compatible operands. If the corresponding operands are not @@ -1038,23 +1133,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, AltPred == CurrentPred || AltPred == SwappedCurrentPred) continue; } - } else if (InstOpcode == Opcode) { - assert(InstOpcode == AltOpcode && + } else if (InstOpcode == Opcode || + (IsCopyable && InstOpcode == AltOpcode)) { + assert((IsCopyable || InstOpcode == AltOpcode) && "Alternate instructions are only supported by BinaryOperator and " "CastInst."); + Instruction *Op = MainOp; + if (IsCopyable) { + if (InstOpcode != Opcode && InstOpcode != AltOpcode) { + Op = I; + } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) { + Op = AltOp; + } + } if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { if (Gep->getNumOperands() != 2 || - Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType()) + Gep->getOperand(0)->getType() != Op->getOperand(0)->getType()) return InstructionsState::invalid(); } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) { if (!isVectorLikeInstWithConstOps(EI)) return InstructionsState::invalid(); } else if (auto *LI = dyn_cast<LoadInst>(I)) { - auto *BaseLI = cast<LoadInst>(MainOp); + auto *BaseLI = cast<LoadInst>(Op); if (!LI->isSimple() || !BaseLI->isSimple()) return InstructionsState::invalid(); } else if (auto *Call = dyn_cast<CallInst>(I)) { - auto *CallBase = cast<CallInst>(MainOp); + auto *CallBase = cast<CallInst>(Op); if (Call->getCalledFunction() != CallBase->getCalledFunction()) return InstructionsState::invalid(); if (Call->hasOperandBundles() && @@ -1069,13 +1173,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, return InstructionsState::invalid(); if (!ID) { SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call); - if (Mappings.size() != BaseMappings.size() || - Mappings.front().ISA != BaseMappings.front().ISA || - Mappings.front().ScalarName != BaseMappings.front().ScalarName || - Mappings.front().VectorName != BaseMappings.front().VectorName || - Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || - Mappings.front().Shape.Parameters != - BaseMappings.front().Shape.Parameters) + if (Mappings.size() && + (Mappings.size() != BaseMappings.size() || + Mappings.front().ISA != BaseMappings.front().ISA || + Mappings.front().ScalarName != BaseMappings.front().ScalarName || + Mappings.front().VectorName != BaseMappings.front().VectorName || + Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || + Mappings.front().Shape.Parameters != + BaseMappings.front().Shape.Parameters)) return InstructionsState::invalid(); } } @@ -1124,6 +1229,46 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } } +static bool checkCopyableInnerDep(ArrayRef<Value *> VL, + const InstructionsState &S) { + SmallSet<Value *, 4> Ops; + unsigned Opcode = S.getOpcode(); + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + if (I->getOpcode() == Opcode) + Ops.insert(V); + } + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + for (Use &U : I->operands()) { + if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op)) + return false; + } + } + return true; +} + +static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) { + if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt || + !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) || + find_if(VL, IsaPred<PHINode>) != VL.end()) + return false; + + Instruction *MainOp = cast<Instruction>(Main); + Instruction *AltOp = cast<Instruction>(Alt); + + if (isa<BinaryOperator>(MainOp) && !isa<BinaryOperator>(AltOp) && + isValidForAlternation(MainOp->getOpcode()) && + isValidForAlternation(AltOp->getOpcode()) && + tryToRepresentAsInstArg(MainOp->getOpcode(), AltOp) && + tryToRepresentAsInstArg(AltOp->getOpcode(), MainOp)) + return true; + return false; +} /// \returns the AA location that is being access by the instruction. static MemoryLocation getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) @@ -1463,6 +1608,7 @@ class BoUpSLP { MultiNodeScalars.clear(); MustGather.clear(); NonScheduledFirst.clear(); + CopyableAltOp.clear(); EntryToLastInstruction.clear(); LoadEntriesToVectorize.clear(); IsGraphTransformMode = false; @@ -2461,8 +2607,16 @@ class BoUpSLP { } bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); bool APO = (OpIdx == 0) ? false : IsInverseOperation; - OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), - APO, false}; + Instruction *Inst = cast<Instruction>(VL[Lane]); + if (Inst->getOpcode() != MainOp->getOpcode() && + OpIdx > (Inst->getNumOperands() - 1)) { + OpsVec[OpIdx][Lane] = { + PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true, + false}; + } else { + OpsVec[OpIdx][Lane] = { + cast<Instruction>(VL[Lane])->getOperand(OpIdx), APO, false}; + } } } } @@ -3298,6 +3452,7 @@ class BoUpSLP { ///< complex node like select/cmp to minmax, mul/add to ///< fma, etc. Must be used for the following nodes in ///< the pattern, not the very first one. + CopyableVectorize, ///< The node for copyable elements. }; EntryState State; @@ -3357,7 +3512,8 @@ class BoUpSLP { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].empty() && "Already resized?"); - assert(OpVL.size() <= Scalars.size() && + assert((State == TreeEntry::CopyableVectorize || + OpVL.size() <= Scalars.size()) && "Number of operands is greater than the number of scalars."); Operands[OpIdx].resize(OpVL.size()); copy(OpVL, Operands[OpIdx].begin()); @@ -3401,7 +3557,9 @@ class BoUpSLP { } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return S.isAltShuffle(); } + bool isAltShuffle() const { + return S.isAltShuffle() && State != TreeEntry::CopyableVectorize; + } bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); } @@ -3524,6 +3682,9 @@ class BoUpSLP { case CombinedVectorize: dbgs() << "CombinedVectorize\n"; break; + case CopyableVectorize: + dbgs() << "CopyableVectorize\n"; + break; } if (S) { dbgs() << "MainOp: " << *S.getMainOp() << "\n"; @@ -3619,6 +3780,7 @@ class BoUpSLP { // for non-power-of-two vectors. assert( (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) || + EntryState == TreeEntry::CopyableVectorize || ReuseShuffleIndices.empty()) && "Reshuffling scalars not yet supported for nodes with padding"); Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), @@ -3642,8 +3804,13 @@ class BoUpSLP { Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } if (!Last->isGather()) { - for (Value *V : VL) { + unsigned Opcode = S.getOpcode(); + for (unsigned i = 0; i < VL.size(); ++i) { + Value *V = VL[i]; const TreeEntry *TE = getTreeEntry(V); + Instruction *I = dyn_cast<Instruction>(V); + bool IsAltInst = (I) ? I->getOpcode() != Opcode : false; + assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) && "Scalar already in tree!"); if (TE) { @@ -3651,6 +3818,10 @@ class BoUpSLP { MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last); continue; } + if (EntryState == TreeEntry::CopyableVectorize && IsAltInst) { + CopyableAltOp.insert(V); + continue; + } ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. @@ -3725,6 +3896,10 @@ class BoUpSLP { bool areAltOperandsProfitable(const InstructionsState &S, ArrayRef<Value *> VL) const; + /// Check that we can represent operations as copyable with looking to + /// operations operands. + bool canRepresentAsCopyable(const InstructionsState &S, ArrayRef<Value *> VL); + /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. TreeEntry::EntryState @@ -3746,6 +3921,9 @@ class BoUpSLP { /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A set op scalars that we are considoring as copyable operations. + ValueSet CopyableAltOp; + /// A set of first non-schedulable values. ValueSet NonScheduledFirst; @@ -3875,15 +4053,16 @@ class BoUpSLP { ScheduleData() = default; - void init(int BlockSchedulingRegionID, Instruction *I) { + void init(int BlockSchedulingRegionID, Value *OpVal) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; IsScheduled = false; SchedulingRegionID = BlockSchedulingRegionID; clearDependencies(); - Inst = I; + OpValue = OpVal; TE = nullptr; + IsCopy = false; } /// Verify basic self consistency properties @@ -3990,6 +4169,9 @@ class BoUpSLP { Instruction *Inst = nullptr; + /// Opcode of the current instruction in the schedule data. + Value *OpValue = nullptr; + /// The TreeEntry that this instruction corresponds to. TreeEntry *TE = nullptr; @@ -4037,6 +4219,9 @@ class BoUpSLP { /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled = false; + + /// True if this instruction is copy. + bool IsCopy = false; }; #ifndef NDEBUG @@ -4106,6 +4291,31 @@ class BoUpSLP { return nullptr; } + ScheduleData *getScheduleData(Value *V, Value *Key) { + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) { + ScheduleData *SD = I->second.lookup(Key); + if (SD && isInSchedulingRegion(SD)) + return SD; + } + if (V == Key) + return getScheduleData(V); + return nullptr; + } + + ScheduleData *getScheduleData(Value *V, const TreeEntry *E) { + ScheduleData *SD = getScheduleData(V); + if (SD && isInSchedulingRegion(SD) && SD->TE == E) + return SD; + auto I = ExtraScheduleDataMap.find(V); + if (I == ExtraScheduleDataMap.end()) + return nullptr; + for (auto &P : I->second) + if (isInSchedulingRegion(P.second) && P.second->TE == E) + return P.second; + return nullptr; + } + bool isInSchedulingRegion(ScheduleData *SD) const { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -4119,30 +4329,33 @@ class BoUpSLP { for (ScheduleData *BundleMember = SD; BundleMember; BundleMember = BundleMember->NextInBundle) { - // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. - auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { - ScheduleData *OpDef = getScheduleData(I); - if (OpDef && OpDef->hasValidDependencies() && - OpDef->incrementUnscheduledDeps(-1) == 0) { - // There are no more unscheduled dependencies after - // decrementing, so we can put the dependent instruction - // into the ready list. - ScheduleData *DepBundle = OpDef->FirstInBundle; - assert(!DepBundle->IsScheduled && - "already scheduled bundle gets ready"); - ReadyList.insert(DepBundle); - LLVM_DEBUG(dbgs() - << "SLP: gets ready (def): " << *DepBundle << "\n"); - } + auto &&DecrUnsched = [this, &ReadyList, &Bu... [truncated] |
| @llvm/pr-subscribers-llvm-transforms Author: Dinar Temirbulatov (dtemirbulatov) ChangesAdded testcase from #110740. Patch is 95.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124242.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c98d872fb6467f..47b61496b5e155 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -201,6 +201,10 @@ static cl::opt<bool> VectorizeNonPowerOf2( "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements.")); +static cl::opt<bool> + VectorizeCopyable("slp-vectorize-copyable", cl::init(false), cl::Hidden, + cl::desc("Try to vectorize with copyable elements.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -426,6 +430,8 @@ static bool isVectorLikeInstWithConstOps(Value *V) { if (isa<ExtractElementInst>(I)) return isConstant(I->getOperand(1)); assert(isa<InsertElementInst>(V) && "Expected only insertelement."); + if (I->getNumOperands() < 2) + return false; return isConstant(I->getOperand(2)); } @@ -594,6 +600,41 @@ static std::optional<unsigned> getElementIndex(const Value *Inst, return Index; } +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + if (Opcode != Instruction::PHI && Opcode != Instruction::Invoke && + !isa<FPMathOperator>(I) && + ((I->getType()->isIntegerTy() && + (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::And || + I->getOpcode() == Instruction::AShr || + I->getOpcode() == Instruction::BitCast || + I->getOpcode() == Instruction::Call || + // Issue with scheduling with isVectorLikeInstWithConstOps + // operations. + // I->getOpcode() == Instruction::ExtractElement || + // I->getOpcode() == Instruction::ExtractValue || + I->getOpcode() == Instruction::ICmp || + I->getOpcode() == Instruction::Load || + I->getOpcode() == Instruction::LShr || + I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::Or || + I->getOpcode() == Instruction::PtrToInt || + I->getOpcode() == Instruction::Select || + I->getOpcode() == Instruction::SExt || + I->getOpcode() == Instruction::Shl || + I->getOpcode() == Instruction::Sub || + I->getOpcode() == Instruction::Trunc || + I->getOpcode() == Instruction::Xor || + I->getOpcode() == Instruction::ZExt)))) + return I->getOpcode(); + return 0; +} + namespace { /// Specifies the way the mask should be analyzed for undefs/poisonous elements /// in the shuffle mask. @@ -853,6 +894,16 @@ class InstructionsState { } // end anonymous namespace +/// Chooses the correct key for scheduling data. If \p Op has the same (or +/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p +/// OpValue. +static Value *isOneOf(const InstructionsState &S, Value *Op) { + auto *I = dyn_cast<Instruction>(Op); + if (I && S.isOpcodeOrAlt(I)) + return Op; + return S.getMainOp(); +} + /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. /// @@ -865,6 +916,14 @@ static bool isValidForAlternation(unsigned Opcode) { return true; } +// Check for inner dependencies, we could not support such depenedies if it +// comes from a main operaion, only from an alternative. +static bool checkCopyableInnerDep(ArrayRef<Value *> VL, + const InstructionsState &S); + +// Determine that the vector could be vectorized with copyable elements. +static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt); + static InstructionsState getSameOpcode(ArrayRef<Value *> VL, const TargetLibraryInfo &TLI); @@ -917,19 +976,53 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, return InstructionsState::invalid(); Instruction *MainOp = cast<Instruction>(*It); + Instruction *AltOp = MainOp; + unsigned Opcode = MainOp->getOpcode(); + unsigned AltOpcode = Opcode; + for (Value *V : iterator_range(It + 1, VL.end())) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + continue; + unsigned VOpcode = Inst->getOpcode(); + if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) && + VOpcode != Opcode && isValidForAlternation(VOpcode)) { + AltOpcode = VOpcode; + AltOp = Inst; + break; + } + } unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>); if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) || (VL.size() == 2 && InstCnt < 2)) return InstructionsState::invalid(); + bool IsBinOp = isa<BinaryOperator>(MainOp); + bool IsCopyable = false; + if (MainOp && AltOp && MainOp != AltOp) { + if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) { + std::swap(MainOp, AltOp); + std::swap(AltOpcode, Opcode); + IsBinOp = true; + } + IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp); + if (IsCopyable && isa<CmpInst>(AltOp)) { + Type *Ty0 = MainOp->getOperand(0)->getType(); + Type *Ty1 = AltOp->getOperand(0)->getType(); + if (Ty0 != Ty1) + return InstructionsState::invalid(); + } + if (!IsCopyable) { + MainOp = cast<Instruction>(*It); + AltOp = MainOp; + Opcode = MainOp->getOpcode(); + AltOpcode = Opcode; + IsBinOp = isa<BinaryOperator>(MainOp); + } + } bool IsCastOp = isa<CastInst>(MainOp); - bool IsBinOp = isa<BinaryOperator>(MainOp); bool IsCmpOp = isa<CmpInst>(MainOp); CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE; - Instruction *AltOp = MainOp; - unsigned Opcode = MainOp->getOpcode(); - unsigned AltOpcode = Opcode; bool SwappedPredsCompatible = IsCmpOp && [&]() { SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds; @@ -984,7 +1077,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, AltOp = I; continue; } - } else if (IsCastOp && isa<CastInst>(I)) { + } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) { Value *Op0 = MainOp->getOperand(0); Type *Ty0 = Op0->getType(); Value *Op1 = I->getOperand(0); @@ -1001,13 +1094,15 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, continue; } } - } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) { + } else if (auto *Inst = dyn_cast<CmpInst>(I); + Inst && (IsCmpOp || IsCopyable)) { auto *BaseInst = cast<CmpInst>(MainOp); Type *Ty0 = BaseInst->getOperand(0)->getType(); Type *Ty1 = Inst->getOperand(0)->getType(); if (Ty0 == Ty1) { - assert(InstOpcode == Opcode && "Expected same CmpInst opcode."); - assert(InstOpcode == AltOpcode && + assert((IsCopyable || InstOpcode == Opcode) && + "Expected same CmpInst opcode."); + assert((IsCopyable || InstOpcode == AltOpcode) && "Alternate instructions are only supported by BinaryOperator " "and CastInst."); // Check for compatible operands. If the corresponding operands are not @@ -1038,23 +1133,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, AltPred == CurrentPred || AltPred == SwappedCurrentPred) continue; } - } else if (InstOpcode == Opcode) { - assert(InstOpcode == AltOpcode && + } else if (InstOpcode == Opcode || + (IsCopyable && InstOpcode == AltOpcode)) { + assert((IsCopyable || InstOpcode == AltOpcode) && "Alternate instructions are only supported by BinaryOperator and " "CastInst."); + Instruction *Op = MainOp; + if (IsCopyable) { + if (InstOpcode != Opcode && InstOpcode != AltOpcode) { + Op = I; + } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) { + Op = AltOp; + } + } if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { if (Gep->getNumOperands() != 2 || - Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType()) + Gep->getOperand(0)->getType() != Op->getOperand(0)->getType()) return InstructionsState::invalid(); } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) { if (!isVectorLikeInstWithConstOps(EI)) return InstructionsState::invalid(); } else if (auto *LI = dyn_cast<LoadInst>(I)) { - auto *BaseLI = cast<LoadInst>(MainOp); + auto *BaseLI = cast<LoadInst>(Op); if (!LI->isSimple() || !BaseLI->isSimple()) return InstructionsState::invalid(); } else if (auto *Call = dyn_cast<CallInst>(I)) { - auto *CallBase = cast<CallInst>(MainOp); + auto *CallBase = cast<CallInst>(Op); if (Call->getCalledFunction() != CallBase->getCalledFunction()) return InstructionsState::invalid(); if (Call->hasOperandBundles() && @@ -1069,13 +1173,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, return InstructionsState::invalid(); if (!ID) { SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call); - if (Mappings.size() != BaseMappings.size() || - Mappings.front().ISA != BaseMappings.front().ISA || - Mappings.front().ScalarName != BaseMappings.front().ScalarName || - Mappings.front().VectorName != BaseMappings.front().VectorName || - Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || - Mappings.front().Shape.Parameters != - BaseMappings.front().Shape.Parameters) + if (Mappings.size() && + (Mappings.size() != BaseMappings.size() || + Mappings.front().ISA != BaseMappings.front().ISA || + Mappings.front().ScalarName != BaseMappings.front().ScalarName || + Mappings.front().VectorName != BaseMappings.front().VectorName || + Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || + Mappings.front().Shape.Parameters != + BaseMappings.front().Shape.Parameters)) return InstructionsState::invalid(); } } @@ -1124,6 +1229,46 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } } +static bool checkCopyableInnerDep(ArrayRef<Value *> VL, + const InstructionsState &S) { + SmallSet<Value *, 4> Ops; + unsigned Opcode = S.getOpcode(); + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + if (I->getOpcode() == Opcode) + Ops.insert(V); + } + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + for (Use &U : I->operands()) { + if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op)) + return false; + } + } + return true; +} + +static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) { + if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt || + !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) || + find_if(VL, IsaPred<PHINode>) != VL.end()) + return false; + + Instruction *MainOp = cast<Instruction>(Main); + Instruction *AltOp = cast<Instruction>(Alt); + + if (isa<BinaryOperator>(MainOp) && !isa<BinaryOperator>(AltOp) && + isValidForAlternation(MainOp->getOpcode()) && + isValidForAlternation(AltOp->getOpcode()) && + tryToRepresentAsInstArg(MainOp->getOpcode(), AltOp) && + tryToRepresentAsInstArg(AltOp->getOpcode(), MainOp)) + return true; + return false; +} /// \returns the AA location that is being access by the instruction. static MemoryLocation getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) @@ -1463,6 +1608,7 @@ class BoUpSLP { MultiNodeScalars.clear(); MustGather.clear(); NonScheduledFirst.clear(); + CopyableAltOp.clear(); EntryToLastInstruction.clear(); LoadEntriesToVectorize.clear(); IsGraphTransformMode = false; @@ -2461,8 +2607,16 @@ class BoUpSLP { } bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); bool APO = (OpIdx == 0) ? false : IsInverseOperation; - OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), - APO, false}; + Instruction *Inst = cast<Instruction>(VL[Lane]); + if (Inst->getOpcode() != MainOp->getOpcode() && + OpIdx > (Inst->getNumOperands() - 1)) { + OpsVec[OpIdx][Lane] = { + PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true, + false}; + } else { + OpsVec[OpIdx][Lane] = { + cast<Instruction>(VL[Lane])->getOperand(OpIdx), APO, false}; + } } } } @@ -3298,6 +3452,7 @@ class BoUpSLP { ///< complex node like select/cmp to minmax, mul/add to ///< fma, etc. Must be used for the following nodes in ///< the pattern, not the very first one. + CopyableVectorize, ///< The node for copyable elements. }; EntryState State; @@ -3357,7 +3512,8 @@ class BoUpSLP { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].empty() && "Already resized?"); - assert(OpVL.size() <= Scalars.size() && + assert((State == TreeEntry::CopyableVectorize || + OpVL.size() <= Scalars.size()) && "Number of operands is greater than the number of scalars."); Operands[OpIdx].resize(OpVL.size()); copy(OpVL, Operands[OpIdx].begin()); @@ -3401,7 +3557,9 @@ class BoUpSLP { } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return S.isAltShuffle(); } + bool isAltShuffle() const { + return S.isAltShuffle() && State != TreeEntry::CopyableVectorize; + } bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); } @@ -3524,6 +3682,9 @@ class BoUpSLP { case CombinedVectorize: dbgs() << "CombinedVectorize\n"; break; + case CopyableVectorize: + dbgs() << "CopyableVectorize\n"; + break; } if (S) { dbgs() << "MainOp: " << *S.getMainOp() << "\n"; @@ -3619,6 +3780,7 @@ class BoUpSLP { // for non-power-of-two vectors. assert( (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) || + EntryState == TreeEntry::CopyableVectorize || ReuseShuffleIndices.empty()) && "Reshuffling scalars not yet supported for nodes with padding"); Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), @@ -3642,8 +3804,13 @@ class BoUpSLP { Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } if (!Last->isGather()) { - for (Value *V : VL) { + unsigned Opcode = S.getOpcode(); + for (unsigned i = 0; i < VL.size(); ++i) { + Value *V = VL[i]; const TreeEntry *TE = getTreeEntry(V); + Instruction *I = dyn_cast<Instruction>(V); + bool IsAltInst = (I) ? I->getOpcode() != Opcode : false; + assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) && "Scalar already in tree!"); if (TE) { @@ -3651,6 +3818,10 @@ class BoUpSLP { MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last); continue; } + if (EntryState == TreeEntry::CopyableVectorize && IsAltInst) { + CopyableAltOp.insert(V); + continue; + } ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. @@ -3725,6 +3896,10 @@ class BoUpSLP { bool areAltOperandsProfitable(const InstructionsState &S, ArrayRef<Value *> VL) const; + /// Check that we can represent operations as copyable with looking to + /// operations operands. + bool canRepresentAsCopyable(const InstructionsState &S, ArrayRef<Value *> VL); + /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. TreeEntry::EntryState @@ -3746,6 +3921,9 @@ class BoUpSLP { /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A set op scalars that we are considoring as copyable operations. + ValueSet CopyableAltOp; + /// A set of first non-schedulable values. ValueSet NonScheduledFirst; @@ -3875,15 +4053,16 @@ class BoUpSLP { ScheduleData() = default; - void init(int BlockSchedulingRegionID, Instruction *I) { + void init(int BlockSchedulingRegionID, Value *OpVal) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; IsScheduled = false; SchedulingRegionID = BlockSchedulingRegionID; clearDependencies(); - Inst = I; + OpValue = OpVal; TE = nullptr; + IsCopy = false; } /// Verify basic self consistency properties @@ -3990,6 +4169,9 @@ class BoUpSLP { Instruction *Inst = nullptr; + /// Opcode of the current instruction in the schedule data. + Value *OpValue = nullptr; + /// The TreeEntry that this instruction corresponds to. TreeEntry *TE = nullptr; @@ -4037,6 +4219,9 @@ class BoUpSLP { /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled = false; + + /// True if this instruction is copy. + bool IsCopy = false; }; #ifndef NDEBUG @@ -4106,6 +4291,31 @@ class BoUpSLP { return nullptr; } + ScheduleData *getScheduleData(Value *V, Value *Key) { + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) { + ScheduleData *SD = I->second.lookup(Key); + if (SD && isInSchedulingRegion(SD)) + return SD; + } + if (V == Key) + return getScheduleData(V); + return nullptr; + } + + ScheduleData *getScheduleData(Value *V, const TreeEntry *E) { + ScheduleData *SD = getScheduleData(V); + if (SD && isInSchedulingRegion(SD) && SD->TE == E) + return SD; + auto I = ExtraScheduleDataMap.find(V); + if (I == ExtraScheduleDataMap.end()) + return nullptr; + for (auto &P : I->second) + if (isInSchedulingRegion(P.second) && P.second->TE == E) + return P.second; + return nullptr; + } + bool isInSchedulingRegion(ScheduleData *SD) const { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -4119,30 +4329,33 @@ class BoUpSLP { for (ScheduleData *BundleMember = SD; BundleMember; BundleMember = BundleMember->NextInBundle) { - // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. - auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { - ScheduleData *OpDef = getScheduleData(I); - if (OpDef && OpDef->hasValidDependencies() && - OpDef->incrementUnscheduledDeps(-1) == 0) { - // There are no more unscheduled dependencies after - // decrementing, so we can put the dependent instruction - // into the ready list. - ScheduleData *DepBundle = OpDef->FirstInBundle; - assert(!DepBundle->IsScheduled && - "already scheduled bundle gets ready"); - ReadyList.insert(DepBundle); - LLVM_DEBUG(dbgs() - << "SLP: gets ready (def): " << *DepBundle << "\n"); - } + auto &&DecrUnsched = [this, &ReadyList, &Bu... [truncated] |
| I foresee that the current scheduling model won't allow to fix all bugs with it. Need to fix it first to allow multi-scheduling of the instructions, otherwise, there always will be bugs |
cd57ad9 to fe8d894 Compare 2fa5cc7 to bde8c3a Compare …), removed ReschedMap from BoUpSLP::scheduleBlock().
…. Resotored original logic to handle just homogeneous operations in getSameOpcode(). Removed checkCopyableInnerDep() with replacing functionality() with replacing with schedular.
| ✅ With the latest revision this PR passed the C/C++ code formatter. |
Co-authored-by: Alexey Bataev <a.bataev@gmx.com>
Co-authored-by: Alexey Bataev <a.bataev@gmx.com>
| if (isa<PHINode>(I)) { | ||
| return false; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Drop extra parens.
- Why PHI cannot be treated as copyable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure, Oh, we could not schedule PHI with the schedular. Done extra parens. Thanks.
| // reordered during buildTree(). We therefore need to get its operands | ||
| // through the TreeEntry. | ||
| if (TreeEntry *TE = BundleMember->TE) { | ||
| if (TreeEntry *TE = BundleMember->TE; TE && !TE->isAltOpCopy()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure this is correct. You still need to check the deps for the operands, even if the node is marked as copyable. Non-copyable instructions should go through this code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Thanks for finding this.
| Opcode0, Opcode1, OpcodeMask)) | ||
| return true; | ||
| SmallVector<ValueList> Operands; | ||
| if (S.getMainOp()->getNumOperands() != S.getAltOp()->getNumOperands()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was an illigal operand number access error before due to the copy instrction has less operand than main operation, I think I resolved correctly now. Thanks. Done.
| } | ||
| } | ||
| } else if (VectorizeCopyable && canRepresentAsCopyable(S, VL)) | ||
| IsAltOpCopy = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be in braces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Co-authored-by: Alexey Bataev <a.bataev@gmx.com>
Co-authored-by: Alexey Bataev <a.bataev@gmx.com>
Co-authored-by: Alexey Bataev <a.bataev@gmx.com>
| ping. |
| It must be rebased. Plus, I'm currently working on scheduling reworking, which, most probably, won't require scheduling model update in this patch, and it can be dropped. |
| WIP, rebasing to the HEAD. |
Added testcase from #110740. It looks like there are no SLP schedular issues left, I was able to compiler and run LNT, Spec2k6, Spec2k17 without any runtime/compile time issues with different flags "-O3, Ofast, -mllvm -slp-threshold=-99999".