1515#include " AMDGPU.h"
1616#include " AMDGPUTargetMachine.h"
1717#include " SIModeRegisterDefaults.h"
18+ #include " llvm/ADT/SetVector.h"
1819#include " llvm/Analysis/AssumptionCache.h"
1920#include " llvm/Analysis/ConstantFolding.h"
2021#include " llvm/Analysis/TargetLibraryInfo.h"
2728#include " llvm/IR/InstVisitor.h"
2829#include " llvm/IR/IntrinsicsAMDGPU.h"
2930#include " llvm/IR/PatternMatch.h"
31+ #include " llvm/IR/ValueHandle.h"
3032#include " llvm/InitializePasses.h"
3133#include " llvm/Pass.h"
3234#include " llvm/Support/KnownBits.h"
@@ -106,6 +108,7 @@ class AMDGPUCodeGenPrepareImpl
106108 bool FlowChanged = false ;
107109 mutable Function *SqrtF32 = nullptr ;
108110 mutable Function *LdexpF32 = nullptr ;
111+ mutable SmallVector<WeakVH> DeadVals;
109112
110113 DenseMap<const PHINode *, bool > BreakPhiNodesCache;
111114
@@ -242,6 +245,8 @@ class AMDGPUCodeGenPrepareImpl
242245 Value *emitSqrtIEEE2ULP (IRBuilder<> &Builder, Value *Src,
243246 FastMathFlags FMF) const ;
244247
248+ bool tryNarrowMathIfNoOverflow (Instruction *I);
249+
245250public:
246251 bool visitFDiv (BinaryOperator &I);
247252
@@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() {
281286 BreakPhiNodesCache.clear ();
282287 bool MadeChange = false ;
283288
284- Function::iterator NextBB;
285- for (Function::iterator FI = F.begin (), FE = F.end (); FI != FE; FI = NextBB) {
286- BasicBlock *BB = &*FI;
287- NextBB = std::next (FI);
288-
289- BasicBlock::iterator Next;
290- for (BasicBlock::iterator I = BB->begin (), E = BB->end (); I != E;
291- I = Next) {
292- Next = std::next (I);
293-
294- MadeChange |= visit (*I);
295-
296- if (Next != E) { // Control flow changed
297- BasicBlock *NextInstBB = Next->getParent ();
298- if (NextInstBB != BB) {
299- BB = NextInstBB;
300- E = BB->end ();
301- FE = F.end ();
302- }
303- }
289+ // Need to use make_early_inc_range because integer division expansion is
290+ // handled by Transform/Utils, and it can delete instructions such as the
291+ // terminator of the BB.
292+ for (BasicBlock &BB : reverse (F)) {
293+ for (Instruction &I : make_early_inc_range (reverse (BB))) {
294+ if (!isInstructionTriviallyDead (&I, TLI))
295+ MadeChange |= visit (I);
304296 }
305297 }
298+
299+ while (!DeadVals.empty ()) {
300+ if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val ()))
301+ RecursivelyDeleteTriviallyDeadInstructions (I, TLI);
302+ }
303+
306304 return MadeChange;
307305}
308306
@@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
422420 Value *NewVal = insertValues (Builder, Ty, ResultVals);
423421 NewVal->takeName (&I);
424422 I.replaceAllUsesWith (NewVal);
425- I. eraseFromParent ( );
423+ DeadVals. push_back (&I );
426424
427425 return true ;
428426}
@@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
496494 FoldedT, FoldedF);
497495 NewSelect->takeName (&BO);
498496 BO.replaceAllUsesWith (NewSelect);
499- BO. eraseFromParent ( );
497+ DeadVals. push_back (&BO );
500498 if (CastOp)
501- CastOp-> eraseFromParent ( );
502- Sel-> eraseFromParent ( );
499+ DeadVals. push_back (CastOp );
500+ DeadVals. push_back (Sel );
503501 return true ;
504502}
505503
@@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
895893 if (NewVal) {
896894 FDiv.replaceAllUsesWith (NewVal);
897895 NewVal->takeName (&FDiv);
898- RecursivelyDeleteTriviallyDeadInstructions (&FDiv, TLI );
896+ DeadVals. push_back (&FDiv);
899897 }
900898
901899 return true ;
@@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`.
13021300We accept this change since the non-byte load assumes the upper bits
13031301within the byte are all 0.
13041302*/
1305- static bool tryNarrowMathIfNoOverflow (Instruction *I,
1306- const SITargetLowering *TLI,
1307- const TargetTransformInfo &TTI,
1308- const DataLayout &DL) {
1303+ bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow (Instruction *I) {
13091304 unsigned Opc = I->getOpcode ();
13101305 Type *OldType = I->getType ();
13111306
@@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
13301325 NewType = I->getType ()->getWithNewBitWidth (NewBit);
13311326
13321327 // Old cost
1328+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
13331329 InstructionCost OldCost =
13341330 TTI.getArithmeticInstrCost (Opc, OldType, TTI::TCK_RecipThroughput);
13351331 // New cost of new op
@@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
13601356
13611357 Value *Zext = Builder.CreateZExt (Arith, OldType);
13621358 I->replaceAllUsesWith (Zext);
1363- I-> eraseFromParent ( );
1359+ DeadVals. push_back (I );
13641360 return true ;
13651361}
13661362
@@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
13701366
13711367 if (UseMul24Intrin && replaceMulWithMul24 (I))
13721368 return true ;
1373- if (tryNarrowMathIfNoOverflow (&I, ST.getTargetLowering (),
1374- TM.getTargetTransformInfo (F), DL))
1369+ if (tryNarrowMathIfNoOverflow (&I))
13751370 return true ;
13761371
13771372 bool Changed = false ;
@@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
14361431
14371432 if (NewDiv) {
14381433 I.replaceAllUsesWith (NewDiv);
1439- I. eraseFromParent ( );
1434+ DeadVals. push_back (&I );
14401435 Changed = true ;
14411436 }
14421437 }
@@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
14921487 Value *ValTrunc = Builder.CreateTrunc (WidenLoad, IntNTy);
14931488 Value *ValOrig = Builder.CreateBitCast (ValTrunc, I.getType ());
14941489 I.replaceAllUsesWith (ValOrig);
1495- I. eraseFromParent ( );
1490+ DeadVals. push_back (&I );
14961491 return true ;
14971492 }
14981493
@@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
15341529
15351530 Fract->takeName (&I);
15361531 I.replaceAllUsesWith (Fract);
1537- RecursivelyDeleteTriviallyDeadInstructions (&I, TLI );
1532+ DeadVals. push_back (&I);
15381533 return true ;
15391534}
15401535
@@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
18221817 }
18231818
18241819 I.replaceAllUsesWith (Vec);
1825- I. eraseFromParent ( );
1820+ DeadVals. push_back (&I );
18261821 return true ;
18271822}
18281823
@@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
19031898 auto *Intrin = B.CreateIntrinsic (
19041899 I.getType (), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand (0 )});
19051900 I.replaceAllUsesWith (Intrin);
1906- I. eraseFromParent ( );
1901+ DeadVals. push_back (&I );
19071902 return true ;
19081903}
19091904
@@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
20001995 Value *Fract = applyFractPat (Builder, FractArg);
20011996 Fract->takeName (&I);
20021997 I.replaceAllUsesWith (Fract);
2003-
2004- RecursivelyDeleteTriviallyDeadInstructions (&I, TLI);
1998+ DeadVals.push_back (&I);
20051999 return true ;
20062000}
20072001
2008- static bool isOneOrNegOne (const Value *Val) {
2009- const APFloat *C;
2010- return match (Val, m_APFloat (C)) && C->getExactLog2Abs () == 0 ;
2011- }
2012-
20132002// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
20142003bool AMDGPUCodeGenPrepareImpl::visitSqrt (IntrinsicInst &Sqrt) {
20152004 Type *Ty = Sqrt.getType ()->getScalarType ();
@@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
20302019 if (ReqdAccuracy < 1 .0f )
20312020 return false ;
20322021
2033- // FIXME: This is an ugly hack for this pass using forward iteration instead
2034- // of reverse. If it worked like a normal combiner, the rsq would form before
2035- // we saw a sqrt call.
2036- auto *FDiv =
2037- dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser ());
2038- if (FDiv && FDiv->getOpcode () == Instruction::FDiv &&
2039- FDiv->getFPAccuracy () >= 1 .0f &&
2040- canOptimizeWithRsq (FPOp, FDiv->getFastMathFlags (), SqrtFMF) &&
2041- // TODO: We should also handle the arcp case for the fdiv with non-1 value
2042- isOneOrNegOne (FDiv->getOperand (0 )))
2043- return false ;
2044-
20452022 Value *SrcVal = Sqrt.getOperand (0 );
20462023 bool CanTreatAsDAZ = canIgnoreDenormalInput (SrcVal, &Sqrt);
20472024
@@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
20652042 Value *NewSqrt = insertValues (Builder, Sqrt.getType (), ResultVals);
20662043 NewSqrt->takeName (&Sqrt);
20672044 Sqrt.replaceAllUsesWith (NewSqrt);
2068- Sqrt. eraseFromParent ( );
2045+ DeadVals. push_back (&Sqrt );
20692046 return true ;
20702047}
20712048
0 commit comments