- Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU][NFC]:: Minor Unpacking Fixes. #163992
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][NFC]:: Minor Unpacking Fixes. #163992
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Akash Dutta (akadutta) ChangesOptimize condition checks, Remove compilation overhead for unsupported archs Full diff: https://github.com/llvm/llvm-project/pull/163992.diff 1 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 01a40c1e38817..d9c3d6b399225 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -47,9 +47,6 @@ class SIPreEmitPeephole { const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - // Check if the machine instruction being processed is a supported packed - // instruction. - bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. void collectUnpackingCandidates(MachineInstr &BeginMI, @@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } -// If support is extended to new operations, add tests in -// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. -bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { - if (!TII->isNeverCoissue(MI)) - return false; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("Fully covered switch"); -} - bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); @@ -612,10 +592,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates( for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; + uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); if (Instr.isMetaInstruction()) continue; if ((Instr.isTerminator()) || - (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (TII->isNeverCoissue(Instr) && + (UnpackedOpCode == std::numeric_limits<uint16_t>::max())) || (SIInstrInfo::modifiesModeRegister(Instr) && Instr.modifiesRegister(AMDGPU::EXEC, TRI))) return; @@ -639,7 +621,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) return; } - if (!isUnpackingSupportedInstr(Instr)) + if (UnpackedOpCode == std::numeric_limits<uint16_t>::max()) continue; if (canUnpackingClobberRegister(Instr)) @@ -687,8 +669,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, bool IsHiBits) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); - const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1); Register DstReg = I.getOperand(0).getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) @@ -702,15 +684,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); - addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1); if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *SrcMO3 = + const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src2); unsigned Src2Mods = TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); - addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 @@ -787,22 +769,26 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // TODO: Fold this into previous block, if possible. Evaluate and handle any // side effects. - for (MachineBasicBlock &MBB : MF) { - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA - auto SchedModel = TII->getSchedModel(); - SetVector<MachineInstr *> InstrsToUnpack; - for (auto &MI : make_early_inc_range(MBB.instrs())) { - if (!SIInstrInfo::isMFMA(MI)) - continue; - const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(&MI); - uint16_t NumMFMACycles = - SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); - } - for (MachineInstr *MI : InstrsToUnpack) { - performF32Unpacking(*MI); + + // Perform the extra MF scans only for supported archs + if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) { + for (MachineBasicBlock &MBB : MF) { + // Unpack packed instructions overlapped by MFMAs. This allows the compiler + // to co-issue unpacked instructions with MFMA + auto SchedModel = TII->getSchedModel(); + SetVector<MachineInstr *> InstrsToUnpack; + for (auto &MI : make_early_inc_range(MBB.instrs())) { + if (!SIInstrInfo::isMFMA(MI)) + continue; + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + uint16_t NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); + } + for (MachineInstr *MI : InstrsToUnpack) { + performF32Unpacking(*MI); + } } } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add NFC to commit title.
performF32Unpacking(*MI); | ||
| ||
// Perform the extra MF scans only for supported archs | ||
if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!(ST.hasGFX950Insts() || ST.hasGFX940Insts())) return;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, can just use hasGFX940Insts
as this implies gfx950
if ((Instr.isTerminator()) || | ||
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || | ||
(TII->isNeverCoissue(Instr) && | ||
(UnpackedOpCode == std::numeric_limits<uint16_t>::max())) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you use a local variable -- IsUnpackable = UnpackedOpCode == std::numeric_limits<uint16_t>::max()
✅ With the latest revision this PR passed the C/C++ code formatter. |
performF32Unpacking(*MI); | ||
| ||
// Perform the extra MF scans only for supported archs | ||
if (ST.hasGFX940Insts()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use early exit to reduce nesting https://llvm.org/docs/CodingStandards.html#use-early-exits-and-continue-to-simplify-code
if (!ST.hasGFX9540Insts()) return Changed; for (MachineBasicBlock &MBB : MF) { ...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM -- thanks
Optimize condition checks, Remove compilation overhead for unsupported archs