Skip to content

Conversation

akadutta
Copy link
Contributor

Optimize condition checks, Remove compilation overhead for unsupported archs

@llvmbot
Copy link
Member

llvmbot commented Oct 17, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Akash Dutta (akadutta)

Changes

Optimize condition checks, Remove compilation overhead for unsupported archs


Full diff: https://github.com/llvm/llvm-project/pull/163992.diff

1 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+30-44)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 01a40c1e38817..d9c3d6b399225 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -47,9 +47,6 @@ class SIPreEmitPeephole { const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - // Check if the machine instruction being processed is a supported packed - // instruction. - bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. void collectUnpackingCandidates(MachineInstr &BeginMI, @@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } -// If support is extended to new operations, add tests in -// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. -bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { - if (!TII->isNeverCoissue(MI)) - return false; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("Fully covered switch"); -} - bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); @@ -612,10 +592,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates( for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; + uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); if (Instr.isMetaInstruction()) continue; if ((Instr.isTerminator()) || - (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (TII->isNeverCoissue(Instr) && + (UnpackedOpCode == std::numeric_limits<uint16_t>::max())) || (SIInstrInfo::modifiesModeRegister(Instr) && Instr.modifiesRegister(AMDGPU::EXEC, TRI))) return; @@ -639,7 +621,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) return; } - if (!isUnpackingSupportedInstr(Instr)) + if (UnpackedOpCode == std::numeric_limits<uint16_t>::max()) continue; if (canUnpackingClobberRegister(Instr)) @@ -687,8 +669,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, bool IsHiBits) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); - const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1); Register DstReg = I.getOperand(0).getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) @@ -702,15 +684,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); - addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1); if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *SrcMO3 = + const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src2); unsigned Src2Mods = TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); - addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 @@ -787,22 +769,26 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // TODO: Fold this into previous block, if possible. Evaluate and handle any // side effects. - for (MachineBasicBlock &MBB : MF) { - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA - auto SchedModel = TII->getSchedModel(); - SetVector<MachineInstr *> InstrsToUnpack; - for (auto &MI : make_early_inc_range(MBB.instrs())) { - if (!SIInstrInfo::isMFMA(MI)) - continue; - const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(&MI); - uint16_t NumMFMACycles = - SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); - } - for (MachineInstr *MI : InstrsToUnpack) { - performF32Unpacking(*MI); + + // Perform the extra MF scans only for supported archs + if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) { + for (MachineBasicBlock &MBB : MF) { + // Unpack packed instructions overlapped by MFMAs. This allows the compiler + // to co-issue unpacked instructions with MFMA + auto SchedModel = TII->getSchedModel(); + SetVector<MachineInstr *> InstrsToUnpack; + for (auto &MI : make_early_inc_range(MBB.instrs())) { + if (!SIInstrInfo::isMFMA(MI)) + continue; + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + uint16_t NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); + } + for (MachineInstr *MI : InstrsToUnpack) { + performF32Unpacking(*MI); + } } } 
Copy link
Contributor

@jrbyrnes jrbyrnes left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add NFC to commit title.

performF32Unpacking(*MI);

// Perform the extra MF scans only for supported archs
if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
Copy link
Contributor

@jrbyrnes jrbyrnes Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (!(ST.hasGFX950Insts() || ST.hasGFX940Insts()))	return; 
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, can just use hasGFX940Insts as this implies gfx950

if ((Instr.isTerminator()) ||
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
(TII->isNeverCoissue(Instr) &&
(UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use a local variable -- IsUnpackable = UnpackedOpCode == std::numeric_limits<uint16_t>::max()

@akadutta akadutta changed the title [AMDGPU]:: Minor Unpacking Fixes. [AMDGPU][NFC]:: Minor Unpacking Fixes. Oct 17, 2025
Copy link

github-actions bot commented Oct 17, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

performF32Unpacking(*MI);

// Perform the extra MF scans only for supported archs
if (ST.hasGFX940Insts()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use early exit to reduce nesting https://llvm.org/docs/CodingStandards.html#use-early-exits-and-continue-to-simplify-code

if (!ST.hasGFX9540Insts()) return Changed; for (MachineBasicBlock &MBB : MF) { ... 
Copy link
Contributor

@jrbyrnes jrbyrnes left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM -- thanks

@jrbyrnes jrbyrnes merged commit 8b2fc00 into llvm:main Oct 17, 2025
10 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

3 participants