Skip to content

Conversation

@HolyMolyCowMan
Copy link
Contributor

This PR adds a new combine to the post-legalizer-combiner pass. The new combine checks for vectors being unmerged and subsequently padded with G_IMPLICIT_DEF values by building a new vector. If such a case is found, the vector being unmerged is instead just concatenated with a G_IMPLICIT_DEF that is as wide as the vector being unmerged.

This removes unnecessary mov instructions in a few places.

@llvmbot
Copy link
Member

llvmbot commented Oct 29, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Ryan Cowan (HolyMolyCowMan)

Changes

This PR adds a new combine to the post-legalizer-combiner pass. The new combine checks for vectors being unmerged and subsequently padded with G_IMPLICIT_DEF values by building a new vector. If such a case is found, the vector being unmerged is instead just concatenated with a G_IMPLICIT_DEF that is as wide as the vector being unmerged.

This removes unnecessary mov instructions in a few places.


Full diff: https://github.com/llvm/llvm-project/pull/165539.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64Combine.td (+8-1)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp (+95-1)
  • (modified) llvm/test/CodeGen/AArch64/fptrunc.ll (+5-13)
  • (modified) llvm/test/CodeGen/AArch64/itofp.ll (+12-36)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 278314792bfb9..056e6145487d2 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -332,6 +332,13 @@ def combine_mul_cmlt : GICombineRule< (apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }]) >; +def combine_build_unmerge : GICombineRule< + (defs root:$root, register_matchinfo:$unmergeSrc), + (match (wip_match_opcode G_BUILD_VECTOR):$root, + [{ return matchCombineBuildUnmerge(*${root}, MRI, ${unmergeSrc}); }]), + (apply [{ applyCombineBuildUnmerge(*${root}, MRI, B, ${unmergeSrc}); }]) +>; + // Post-legalization combines which should happen at all optimization levels. // (E.g. ones that facilitate matching for the selector) For example, matching // pseudos. @@ -366,7 +373,7 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt, combine_use_vector_truncate, + combine_mul_cmlt, combine_use_vector_truncate, combine_build_unmerge, extmultomull, truncsat_combines, lshr_of_trunc_of_lshr, funnel_shift_from_or_shift_constants_are_legal]> { } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index fa7bb6ecc35ee..2f17fd33559ee 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -39,6 +39,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" +#include <set> #define GET_GICOMBINER_DEPS #include "AArch64GenPostLegalizeGICombiner.inc" @@ -133,6 +134,99 @@ bool isZeroExtended(Register R, MachineRegisterInfo &MRI) { return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT; } +// This pattern aims to match the following shape to avoid extra mov +// instructions +// G_BUILD_VECTOR( +// G_UNMERGE_VALUES(src, 0) +// G_UNMERGE_VALUES(src, 1) +// G_IMPLICIT_DEF +// G_IMPLICIT_DEF +// ) +// -> +// G_CONCAT_VECTORS( +// undef +// src +// ) +bool matchCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, + Register &UnmergeSrc) { + assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + + unsigned UnmergeInstrCount = 0; + unsigned UndefInstrCount = 0; + + unsigned UnmergeEltCount = 0; + unsigned UnmergeEltSize = 0; + + Register UnmergeSrcTemp; + + std::set<int> KnownRegs; + + for (auto Use : MI.all_uses()) { + auto *Def = getDefIgnoringCopies(Use.getReg(), MRI); + + if (!Def) { + return false; + } + + unsigned Opcode = Def->getOpcode(); + + switch (Opcode) { + default: + return false; + case TargetOpcode::G_IMPLICIT_DEF: + ++UndefInstrCount; + break; + case TargetOpcode::G_UNMERGE_VALUES: + ++UnmergeInstrCount; + + UnmergeEltSize = MRI.getType(Use.getReg()).getScalarSizeInBits(); + UnmergeEltCount = Def->getNumDefs(); + if (UnmergeEltCount < 2 || (UnmergeEltSize * UnmergeEltCount != 64 && + UnmergeEltSize * UnmergeEltCount != 128)) { + return false; + } + + // Unmerge should only use one register so we can use the last one + for (auto UnmergeUse : Def->all_uses()) + UnmergeSrcTemp = UnmergeUse.getReg(); + + // Track unique sources for the G_UNMERGE_VALUES + unsigned RegId = UnmergeSrcTemp.id(); + if (KnownRegs.find(RegId) != KnownRegs.end()) + continue; + + KnownRegs.insert(RegId); + + // We know the unmerge is a valid target now so store the register. + UnmergeSrc = UnmergeSrcTemp; + + break; + } + } + + // Only want to match patterns that pad half of a vector with undefined. We + // also want to ensure that these values come from a single unmerge and all + // unmerged values are consumed. + if (UndefInstrCount != UnmergeInstrCount || + UnmergeEltCount != UnmergeInstrCount || KnownRegs.size() != 1) { + return false; + } + + return true; +} + +void applyCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, Register &UnmergeSrc) { + assert(UnmergeSrc && "Expected there to be one matching G_UNMERGE_VALUES"); + B.setInstrAndDebugLoc(MI); + + Register UndefVec = MRI.createGenericVirtualRegister(MRI.getType(UnmergeSrc)); + B.buildUndef(UndefVec); + B.buildConcatVectors(MI.getOperand(0), {UnmergeSrc, UndefVec}); + + MI.eraseFromParent(); +} + bool matchAArch64MulConstCombine( MachineInstr &MI, MachineRegisterInfo &MRI, std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { @@ -890,4 +984,4 @@ namespace llvm { FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) { return new AArch64PostLegalizerCombiner(IsOptNone); } -} // end namespace llvm +} // end namespace llvm \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index 1f84c944d7c16..ce1a2fc48c2e7 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -345,19 +345,11 @@ entry: } define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) { -; CHECK-SD-LABEL: fptrunc_v2f32_v2f16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptrunc_v2f32_v2f16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptrunc_v2f32_v2f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptrunc <2 x float> %a to <2 x half> ret <2 x half> %c diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index caf87a13f283b..6d168edf180a4 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -5763,18 +5763,14 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: scvtf v0.2d, v0.2d ; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i64_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: scvtf v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = sitofp <2 x i64> %a to <2 x half> @@ -5808,18 +5804,14 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i64_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: ucvtf v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = uitofp <2 x i64> %a to <2 x half> @@ -6232,17 +6224,13 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-NOFP16-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = sitofp <2 x i32> %a to <2 x half> @@ -6267,17 +6255,13 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-NOFP16-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = uitofp <2 x i32> %a to <2 x half> @@ -6480,9 +6464,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret entry: %c = sitofp <2 x i16> %a to <2 x half> @@ -6509,9 +6491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-NOFP16-GI-NEXT: movi d1, #0x00ffff0000ffff ; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret entry: %c = uitofp <2 x i16> %a to <2 x half> @@ -6766,9 +6746,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i8_v2f16: @@ -6817,9 +6795,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-NOFP16-GI-NEXT: movi d1, #0x0000ff000000ff ; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i8_v2f16: 
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

2 participants