-
Couldn't load subscription status.
- Fork 15k
[AArch64][GlobalISel] Add combine for build_vector(unmerge, unmerge, undef, undef) #165539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
| @llvm/pr-subscribers-backend-aarch64 Author: Ryan Cowan (HolyMolyCowMan) ChangesThis PR adds a new combine to the This removes unnecessary Full diff: https://github.com/llvm/llvm-project/pull/165539.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 278314792bfb9..056e6145487d2 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -332,6 +332,13 @@ def combine_mul_cmlt : GICombineRule< (apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }]) >; +def combine_build_unmerge : GICombineRule< + (defs root:$root, register_matchinfo:$unmergeSrc), + (match (wip_match_opcode G_BUILD_VECTOR):$root, + [{ return matchCombineBuildUnmerge(*${root}, MRI, ${unmergeSrc}); }]), + (apply [{ applyCombineBuildUnmerge(*${root}, MRI, B, ${unmergeSrc}); }]) +>; + // Post-legalization combines which should happen at all optimization levels. // (E.g. ones that facilitate matching for the selector) For example, matching // pseudos. @@ -366,7 +373,7 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt, combine_use_vector_truncate, + combine_mul_cmlt, combine_use_vector_truncate, combine_build_unmerge, extmultomull, truncsat_combines, lshr_of_trunc_of_lshr, funnel_shift_from_or_shift_constants_are_legal]> { } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index fa7bb6ecc35ee..2f17fd33559ee 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -39,6 +39,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" +#include <set> #define GET_GICOMBINER_DEPS #include "AArch64GenPostLegalizeGICombiner.inc" @@ -133,6 +134,99 @@ bool isZeroExtended(Register R, MachineRegisterInfo &MRI) { return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT; } +// This pattern aims to match the following shape to avoid extra mov +// instructions +// G_BUILD_VECTOR( +// G_UNMERGE_VALUES(src, 0) +// G_UNMERGE_VALUES(src, 1) +// G_IMPLICIT_DEF +// G_IMPLICIT_DEF +// ) +// -> +// G_CONCAT_VECTORS( +// undef +// src +// ) +bool matchCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, + Register &UnmergeSrc) { + assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + + unsigned UnmergeInstrCount = 0; + unsigned UndefInstrCount = 0; + + unsigned UnmergeEltCount = 0; + unsigned UnmergeEltSize = 0; + + Register UnmergeSrcTemp; + + std::set<int> KnownRegs; + + for (auto Use : MI.all_uses()) { + auto *Def = getDefIgnoringCopies(Use.getReg(), MRI); + + if (!Def) { + return false; + } + + unsigned Opcode = Def->getOpcode(); + + switch (Opcode) { + default: + return false; + case TargetOpcode::G_IMPLICIT_DEF: + ++UndefInstrCount; + break; + case TargetOpcode::G_UNMERGE_VALUES: + ++UnmergeInstrCount; + + UnmergeEltSize = MRI.getType(Use.getReg()).getScalarSizeInBits(); + UnmergeEltCount = Def->getNumDefs(); + if (UnmergeEltCount < 2 || (UnmergeEltSize * UnmergeEltCount != 64 && + UnmergeEltSize * UnmergeEltCount != 128)) { + return false; + } + + // Unmerge should only use one register so we can use the last one + for (auto UnmergeUse : Def->all_uses()) + UnmergeSrcTemp = UnmergeUse.getReg(); + + // Track unique sources for the G_UNMERGE_VALUES + unsigned RegId = UnmergeSrcTemp.id(); + if (KnownRegs.find(RegId) != KnownRegs.end()) + continue; + + KnownRegs.insert(RegId); + + // We know the unmerge is a valid target now so store the register. + UnmergeSrc = UnmergeSrcTemp; + + break; + } + } + + // Only want to match patterns that pad half of a vector with undefined. We + // also want to ensure that these values come from a single unmerge and all + // unmerged values are consumed. + if (UndefInstrCount != UnmergeInstrCount || + UnmergeEltCount != UnmergeInstrCount || KnownRegs.size() != 1) { + return false; + } + + return true; +} + +void applyCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, Register &UnmergeSrc) { + assert(UnmergeSrc && "Expected there to be one matching G_UNMERGE_VALUES"); + B.setInstrAndDebugLoc(MI); + + Register UndefVec = MRI.createGenericVirtualRegister(MRI.getType(UnmergeSrc)); + B.buildUndef(UndefVec); + B.buildConcatVectors(MI.getOperand(0), {UnmergeSrc, UndefVec}); + + MI.eraseFromParent(); +} + bool matchAArch64MulConstCombine( MachineInstr &MI, MachineRegisterInfo &MRI, std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { @@ -890,4 +984,4 @@ namespace llvm { FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) { return new AArch64PostLegalizerCombiner(IsOptNone); } -} // end namespace llvm +} // end namespace llvm \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index 1f84c944d7c16..ce1a2fc48c2e7 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -345,19 +345,11 @@ entry: } define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) { -; CHECK-SD-LABEL: fptrunc_v2f32_v2f16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptrunc_v2f32_v2f16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptrunc_v2f32_v2f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptrunc <2 x float> %a to <2 x half> ret <2 x half> %c diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index caf87a13f283b..6d168edf180a4 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -5763,18 +5763,14 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: scvtf v0.2d, v0.2d ; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i64_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: scvtf v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = sitofp <2 x i64> %a to <2 x half> @@ -5808,18 +5804,14 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i64_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: ucvtf v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = uitofp <2 x i64> %a to <2 x half> @@ -6232,17 +6224,13 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-NOFP16-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = sitofp <2 x i32> %a to <2 x half> @@ -6267,17 +6255,13 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-NOFP16-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = uitofp <2 x i32> %a to <2 x half> @@ -6480,9 +6464,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret entry: %c = sitofp <2 x i16> %a to <2 x half> @@ -6509,9 +6491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-NOFP16-GI-NEXT: movi d1, #0x00ffff0000ffff ; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret entry: %c = uitofp <2 x i16> %a to <2 x half> @@ -6766,9 +6746,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i8_v2f16: @@ -6817,9 +6795,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-NOFP16-GI-NEXT: movi d1, #0x0000ff000000ff ; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i8_v2f16: |
This PR adds a new combine to the
post-legalizer-combinerpass. The new combine checks for vectors being unmerged and subsequently padded withG_IMPLICIT_DEFvalues by building a new vector. If such a case is found, the vector being unmerged is instead just concatenated with aG_IMPLICIT_DEFthat is as wide as the vector being unmerged.This removes unnecessary
movinstructions in a few places.