Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,13 @@ def combine_mul_cmlt : GICombineRule<
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
>;

def combine_build_unmerge : GICombineRule<
(defs root:$root, register_matchinfo:$unmergeSrc),
(match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root,
[{ return matchCombineBuildUnmerge(*${root}, MRI, ${unmergeSrc}); }]),
(apply [{ applyCombineBuildUnmerge(*${root}, MRI, B, ${unmergeSrc}); }])
>;

// Post-legalization combines which should happen at all optimization levels.
// (E.g. ones that facilitate matching for the selector) For example, matching
// pseudos.
Expand Down Expand Up @@ -366,7 +373,7 @@ def AArch64PostLegalizerCombiner
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs, extract_vec_elt_combines,
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt, combine_use_vector_truncate,
combine_mul_cmlt, combine_use_vector_truncate, combine_build_unmerge,
extmultomull, truncsat_combines, lshr_of_trunc_of_lshr,
funnel_shift_from_or_shift_constants_are_legal]> {
}
120 changes: 119 additions & 1 deletion llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"
#include <set>

#define GET_GICOMBINER_DEPS
#include "AArch64GenPostLegalizeGICombiner.inc"
Expand Down Expand Up @@ -133,6 +134,123 @@ bool isZeroExtended(Register R, MachineRegisterInfo &MRI) {
return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT;
}

// This pattern aims to match the following shape to avoid extra mov
// instructions
// G_BUILD_VECTOR(
// G_UNMERGE_VALUES(src, 0)
// G_UNMERGE_VALUES(src, 1)
// G_IMPLICIT_DEF
// G_IMPLICIT_DEF
// )
// ->
// G_CONCAT_VECTORS(
// src,
// undef
// )
bool matchCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
Register &UnmergeSrc) {
assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);

unsigned UnmergeUseCount = 0;
unsigned UndefInstrCount = 0;

unsigned UnmergeEltCount = 0;
unsigned UnmergeEltSize = 0;

unsigned BuildOperandCount = MI.getNumOperands();
bool EncounteredUndef = false;

Register UnmergeSrcTemp;
MachineInstr *UnmergeInstr;

std::set<int> KnownRegs;

for (auto &Use : MI.all_uses()) {
auto *Def = getDefIgnoringCopies(Use.getReg(), MRI);

if (!Def) {
return false;
}

unsigned Opcode = Def->getOpcode();

// Ensure that the unmerged instructions are consecutive and before the
// undefined values by checking we don't encounter an undef before we reach
// half way
if (EncounteredUndef && UnmergeUseCount < BuildOperandCount / 2)
return false;

switch (Opcode) {
default:
return false;
case TargetOpcode::G_IMPLICIT_DEF:
++UndefInstrCount;
EncounteredUndef = true;
break;
case TargetOpcode::G_UNMERGE_VALUES:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would happen if we had this?

// G_BUILD_VECTOR( // G_UNMERGE_VALUES(src, 0) // G_IMPLICIT_DEF // G_UNMERGE_VALUES(src, 1) // G_IMPLICIT_DEF // ) 

or

// G_BUILD_VECTOR( // G_UNMERGE_VALUES(src, 1) // G_UNMERGE_VALUES(src, 0) // G_IMPLICIT_DEF // G_IMPLICIT_DEF // ) 

I believe it needs to check that the first operand is an unmerge from def 0, the remaining n/2-1 operands are the same unmerge from the correct def number, and the remaining n2/defs are undef.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to check I understand correctly, you are saying that the order matters & it needs to be checked?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I believe so. In general any input IR that we can generate needs to be handled correctly, and if the output is G_CONCAT_VECTORS(src, undef), we need to make sure the input is always equivalent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, I hadn't considered that. Thank you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've had a go at this, it would be great to get your thoughts on the approach I went with.

++UnmergeUseCount;

UnmergeEltSize = MRI.getType(Use.getReg()).getScalarSizeInBits();
UnmergeEltCount = Def->getNumDefs();
if (UnmergeEltCount < 2 || (UnmergeEltSize * UnmergeEltCount != 64 &&
UnmergeEltSize * UnmergeEltCount != 128)) {
return false;
}

// Unmerge should only use one register so we can use the last one
for (auto &UnmergeUse : Def->all_uses())
UnmergeSrcTemp = UnmergeUse.getReg();

// Track unique sources for the G_UNMERGE_VALUES
unsigned RegId = UnmergeSrcTemp.id();
if (KnownRegs.find(RegId) != KnownRegs.end())
continue;

KnownRegs.insert(RegId);

// We know the unmerge is a valid target now so store the register & the
// instruction.
UnmergeSrc = UnmergeSrcTemp;
UnmergeInstr = Def;

break;
}
}

// Only want to match patterns that pad half of a vector with undefined. We
// also want to ensure that these values come from a single unmerge and all
// unmerged values are consumed.
if (UndefInstrCount != UnmergeUseCount ||
UnmergeEltCount != UnmergeUseCount || KnownRegs.size() != 1) {
return false;
}

// Check the operands of the unmerge are used in the same order they are
// defined G_BUILD_VECTOR always defines 1 output so we know the uses start
// from index 1
for (unsigned OperandIndex = 0; OperandIndex < UnmergeUseCount;
++OperandIndex) {
Register BuildReg = MI.getOperand(OperandIndex + 1).getReg();
Register UnmergeReg = UnmergeInstr->getOperand(OperandIndex).getReg();
if (BuildReg != UnmergeReg)
return false;
}

return true;
}

void applyCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, Register &UnmergeSrc) {
assert(UnmergeSrc && "Expected there to be one matching G_UNMERGE_VALUES");
B.setInstrAndDebugLoc(MI);

Register UndefVec = MRI.createGenericVirtualRegister(MRI.getType(UnmergeSrc));
B.buildUndef(UndefVec);
B.buildConcatVectors(MI.getOperand(0), {UnmergeSrc, UndefVec});

MI.eraseFromParent();
}

bool matchAArch64MulConstCombine(
MachineInstr &MI, MachineRegisterInfo &MRI,
std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
Expand Down Expand Up @@ -890,4 +1008,4 @@ namespace llvm {
FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) {
return new AArch64PostLegalizerCombiner(IsOptNone);
}
} // end namespace llvm
} // end namespace llvm
18 changes: 5 additions & 13 deletions llvm/test/CodeGen/AArch64/fptrunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -345,19 +345,11 @@ entry:
}

define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
; CHECK-SD-LABEL: fptrunc_v2f32_v2f16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: fptrunc_v2f32_v2f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NEXT: ret
entry:
%c = fptrunc <2 x float> %a to <2 x half>
ret <2 x half> %c
Expand Down
48 changes: 12 additions & 36 deletions llvm/test/CodeGen/AArch64/itofp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5763,18 +5763,14 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: scvtf v0.2d, v0.2d
; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: stofp_v2i64_v2f16:
; CHECK-FP16-GI: // %bb.0: // %entry
; CHECK-FP16-GI-NEXT: scvtf v0.2d, v0.2d
; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d
; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%c = sitofp <2 x i64> %a to <2 x half>
Expand Down Expand Up @@ -5808,18 +5804,14 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: ucvtf v0.2d, v0.2d
; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: utofp_v2i64_v2f16:
; CHECK-FP16-GI: // %bb.0: // %entry
; CHECK-FP16-GI-NEXT: ucvtf v0.2d, v0.2d
; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d
; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%c = uitofp <2 x i64> %a to <2 x half>
Expand Down Expand Up @@ -6232,17 +6224,13 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
; CHECK-NOFP16-GI-LABEL: stofp_v2i32_v2f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: stofp_v2i32_v2f16:
; CHECK-FP16-GI: // %bb.0: // %entry
; CHECK-FP16-GI-NEXT: scvtf v0.2s, v0.2s
; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%c = sitofp <2 x i32> %a to <2 x half>
Expand All @@ -6267,17 +6255,13 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
; CHECK-NOFP16-GI-LABEL: utofp_v2i32_v2f16:
; CHECK-NOFP16-GI: // %bb.0: // %entry
; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: utofp_v2i32_v2f16:
; CHECK-FP16-GI: // %bb.0: // %entry
; CHECK-FP16-GI-NEXT: ucvtf v0.2s, v0.2s
; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%c = uitofp <2 x i32> %a to <2 x half>
Expand Down Expand Up @@ -6480,9 +6464,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #16
; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #16
; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
entry:
%c = sitofp <2 x i16> %a to <2 x half>
Expand All @@ -6509,9 +6491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
; CHECK-NOFP16-GI-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
entry:
%c = uitofp <2 x i16> %a to <2 x half>
Expand Down Expand Up @@ -6766,9 +6746,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: stofp_v2i8_v2f16:
Expand Down Expand Up @@ -6817,9 +6795,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-NOFP16-GI-NEXT: movi d1, #0x0000ff000000ff
; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s
; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s
; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: utofp_v2i8_v2f16:
Expand Down