[AArch64] Combine vector FNEG+FMA into `FNML[A|S]` #167900

dheaton-arm · 2025-11-13T16:14:45Z

This allows for FNEG + FMA sequences to be combined into a single operation, with FNML[A|S], FNMAD, or FNMSB selected depending on the operand order.

This allows for FNEG + FMA sequences to be combined into a single operation, with `FNML[A|S]`, `FNMAD`, or `FNMSB` selected depending on the operand order.

dheaton-arm · 2025-11-13T16:15:37Z

@sdesmalen-arm @gbossu @SamTebbs33

llvmbot · 2025-11-13T16:15:53Z

@llvm/pr-subscribers-backend-aarch64

Author: Damian Heaton (dheaton-arm)

Changes

This allows for FNEG + FMA sequences to be combined into a single operation, with FNML[A|S], FNMAD, or FNMSB selected depending on the operand order.

Full diff: https://github.com/llvm/llvm-project/pull/167900.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+50)
(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+6-2)
(added) llvm/test/CodeGen/AArch64/sve-fmsub.ll (+276)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c8a038fa99b30..d104e2e956a40 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1176,6 +1176,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE); setTargetDAGCombine(ISD::CTPOP); + setTargetDAGCombine(ISD::FMA); + // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemset = @@ -20435,6 +20437,52 @@ static SDValue performFADDCombine(SDNode *N, return SDValue(); } +static SDValue performFMACombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + SDValue Op1 = N->getOperand(0); + SDValue Op2 = N->getOperand(1); + SDValue Op3 = N->getOperand(2); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // fma(a, b, neg(c)) -> fnmls(a, b, c) + // fma(neg(a), b, neg(c)) -> fnmla(a, b, c) + // fma(a, neg(b), neg(c)) -> fnmla(a, b, c) + if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + (Subtarget->hasSVE() || Subtarget->hasSME())) { + if (Op3.getOpcode() == ISD::FNEG) { + unsigned int Opcode; + if (Op1.getOpcode() == ISD::FNEG) { + Op1 = Op1.getOperand(0); + Opcode = AArch64ISD::FNMLA_PRED; + } else if (Op2.getOpcode() == ISD::FNEG) { + Op2 = Op2.getOperand(0); + Opcode = AArch64ISD::FNMLA_PRED; + } else { + Opcode = AArch64ISD::FNMLS_PRED; + } + Op3 = Op3.getOperand(0); + auto Pg = getPredicateForVector(DAG, DL, VT); + if (VT.isFixedLengthVector()) { + assert(DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected only legal fixed-width types"); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + Op1 = convertToScalableVector(DAG, ContainerVT, Op1); + Op2 = convertToScalableVector(DAG, ContainerVT, Op2); + Op3 = convertToScalableVector(DAG, ContainerVT, Op3); + auto ScalableRes = + DAG.getNode(Opcode, DL, ContainerVT, Pg, Op1, Op2, Op3); + return convertFromScalableVector(DAG, VT, ScalableRes); + } + return DAG.getNode(Opcode, DL, VT, Pg, Op1, Op2, Op3); + } + } + + return SDValue(); +} + static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { switch (Opcode) { case ISD::STRICT_FADD: @@ -27958,6 +28006,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performANDCombine(N, DCI); case ISD::FADD: return performFADDCombine(N, DCI); + case ISD::FMA: + return performFMACombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index e1f43867bbe5b..2f1e860cb8916 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -240,6 +240,8 @@ def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; def AArch64umulh_p : SDNode<"AArch64ISD::MULHU_PRED", SDT_AArch64Arith>; +def AArch64fnmla_p_node : SDNode<"AArch64ISD::FNMLA_PRED", SDT_AArch64FMA>; +def AArch64fnmls_p_node : SDNode<"AArch64ISD::FNMLS_PRED", SDT_AArch64FMA>; def AArch64fadd_p_contract : PatFrag<(ops node:$op1, node:$op2, node:$op3), (AArch64fadd_p node:$op1, node:$op2, node:$op3), [{ @@ -460,12 +462,14 @@ def AArch64fmlsidx : PatFrags<(ops node:$acc, node:$op1, node:$op2, node:$idx), def AArch64fnmla_p : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm), - [(int_aarch64_sve_fnmla_u node:$pg, node:$za, node:$zn, node:$zm), + [(AArch64fnmla_p_node node:$pg, node:$zn, node:$zm, node:$za), + (int_aarch64_sve_fnmla_u node:$pg, node:$za, node:$zn, node:$zm), (AArch64fma_p node:$pg, (AArch64fneg_mt node:$pg, node:$zn, (undef)), node:$zm, (AArch64fneg_mt node:$pg, node:$za, (undef))), (AArch64fneg_mt_nsz node:$pg, (AArch64fma_p node:$pg, node:$zn, node:$zm, node:$za), (undef))]>; def AArch64fnmls_p : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm), - [(int_aarch64_sve_fnmls_u node:$pg, node:$za, node:$zn, node:$zm), + [(AArch64fnmls_p_node node:$pg, node:$zn, node:$zm, node:$za), + (int_aarch64_sve_fnmls_u node:$pg, node:$za, node:$zn, node:$zm), (AArch64fma_p node:$pg, node:$zn, node:$zm, (AArch64fneg_mt node:$pg, node:$za, (undef)))]>; def AArch64fsubr_p : PatFrag<(ops node:$pg, node:$op1, node:$op2), diff --git a/llvm/test/CodeGen/AArch64/sve-fmsub.ll b/llvm/test/CodeGen/AArch64/sve-fmsub.ll new file mode 100644 index 0000000000000..721066038769c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fmsub.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64 -mattr=+v9a,+sve2,+crypto,+bf16,+sm4,+i8mm,+sve2-bitperm,+sve2-sha3,+sve2-aes,+sve2-sm4 %s -o - | FileCheck %s --check-prefixes=CHECK + +define <vscale x 2 x double> @fmsub_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) { +; CHECK-LABEL: fmsub_nxv2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %neg = fneg <vscale x 2 x double> %c + %0 = tail call <vscale x 2 x double> @llvm.fmuladd(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %neg) + ret <vscale x 2 x double> %0 +} + +define <vscale x 4 x float> @fmsub_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) { +; CHECK-LABEL: fmsub_nxv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret +entry: + %neg = fneg <vscale x 4 x float> %c + %0 = tail call <vscale x 4 x float> @llvm.fmuladd(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %neg) + ret <vscale x 4 x float> %0 +} + +define <vscale x 8 x half> @fmsub_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) { +; CHECK-LABEL: fmsub_nxv8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret +entry: + %neg = fneg <vscale x 8 x half> %c + %0 = tail call <vscale x 8 x half> @llvm.fmuladd(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %neg) + ret <vscale x 8 x half> %0 +} + +define <2 x double> @fmsub_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: fmsub_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <2 x double> %c + %0 = tail call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %neg) + ret <2 x double> %0 +} + +define <4 x float> @fmsub_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: fmsub_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <4 x float> %c + %0 = tail call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %neg) + ret <4 x float> %0 +} + +define <8 x half> @fmsub_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; CHECK-LABEL: fmsub_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <8 x half> %c + %0 = tail call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %neg) + ret <8 x half> %0 +} + + +define <2 x double> @fmsub_flipped_v2f64(<2 x double> %c, <2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: fmsub_flipped_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <2 x double> %c + %0 = tail call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %neg) + ret <2 x double> %0 +} + +define <4 x float> @fmsub_flipped_v4f32(<4 x float> %c, <4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmsub_flipped_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmls z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <4 x float> %c + %0 = tail call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %neg) + ret <4 x float> %0 +} + +define <8 x half> @fmsub_flipped_v8f16(<8 x half> %c, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: fmsub_flipped_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <8 x half> %c + %0 = tail call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %neg) + ret <8 x half> %0 +} + +define <vscale x 2 x double> @fnmsub_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) { +; CHECK-LABEL: fnmsub_nxv2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fnmad z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %neg = fneg <vscale x 2 x double> %a + %neg1 = fneg <vscale x 2 x double> %c + %0 = tail call <vscale x 2 x double> @llvm.fmuladd(<vscale x 2 x double> %neg, <vscale x 2 x double> %b, <vscale x 2 x double> %neg1) + ret <vscale x 2 x double> %0 +} + +define <vscale x 4 x float> @fnmsub_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) { +; CHECK-LABEL: fnmsub_nxv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ret +entry: + %neg = fneg <vscale x 4 x float> %a + %neg1 = fneg <vscale x 4 x float> %c + %0 = tail call <vscale x 4 x float> @llvm.fmuladd(<vscale x 4 x float> %neg, <vscale x 4 x float> %b, <vscale x 4 x float> %neg1) + ret <vscale x 4 x float> %0 +} + +define <vscale x 8 x half> @fnmsub_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) { +; CHECK-LABEL: fnmsub_nxv8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret +entry: + %neg = fneg <vscale x 8 x half> %a + %neg1 = fneg <vscale x 8 x half> %c + %0 = tail call <vscale x 8 x half> @llvm.fmuladd(<vscale x 8 x half> %neg, <vscale x 8 x half> %b, <vscale x 8 x half> %neg1) + ret <vscale x 8 x half> %0 +} + +define <2 x double> @fnmsub_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: fnmsub_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmad z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <2 x double> %a + %neg1 = fneg <2 x double> %c + %0 = tail call <2 x double> @llvm.fmuladd(<2 x double> %neg, <2 x double> %b, <2 x double> %neg1) + ret <2 x double> %0 +} + +define <4 x float> @fnmsub_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: fnmsub_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <4 x float> %a + %neg1 = fneg <4 x float> %c + %0 = tail call <4 x float> @llvm.fmuladd(<4 x float> %neg, <4 x float> %b, <4 x float> %neg1) + ret <4 x float> %0 +} + +define <8 x half> @fnmsub_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; CHECK-LABEL: fnmsub_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <8 x half> %a + %neg1 = fneg <8 x half> %c + %0 = tail call <8 x half> @llvm.fmuladd(<8 x half> %neg, <8 x half> %b, <8 x half> %neg1) + ret <8 x half> %0 +} + +define <2 x double> @fnmsub_flipped_v2f64(<2 x double> %c, <2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: fnmsub_flipped_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmla z0.d, p0/m, z1.d, z2.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <2 x double> %a + %neg1 = fneg <2 x double> %c + %0 = tail call <2 x double> @llvm.fmuladd(<2 x double> %neg, <2 x double> %b, <2 x double> %neg1) + ret <2 x double> %0 +} + +define <4 x float> @fnmsub_flipped_v4f32(<4 x float> %c, <4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fnmsub_flipped_v4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <4 x float> %a + %neg1 = fneg <4 x float> %c + %0 = tail call <4 x float> @llvm.fmuladd(<4 x float> %neg, <4 x float> %b, <4 x float> %neg1) + ret <4 x float> %0 +} + +define <8 x half> @fnmsub_flipped_v8f16(<8 x half> %c, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: fnmsub_flipped_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %neg = fneg <8 x half> %a + %neg1 = fneg <8 x half> %c + %0 = tail call <8 x half> @llvm.fmuladd(<8 x half> %neg, <8 x half> %b, <8 x half> %neg1) + ret <8 x half> %0 +}

MacDue · 2025-11-13T16:22:31Z

nit: I think the PR title should be prefixed with [AArch64]

MacDue · 2025-11-13T16:45:52Z

llvm/test/CodeGen/AArch64/sve-fmsub.ll

@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -mattr=+v9a,+sve2,+crypto,+bf16,+sm4,+i8mm,+sve2-bitperm,+sve2-sha3,+sve2-aes,+sve2-sm4 %s -o - | FileCheck %s --check-prefixes=CHECK


We normally only add the features needed for the test (I assume these are from a Clang .ll file).

MacDue · 2025-11-13T16:55:28Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+ // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
+ // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
+ if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ (Subtarget->hasSVE() || Subtarget->hasSME())) {


I think this needs to be:

Suggested change

(Subtarget->hasSVE() || Subtarget->hasSME())) {

Subtarget->isSVEorStreamingSVEAvailable()) {

Otherwise, the backend will crash trying to lower this with just -mattr=+sme (it'd need to be a streaming function to use FNMLS/A_PRED).

define <2 x double> @fmsub_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: %neg = fneg <2 x double> %c %0 = tail call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %neg) ret <2 x double> %0 }

MacDue · 2025-11-13T16:56:52Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+ // fma(a, b, neg(c)) -> fnmls(a, b, c)
+ // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
+ // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
+ if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&


nit: Maybe exit early if this does not hold, to avoid a level of indentation?

MacDue · 2025-11-13T16:57:46Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+ assert(DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected only legal fixed-width types");


nit: This is already checked in the main if, so we probably can remove the assert.

MacDue · 2025-11-13T17:03:07Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+ SDValue Op1 = N->getOperand(0);
+ SDValue Op2 = N->getOperand(1);
+ SDValue Op3 = N->getOperand(2);


Maybe rename these OpA/OpB/OpC to be consistent with the examples below?

Combine vector FNEG+FMA into FNML[A|S]

ba0e9ae

This allows for FNEG + FMA sequences to be combined into a single operation, with `FNML[A|S]`, `FNMAD`, or `FNMSB` selected depending on the operand order.

llvmbot added the backend:AArch64 label Nov 13, 2025

MacDue requested review from MacDue, SamTebbs33, gbossu and sdesmalen-arm November 13, 2025 16:20

dheaton-arm changed the title ~~Combine vector FNEG+FMA into FNML[A|S]~~ [AArch64] Combine vector FNEG+FMA into FNML[A|S] Nov 13, 2025

MacDue reviewed Nov 13, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Combine vector FNEG+FMA into `FNML[A|S]` #167900

[AArch64] Combine vector FNEG+FMA into `FNML[A|S]` #167900

dheaton-arm commented Nov 13, 2025

dheaton-arm commented Nov 13, 2025

llvmbot commented Nov 13, 2025

MacDue commented Nov 13, 2025

MacDue Nov 13, 2025

MacDue Nov 13, 2025

MacDue Nov 13, 2025 •

edited

Loading

MacDue Nov 13, 2025

MacDue Nov 13, 2025

Labels

3 participants

		@@ -0,0 +1,276 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
		; RUN: llc -mtriple=aarch64 -mattr=+v9a,+sve2,+crypto,+bf16,+sm4,+i8mm,+sve2-bitperm,+sve2-sha3,+sve2-aes,+sve2-sm4 %s -o - \| FileCheck %s --check-prefixes=CHECK

	(Subtarget->hasSVE() \|\| Subtarget->hasSME())) {
	Subtarget->isSVEorStreamingSVEAvailable()) {

		assert(DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
		"Expected only legal fixed-width types");

[AArch64] Combine vector FNEG+FMA into FNML[A|S] #167900

Are you sure you want to change the base?

[AArch64] Combine vector FNEG+FMA into FNML[A|S] #167900

Conversation

dheaton-arm commented Nov 13, 2025

dheaton-arm commented Nov 13, 2025

llvmbot commented Nov 13, 2025

MacDue commented Nov 13, 2025

MacDue Nov 13, 2025

Choose a reason for hiding this comment

MacDue Nov 13, 2025

Choose a reason for hiding this comment

MacDue Nov 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

MacDue Nov 13, 2025

Choose a reason for hiding this comment

MacDue Nov 13, 2025

Choose a reason for hiding this comment

Labels

3 participants

[AArch64] Combine vector FNEG+FMA into `FNML[A|S]` #167900

[AArch64] Combine vector FNEG+FMA into `FNML[A|S]` #167900

MacDue Nov 13, 2025 •

edited

Loading