@@ -901,6 +901,200 @@ unsigned getCmpOperandFoldingProfit(Register CmpOp, MachineRegisterInfo &MRI) {
901901 return 0 ;
902902}
903903
904+ // Helper function for matchFpTruncFpTrunc.
905+ // Checks that the given definition belongs to an FPTRUNC and that the source is
906+ // not an integer, as no rounding is necessary due to the range of values
907+ bool checkTruncSrc (MachineRegisterInfo &MRI, MachineInstr *MaybeFpTrunc) {
908+ if (!MaybeFpTrunc || MaybeFpTrunc->getOpcode () != TargetOpcode::G_FPTRUNC)
909+ return false ;
910+
911+ // Check the source is 64 bits as we only want to match a very specific
912+ // pattern
913+ Register FpTruncSrc = MaybeFpTrunc->getOperand (1 ).getReg ();
914+ LLT SrcTy = MRI.getType (FpTruncSrc);
915+ if (SrcTy.getScalarSizeInBits () != 64 )
916+ return false ;
917+
918+ // Need to check the float didn't come from an int as no rounding is
919+ // neccessary
920+ MachineInstr *FpTruncSrcDef = getDefIgnoringCopies (FpTruncSrc, MRI);
921+ if (FpTruncSrcDef->getOpcode () == TargetOpcode::G_SITOFP ||
922+ FpTruncSrcDef->getOpcode () == TargetOpcode::G_UITOFP)
923+ return false ;
924+
925+ return true ;
926+ }
927+
928+ // To avoid double rounding issues we need to lower FPTRUNC(FPTRUNC) to an odd
929+ // rounding truncate and a normal truncate. When
930+ // truncating an FP that came from an integer this is not a problem as the range
931+ // of values is lower in the int
932+ bool matchFpTruncFpTrunc (MachineInstr &MI, MachineRegisterInfo &MRI) {
933+ if (MI.getOpcode () != TargetOpcode::G_FPTRUNC)
934+ return false ;
935+
936+ // Check the destination is 16 bits as we only want to match a very specific
937+ // pattern
938+ Register Dst = MI.getOperand (0 ).getReg ();
939+ LLT DstTy = MRI.getType (Dst);
940+ if (DstTy.getScalarSizeInBits () != 16 )
941+ return false ;
942+
943+ Register Src = MI.getOperand (1 ).getReg ();
944+
945+ MachineInstr *ParentDef = getDefIgnoringCopies (Src, MRI);
946+ if (!ParentDef)
947+ return false ;
948+
949+ MachineInstr *FpTruncDef;
950+ switch (ParentDef->getOpcode ()) {
951+ default :
952+ return false ;
953+ case TargetOpcode::G_CONCAT_VECTORS: {
954+ // Expecting exactly two FPTRUNCs
955+ if (ParentDef->getNumOperands () != 3 )
956+ return false ;
957+
958+ // All operands need to be FPTRUNC
959+ for (unsigned OpIdx = 1 , NumOperands = ParentDef->getNumOperands ();
960+ OpIdx != NumOperands; ++OpIdx) {
961+ Register FpTruncDst = ParentDef->getOperand (OpIdx).getReg ();
962+
963+ FpTruncDef = getDefIgnoringCopies (FpTruncDst, MRI);
964+
965+ if (!checkTruncSrc (MRI, FpTruncDef))
966+ return false ;
967+ }
968+
969+ return true ;
970+ }
971+ // This is to match cases in which vectors are widened to a larger size
972+ case TargetOpcode::G_INSERT_VECTOR_ELT: {
973+ Register VecExtractDst = ParentDef->getOperand (2 ).getReg ();
974+ MachineInstr *VecExtractDef = getDefIgnoringCopies (VecExtractDst, MRI);
975+
976+ Register FpTruncDst = VecExtractDef->getOperand (1 ).getReg ();
977+ FpTruncDef = getDefIgnoringCopies (FpTruncDst, MRI);
978+
979+ if (!checkTruncSrc (MRI, FpTruncDef))
980+ return false ;
981+ break ;
982+ }
983+ case TargetOpcode::G_FPTRUNC: {
984+ Register FpTruncDst = ParentDef->getOperand (1 ).getReg ();
985+ FpTruncDef = getDefIgnoringCopies (FpTruncDst, MRI);
986+
987+ if (!checkTruncSrc (MRI, FpTruncDef))
988+ return false ;
989+ break ;
990+ }
991+ }
992+
993+ return true ;
994+ }
995+
996+ void applyFpTruncFpTrunc (MachineInstr &MI, MachineRegisterInfo &MRI,
997+ MachineIRBuilder &B) {
998+ Register Dst = MI.getOperand (0 ).getReg ();
999+ Register Src = MI.getOperand (1 ).getReg ();
1000+
1001+ LLT V2F32 = LLT::fixed_vector (2 , LLT::scalar (32 ));
1002+ LLT V4F32 = LLT::fixed_vector (4 , LLT::scalar (32 ));
1003+ LLT V4F16 = LLT::fixed_vector (4 , LLT::scalar (16 ));
1004+
1005+ B.setInstrAndDebugLoc (MI);
1006+
1007+ MachineInstr *ParentDef = getDefIgnoringCopies (Src, MRI);
1008+ if (!ParentDef)
1009+ return ;
1010+
1011+ switch (ParentDef->getOpcode ()) {
1012+ default :
1013+ return ;
1014+ case TargetOpcode::G_INSERT_VECTOR_ELT: {
1015+ Register VecExtractDst = ParentDef->getOperand (2 ).getReg ();
1016+ MachineInstr *VecExtractDef = getDefIgnoringCopies (VecExtractDst, MRI);
1017+
1018+ Register FpTruncDst = VecExtractDef->getOperand (1 ).getReg ();
1019+ MachineInstr *FpTruncDef = getDefIgnoringCopies (FpTruncDst, MRI);
1020+
1021+ Register FpTruncSrc = FpTruncDef->getOperand (1 ).getReg ();
1022+ MRI.setRegClass (FpTruncSrc, &AArch64::FPR128RegClass);
1023+
1024+ Register Fp32 = MRI.createGenericVirtualRegister (V2F32);
1025+ MRI.setRegClass (Fp32, &AArch64::FPR64RegClass);
1026+
1027+ B.buildInstr (AArch64::FCVTXNv2f32, {Fp32}, {FpTruncSrc});
1028+
1029+ // Only 4f32 -> 4f16 is legal so we need to mimic that situation
1030+ Register Fp32Padding = B.buildUndef (V2F32).getReg (0 );
1031+ MRI.setRegClass (Fp32Padding, &AArch64::FPR64RegClass);
1032+
1033+ Register Fp32Full = MRI.createGenericVirtualRegister (V4F32);
1034+ MRI.setRegClass (Fp32Full, &AArch64::FPR128RegClass);
1035+ B.buildConcatVectors (Fp32Full, {Fp32, Fp32Padding});
1036+
1037+ Register Fp16 = MRI.createGenericVirtualRegister (V4F16);
1038+ MRI.setRegClass (Fp16, &AArch64::FPR64RegClass);
1039+ B.buildFPTrunc (Fp16, Fp32Full);
1040+
1041+ MRI.replaceRegWith (Dst, Fp16);
1042+ MI.eraseFromParent ();
1043+ break ;
1044+ }
1045+ case TargetOpcode::G_CONCAT_VECTORS: {
1046+ // Get the two FP Truncs that are being concatenated
1047+ Register FpTrunc1Dst = ParentDef->getOperand (1 ).getReg ();
1048+ Register FpTrunc2Dst = ParentDef->getOperand (2 ).getReg ();
1049+
1050+ MachineInstr *FpTrunc1Def = getDefIgnoringCopies (FpTrunc1Dst, MRI);
1051+ MachineInstr *FpTrunc2Def = getDefIgnoringCopies (FpTrunc2Dst, MRI);
1052+
1053+ // Make the registers 128bit to store the 2 doubles
1054+ Register LoFp64 = FpTrunc1Def->getOperand (1 ).getReg ();
1055+ MRI.setRegClass (LoFp64, &AArch64::FPR128RegClass);
1056+ Register HiFp64 = FpTrunc2Def->getOperand (1 ).getReg ();
1057+ MRI.setRegClass (HiFp64, &AArch64::FPR128RegClass);
1058+
1059+ B.setInstrAndDebugLoc (MI);
1060+
1061+ // Convert the lower half
1062+ Register LoFp32 = MRI.createGenericVirtualRegister (V2F32);
1063+ MRI.setRegClass (LoFp32, &AArch64::FPR64RegClass);
1064+ B.buildInstr (AArch64::FCVTXNv2f32, {LoFp32}, {LoFp64});
1065+
1066+ // Create a register for the high half to use
1067+ Register AccUndef = MRI.createGenericVirtualRegister (V4F32);
1068+ MRI.setRegClass (AccUndef, &AArch64::FPR128RegClass);
1069+ B.buildUndef (AccUndef);
1070+
1071+ Register Acc = MRI.createGenericVirtualRegister (V4F32);
1072+ MRI.setRegClass (Acc, &AArch64::FPR128RegClass);
1073+ B.buildInstr (TargetOpcode::INSERT_SUBREG)
1074+ .addDef (Acc)
1075+ .addUse (AccUndef)
1076+ .addUse (LoFp32)
1077+ .addImm (AArch64::dsub);
1078+
1079+ // Convert the high half
1080+ Register AccOut = MRI.createGenericVirtualRegister (V4F32);
1081+ MRI.setRegClass (AccOut, &AArch64::FPR128RegClass);
1082+ B.buildInstr (AArch64::FCVTXNv4f32)
1083+ .addDef (AccOut)
1084+ .addUse (Acc)
1085+ .addUse (HiFp64);
1086+
1087+ Register Fp16 = MRI.createGenericVirtualRegister (V4F16);
1088+ MRI.setRegClass (Fp16, &AArch64::FPR64RegClass);
1089+ B.buildFPTrunc (Fp16, AccOut);
1090+
1091+ MRI.replaceRegWith (Dst, Fp16);
1092+ MI.eraseFromParent ();
1093+ break ;
1094+ }
1095+ }
1096+ }
1097+
9041098// / \returns true if it would be profitable to swap the LHS and RHS of a G_ICMP
9051099// / instruction \p MI.
9061100bool trySwapICmpOperands (MachineInstr &MI, MachineRegisterInfo &MRI) {
0 commit comments