@@ -790,15 +790,250 @@ entry:
790790 ret i16 %result
791791}
792792
793- declare i16 @llvm.vector.reduce.add.v16i16 (<16 x i16 >)
794-
795-
796- declare <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 , i32 )
797- declare <4 x i32 > @llvm.masked.load.v4i32.p0 (ptr , i32 immarg, <4 x i1 >, <4 x i32 >)
798- declare void @llvm.masked.store.v4i32.p0 (<4 x i32 >, ptr , i32 immarg, <4 x i1 >)
799- declare <8 x i1 > @llvm.get.active.lane.mask.v8i1.i32 (i32 , i32 )
800- declare <8 x i16 > @llvm.masked.load.v8i16.p0 (ptr , i32 immarg, <8 x i1 >, <8 x i16 >)
801- declare void @llvm.masked.store.v8i16.p0 (<8 x i16 >, ptr , i32 immarg, <8 x i1 >)
802- declare <16 x i1 > @llvm.get.active.lane.mask.v16i1.i32 (i32 , i32 )
803- declare <16 x i8 > @llvm.masked.load.v16i8.p0 (ptr , i32 immarg, <16 x i1 >, <16 x i8 >)
804- declare void @llvm.masked.store.v16i8.p0 (<16 x i8 >, ptr , i32 immarg, <16 x i1 >)
793+ define arm_aapcs_vfpcc <4 x i32 > @vmulhs_kb_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
794+ ; CHECK-LABEL: vmulhs_kb_v4i32:
795+ ; CHECK: @ %bb.0: @ %entry
796+ ; CHECK-NEXT: vmov.f32 s4, s2
797+ ; CHECK-NEXT: vmov r1, s9
798+ ; CHECK-NEXT: vmov r2, s5
799+ ; CHECK-NEXT: vmov.f32 s6, s3
800+ ; CHECK-NEXT: vmov.f32 s10, s1
801+ ; CHECK-NEXT: vmov r0, s4
802+ ; CHECK-NEXT: smmul r0, r0, r1
803+ ; CHECK-NEXT: vmov r1, s0
804+ ; CHECK-NEXT: smmul r1, r1, r2
805+ ; CHECK-NEXT: vmov r2, s7
806+ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
807+ ; CHECK-NEXT: vmov r0, s6
808+ ; CHECK-NEXT: vmov r1, s11
809+ ; CHECK-NEXT: smmul r0, r0, r1
810+ ; CHECK-NEXT: vmov r1, s10
811+ ; CHECK-NEXT: smmul r1, r1, r2
812+ ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
813+ ; CHECK-NEXT: bx lr
814+ entry:
815+ %s0s = sext <4 x i32 > %s0 to <4 x i64 >
816+ %s1s = ashr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
817+ %m = mul <4 x i64 > %s0s , %s1s
818+ %s = ashr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
819+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
820+ ret <4 x i32 > %s2
821+ }
822+
823+ define arm_aapcs_vfpcc <4 x i32 > @vmulhu_kb_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
824+ ; CHECK-LABEL: vmulhu_kb_v4i32:
825+ ; CHECK: @ %bb.0: @ %entry
826+ ; CHECK-NEXT: vmov.f32 s4, s2
827+ ; CHECK-NEXT: vmov r1, s9
828+ ; CHECK-NEXT: vmov r2, s5
829+ ; CHECK-NEXT: vmov.f32 s6, s3
830+ ; CHECK-NEXT: vmov.f32 s10, s1
831+ ; CHECK-NEXT: vmov r0, s4
832+ ; CHECK-NEXT: umull r0, r1, r0, r1
833+ ; CHECK-NEXT: vmov r0, s0
834+ ; CHECK-NEXT: umull r0, r2, r0, r2
835+ ; CHECK-NEXT: vmov r0, s6
836+ ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
837+ ; CHECK-NEXT: vmov r1, s11
838+ ; CHECK-NEXT: vmov r2, s7
839+ ; CHECK-NEXT: umull r0, r1, r0, r1
840+ ; CHECK-NEXT: vmov r0, s10
841+ ; CHECK-NEXT: umull r0, r2, r0, r2
842+ ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
843+ ; CHECK-NEXT: bx lr
844+ entry:
845+ %s0s = zext <4 x i32 > %s0 to <4 x i64 >
846+ %s1s = lshr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
847+ %m = mul <4 x i64 > %s0s , %s1s
848+ %s = lshr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
849+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
850+ ret <4 x i32 > %s2
851+ }
852+
853+ define arm_aapcs_vfpcc <4 x i32 > @vmulhs_kbc_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
854+ ; CHECK-LABEL: vmulhs_kbc_v4i32:
855+ ; CHECK: @ %bb.0: @ %entry
856+ ; CHECK-NEXT: vmov.f32 s4, s2
857+ ; CHECK-NEXT: vmov r1, s9
858+ ; CHECK-NEXT: vmov r2, s5
859+ ; CHECK-NEXT: vmov.f32 s6, s3
860+ ; CHECK-NEXT: vmov.f32 s10, s1
861+ ; CHECK-NEXT: vmov r0, s4
862+ ; CHECK-NEXT: smmul r0, r1, r0
863+ ; CHECK-NEXT: vmov r1, s0
864+ ; CHECK-NEXT: smmul r1, r2, r1
865+ ; CHECK-NEXT: vmov r2, s7
866+ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
867+ ; CHECK-NEXT: vmov r0, s6
868+ ; CHECK-NEXT: vmov r1, s11
869+ ; CHECK-NEXT: smmul r0, r1, r0
870+ ; CHECK-NEXT: vmov r1, s10
871+ ; CHECK-NEXT: smmul r1, r2, r1
872+ ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
873+ ; CHECK-NEXT: bx lr
874+ entry:
875+ %s0s = sext <4 x i32 > %s0 to <4 x i64 >
876+ %s1s = ashr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
877+ %m = mul <4 x i64 > %s1s , %s0s
878+ %s = ashr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
879+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
880+ ret <4 x i32 > %s2
881+ }
882+
883+ define arm_aapcs_vfpcc <4 x i32 > @vmulhu_kbc_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
884+ ; CHECK-LABEL: vmulhu_kbc_v4i32:
885+ ; CHECK: @ %bb.0: @ %entry
886+ ; CHECK-NEXT: vmov.f32 s4, s2
887+ ; CHECK-NEXT: vmov r1, s9
888+ ; CHECK-NEXT: vmov r2, s5
889+ ; CHECK-NEXT: vmov.f32 s6, s3
890+ ; CHECK-NEXT: vmov.f32 s10, s1
891+ ; CHECK-NEXT: vmov r0, s4
892+ ; CHECK-NEXT: umull r0, r1, r1, r0
893+ ; CHECK-NEXT: vmov r0, s0
894+ ; CHECK-NEXT: umull r0, r2, r2, r0
895+ ; CHECK-NEXT: vmov r0, s6
896+ ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
897+ ; CHECK-NEXT: vmov r1, s11
898+ ; CHECK-NEXT: vmov r2, s7
899+ ; CHECK-NEXT: umull r0, r1, r1, r0
900+ ; CHECK-NEXT: vmov r0, s10
901+ ; CHECK-NEXT: umull r0, r2, r2, r0
902+ ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
903+ ; CHECK-NEXT: bx lr
904+ entry:
905+ %s0s = zext <4 x i32 > %s0 to <4 x i64 >
906+ %s1s = lshr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
907+ %m = mul <4 x i64 > %s1s , %s0s
908+ %s = lshr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
909+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
910+ ret <4 x i32 > %s2
911+ }
912+
913+ define arm_aapcs_vfpcc <8 x i16 > @vmulhs_kb_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
914+ ; CHECK-LABEL: vmulhs_kb_v8i16:
915+ ; CHECK: @ %bb.0: @ %entry
916+ ; CHECK-NEXT: .vsave {d8, d9}
917+ ; CHECK-NEXT: vpush {d8, d9}
918+ ; CHECK-NEXT: vmov.f32 s12, s5
919+ ; CHECK-NEXT: vmovlt.s16 q4, q0
920+ ; CHECK-NEXT: vmov.f32 s13, s7
921+ ; CHECK-NEXT: vmovlb.s16 q0, q0
922+ ; CHECK-NEXT: vmov.f32 s5, s6
923+ ; CHECK-NEXT: vmov.f32 s14, s9
924+ ; CHECK-NEXT: vmov.f32 s15, s11
925+ ; CHECK-NEXT: vmov.f32 s6, s8
926+ ; CHECK-NEXT: vshr.s32 q3, q3, #16
927+ ; CHECK-NEXT: vmov.f32 s7, s10
928+ ; CHECK-NEXT: vmul.i32 q3, q4, q3
929+ ; CHECK-NEXT: vshr.s32 q1, q1, #16
930+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
931+ ; CHECK-NEXT: vmul.i32 q0, q0, q1
932+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
933+ ; CHECK-NEXT: vmovnt.i32 q0, q3
934+ ; CHECK-NEXT: vpop {d8, d9}
935+ ; CHECK-NEXT: bx lr
936+ entry:
937+ %s0s = sext <8 x i16 > %s0 to <8 x i32 >
938+ %s1s = ashr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
939+ %m = mul <8 x i32 > %s0s , %s1s
940+ %s = ashr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
941+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
942+ ret <8 x i16 > %s2
943+ }
944+
945+ define arm_aapcs_vfpcc <8 x i16 > @vmulhu_kb_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
946+ ; CHECK-LABEL: vmulhu_kb_v8i16:
947+ ; CHECK: @ %bb.0: @ %entry
948+ ; CHECK-NEXT: .vsave {d8, d9}
949+ ; CHECK-NEXT: vpush {d8, d9}
950+ ; CHECK-NEXT: vmov.f32 s12, s5
951+ ; CHECK-NEXT: vmovlt.u16 q4, q0
952+ ; CHECK-NEXT: vmov.f32 s13, s7
953+ ; CHECK-NEXT: vmovlb.u16 q0, q0
954+ ; CHECK-NEXT: vmov.f32 s5, s6
955+ ; CHECK-NEXT: vmov.f32 s14, s9
956+ ; CHECK-NEXT: vmov.f32 s15, s11
957+ ; CHECK-NEXT: vmov.f32 s6, s8
958+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
959+ ; CHECK-NEXT: vmov.f32 s7, s10
960+ ; CHECK-NEXT: vmul.i32 q3, q4, q3
961+ ; CHECK-NEXT: vshr.u32 q1, q1, #16
962+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
963+ ; CHECK-NEXT: vmul.i32 q0, q0, q1
964+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
965+ ; CHECK-NEXT: vmovnt.i32 q0, q3
966+ ; CHECK-NEXT: vpop {d8, d9}
967+ ; CHECK-NEXT: bx lr
968+ entry:
969+ %s0s = zext <8 x i16 > %s0 to <8 x i32 >
970+ %s1s = lshr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
971+ %m = mul <8 x i32 > %s0s , %s1s
972+ %s = lshr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
973+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
974+ ret <8 x i16 > %s2
975+ }
976+
977+ define arm_aapcs_vfpcc <8 x i16 > @vmulhs_kbc_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
978+ ; CHECK-LABEL: vmulhs_kbc_v8i16:
979+ ; CHECK: @ %bb.0: @ %entry
980+ ; CHECK-NEXT: .vsave {d8, d9}
981+ ; CHECK-NEXT: vpush {d8, d9}
982+ ; CHECK-NEXT: vmov.f32 s12, s5
983+ ; CHECK-NEXT: vmovlt.s16 q4, q0
984+ ; CHECK-NEXT: vmov.f32 s13, s7
985+ ; CHECK-NEXT: vmovlb.s16 q0, q0
986+ ; CHECK-NEXT: vmov.f32 s5, s6
987+ ; CHECK-NEXT: vmov.f32 s14, s9
988+ ; CHECK-NEXT: vmov.f32 s15, s11
989+ ; CHECK-NEXT: vmov.f32 s6, s8
990+ ; CHECK-NEXT: vshr.s32 q3, q3, #16
991+ ; CHECK-NEXT: vmov.f32 s7, s10
992+ ; CHECK-NEXT: vmul.i32 q3, q3, q4
993+ ; CHECK-NEXT: vshr.s32 q1, q1, #16
994+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
995+ ; CHECK-NEXT: vmul.i32 q0, q1, q0
996+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
997+ ; CHECK-NEXT: vmovnt.i32 q0, q3
998+ ; CHECK-NEXT: vpop {d8, d9}
999+ ; CHECK-NEXT: bx lr
1000+ entry:
1001+ %s0s = sext <8 x i16 > %s0 to <8 x i32 >
1002+ %s1s = ashr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1003+ %m = mul <8 x i32 > %s1s , %s0s
1004+ %s = ashr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1005+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
1006+ ret <8 x i16 > %s2
1007+ }
1008+
1009+ define arm_aapcs_vfpcc <8 x i16 > @vmulhu_kbc_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
1010+ ; CHECK-LABEL: vmulhu_kbc_v8i16:
1011+ ; CHECK: @ %bb.0: @ %entry
1012+ ; CHECK-NEXT: .vsave {d8, d9}
1013+ ; CHECK-NEXT: vpush {d8, d9}
1014+ ; CHECK-NEXT: vmov.f32 s12, s5
1015+ ; CHECK-NEXT: vmovlt.u16 q4, q0
1016+ ; CHECK-NEXT: vmov.f32 s13, s7
1017+ ; CHECK-NEXT: vmovlb.u16 q0, q0
1018+ ; CHECK-NEXT: vmov.f32 s5, s6
1019+ ; CHECK-NEXT: vmov.f32 s14, s9
1020+ ; CHECK-NEXT: vmov.f32 s15, s11
1021+ ; CHECK-NEXT: vmov.f32 s6, s8
1022+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
1023+ ; CHECK-NEXT: vmov.f32 s7, s10
1024+ ; CHECK-NEXT: vmul.i32 q3, q3, q4
1025+ ; CHECK-NEXT: vshr.u32 q1, q1, #16
1026+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
1027+ ; CHECK-NEXT: vmul.i32 q0, q1, q0
1028+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
1029+ ; CHECK-NEXT: vmovnt.i32 q0, q3
1030+ ; CHECK-NEXT: vpop {d8, d9}
1031+ ; CHECK-NEXT: bx lr
1032+ entry:
1033+ %s0s = zext <8 x i16 > %s0 to <8 x i32 >
1034+ %s1s = lshr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1035+ %m = mul <8 x i32 > %s1s , %s0s
1036+ %s = lshr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1037+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
1038+ ret <8 x i16 > %s2
1039+ }
0 commit comments