-
Couldn't load subscription status.
- Fork 15k
Open
Labels
Description
In LLVM15, there was only one store in vector.body that was loop-vectorized.
However, in LLVM16, there are three stores.
(The same phenomenon occurs in LLVM17 as in LLVM16)
We believe that this change affects performance.
This started happening after applying the following commit.
078899c [SimplifyCFG] Allow SimplifyCFG hoisting to skip over non-matching instructions
This is related to the fact that SimplifyCFG, which is executed before loop-vectorize, no longer shares the store for each if condition in the loop.
- Input code (modified from TSVC s441 function)
[ sample.c ]
double a[20], b[20], c[20], d[20]; void func() { for (int i = 0; i < 20; i++) { if (d[i] < (double)0.) { a[i] += b[i] * c[i]; } else if (d[i] == (double)0.) { a[i] += b[i] * b[i]; } else { a[i] += c[i] * c[i]; } } } - compile options
clang --target=aarch64-linux-gnu -mcpu=a64fx -Ofast -S -c -o /dev/null sample.c -fno-unroll-loops -msve-vector-bits=512 - vector.body (LLVM16)
vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %4 = add i64 %index, 0 %5 = getelementptr inbounds [20 x double], ptr @d, i64 0, i64 %4 %6 = getelementptr inbounds double, ptr %5, i32 0 %wide.load = load <vscale x 2 x double>, ptr %6, align 8, !tbaa !6 %7 = fcmp fast olt <vscale x 2 x double> %wide.load, zeroinitializer %8 = fcmp fast oeq <vscale x 2 x double> %wide.load, zeroinitializer %9 = getelementptr [20 x double], ptr @a, i64 0, i64 %4 %10 = getelementptr double, ptr %9, i32 0 %wide.load44 = load <vscale x 2 x double>, ptr %10, align 8, !tbaa !6 %11 = getelementptr [20 x double], ptr @c, i64 0, i64 %4 %12 = getelementptr double, ptr %11, i32 0 %wide.load45 = load <vscale x 2 x double>, ptr %12, align 8, !tbaa !6 %13 = fmul fast <vscale x 2 x double> %wide.load45, %wide.load45 %14 = fadd fast <vscale x 2 x double> %wide.load44, %13 %15 = xor <vscale x 2 x i1> %7, shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer) %16 = xor <vscale x 2 x i1> %8, shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer) %17 = select <vscale x 2 x i1> %15, <vscale x 2 x i1> %16, <vscale x 2 x i1> zeroinitializer call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %14, ptr %10, i32 8, <vscale x 2 x i1> %17), !tbaa !6 %18 = getelementptr [20 x double], ptr @b, i64 0, i64 %4 %19 = getelementptr double, ptr %18, i32 0 %wide.load46 = load <vscale x 2 x double>, ptr %19, align 8, !tbaa !6 %20 = fmul fast <vscale x 2 x double> %wide.load46, %wide.load46 %21 = fadd fast <vscale x 2 x double> %wide.load44, %20 %22 = select <vscale x 2 x i1> %15, <vscale x 2 x i1> %8, <vscale x 2 x i1> zeroinitializer call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %21, ptr %10, i32 8, <vscale x 2 x i1> %22), !tbaa !6 %wide.load47 = load <vscale x 2 x double>, ptr %19, align 8, !tbaa !6 %wide.load48 = load <vscale x 2 x double>, ptr %12, align 8, !tbaa !6 %23 = fmul fast <vscale x 2 x double> %wide.load48, %wide.load47 %wide.load49 = load <vscale x 2 x double>, ptr %10, align 8, !tbaa !6 %24 = fadd fast <vscale x 2 x double> %wide.load49, %23 call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %24, ptr %10, i32 8, <vscale x 2 x i1> %7), !tbaa !6 %25 = call i64 @llvm.vscale.i64() %26 = mul i64 %25, 2 %index.next = add nuw i64 %index, %26 %27 = icmp eq i64 %index.next, %n.vec br i1 %27, label %middle.block, label %vector.body, !llvm.loop !10 - vector.body (LLVM15)
vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %4 = add i64 %index, 0 %5 = getelementptr inbounds [20 x double], ptr @d, i64 0, i64 %4 %6 = getelementptr inbounds double, ptr %5, i32 0 %wide.load = load <vscale x 2 x double>, ptr %6, align 8, !tbaa !6 %7 = fcmp fast olt <vscale x 2 x double> %wide.load, zeroinitializer %8 = fcmp fast oeq <vscale x 2 x double> %wide.load, zeroinitializer %9 = getelementptr [20 x double], ptr @c, i64 0, i64 %4 %10 = getelementptr double, ptr %9, i32 0 %wide.load45 = load <vscale x 2 x double>, ptr %10, align 8, !tbaa !6 %11 = fmul fast <vscale x 2 x double> %wide.load45, %wide.load45 %12 = getelementptr [20 x double], ptr @b, i64 0, i64 %4 %13 = getelementptr double, ptr %12, i32 0 %wide.load46 = load <vscale x 2 x double>, ptr %13, align 8, !tbaa !6 %14 = fmul fast <vscale x 2 x double> %wide.load46, %wide.load46 %wide.load47 = load <vscale x 2 x double>, ptr %13, align 8, !tbaa !6 %wide.load48 = load <vscale x 2 x double>, ptr %10, align 8, !tbaa !6 %15 = fmul fast <vscale x 2 x double> %wide.load48, %wide.load47 %16 = xor <vscale x 2 x i1> %7, shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer) %17 = xor <vscale x 2 x i1> %8, shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer) %18 = select <vscale x 2 x i1> %16, <vscale x 2 x i1> %17, <vscale x 2 x i1> zeroinitializer %19 = select <vscale x 2 x i1> %16, <vscale x 2 x i1> %8, <vscale x 2 x i1> zeroinitializer %predphi = select <vscale x 2 x i1> %18, <vscale x 2 x double> %11, <vscale x 2 x double> %15 %predphi49 = select <vscale x 2 x i1> %19, <vscale x 2 x double> %14, <vscale x 2 x double> %predphi %20 = getelementptr inbounds [20 x double], ptr @a, i64 0, i64 %4 %21 = getelementptr inbounds double, ptr %20, i32 0 %wide.load50 = load <vscale x 2 x double>, ptr %21, align 8, !tbaa !6 %22 = fadd fast <vscale x 2 x double> %wide.load50, %predphi49 store <vscale x 2 x double> %22, ptr %21, align 8, !tbaa !6 %23 = call i64 @llvm.vscale.i64() %24 = mul i64 %23, 2 %index.next = add nuw i64 %index, %24 %25 = icmp eq i64 %index.next, %n.vec br i1 %25, label %middle.block, label %vector.body, !llvm.loop !10