- Notifications
You must be signed in to change notification settings - Fork 13.9k
Description
I have experienced more than 300% longer execution time in specific functions that use loops along with indexing into slices. After several hours of work with a profiler, I was able to isolate the problem from a 60K lines codebase into the following short program
use std::cmp; #[inline(never)] pub fn cmp_gt_and(in1: &[i16], in2: &[i16], destination: &mut [bool]) { let max = cmp::min(cmp::min(in1.len(), in2.len()), destination.len()); let src1 = &in1[0..max]; let src2 = &in2[0..max]; let dst = &mut destination[0..max]; for index in 0..max { dst[index] &= src1[index] < src2[index]; } } fn main() { let len = 100; let a: Vec<i16> = (1..len).collect(); let b: Vec<i16> = (1..len).map(|x| len - x).collect(); let mut result = vec![false; len as usize]; for _ in 0..100*1000*1000 { cmp_gt_and(&a, &b, &mut result); } let sum: i32 = b.into_iter().map(|x| x as i32).sum(); std::process::exit(sum); }Code is also available in the following repository
With rust 1.44.0, I observe excecution time around 1.7 sec
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo rustc 1.44.0 (49cae5576 2020-06-01) binary: rustc commit-hash: 49cae55760da0a43428eba73abcb659bb70cf2e4 commit-date: 2020-06-01 host: x86_64-unknown-linux-gnu release: 1.44.0 LLVM version: 9.0 Finished release [optimized] target(s) in 0.04s real 0m1.681s user 0m1.676s sys 0m0.004s Rust versions 1.45.2 and current stable 1.46.0 produce binaries that run more than 6.0 seconds with the same source code
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo rustc 1.45.2 (d3fb005a3 2020-07-31) binary: rustc commit-hash: d3fb005a39e62501b8b0b356166e515ae24e2e54 commit-date: 2020-07-31 host: x86_64-unknown-linux-gnu release: 1.45.2 LLVM version: 10.0 Finished release [optimized] target(s) in 0.05s real 0m6.643s user 0m6.643s sys 0m0.000s $ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo rustc 1.46.0 (04488afe3 2020-08-24) binary: rustc commit-hash: 04488afe34512aa4c33566eb16d8c912a3ae04f9 commit-date: 2020-08-24 host: x86_64-unknown-linux-gnu release: 1.46.0 LLVM version: 10.0 Finished release [optimized] target(s) in 0.00s real 0m6.642s user 0m6.606s sys 0m0.012s I use several more functions like cmp_gt_and in a core of image processing software that also show similar performance drop.
Has anything significantly changed between rustc 1.44 and 1.45 that may have impacted the code so significantly? Maybe LLVM 10 has a different behavior? Any thoughts how to modify the code to gain the performance back with the current compiler or other things to try in order to clarify the problem? For some time, I can stick with 1.44 to keep the performance.
Function cmp_gt_and also appears to have much shorter assembly code with rustc 1.44 than with its successors, not sure if that is the reason for the performnace drop, though:
Rustc 1.44.0
_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE: .cfi_startproc pushrax .cfi_def_cfa_offset 16 movr10, rdi cmprsi, rcx movrdi, rsi cmovardi, rcx cmprdi, r9 cmovardi, r9 cmprdi, rsi ja.LBB8_10 cmprdi, rcx ja.LBB8_11 testrdi, rdi je.LBB8_9 cmprdi, 15 ja.LBB8_5 xorecx, ecx jmp.LBB8_8 .LBB8_5: movrcx, rdi andrcx, -16 xoresi, esi pxorxmm0, xmm0 .p2align4, 0x90 .LBB8_6: movdquxmm1, xmmword ptr [r10 + 2*rsi] movdquxmm2, xmmword ptr [r10 + 2*rsi + 16] movdquxmm3, xmmword ptr [rdx + 2*rsi] pcmpgtwxmm3, xmm1 movdquxmm1, xmmword ptr [rdx + 2*rsi + 16] pcmpgtwxmm1, xmm2 movqxmm2, qword ptr [r8 + rsi] punpcklbwxmm2, xmm0 movqxmm4, qword ptr [r8 + rsi + 8] punpcklbwxmm4, xmm0 pcmpeqwxmm2, xmm0 pandnxmm2, xmm3 pcmpeqwxmm4, xmm0 pandnxmm4, xmm1 psrlwxmm2, 15 packuswbxmm2, xmm0 psrlwxmm4, 15 packuswbxmm4, xmm0 movqqword ptr [r8 + rsi], xmm2 movqqword ptr [r8 + rsi + 8], xmm4 addrsi, 16 cmprcx, rsi jne.LBB8_6 cmprdi, rcx je.LBB8_9 .p2align4, 0x90 .LBB8_8: movzxesi, word ptr [r10 + 2*rcx] cmpsi, word ptr [rdx + 2*rcx] setl sil cmpbyte ptr [r8 + rcx], 0 setneal andal, sil movbyte ptr [r8 + rcx], al addrcx, 1 cmprcx, rdi jb.LBB8_8 .LBB8_9: poprax .cfi_def_cfa_offset 8 ret .LBB8_10: .cfi_def_cfa_offset 16 leardx, [rip + .L__unnamed_2] callqword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL] ud2 .LBB8_11: leardx, [rip + .L__unnamed_3] movrsi, rcx callqword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL] ud2 .Lfunc_end8: .size _ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE .cfi_endproc .section.rodata.cst16,"aM",@progbits,16 .p2align4Rustc 1.45.2
_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE: .cfi_startproc pushrbx .cfi_def_cfa_offset 16 subrsp, 32 .cfi_def_cfa_offset 48 .cfi_offset rbx, -16 movr10, rdi cmprsi, rcx movrdi, rsi cmovardi, rcx cmprdi, r9 cmovardi, r9 cmprdi, rsi ja.LBB8_10 cmprdi, rcx ja.LBB8_11 testrdi, rdi je.LBB8_9 cmprdi, 15 ja.LBB8_5 xoresi, esi jmp.LBB8_8 .LBB8_5: movrsi, rdi andrsi, -16 xorr11d, r11d pxorxmm0, xmm0 pcmpeqdxmm1, xmm1 .p2align4, 0x90 .LBB8_6: movdquxmm2, xmmword ptr [r10 + 2*r11] movdquxmm3, xmmword ptr [r10 + 2*r11 + 16] movdquxmm4, xmmword ptr [rdx + 2*r11] pcmpgtwxmm4, xmm2 movdquxmm2, xmmword ptr [rdx + 2*r11 + 16] pcmpgtwxmm2, xmm3 movqxmm5, qword ptr [r8 + r11] movqxmm3, qword ptr [r8 + r11 + 8] pcmpeqbxmm5, xmm0 pxorxmm5, xmm1 punpcklbwxmm5, xmm0 pandxmm5, xmm4 pcmpeqbxmm3, xmm0 pxorxmm3, xmm1 punpcklbwxmm3, xmm0 pandxmm3, xmm2 movdqaxmmword ptr [rsp], xmm5 movzxeax, byte ptr [rsp + 4] andal, 1 movzxr9d, al movzxeax, byte ptr [rsp + 6] andal, 1 movzxeax, al shleax, 8 oreax, r9d movzxecx, byte ptr [rsp] movzxr9d, byte ptr [rsp + 2] andcl, 1 movzxebx, cl andr9b, 1 movzxecx, r9b shlecx, 8 orecx, ebx movdxmm2, ecx pinsrwxmm2, eax, 1 movzxeax, byte ptr [rsp + 8] andal, 1 movzxeax, al movzxecx, byte ptr [rsp + 10] andcl, 1 movzxecx, cl shlecx, 8 orecx, eax pinsrwxmm2, ecx, 2 movzxeax, byte ptr [rsp + 12] andal, 1 movzxeax, al movzxecx, byte ptr [rsp + 14] andcl, 1 movzxecx, cl shlecx, 8 orecx, eax pinsrwxmm2, ecx, 3 movdqaxmmword ptr [rsp + 16], xmm3 movzxeax, byte ptr [rsp + 20] andal, 1 movzxeax, al movzxecx, byte ptr [rsp + 22] andcl, 1 movzxecx, cl shlecx, 8 orecx, eax movzxeax, byte ptr [rsp + 16] movzxebx, byte ptr [rsp + 18] andal, 1 movzxeax, al andbl, 1 movzxebx, bl shlebx, 8 orebx, eax movdxmm3, ebx pinsrwxmm3, ecx, 1 movzxeax, byte ptr [rsp + 24] andal, 1 movzxeax, al movzxecx, byte ptr [rsp + 26] andcl, 1 movzxecx, cl shlecx, 8 orecx, eax pinsrwxmm3, ecx, 2 movzxeax, byte ptr [rsp + 28] andal, 1 movzxeax, al movzxecx, byte ptr [rsp + 30] andcl, 1 movzxecx, cl shlecx, 8 orecx, eax pinsrwxmm3, ecx, 3 movqqword ptr [r8 + r11], xmm2 movqqword ptr [r8 + r11 + 8], xmm3 addr11, 16 cmprsi, r11 jne.LBB8_6 cmprdi, rsi je.LBB8_9 .p2align4, 0x90 .LBB8_8: movzxeax, word ptr [r10 + 2*rsi] cmpax, word ptr [rdx + 2*rsi] setlal cmpbyte ptr [r8 + rsi], 0 setnecl andcl, al movbyte ptr [r8 + rsi], cl addrsi, 1 cmprsi, rdi jb.LBB8_8 .LBB8_9: addrsp, 32 .cfi_def_cfa_offset 16 poprbx .cfi_def_cfa_offset 8 ret .LBB8_10: .cfi_def_cfa_offset 48 leardx, [rip + .L__unnamed_2] callqword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL] ud2 .LBB8_11: leardx, [rip + .L__unnamed_3] movrsi, rcx callqword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL] ud2 .Lfunc_end8: .size _ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE .cfi_endproc .section.rodata.cst16,"aM",@progbits,16 .p2align4