Skip to content

Conversation

@scottmcm
Copy link
Member

@scottmcm scottmcm commented Feb 5, 2018

Because the last item needs special handling, it seems that LLVM has trouble canonicalizing the loops in external iteration. With the override, it becomes obvious that the start==end case exits the loop (as opposed to the one after that exiting the loop in external iteration).

Demo adapted from #45222

#[no_mangle] pub fn foo3r(n: u64) -> u64 { let mut count = 0; (0..n).for_each(|_| { (0 ..= n).rev().for_each(|j| { count += j; }) }); count }
Current nightly ASM, 100 lines (https://play.rust-lang.org/?gist=f5674c702c6e2045c3aab5d03763e5f6&version=nightly&mode=release)
foo3r: pushq	%rbx .Lcfi0: .Lcfi1: testq	%rdi, %rdi je.LBB0_1 testb$1, %dil jne.LBB0_4 xorl	%eax, %eax xorl	%r8d, %r8d cmpq$1, %rdi jne.LBB0_11 jmp.LBB0_23 .LBB0_1: xorl	%eax, %eax popq	%rbx retq .LBB0_4: xorl	%r8d, %r8d movq$-1, %r9 xorl	%eax, %eax movq%rdi, %r11 xorl	%r10d, %r10d jmp.LBB0_5 .LBB0_8: addq	%r11, %rax movq%rsi, %r11 movq%rdx, %r10 .LBB0_5: cmpq	%r11, %r10 movl$1, %ecx cmovbq	%r9, %rcx cmoveq	%r8, %rcx testq	%rcx, %rcx movl$0, %esi movl$1, %edx je.LBB0_8 cmpq	$-1, %rcx jne.LBB0_9 leaq-1(%r11), %rsi movq%r10, %rdx jmp.LBB0_8 .LBB0_9: movl$1, %r8d cmpq$1, %rdi je.LBB0_23 .LBB0_11: xorl	%r9d, %r9d movq$-1, %r10 .LBB0_12: movq%rdi, %rsi xorl	%r11d, %r11d jmp.LBB0_13 .LBB0_16: addq	%rsi, %rax movq%rcx, %rsi movq%rbx, %r11 .LBB0_13: cmpq	%rsi, %r11 movl$1, %edx cmovbq	%r10, %rdx cmoveq	%r9, %rdx testq	%rdx, %rdx movl$0, %ecx movl$1, %ebx je.LBB0_16 cmpq	$-1, %rdx jne.LBB0_17 leaq-1(%rsi), %rcx movq%r11, %rbx jmp.LBB0_16 .LBB0_17: movq%rdi, %rcx xorl	%r11d, %r11d jmp.LBB0_18 .LBB0_21: addq	%rcx, %rax movq%rsi, %rcx movq%rbx, %r11 .LBB0_18: cmpq	%rcx, %r11 movl$1, %edx cmovbq	%r10, %rdx cmoveq	%r9, %rdx testq	%rdx, %rdx movl$0, %esi movl$1, %ebx je.LBB0_21 cmpq	$-1, %rdx jne.LBB0_22 leaq-1(%rcx), %rsi movq%r11, %rbx jmp.LBB0_21 .LBB0_22: addq$2, %r8 cmpq	%rdi, %r8 jne.LBB0_12 .LBB0_23: popq	%rbx retq .Lfunc_end0:

With this PR:

foo3r: testrcx, rcx je.LBB3_1 lear8, [rcx - 1] leardx, [rcx - 2] movrax, r8 mulrdx shldrdx, rax, 63 imulr8, r8 addr8, rcx subr8, rdx imulr8, rcx movrax, r8 ret .LBB3_1: xorr8d, r8d movrax, r8 ret
Because the last item needs special handling, it seems that LLVM has trouble canonicalizing the loops in external iteration. With the override, it becomes obvious that the start==end case exits the loop (as opposed to the one *after* that exiting the loop in external iteration).
@rust-highfive
Copy link
Contributor

r? @alexcrichton

(rust_highfive has picked a reviewer for you, use r? to override)

@pietroalbini pietroalbini added the S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. label Feb 5, 2018
@alexcrichton
Copy link
Member

@bors: r+

Thanks!

@bors
Copy link
Collaborator

bors commented Feb 5, 2018

📌 Commit 1b1e887 has been approved by alexcrichton

kennytm added a commit to kennytm/rust that referenced this pull request Feb 6, 2018
…, r=alexcrichton Override try_[r]fold for RangeInclusive Because the last item needs special handling, it seems that LLVM has trouble canonicalizing the loops in external iteration. With the override, it becomes obvious that the start==end case exits the loop (as opposed to the one *after* that exiting the loop in external iteration). Demo adapted from rust-lang#45222 ```rust #[no_mangle] pub fn foo3r(n: u64) -> u64 { let mut count = 0; (0..n).for_each(|_| { (0 ..= n).rev().for_each(|j| { count += j; }) }); count } ``` <details> <summary>Current nightly ASM, 100 lines (https://play.rust-lang.org/?gist=f5674c702c6e2045c3aab5d03763e5f6&version=nightly&mode=release)</summary> ```asm foo3r:	pushq	%rbx .Lcfi0: .Lcfi1:	testq	%rdi, %rdi	je	.LBB0_1	testb	$1, %dil	jne	.LBB0_4	xorl	%eax, %eax	xorl	%r8d, %r8d	cmpq	$1, %rdi	jne	.LBB0_11	jmp	.LBB0_23 .LBB0_1:	xorl	%eax, %eax	popq	%rbx	retq .LBB0_4:	xorl	%r8d, %r8d	movq	$-1, %r9	xorl	%eax, %eax	movq	%rdi, %r11	xorl	%r10d, %r10d	jmp	.LBB0_5 .LBB0_8:	addq	%r11, %rax	movq	%rsi, %r11	movq	%rdx, %r10 .LBB0_5:	cmpq	%r11, %r10	movl	$1, %ecx	cmovbq	%r9, %rcx	cmoveq	%r8, %rcx	testq	%rcx, %rcx	movl	$0, %esi	movl	$1, %edx	je	.LBB0_8	cmpq	$-1, %rcx	jne	.LBB0_9	leaq	-1(%r11), %rsi	movq	%r10, %rdx	jmp	.LBB0_8 .LBB0_9:	movl	$1, %r8d	cmpq	$1, %rdi	je	.LBB0_23 .LBB0_11:	xorl	%r9d, %r9d	movq	$-1, %r10 .LBB0_12:	movq	%rdi, %rsi	xorl	%r11d, %r11d	jmp	.LBB0_13 .LBB0_16:	addq	%rsi, %rax	movq	%rcx, %rsi	movq	%rbx, %r11 .LBB0_13:	cmpq	%rsi, %r11	movl	$1, %edx	cmovbq	%r10, %rdx	cmoveq	%r9, %rdx	testq	%rdx, %rdx	movl	$0, %ecx	movl	$1, %ebx	je	.LBB0_16	cmpq	$-1, %rdx	jne	.LBB0_17	leaq	-1(%rsi), %rcx	movq	%r11, %rbx	jmp	.LBB0_16 .LBB0_17:	movq	%rdi, %rcx	xorl	%r11d, %r11d	jmp	.LBB0_18 .LBB0_21:	addq	%rcx, %rax	movq	%rsi, %rcx	movq	%rbx, %r11 .LBB0_18:	cmpq	%rcx, %r11	movl	$1, %edx	cmovbq	%r10, %rdx	cmoveq	%r9, %rdx	testq	%rdx, %rdx	movl	$0, %esi	movl	$1, %ebx	je	.LBB0_21	cmpq	$-1, %rdx	jne	.LBB0_22	leaq	-1(%rcx), %rsi	movq	%r11, %rbx	jmp	.LBB0_21 .LBB0_22:	addq	$2, %r8	cmpq	%rdi, %r8	jne	.LBB0_12 .LBB0_23:	popq	%rbx	retq .Lfunc_end0: ``` </details><br> With this PR: ```asm foo3r:	test	rcx, rcx	je	.LBB3_1	lea	r8, [rcx - 1]	lea	rdx, [rcx - 2]	mov	rax, r8	mul	rdx	shld	rdx, rax, 63	imul	r8, r8	add	r8, rcx	sub	r8, rdx	imul	r8, rcx	mov	rax, r8	ret .LBB3_1:	xor	r8d, r8d	mov	rax, r8	ret ```
bors added a commit that referenced this pull request Feb 6, 2018
Rollup of 7 pull requests - Successful merges: #46962, #47986, #48012, #48013, #48026, #48031, #48036 - Failed merges:
@bors bors merged commit 1b1e887 into rust-lang:master Feb 6, 2018
@scottmcm scottmcm deleted the faster-rangeinclusive-fold branch February 6, 2018 22:52
kennytm added a commit to kennytm/rust that referenced this pull request Jul 12, 2018
…, r=SimonSapin Change RangeInclusive to a three-field struct. Fix rust-lang#45222. This PR also reverts rust-lang#48012 (i.e. removed the `try_fold`/`try_rfold` specialization for `RangeInclusive`) because LLVM no longer has trouble recognizing a RangeInclusive loop.
bors added a commit that referenced this pull request Jul 13, 2018
Change RangeInclusive to a three-field struct. Fix #45222. This PR also reverts #48012 (i.e. removed the `try_fold`/`try_rfold` specialization for `RangeInclusive`) because LLVM no longer has trouble recognizing a RangeInclusive loop.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

S-waiting-on-review Status: Awaiting review from the assignee but also interested parties.

5 participants