[AVX] clang does not respect streaming store intrinsics

@topperc


Bugzilla Link	33830
Resolution	FIXED
Resolved on	Jul 31, 2017 12:38
Version	4.0
OS	All
Blocks	#33196
Reporter	LLVM Bugzilla Contributor
CC	@topperc,@Delena,@filcab,@zmodem,@RKSimon,@ZviRackover

Extended Description

The AVX streaming store intrinsic _mm256_stream_pd is not translated by clang 4.0.1 to VMOVNTPD but to VMOVUPD. This leads to severe performance degradation.

This bug is not present in the official release of clang 3.8.0.
This seems to be related to the changes introduced with __builtin_nontemporal_store.

Streaming stores with SSE instruction set seems to be not affected.

Sample code:
#include <immintrin.h>

int main() {
int n = 1024;
//AVX
double* x = (double*)_mm_malloc( sizeof(double)n, 32 );
__m256d a = _mm256_set1_pd(2017.0717);
for (int i = 0; i < 1024; i+=4) {
_mm256_stream_pd(x+i, a);
}
//SEE
double y = (double*)_mm_malloc( sizeof(double)*n, 32 );
__m128d b = _mm_set1_pd(2017.0717);
for (int i = 0; i < 1024; i+=4) {
_mm_stream_pd(y+i, b);
}
return 0;
}

clang -S -O3 -march=native -o bug.s bug.c

Assembler code for clang 4.0.1

.section	__TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .section	__TEXT,__literal8,8byte_literals .p2align	3

LCPI0_0:
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__literal16,16byte_literals
.p2align 4
LCPI0_1:
.quad 4656585990599183486 ## double 2017.0717
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__text,regular,pure_instructions
.globl _main
.p2align 4, 0x90
_main: ## @main
.cfi_startproc

BB#0:

pushq	%rbp

Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
subq $16, %rsp
leaq -8(%rbp), %rdi
movl $32, %esi
movl $8192, %edx ## imm = 0x2000
callq _posix_memalign
xorl %ecx, %ecx
testl %eax, %eax
movq -8(%rbp), %rax
cmovneq %rcx, %rax
vbroadcastsd LCPI0_0(%rip), %ymm0
.p2align 4, 0x90
LBB0_1: ## =>This Inner Loop Header: Depth=1
vmovups %ymm0, (%rax,%rcx,8)
vmovups %ymm0, 32(%rax,%rcx,8)
vmovups %ymm0, 64(%rax,%rcx,8)
vmovups %ymm0, 96(%rax,%rcx,8)
vmovups %ymm0, 128(%rax,%rcx,8)
vmovups %ymm0, 160(%rax,%rcx,8)
vmovups %ymm0, 192(%rax,%rcx,8)
vmovups %ymm0, 224(%rax,%rcx,8)
vmovups %ymm0, 256(%rax,%rcx,8)
vmovups %ymm0, 288(%rax,%rcx,8)
vmovups %ymm0, 320(%rax,%rcx,8)
vmovups %ymm0, 352(%rax,%rcx,8)
vmovups %ymm0, 384(%rax,%rcx,8)
vmovups %ymm0, 416(%rax,%rcx,8)
vmovups %ymm0, 448(%rax,%rcx,8)
vmovups %ymm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_1

BB#2:

leaq	-8(%rbp), %rdi movl	$32, %esi movl	$8192, %edx ## imm = 0x2000 vzeroupper callq	_posix_memalign xorl	%ecx, %ecx testl	%eax, %eax movq	-8(%rbp), %rax cmovneq	%rcx, %rax vmovaps	LCPI0_1(%rip), %xmm0 ## xmm0 = [2.017072e+03,2.017072e+03] .p2align	4, 0x90

LBB0_3: ## =>This Inner Loop Header: Depth=1
vmovntps %xmm0, (%rax,%rcx,8)
vmovntps %xmm0, 32(%rax,%rcx,8)
vmovntps %xmm0, 64(%rax,%rcx,8)
vmovntps %xmm0, 96(%rax,%rcx,8)
vmovntps %xmm0, 128(%rax,%rcx,8)
vmovntps %xmm0, 160(%rax,%rcx,8)
vmovntps %xmm0, 192(%rax,%rcx,8)
vmovntps %xmm0, 224(%rax,%rcx,8)
vmovntps %xmm0, 256(%rax,%rcx,8)
vmovntps %xmm0, 288(%rax,%rcx,8)
vmovntps %xmm0, 320(%rax,%rcx,8)
vmovntps %xmm0, 352(%rax,%rcx,8)
vmovntps %xmm0, 384(%rax,%rcx,8)
vmovntps %xmm0, 416(%rax,%rcx,8)
vmovntps %xmm0, 448(%rax,%rcx,8)
vmovntps %xmm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_3

BB#4:

xorl	%eax, %eax addq	$16, %rsp popq	%rbp retq .cfi_endproc

.subsections_via_symbols

Assembler code for clang 3.8.0

.section	__TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .section	__TEXT,__literal8,8byte_literals .align	3

LCPI0_0:
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__literal16,16byte_literals
.align 4
LCPI0_1:
.quad 4656585990599183486 ## double 2017.0717
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## @main
.cfi_startproc

BB#0:

pushq	%rbp

Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
subq $16, %rsp
leaq -8(%rbp), %rdi
movl $32, %esi
movl $8192, %edx ## imm = 0x2000
callq _posix_memalign
xorl %ecx, %ecx
testl %eax, %eax
movq -8(%rbp), %rax
cmovneq %rcx, %rax
vbroadcastsd LCPI0_0(%rip), %ymm0
.align 4, 0x90
LBB0_1: ## =>This Inner Loop Header: Depth=1
vmovntps %ymm0, (%rax,%rcx,8)
vmovntps %ymm0, 32(%rax,%rcx,8)
vmovntps %ymm0, 64(%rax,%rcx,8)
vmovntps %ymm0, 96(%rax,%rcx,8)
vmovntps %ymm0, 128(%rax,%rcx,8)
vmovntps %ymm0, 160(%rax,%rcx,8)
vmovntps %ymm0, 192(%rax,%rcx,8)
vmovntps %ymm0, 224(%rax,%rcx,8)
vmovntps %ymm0, 256(%rax,%rcx,8)
vmovntps %ymm0, 288(%rax,%rcx,8)
vmovntps %ymm0, 320(%rax,%rcx,8)
vmovntps %ymm0, 352(%rax,%rcx,8)
vmovntps %ymm0, 384(%rax,%rcx,8)
vmovntps %ymm0, 416(%rax,%rcx,8)
vmovntps %ymm0, 448(%rax,%rcx,8)
vmovntps %ymm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_1

BB#2:

leaq	-8(%rbp), %rdi movl	$32, %esi movl	$8192, %edx ## imm = 0x2000 vzeroupper callq	_posix_memalign xorl	%ecx, %ecx testl	%eax, %eax movq	-8(%rbp), %rax cmovneq	%rcx, %rax vmovaps	LCPI0_1(%rip), %xmm0 ## xmm0 = [2.017072e+03,2.017072e+03] .align	4, 0x90

LBB0_3: ## =>This Inner Loop Header: Depth=1
vmovntps %xmm0, (%rax,%rcx,8)
vmovntps %xmm0, 32(%rax,%rcx,8)
vmovntps %xmm0, 64(%rax,%rcx,8)
vmovntps %xmm0, 96(%rax,%rcx,8)
vmovntps %xmm0, 128(%rax,%rcx,8)
vmovntps %xmm0, 160(%rax,%rcx,8)
vmovntps %xmm0, 192(%rax,%rcx,8)
vmovntps %xmm0, 224(%rax,%rcx,8)
vmovntps %xmm0, 256(%rax,%rcx,8)
vmovntps %xmm0, 288(%rax,%rcx,8)
vmovntps %xmm0, 320(%rax,%rcx,8)
vmovntps %xmm0, 352(%rax,%rcx,8)
vmovntps %xmm0, 384(%rax,%rcx,8)
vmovntps %xmm0, 416(%rax,%rcx,8)
vmovntps %xmm0, 448(%rax,%rcx,8)
vmovntps %xmm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_3

BB#4:

xorl	%eax, %eax addq	$16, %rsp popq	%rbp retq .cfi_endproc

.subsections_via_symbols

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AVX] clang does not respect streaming store intrinsics #33177

Extended Description

clang -S -O3 -march=native -o bug.s bug.c

Assembler code for clang 4.0.1

BB#0:

BB#2:

BB#4:

Assembler code for clang 3.8.0

BB#0:

BB#2:

BB#4:

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[AVX] clang does not respect streaming store intrinsics #33177

Description

Extended Description

clang -S -O3 -march=native -o bug.s bug.c

Assembler code for clang 4.0.1

BB#0:

BB#2:

BB#4:

Assembler code for clang 3.8.0

BB#0:

BB#2:

BB#4:

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions