Skip to content

[x86] Missed optimization for AVX2 truncating cast from v8i32 to v8i8 #167138

@okaneco

Description

@okaneco

(The original code is from a Rust JPEG encoder where we wanted to push every third byte from an __m256i into a buffer.)

Trying to interpret an __m256i as int32x8 and truncate it to uint8x8 misses shuffling the bytes in the high lane before combining with the low lane. I don't know what the best instruction sequence would be for this shuffle.

https://godbolt.org/z/oj9acsErE
alive2 proof - https://alive2.llvm.org/ce/z/WzN-QY

#include <immintrin.h> #include <stdint.h> void src(__m256i data, uint8_t* out) { for (int i = 0; i < 8; i++, out++) { *out = (uint8_t)((int32_t*)&data)[i]; } return; } void tgt(__m256i data, uint8_t* out) { // Shuffle mask = [0, 4, 8, 12] __m256i A = _mm256_shuffle_epi8(data, _mm256_set1_epi32(0x0c080400)); // Shuffle lowest int32 from hi 128-bits into second int32 lane __m256i B = _mm256_permutevar8x32_epi32(A, _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0)); for (int i = 0; i < 8; i++, out++) { *out = ((uint8_t*)&B)[i]; } return; }
.LCPI0_0:  .byte 0  .byte 4  .byte 8  .byte 12  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1  .zero 1 src:  vextracti128 xmm1, ymm0, 1  vpextrb eax, xmm1, 4  vpextrb ecx, xmm1, 8  vpextrb edx, xmm1, 12  vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]  vpunpckldq xmm0, xmm0, xmm1  vpinsrb xmm0, xmm0, eax, 5  vpinsrb xmm0, xmm0, ecx, 6  vpinsrb xmm0, xmm0, edx, 7  vmovq qword ptr [rdi], xmm0  vzeroupper  ret .LCPI1_0:  .byte 0 .byte 4 .byte 8 .byte 12 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .byte 0 .byte 4 .byte 8 .byte 12 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 tgt:  vpshufb ymm0, ymm0, ymmword ptr [rip + .LCPI1_0]  vextracti128 xmm1, ymm0, 1  vpunpckldq xmm0, xmm0, xmm1  vmovq qword ptr [rdi], xmm0  vzeroupper  ret
LLVM IR
define void @src(<4 x i64> noundef %data, ptr nocapture noread noundef initializes((0, 8)) %out) { entry: %#0 = bitcast <4 x i64> noundef %data to <32 x i8> %#1 = bitcast <4 x i64> noundef %data to <32 x i8> %#2 = bitcast <4 x i64> noundef %data to <32 x i8> %conv.2 = extractelement <32 x i8> %#2, i64 8 %#3 = bitcast <4 x i64> noundef %data to <32 x i8> %conv.3 = extractelement <32 x i8> %#3, i64 12 %#4 = bitcast <4 x i64> noundef %data to <32 x i8> %conv.4 = extractelement <32 x i8> %#4, i64 16 %#5 = bitcast <4 x i64> noundef %data to <32 x i8> %conv.5 = extractelement <32 x i8> %#5, i64 20 %#6 = bitcast <4 x i64> noundef %data to <32 x i8> %conv.6 = extractelement <32 x i8> %#6, i64 24 %#7 = bitcast <4 x i64> noundef %data to <32 x i8> %conv.7 = extractelement <32 x i8> %#7, i64 28 %#8 = shufflevector <32 x i8> %#0, <32 x i8> %#1, 0, 36, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295 %#9 = insertelement <8 x i8> %#8, i8 %conv.2, i64 2 %#10 = insertelement <8 x i8> %#9, i8 %conv.3, i64 3 %#11 = insertelement <8 x i8> %#10, i8 %conv.4, i64 4 %#12 = insertelement <8 x i8> %#11, i8 %conv.5, i64 5 %#13 = insertelement <8 x i8> %#12, i8 %conv.6, i64 6 %#14 = insertelement <8 x i8> %#13, i8 %conv.7, i64 7 store <8 x i8> %#14, ptr nocapture noread noundef initializes((0, 8)) %out, align 1 ret void } => define void @tgt(<4 x i64> noundef %data, ptr nocapture noread noundef initializes((0, 8)) %out) { entry: %#0 = bitcast <4 x i64> noundef %data to <32 x i8> %#1 = shufflevector <32 x i8> %#0, <32 x i8> poison, 0, 4, 8, 12, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 16, 20, 24, 28, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295 %#2 = bitcast <32 x i8> %#1 to <8 x i32> %B.sroa.0.0.vec.extract = shufflevector <8 x i32> %#2, <8 x i32> poison, 0, 4 store <2 x i32> %B.sroa.0.0.vec.extract, ptr nocapture noread noundef initializes((0, 8)) %out, align 1 ret void } Transformation seems to be correct! 

Written as the following, the shuffle is recognized in C (but doesn't help in Rust).

void src8(__m256i data, uint8_t* out) { for (int i = 0; i < 32; out++) { *out = ((uint8_t*)&data)[i]; i += 4; } return; }
.LCPI0_1:  .byte 0  .byte 4  .byte 8  .byte 12  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0  .byte 0 src8:  vextracti128 xmm1, ymm0, 1  vmovd xmm2, dword ptr [rip + .LCPI0_1]  vpshufb xmm1, xmm1, xmm2  vpshufb xmm0, xmm0, xmm2  vpunpckldq xmm0, xmm0, xmm1  vmovq qword ptr [rdi], xmm0  vzeroupper  ret
define dso_local void @src8(<4 x i64> noundef %data, ptr noundef writeonly captures(none) initializes((0, 8)) %out) local_unnamed_addr { entry: %0 = bitcast <4 x i64> %data to <32 x i8> %1 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> store <8 x i8> %1, ptr %out, align 1 ret void }

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions