Skip to content

Missed optimization opportunity: flatten branches with function call into one function call that uses CMOV #167256

@inicula

Description

@inicula

Ideally Clang should figure out that foo_v1() can be optimized like foo_v2(). Code:

#include <cstdint> struct Entry { int _v[32]; unsigned char _s[32]; }; __always_inline void common_part(Entry* e, int b, int s) { int t = (b*3 + s) ^ 0x9e3779b9; e->_v[s] = t; e->_s[s] = (unsigned char)((b ^ s) & 255); } void foo_v1(Entry* e, bool found, unsigned m, int b) { int k = found ? (b & 31) : -1; if (k == -1) common_part(e, b, __builtin_ctz(m)); else common_part(e, b, k); } void foo_v2(Entry* e, bool found, unsigned m, int b) { int k = found ? (b & 31) : -1; common_part(e, b, k == -1 ? __builtin_ctz(m) : k); }

Assembly (-std=c++23 -O3 -march=znver1):

foo_v1(Entry*, bool, unsigned int, int):  lea eax, [rcx+rcx*2]  test sil, sil  je .L2  mov esi, ecx  mov edx, ecx  and esi, 31  and edx, 31  add eax, esi  xor ecx, esi  xor eax, -1640531527  mov DWORD PTR [rdi+rdx*4], eax  mov BYTE PTR [rdi+128+rdx], cl  ret .L2:  tzcnt esi, edx  add eax, esi  xor ecx, esi  xor eax, -1640531527  mov DWORD PTR [rdi+rsi*4], eax  mov BYTE PTR [rdi+128+rsi], cl  ret foo_v2(Entry*, bool, unsigned int, int):  mov eax, ecx  tzcnt edx, edx  and eax, 31  test sil, sil  cmovne edx, eax  lea eax, [rcx+rcx*2]  add eax, edx  movsx rsi, edx  xor ecx, edx  xor eax, -1640531527  mov DWORD PTR [rdi+rsi*4], eax  mov BYTE PTR [rdi+128+rsi], cl  ret

Godbolt: https://godbolt.org/z/bK13denas

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions