Skip to content

Commit f4a0c08

Browse files
authored
Merge pull request #178 from p12tic/fix-gcc-arm-neon-regression
insn: Work around in GCC regression when generating comparisons on ARM
2 parents e4ef7fa + 6812070 commit f4a0c08

File tree

4 files changed

+44
-0
lines changed

4 files changed

+44
-0
lines changed

simdpp/detail/insn/cmp_ge.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,18 @@ mask_float32<4> i_cmp_ge(const float32<4>& a, const float32<4>& b)
331331
#elif SIMDPP_USE_SSE2
332332
return _mm_cmpge_ps(a.native(), b.native());
333333
#elif SIMDPP_USE_NEON
334+
#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
335+
// BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
336+
// broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
337+
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
338+
float32x4_t va = a.native();
339+
float32x4_t vb = b.native();
340+
float32x4_t vr;
341+
asm("vcge.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(va) , "w"(vb));
342+
return vr;
343+
#else
334344
return vreinterpretq_f32_u32(vcgeq_f32(a.native(), b.native()));
345+
#endif
335346
#elif SIMDPP_USE_ALTIVEC
336347
return vec_cmpge(a.native(), b.native());
337348
#elif SIMDPP_USE_MSA

simdpp/detail/insn/cmp_gt.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,18 @@ mask_float32x4 i_cmp_gt(const float32x4& a, const float32x4& b)
394394
#elif SIMDPP_USE_SSE2
395395
return _mm_cmpgt_ps(a.native(), b.native());
396396
#elif SIMDPP_USE_NEON
397+
#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
398+
// BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
399+
// broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
400+
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
401+
float32x4_t va = a.native();
402+
float32x4_t vb = b.native();
403+
float32x4_t vr;
404+
asm("vcgt.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(va) , "w"(vb));
405+
return vr;
406+
#else
397407
return vreinterpretq_f32_u32(vcgtq_f32(a.native(), b.native()));
408+
#endif
398409
#elif SIMDPP_USE_ALTIVEC
399410
return vec_cmpgt(a.native(), b.native());
400411
#elif SIMDPP_USE_MSA

simdpp/detail/insn/cmp_le.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,18 @@ mask_float32<4> i_cmp_le(const float32<4>& a, const float32<4>& b)
330330
#elif SIMDPP_USE_SSE2
331331
return _mm_cmple_ps(a.native(), b.native());
332332
#elif SIMDPP_USE_NEON
333+
#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
334+
// BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
335+
// broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
336+
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
337+
float32x4_t va = a.native();
338+
float32x4_t vb = b.native();
339+
float32x4_t vr;
340+
asm("vcge.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(vb) , "w"(va));
341+
return vr;
342+
#else
333343
return vreinterpretq_f32_u32(vcleq_f32(a.native(), b.native()));
344+
#endif
334345
#elif SIMDPP_USE_ALTIVEC
335346
return vec_cmple(a.native(), b.native());
336347
#elif SIMDPP_USE_MSA

simdpp/detail/insn/cmp_lt.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,18 @@ mask_float32x4 i_cmp_lt(const float32x4& a, const float32x4& b)
390390
#elif SIMDPP_USE_SSE2
391391
return _mm_cmplt_ps(a.native(), b.native());
392392
#elif SIMDPP_USE_NEON
393+
#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
394+
// BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
395+
// broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
396+
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
397+
float32x4_t va = a.native();
398+
float32x4_t vb = b.native();
399+
float32x4_t vr;
400+
asm("vcgt.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(vb) , "w"(va));
401+
return vr;
402+
#else
393403
return vreinterpretq_f32_u32(vcltq_f32(a.native(), b.native()));
404+
#endif
394405
#elif SIMDPP_USE_ALTIVEC
395406
return vec_cmplt(a.native(), b.native());
396407
#elif SIMDPP_USE_MSA

0 commit comments

Comments
 (0)