p12tic
diff --git a/‎simdpp/detail/insn/cmp_ge.h‎
Lines changed: 11 additions & 0 deletions b/‎simdpp/detail/insn/cmp_ge.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎simdpp/detail/insn/cmp_gt.h‎
Lines changed: 11 additions & 0 deletions b/‎simdpp/detail/insn/cmp_gt.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎simdpp/detail/insn/cmp_le.h‎
Lines changed: 11 additions & 0 deletions b/‎simdpp/detail/insn/cmp_le.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎simdpp/detail/insn/cmp_lt.h‎
Lines changed: 11 additions & 0 deletions b/‎simdpp/detail/insn/cmp_lt.h‎
Lines changed: 11 additions & 0 deletions
@@ -331,7 +331,18 @@ mask_float32<4> i_cmp_ge(const float32<4>& a, const float32<4>& b)
 #elif SIMDPP_USE_SSE2
  return _mm_cmpge_ps(a.native(), b.native());
 #elif SIMDPP_USE_NEON
+#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
+ // BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
+ // broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
+ float32x4_t va = a.native();
+ float32x4_t vb = b.native();
+ float32x4_t vr;
+ asm("vcge.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(va) , "w"(vb));
+ return vr;
+#else
  return vreinterpretq_f32_u32(vcgeq_f32(a.native(), b.native()));
+#endif
 #elif SIMDPP_USE_ALTIVEC
  return vec_cmpge(a.native(), b.native());
 #elif SIMDPP_USE_MSA
 
@@ -394,7 +394,18 @@ mask_float32x4 i_cmp_gt(const float32x4& a, const float32x4& b)
 #elif SIMDPP_USE_SSE2
  return _mm_cmpgt_ps(a.native(), b.native());
 #elif SIMDPP_USE_NEON
+#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
+ // BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
+ // broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
+ float32x4_t va = a.native();
+ float32x4_t vb = b.native();
+ float32x4_t vr;
+ asm("vcgt.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(va) , "w"(vb));
+ return vr;
+#else
  return vreinterpretq_f32_u32(vcgtq_f32(a.native(), b.native()));
+#endif
 #elif SIMDPP_USE_ALTIVEC
  return vec_cmpgt(a.native(), b.native());
 #elif SIMDPP_USE_MSA
 
@@ -330,7 +330,18 @@ mask_float32<4> i_cmp_le(const float32<4>& a, const float32<4>& b)
 #elif SIMDPP_USE_SSE2
  return _mm_cmple_ps(a.native(), b.native());
 #elif SIMDPP_USE_NEON
+#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
+ // BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
+ // broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
+ float32x4_t va = a.native();
+ float32x4_t vb = b.native();
+ float32x4_t vr;
+ asm("vcge.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(vb) , "w"(va));
+ return vr;
+#else
  return vreinterpretq_f32_u32(vcleq_f32(a.native(), b.native()));
+#endif
 #elif SIMDPP_USE_ALTIVEC
  return vec_cmple(a.native(), b.native());
 #elif SIMDPP_USE_MSA
 
@@ -390,7 +390,18 @@ mask_float32x4 i_cmp_lt(const float32x4& a, const float32x4& b)
 #elif SIMDPP_USE_SSE2
  return _mm_cmplt_ps(a.native(), b.native());
 #elif SIMDPP_USE_NEON
+#if SIMDPP_32_BITS && defined(__GNUC__) && (__GNUC__ >= 11) && !defined(__clang__)
+ // BUG: Starting GCC 11 and up until at least GCC 14 floating-point comparison intrinics get
+ // broken down to scalar code on armv7. https://godbolt.org/z/MbnMhzrTT
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115451
+ float32x4_t va = a.native();
+ float32x4_t vb = b.native();
+ float32x4_t vr;
+ asm("vcgt.f32 %q0, %q1, %q2 " : "=w"(vr) : "w"(vb) , "w"(va));
+ return vr;
+#else
  return vreinterpretq_f32_u32(vcltq_f32(a.native(), b.native()));
+#endif
 #elif SIMDPP_USE_ALTIVEC
  return vec_cmplt(a.native(), b.native());
 #elif SIMDPP_USE_MSA