@@ -12,54 +12,45 @@ namespace cp_algo {
1212 using u32x4 = simd<uint32_t , 4 >;
1313 using dx4 = simd<double , 4 >;
1414
15- dx4 abs (dx4 a) {
16- #ifdef __AVX2__
17- return _mm256_and_pd (a, dx4{} + 1 /0 .);
18- #else
15+ [[gnu::always_inline]] inline dx4 abs (dx4 a) {
1916 return a < 0 ? -a : a;
20- #endif
2117 }
2218
23- i64x4 lround (dx4 x) {
19+ [[gnu::always_inline]] inline i64x4 lround (dx4 x) {
2420 // https://stackoverflow.com/a/77376595
2521 static constexpr dx4 magic = dx4 () + double (3ULL << 51 );
2622 return i64x4 (x + magic) - i64x4 (magic);
2723 }
2824
29- dx4 round (dx4 a) {
25+ [[gnu::always_inline]] inline dx4 round (dx4 a) {
3026#ifdef __AVX2__
3127 return _mm256_round_pd (a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
3228#else
3329 return __builtin_convertvector (lround (a), dx4);
3430#endif
3531 }
3632
37- u64x4 montgomery_reduce (u64x4 x, u64x4 mod, u64x4 imod) {
38- #ifndef __AVX2__
39- auto x_ninv = _mm256_mul_epu32 (__m256i (x), __m256i (imod));
40- auto x_res = _mm256_add_epi64 (__m256i (x), _mm256_mul_epu32 (x_ninv, __m256i (mod)));
41- return u64x4 (_mm256_bsrli_epi128 (x_res, 4 ));
42- #else
43-
33+ [[gnu::always_inline]] inline u64x4 montgomery_reduce (u64x4 x, u64x4 mod, u64x4 imod) {
4434 auto x_ninv = u64x4 (u32x8 (x) * u32x8 (imod));
45- return (x + x_ninv * mod) >> 32 ;
35+ #ifdef __AVX2__
36+ auto x_res = __m256i (x) + _mm256_mul_epu32 (__m256i (x_ninv), __m256i (mod));
37+ #else
38+ auto x_res = x + x_ninv * mod;
4639#endif
40+ return u64x4 (x_res) >> 32 ;
4741 }
4842
49- u64x4 montgomery_mul (u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
43+ [[gnu::always_inline]] inline u64x4 montgomery_mul (u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
5044#ifdef __AVX2__
5145 return montgomery_reduce (u64x4 (_mm256_mul_epu32 (__m256i (x), __m256i (y))), mod, imod);
5246#else
5347 return montgomery_reduce (x * y, mod, imod);
5448#endif
5549 }
5650
57- dx4 rotate_right (dx4 x) {
58- #ifdef __AVX2__
59- return _mm256_permute4x64_pd (x, _MM_SHUFFLE (2 , 1 , 0 , 3 ));
60- #else
61- return __builtin_shufflevector (x, x, 3 , 0 , 1 , 2 );
62- #endif
51+ [[gnu::always_inline]] inline dx4 rotate_right (dx4 x) {
52+ static constexpr u64x4 shuffler = {3 , 0 , 1 , 2 };
53+ return __builtin_shuffle (x, shuffler);
6354 }
6455}
6556#endif // CP_ALGO_UTIL_SIMD_HPP
0 commit comments