@@ -1344,6 +1344,52 @@ namespace xsimd
1344
1344
return first (acc3, A {});
1345
1345
}
1346
1346
1347
+ // reduce_mul
1348
+ template <class A >
1349
+ XSIMD_INLINE float reduce_mul (batch<float , A> const & self, requires_arch<sse2>) noexcept
1350
+ {
1351
+ __m128 tmp0 = _mm_mul_ps (self, _mm_movehl_ps (self, self));
1352
+ __m128 tmp1 = _mm_mul_ss (tmp0, _mm_shuffle_ps (tmp0, tmp0, 1 ));
1353
+ return _mm_cvtss_f32 (tmp1);
1354
+ }
1355
+
1356
+ template <class A >
1357
+ XSIMD_INLINE double reduce_mul (batch<double , A> const & self, requires_arch<sse2>) noexcept
1358
+ {
1359
+ return _mm_cvtsd_f64 (_mm_mul_sd (self, _mm_unpackhi_pd (self, self)));
1360
+ }
1361
+
1362
+ template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
1363
+ XSIMD_INLINE T reduce_mul (batch<T, A> const & self, requires_arch<sse2>) noexcept
1364
+ {
1365
+ XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1366
+ {
1367
+ batch<T, A> tmp1 = _mm_shuffle_epi32 (self, _MM_SHUFFLE (0 , 1 , 2 , 3 ));
1368
+ tmp1 = tmp1 * self;
1369
+ batch<T, A> tmp2 = _mm_unpackhi_epi32 (tmp1, tmp1);
1370
+ tmp2 = tmp2 * tmp1;
1371
+ return _mm_cvtsi128_si32 (tmp2);
1372
+ }
1373
+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1374
+ {
1375
+ batch<T, A> tmp1 = _mm_unpackhi_epi64 (self, self);
1376
+ auto tmp2 = tmp1 * self;
1377
+ #if defined(__x86_64__)
1378
+ return _mm_cvtsi128_si64 (tmp2);
1379
+ #else
1380
+ __m128i m;
1381
+ _mm_storel_epi64 (&m, tmp2);
1382
+ int64_t i;
1383
+ std::memcpy (&i, &m, sizeof (i));
1384
+ return i;
1385
+ #endif
1386
+ }
1387
+ else
1388
+ {
1389
+ return reduce_mul (self, common {});
1390
+ }
1391
+ }
1392
+
1347
1393
// rsqrt
1348
1394
template <class A >
1349
1395
XSIMD_INLINE batch<float , A> rsqrt (batch<float , A> const & val, requires_arch<sse2>) noexcept
0 commit comments