xtensor-stack
diff --git a/‎docs/source/api/reducer_index.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/api/reducer_index.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/xsimd/arch/common/xsimd_common_details.hpp‎
Lines changed: 2 additions & 0 deletions b/‎include/xsimd/arch/common/xsimd_common_details.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/xsimd/arch/common/xsimd_common_math.hpp‎
Lines changed: 28 additions & 1 deletion b/‎include/xsimd/arch/common/xsimd_common_math.hpp‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 11 additions & 1 deletion b/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎include/xsimd/arch/xsimd_avx512dq.hpp‎
Lines changed: 10 additions & 0 deletions b/‎include/xsimd/arch/xsimd_avx512dq.hpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx512f.hpp‎
Lines changed: 31 additions & 0 deletions b/‎include/xsimd/arch/xsimd_avx512f.hpp‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_common_fwd.hpp‎
Lines changed: 2 additions & 0 deletions b/‎include/xsimd/arch/xsimd_common_fwd.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_emulated.hpp‎
Lines changed: 10 additions & 0 deletions b/‎include/xsimd/arch/xsimd_emulated.hpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_neon.hpp‎
Lines changed: 9 additions & 2 deletions b/‎include/xsimd/arch/xsimd_neon.hpp‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎include/xsimd/arch/xsimd_sse2.hpp‎
Lines changed: 46 additions & 0 deletions b/‎include/xsimd/arch/xsimd_sse2.hpp‎
Lines changed: 46 additions & 0 deletions
@@ -38,6 +38,8 @@ Reduction operators
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`reduce_min` | min of the batch elements |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`reduce_mul` | product of the batch elements |
++---------------------------------------+----------------------------------------------------+
 | :cpp:func:`haddp` | horizontal sum across batches |
 +---------------------------------------+----------------------------------------------------+
 
 
@@ -77,6 +77,8 @@ namespace xsimd
  template <class T, class A>
  XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
  template <class T, class A>
+ XSIMD_INLINE T reduce_mul(batch<T, A> const&) noexcept;
+ template <class T, class A>
  XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
  template <class T, class A>
  XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
 
@@ -2116,7 +2116,6 @@ namespace xsimd
  return res;
  }
 
-
  namespace detail
  {
  template <class T, T N>
@@ -2161,6 +2160,34 @@ namespace xsimd
  self, std::integral_constant<unsigned, batch<T, A>::size>());
  }
 
+ // reduce_mul
+ template <class A, class T>
+ XSIMD_INLINE std::complex<T> reduce_mul(batch<std::complex<T>, A> const& self, requires_arch<common>) noexcept
+ {
+ // FIXME: could do better
+ alignas(A::alignment()) std::complex<T> buffer[batch<std::complex<T>, A>::size];
+ self.store_aligned(buffer);
+ std::complex<T> res = 1;
+ for (auto val : buffer)
+ {
+ res *= val;
+ }
+ return res;
+ }
+
+ template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept
+ {
+ alignas(A::alignment()) T buffer[batch<T, A>::size];
+ self.store_aligned(buffer);
+ T res = 1;
+ for (T val : buffer)
+ {
+ res *= val;
+ }
+ return res;
+ }
+
  // remainder
  template <class A>
  XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
 
@@ -1077,6 +1077,16 @@ namespace xsimd
  return reduce_min(batch<T, sse4_2>(low));
  }
 
+ // reduce_mul
+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ typename batch<T, sse4_2>::register_type low, high;
+ detail::split_avx(self, low, high);
+ batch<T, sse4_2> blow(low), bhigh(high);
+ return reduce_mul(blow * bhigh);
+ }
+
  // rsqrt
  template <class A>
  XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
@@ -1911,4 +1921,4 @@ namespace xsimd
  }
 }
 
-#endif
+#endif
@@ -188,6 +188,16 @@ namespace xsimd
  return reduce_add(batch<float, avx2>(res1), avx2 {});
  }
 
+ // reduce_mul
+ template <class A>
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
+ {
+ __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+ __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+ __m256 res1 = _mm256_mul_ps(tmp1, tmp2);
+ return reduce_mul(batch<float, avx2>(res1), avx2 {});
+ }
+
  // swizzle constant mask
  template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
  uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
 
@@ -1558,6 +1558,37 @@ namespace xsimd
  return reduce_min(batch<T, avx2>(low));
  }
 
+ // reduce_mul
+ template <class A>
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+ {
+ return _mm512_reduce_mul_ps(rhs);
+ }
+ template <class A>
+ XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+ {
+ return _mm512_reduce_mul_pd(rhs);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm512_reduce_mul_epi32(self);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm512_reduce_mul_epi64(self);
+ }
+ else
+ {
+ __m256i low, high;
+ detail::split_avx512(self, low, high);
+ batch<T, avx2> blow(low), bhigh(high);
+ return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
+ }
+ }
+
  // rsqrt
  template <class A>
  XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
 
@@ -38,6 +38,8 @@ namespace xsimd
  XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
  template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
  XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept;
+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept;
  // Forward declarations for pack-level helpers
  namespace detail
  {
 
@@ -601,6 +601,16 @@ namespace xsimd
  { return xsimd::min(x, y); });
  }
 
+ // reduce_mul
+ template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+ {
+ constexpr size_t size = batch<T, A>::size;
+ std::array<T, size> buffer;
+ self.store_unaligned(buffer.data());
+ return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin(), std::multiplies<T>());
+ }
+
  // rsqrt
  template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
  XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
 
@@ -1705,14 +1705,21 @@ namespace xsimd
  * reduce_max *
  **************/
 
- // Using common implementation because ARM doe snot provide intrinsics
+ // Using common implementation because ARM does not provide intrinsics
  // for this operation
 
  /**************
  * reduce_min *
  **************/
 
- // Using common implementation because ARM doe snot provide intrinsics
+ // Using common implementation because ARM does not provide intrinsics
+ // for this operation
+
+ /**************
+ * reduce_mul *
+ **************/
+
+ // Using common implementation because ARM does not provide intrinsics
  // for this operation
 
  /**********
 
@@ -1344,6 +1344,52 @@ namespace xsimd
  return first(acc3, A {});
  }
 
+ // reduce_mul
+ template <class A>
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
+ {
+ __m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
+ __m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
+ return _mm_cvtss_f32(tmp1);
+ }
+
+ template <class A>
+ XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<sse2>) noexcept
+ {
+ return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self)));
+ }
+
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3));
+ tmp1 = tmp1 * self;
+ batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
+ tmp2 = tmp2 * tmp1;
+ return _mm_cvtsi128_si32(tmp2);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
+ auto tmp2 = tmp1 * self;
+#if defined(__x86_64__)
+ return _mm_cvtsi128_si64(tmp2);
+#else
+ __m128i m;
+ _mm_storel_epi64(&m, tmp2);
+ int64_t i;
+ std::memcpy(&i, &m, sizeof(i));
+ return i;
+#endif
+ }
+ else
+ {
+ return reduce_mul(self, common {});
+ }
+ }
+
  // rsqrt
  template <class A>
  XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,8 @@ namespace xsimd`
`38`	`38`	`XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;`
`39`	`39`	`template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`40`	`40`	`XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept;`
	`41`	`+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
	`42`	`+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept;`
`41`	`43`	`// Forward declarations for pack-level helpers`
`42`	`44`	`namespace detail`
`43`	`45`	`{`