@@ -1688,8 +1688,61 @@ namespace xsimd
1688
1688
}
1689
1689
1690
1690
template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1691
- XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1691
+ XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask , requires_arch<sse2>) noexcept
1692
1692
{
1693
+ constexpr bool is_identity = detail::is_identity (mask);
1694
+ constexpr bool is_dup_lo = detail::is_dup_lo (mask);
1695
+ constexpr bool is_dup_hi = detail::is_dup_hi (mask);
1696
+
1697
+ XSIMD_IF_CONSTEXPR (is_identity)
1698
+ {
1699
+ return self;
1700
+ }
1701
+ XSIMD_IF_CONSTEXPR (is_dup_lo)
1702
+ {
1703
+ // permute the low half
1704
+ constexpr int imm = detail::mod_shuffle (V0, V1, V2, V3);
1705
+ const auto lo = _mm_shufflelo_epi16 (self, imm);
1706
+ // broadcast that 64-bit low half into both halves
1707
+ const auto lo_all = _mm_unpacklo_epi64 (lo, lo);
1708
+ return lo_all;
1709
+ }
1710
+ XSIMD_IF_CONSTEXPR (is_dup_hi)
1711
+ {
1712
+ // permute the high half
1713
+ constexpr int imm = detail::mod_shuffle (V4, V5, V6, V7);
1714
+ const auto hi = _mm_shufflehi_epi16 (self, imm);
1715
+ // broadcast that 64-bit high half into both halves
1716
+ const auto hi_all = _mm_unpackhi_epi64 (hi, hi);
1717
+ return hi_all;
1718
+ }
1719
+ // Only pick elements from the low lane
1720
+ XSIMD_IF_CONSTEXPR ((V0 < 4 ) && (V1 < 4 ) && (V2 < 4 ) && (V3 < 4 ) && (V4 < 4 ) && (V5 < 4 ) && (V6 < 4 ) && (V7 < 4 ))
1721
+ {
1722
+ // permute within each sub lane
1723
+ constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
1724
+ constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
1725
+ __m128i lol = _mm_shufflelo_epi16 (self, mask_lo);
1726
+ __m128i loh = _mm_shufflelo_epi16 (self, mask_hi);
1727
+
1728
+ // generate temporary lanes
1729
+ return _mm_unpacklo_epi64 (lol, loh);
1730
+ }
1731
+ // Only pick elements from the high lane
1732
+ XSIMD_IF_CONSTEXPR ((V0 >= 4 ) && (V1 >= 4 ) && (V2 >= 4 ) && (V3 >= 4 ) && (V4 >= 4 ) && (V5 >= 4 ) && (V6 >= 4 ) && (V7 >= 4 ))
1733
+ {
1734
+ // permute within each sub lane
1735
+ constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
1736
+ constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
1737
+ __m128i hil = _mm_shufflehi_epi16 (self, mask_lo);
1738
+ __m128i hih = _mm_shufflehi_epi16 (self, mask_hi);
1739
+
1740
+ // generate temporary lanes
1741
+ return _mm_unpackhi_epi64 (hil, hih);
1742
+ }
1743
+
1744
+ // Generic case
1745
+
1693
1746
// permute within each sub lane
1694
1747
constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
1695
1748
constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
0 commit comments