Skip to content

Commit b4ee3f2

Browse files
Specialize sse2 swizzle on short based on the work from @DiamonDinoia
1 parent c5cca10 commit b4ee3f2

File tree

1 file changed

+54
-1
lines changed

1 file changed

+54
-1
lines changed

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1688,8 +1688,61 @@ namespace xsimd
16881688
}
16891689

16901690
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1691-
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1691+
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
16921692
{
1693+
constexpr bool is_identity = detail::is_identity(mask);
1694+
constexpr bool is_dup_lo = detail::is_dup_lo(mask);
1695+
constexpr bool is_dup_hi = detail::is_dup_hi(mask);
1696+
1697+
XSIMD_IF_CONSTEXPR(is_identity)
1698+
{
1699+
return self;
1700+
}
1701+
XSIMD_IF_CONSTEXPR(is_dup_lo)
1702+
{
1703+
// permute the low half
1704+
constexpr int imm = detail::mod_shuffle(V0, V1, V2, V3);
1705+
const auto lo = _mm_shufflelo_epi16(self, imm);
1706+
// broadcast that 64-bit low half into both halves
1707+
const auto lo_all = _mm_unpacklo_epi64(lo, lo);
1708+
return lo_all;
1709+
}
1710+
XSIMD_IF_CONSTEXPR(is_dup_hi)
1711+
{
1712+
// permute the high half
1713+
constexpr int imm = detail::mod_shuffle(V4, V5, V6, V7);
1714+
const auto hi = _mm_shufflehi_epi16(self, imm);
1715+
// broadcast that 64-bit high half into both halves
1716+
const auto hi_all = _mm_unpackhi_epi64(hi, hi);
1717+
return hi_all;
1718+
}
1719+
// Only pick elements from the low lane
1720+
XSIMD_IF_CONSTEXPR((V0 < 4) && (V1 < 4) && (V2 < 4) && (V3 < 4) && (V4 < 4) && (V5 < 4) && (V6 < 4) && (V7 < 4))
1721+
{
1722+
// permute within each sub lane
1723+
constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1724+
constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1725+
__m128i lol = _mm_shufflelo_epi16(self, mask_lo);
1726+
__m128i loh = _mm_shufflelo_epi16(self, mask_hi);
1727+
1728+
// generate temporary lanes
1729+
return _mm_unpacklo_epi64(lol, loh);
1730+
}
1731+
// Only pick elements from the high lane
1732+
XSIMD_IF_CONSTEXPR((V0 >= 4) && (V1 >= 4) && (V2 >= 4) && (V3 >= 4) && (V4 >= 4) && (V5 >= 4) && (V6 >= 4) && (V7 >= 4))
1733+
{
1734+
// permute within each sub lane
1735+
constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1736+
constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1737+
__m128i hil = _mm_shufflehi_epi16(self, mask_lo);
1738+
__m128i hih = _mm_shufflehi_epi16(self, mask_hi);
1739+
1740+
// generate temporary lanes
1741+
return _mm_unpackhi_epi64(hil, hih);
1742+
}
1743+
1744+
// Generic case
1745+
16931746
// permute within each sub lane
16941747
constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
16951748
constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);

0 commit comments

Comments
 (0)