codeplaysoftware
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256.h‎
Lines changed: 18 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec256/vec256.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512.h‎
Lines changed: 12 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec512/vec512.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec_base.h‎
Lines changed: 19 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec_base.h‎
Lines changed: 19 additions & 0 deletions
@@ -143,6 +143,24 @@ inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
  return _mm256_cvttps_epi32(src);
 }
 
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vectorized<double>
+inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
+ auto x = _mm256_add_epi64(src, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
+ return _mm256_sub_pd(
+ _mm256_castsi256_pd(x),
+ _mm256_set1_pd(0x0018000000000000)
+ );
+}
+
+template<>
+Vectorized<float>
+inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+ return _mm256_cvtepi32_ps(src);
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
 
@@ -127,6 +127,18 @@ inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
  return _mm512_cvttps_epi32(src);
 }
 
+template<>
+Vectorized<double>
+inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
+ return _mm512_cvtepi64_pd(src);
+}
+
+template<>
+Vectorized<float>
+inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+ return _mm512_cvtepi32_ps(src);
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
 
@@ -622,6 +622,12 @@ template <class T> Vectorized<T> inline operator/(const Vectorized<T> &a, const
  return c;
 }
 
+template <class T,
+ typename std::enable_if<!is_floating_point_v<T>, int>::type = 0>
+Vectorized<T> inline operator%(const Vectorized<T> &a, const Vectorized<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+ return a - a / b * b;
+}
+
 template <class T> Vectorized<T> inline operator||(
  const Vectorized<T> &a, const Vectorized<T> &b) {
  Vectorized<T> c;
@@ -989,6 +995,19 @@ inline Vectorized<IntType> convert_to_int_of_same_size(const Vectorized<T>& src)
  return Vectorized<IntType>::loadu(static_cast<const void*>(buffer.data()));
 }
 
+template <typename T, typename IntType = int_same_size_t<T>>
+inline Vectorized<T> convert_to_fp_of_same_size(const Vectorized<IntType>& src) {
+ static_assert(sizeof(T) == sizeof(IntType));
+ static constexpr int size = Vectorized<T>::size();
+
+ std::array<IntType, size> src_arr;
+ src.store(static_cast<void*>(src_arr.data()));
+ std::array<T, size> buffer;
+ std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(),
+ [](const IntType& x) { return static_cast<T>(x); });
+ return Vectorized<T>::loadu(static_cast<const void*>(buffer.data()));
+}
+
 // Example inputs for AVX512:
 // a Vectorized<float> = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
 // b Vectorized<float> = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15}