|
4 | 4 | #include <intrin.h> |
5 | 5 | #endif |
6 | 6 |
|
| 7 | +static void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) { |
| 8 | + ptrdiff_t i; |
| 9 | + ptrdiff_t off; |
| 10 | + __m256d YMM0 = _mm256_set_pd(c, c, c, c); |
| 11 | + for (i=0; i<=((n)-16); i+=16) { |
| 12 | + _mm256_storeu_pd((x)+i , YMM0); |
| 13 | + _mm256_storeu_pd((x)+i+4, YMM0); |
| 14 | + _mm256_storeu_pd((x)+i+8, YMM0); |
| 15 | + _mm256_storeu_pd((x)+i+12, YMM0); |
| 16 | + } |
| 17 | + off = (n) - ((n)%16); |
| 18 | + for (i=0; i<((n)%16); i++) { |
| 19 | + x[off+i] = c; |
| 20 | + } |
| 21 | +} |
| 22 | + |
7 | 23 | static void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { |
8 | 24 | ptrdiff_t i; |
9 | 25 | __m256d YMM0, YMM1, YMM2, YMM3; |
@@ -107,6 +123,21 @@ static void THDoubleVector_add_AVX(double *y, const double *x, const double c, c |
107 | 123 | } |
108 | 124 | } |
109 | 125 |
|
| 126 | +static void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) { |
| 127 | + ptrdiff_t i; |
| 128 | + ptrdiff_t off; |
| 129 | + __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c); |
| 130 | + for (i=0; i<=((n)-32); i+=32) { |
| 131 | + _mm256_storeu_ps((x)+i , YMM0); |
| 132 | + _mm256_storeu_ps((x)+i+8, YMM0); |
| 133 | + _mm256_storeu_ps((x)+i+16, YMM0); |
| 134 | + _mm256_storeu_ps((x)+i+24, YMM0); |
| 135 | + } |
| 136 | + off = (n) - ((n)%32); |
| 137 | + for (i=0; i<((n)%32); i++) { |
| 138 | + x[off+i] = c; |
| 139 | + } |
| 140 | +} |
110 | 141 |
|
111 | 142 | static void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { |
112 | 143 | ptrdiff_t i; |
|
0 commit comments