@@ -476,11 +476,66 @@ void THTensor_(add_Default)(THTensor *r_, THTensor *t, real value)
476476 }
477477}
478478
479+ #if defined(TH_REAL_IS_DOUBLE )
480+
481+ #include <immintrin.h>
482+
479483void THTensor_ (add_AVX )(THTensor * r_ , THTensor * t , real value )
480484{
481- // TODO
485+ THTensor_ (resizeAs )(r_ , t );
486+
487+ real * tp = THTensor_ (data )(t );
488+ real * rp = THTensor_ (data )(r_ );
489+ long sz = THTensor_ (nElement )(t );
490+
491+ __m256d c = _mm256_broadcast_sd (& value );
492+ long batch_size = 20 ;
493+ sz -= batch_size ;
494+ if (sz > 0 ) {
495+ // pipelined loop prologue
496+ __m256d x0 = _mm256_load_pd (tp );
497+
498+ __m256d x1 = _mm256_load_pd (tp + 4 );
499+ __m256d y0 = _mm256_add_pd (x0 , c );
500+ for (; sz > batch_size ; sz -= batch_size ) { // tp incremented in loop
501+ __m256d x2 = _mm256_load_pd (tp + 8 );
502+ __m256d y1 = _mm256_add_pd (x1 , c );
503+ _mm256_store_pd (rp , y0 );
504+
505+ __m256d x3 = _mm256_load_pd (tp + 12 );
506+ __m256d y2 = _mm256_add_pd (x2 , c );
507+ _mm256_store_pd (rp + 4 , y1 );
508+
509+ __m256d x4 = _mm256_load_pd (tp + 16 );
510+ __m256d y3 = _mm256_add_pd (x3 , c );
511+ _mm256_store_pd (rp + 8 , y2 );
512+ tp += batch_size ;
513+
514+ x0 = _mm256_load_pd (tp );
515+ __m256d y4 = _mm256_add_pd (x4 , c );
516+ _mm256_store_pd (rp + 12 , y3 );
517+
518+ x1 = _mm256_load_pd (tp + 4 );
519+ y0 = _mm256_add_pd (x0 , c );
520+ _mm256_store_pd (rp + 16 , y4 );
521+ rp += batch_size ;
522+ }
523+ __m256d y1 = _mm256_add_pd (x1 , c );
524+ _mm256_store_pd (rp , y0 );
525+
526+ _mm256_store_pd (rp + 4 , y1 );
527+
528+ rp += 8 ;
529+ tp += 8 ;
530+ }
531+ sz += batch_size ;
532+ long i ;
533+ for (i = 0 ; i < sz ; ++ i ) {
534+ rp [i ] = tp [i ] + value ;
535+ }
482536 return ;
483537}
538+ #endif
484539
485540void THTensor_ (sub )(THTensor * r_ , THTensor * t , real value )
486541{
0 commit comments