Skip to content

Commit 6ada3c0

Browse files
rguthrie3soumith
authored andcommitted
Fast floating point add kernel in intrinsics (11x speedup over default for 10k elements)
1 parent 60061fb commit 6ada3c0

File tree

2 files changed

+63
-1
lines changed

2 files changed

+63
-1
lines changed

generic/THTensorMath.c

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,11 +476,66 @@ void THTensor_(add_Default)(THTensor *r_, THTensor *t, real value)
476476
}
477477
}
478478

479+
#if defined(TH_REAL_IS_DOUBLE)
480+
481+
#include <immintrin.h>
482+
479483
void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value)
480484
{
481-
// TODO
485+
THTensor_(resizeAs)(r_, t);
486+
487+
real *tp = THTensor_(data)(t);
488+
real *rp = THTensor_(data)(r_);
489+
long sz = THTensor_(nElement)(t);
490+
491+
__m256d c = _mm256_broadcast_sd(&value);
492+
long batch_size = 20;
493+
sz -= batch_size;
494+
if (sz > 0) {
495+
// pipelined loop prologue
496+
__m256d x0 = _mm256_load_pd(tp);
497+
498+
__m256d x1 = _mm256_load_pd(tp + 4);
499+
__m256d y0 = _mm256_add_pd(x0, c);
500+
for (; sz > batch_size; sz -= batch_size) { // tp incremented in loop
501+
__m256d x2 = _mm256_load_pd(tp + 8);
502+
__m256d y1 = _mm256_add_pd(x1, c);
503+
_mm256_store_pd(rp, y0);
504+
505+
__m256d x3 = _mm256_load_pd(tp + 12);
506+
__m256d y2 = _mm256_add_pd(x2, c);
507+
_mm256_store_pd(rp + 4, y1);
508+
509+
__m256d x4 = _mm256_load_pd(tp + 16);
510+
__m256d y3 = _mm256_add_pd(x3, c);
511+
_mm256_store_pd(rp + 8, y2);
512+
tp += batch_size;
513+
514+
x0 = _mm256_load_pd(tp);
515+
__m256d y4 = _mm256_add_pd(x4, c);
516+
_mm256_store_pd(rp + 12, y3);
517+
518+
x1 = _mm256_load_pd(tp + 4);
519+
y0 = _mm256_add_pd(x0, c);
520+
_mm256_store_pd(rp + 16, y4);
521+
rp += batch_size;
522+
}
523+
__m256d y1 = _mm256_add_pd(x1, c);
524+
_mm256_store_pd(rp, y0);
525+
526+
_mm256_store_pd(rp + 4, y1);
527+
528+
rp += 8;
529+
tp += 8;
530+
}
531+
sz += batch_size;
532+
long i;
533+
for (i = 0; i < sz; ++i) {
534+
rp[i] = tp[i] + value;
535+
}
482536
return;
483537
}
538+
#endif
484539

485540
void THTensor_(sub)(THTensor *r_, THTensor *t, real value)
486541
{

generic/THTensorMathDispatch.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,17 @@ TH_API void THTensor_(add)(THTensor *r_, THTensor *t, real value)
2323
// Dispatch tables: each optimized implementation of a function
2424
// is described in a table, and the tables are used to initialize
2525
// the function pointers for dynamic dispatch
26+
27+
#if defined(TH_REAL_IS_DOUBLE)
2628
FunctionDescription THTensor_(dispatchTblAdd)[] = {
2729
FUNCTION_IMPL(THTensor_(add_AVX), SIMDExtension_AVX),
2830
FUNCTION_IMPL(THTensor_(add_Default), SIMDExtension_DEFAULT)
2931
};
32+
#else
33+
FunctionDescription THTensor_(dispatchTblAdd)[] = {
34+
FUNCTION_IMPL(THTensor_(add_Default), SIMDExtension_DEFAULT)
35+
};
36+
#endif
3037

3138
int THTensor_(cpuDispatchInit)()
3239
{

0 commit comments

Comments
 (0)