taion
diff --git a/‎torch/lib/TH/THMath.h‎
Lines changed: 16 additions & 1 deletion b/‎torch/lib/TH/THMath.h‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎torch/lib/TH/cmake/FindSSE.cmake‎
Lines changed: 1 addition & 1 deletion b/‎torch/lib/TH/cmake/FindSSE.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/lib/TH/generic/THTensorCopy.c‎
Lines changed: 69 additions & 1 deletion b/‎torch/lib/TH/generic/THTensorCopy.c‎
Lines changed: 69 additions & 1 deletion
diff --git a/‎torch/lib/TH/generic/THTensorMath.c‎
Lines changed: 50 additions & 39 deletions b/‎torch/lib/TH/generic/THTensorMath.c‎
Lines changed: 50 additions & 39 deletions
@@ -17,5 +17,20 @@ static inline double TH_lerp(double a, double b, double weight) {
  return a + weight * (b-a);
 }
 
-#endif // _THMATH_H
+static inline float TH_sigmoidf(float value) {
+ return 1.0f / (1.0f + expf(-value));
+}
+
+static inline float TH_fracf(float x) {
+ return x - truncf(x);
+}
+
+static inline float TH_rsqrtf(float x) {
+ return 1.0f / sqrtf(x);
+}
 
+static inline float TH_lerpf(float a, float b, float weight) {
+ return a + weight * (b-a);
+}
+
+#endif // _THMATH_H
@@ -73,7 +73,7 @@ SET(AVX2_CODE "
 
  int main()
  {
- __m256i a;
+ __m256i a = {0};
  a = _mm256_abs_epi16(a);
  return 0;
  }
 
@@ -2,16 +2,84 @@
 #define TH_GENERIC_FILE "generic/THTensorCopy.c"
 #else
 
+int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) {
+ const int MIN_SZ = 60 * 60;
+ return THTensor_(isContiguous)(tensor) &&
+ THTensor_(nDimension)(src) == 2 &&
+ THTensor_(stride)(src, 0) == 1 &&
+ THTensor_(stride)(src, 1) == THTensor_(size)(src, 0) &&
+ THTensor_(nElement)(tensor) >= MIN_SZ;
+}
+
+// special case copy where tensor is contiguous and src is a transposed matrix
+// This can be generalized to most copies, but it's tricker
+void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
+ #define MIN(x, y) (((x) < (y)) ? (x) : (y))
+ #define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#ifdef TH_REAL_IS_BYTE
+ const int BLOCK_SZ = 120;
+#else
+ const int BLOCK_SZ = 60;
+#endif
+
+ THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ);
+ real *sp = THTensor_(data)(src);
+ real *rp = THTensor_(data)(tensor);
+ real *bp = THTensor_(data)(buf);
+
+ long NR = THTensor_(size)(src, 0);
+ long NC = THTensor_(size)(src, 1);
+ for (long R = 0; R < NR; R += BLOCK_SZ) {
+ for (long C = 0; C < NC; C += BLOCK_SZ) {
+ real *spo = sp + R + C * NR;
+ real *rpo = rp + C + R * NC;
+
+ int nr = MIN(NR - R, BLOCK_SZ);
+ int nc = MIN(NC - C, BLOCK_SZ);
+
+ // 1. copy columns from src to buf
+ for (int c = 0; c < nc; c++) {
+ memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(real));
+ }
+
+ // 2. transpose buf in place
+ int rc_max = MAX(nr, nc);
+ int rc_min = MIN(nr, nc);
+ for (int r = 0; r < rc_max; r++) {
+ int end = MIN(r, rc_min);
+ for (int c = 0; c < end; c++) {
+ real tmp = bp[r + BLOCK_SZ * c];
+ bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
+ bp[r * BLOCK_SZ + c] = tmp;
+ }
+ }
+
+ // 3. copy rows from buf to dst
+ for (int r = 0; r < nr; r++) {
+ memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(real));
+ }
+ }
+ }
+ THTensor_(free)(buf);
+ #undef MIN
+ #undef MAX
+}
+
 void THTensor_(copy)(THTensor *tensor, THTensor *src)
 {
  if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) {
  real *sp = THTensor_(data)(src);
  real *rp = THTensor_(data)(tensor);
  ptrdiff_t sz = THTensor_(nElement)(tensor);
 #ifndef TH_REAL_IS_HALF
- THVector_(copy)(rp, sp, sz); 
+ THVector_(copy)(rp, sp, sz);
 #else
  memcpy(rp, sp, sz * sizeof(real));
+#endif
+#ifndef TH_REAL_IS_HALF
+ } else if (THTensor_(copyTransposeValid)(tensor, src)) {
+ THTensor_(copyTranspose)(tensor, src);
 #endif
  } else {
  TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
 
@@ -2746,43 +2746,50 @@ TENSOR_IMPLEMENT_LOGICAL_SUM(logicalany, ||, 0)
 /* floating point only now */
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
-LAB_IMPLEMENT_BASIC_FUNCTION(log,log)
-LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,lgamma)
-LAB_IMPLEMENT_BASIC_FUNCTION(log1p,log1p)
-LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_sigmoid)
-LAB_IMPLEMENT_BASIC_FUNCTION(exp,exp)
-LAB_IMPLEMENT_BASIC_FUNCTION(cos,cos)
-LAB_IMPLEMENT_BASIC_FUNCTION(acos,acos)
-LAB_IMPLEMENT_BASIC_FUNCTION(cosh,cosh)
-LAB_IMPLEMENT_BASIC_FUNCTION(sin,sin)
-LAB_IMPLEMENT_BASIC_FUNCTION(asin,asin)
-LAB_IMPLEMENT_BASIC_FUNCTION(sinh,sinh)
-LAB_IMPLEMENT_BASIC_FUNCTION(tan,tan)
-LAB_IMPLEMENT_BASIC_FUNCTION(atan,atan)
-LAB_IMPLEMENT_BASIC_FUNCTION(tanh,tanh)
-LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,pow)
-LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,sqrt)
-LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_rsqrt)
-LAB_IMPLEMENT_BASIC_FUNCTION(ceil,ceil)
-LAB_IMPLEMENT_BASIC_FUNCTION(floor,floor)
-LAB_IMPLEMENT_BASIC_FUNCTION(round,round)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,fabs)
-LAB_IMPLEMENT_BASIC_FUNCTION(trunc,trunc)
-LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_frac)
+#if defined (TH_REAL_IS_FLOAT)
+#define TH_MATH_NAME(fn) fn##f
+#else
+#define TH_MATH_NAME(fn) fn
+#endif
+
+LAB_IMPLEMENT_BASIC_FUNCTION(log,TH_MATH_NAME(log))
+LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,TH_MATH_NAME(lgamma))
+LAB_IMPLEMENT_BASIC_FUNCTION(log1p,TH_MATH_NAME(log1p))
+LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_MATH_NAME(TH_sigmoid))
+LAB_IMPLEMENT_BASIC_FUNCTION(exp,TH_MATH_NAME(exp))
+LAB_IMPLEMENT_BASIC_FUNCTION(cos,TH_MATH_NAME(cos))
+LAB_IMPLEMENT_BASIC_FUNCTION(acos,TH_MATH_NAME(acos))
+LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh))
+LAB_IMPLEMENT_BASIC_FUNCTION(sin,TH_MATH_NAME(sin))
+LAB_IMPLEMENT_BASIC_FUNCTION(asin,TH_MATH_NAME(asin))
+LAB_IMPLEMENT_BASIC_FUNCTION(sinh,TH_MATH_NAME(sinh))
+LAB_IMPLEMENT_BASIC_FUNCTION(tan,TH_MATH_NAME(tan))
+LAB_IMPLEMENT_BASIC_FUNCTION(atan,TH_MATH_NAME(atan))
+LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh))
+LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,TH_MATH_NAME(pow))
+LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,TH_MATH_NAME(sqrt))
+LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt))
+LAB_IMPLEMENT_BASIC_FUNCTION(ceil,TH_MATH_NAME(ceil))
+LAB_IMPLEMENT_BASIC_FUNCTION(floor,TH_MATH_NAME(floor))
+LAB_IMPLEMENT_BASIC_FUNCTION(round,TH_MATH_NAME(round))
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
+LAB_IMPLEMENT_BASIC_FUNCTION(trunc,TH_MATH_NAME(trunc))
+LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_MATH_NAME(TH_frac))
 LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
-LAB_IMPLEMENT_BASIC_FUNCTION(cinv, 1.0 / )
+LAB_IMPLEMENT_BASIC_FUNCTION(cinv, TH_MATH_NAME(1.0) / )
+
 
 void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty)
 {
  THTensor_(resizeAs)(r_, tx);
- TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = atan2(*tx_data,*ty_data););
+ TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = TH_MATH_NAME(atan2)(*tx_data,*ty_data););
 }
 
 void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
 {
  THArgCheck(THTensor_(nElement)(a) == THTensor_(nElement)(b), 2, "sizes do not match");
  THTensor_(resizeAs)(r_, a);
- TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_lerp(*a_data, *b_data, weight););
+ TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_MATH_NAME(TH_lerp)(*a_data, *b_data, weight););
 }
 
 void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
@@ -2823,15 +2830,15 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keep
  sum2 /= t_size;
  sum2 -= sum*sum;
  sum2 = (sum2 < 0 ? 0 : sum2);
- *r__data = (real)sqrt(sum2);
+ *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
  }
  else
  {
  sum /= t_size;
  sum2 /= t_size-1;
  sum2 -= ((real)t_size)/((real)(t_size-1))*sum*sum;
  sum2 = (sum2 < 0 ? 0 : sum2);
- *r__data = (real)sqrt(sum2);
+ *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
  });
 
  if (!keepdim) {
@@ -2907,9 +2914,11 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int k
  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
  accreal sum = 0;
  long i;
- for(i = 0; i < t_size; i++)
- sum += pow(fabs(t_data[i*t_stride]), value);
- *r__data = pow(sum, 1.0/value);)
+ for(i = 0; i < t_size; i++) {
+ sum += TH_MATH_NAME(pow)(
+ TH_MATH_NAME(fabs)(t_data[i*t_stride]), value);
+ }
+ *r__data = TH_MATH_NAME(pow)(sum, 1.0/value);)
  }
 
  if (!keepdim) {
@@ -2924,14 +2933,14 @@ accreal THTensor_(normall)(THTensor *tensor, real value)
  TH_TENSOR_APPLY(real, tensor, sum += *tensor_data != 0.0;);
  return sum;
  } else if(value == 1) {
- TH_TENSOR_APPLY(real, tensor, sum += fabs(*tensor_data););
+ TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(fabs)(*tensor_data););
  return sum;
  } else if(value == 2) {
  TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += z*z;);
  return sqrt(sum);
  } else {
- TH_TENSOR_APPLY(real, tensor, sum += pow(fabs(*tensor_data), value););
- return pow(sum, 1.0/value);
+ TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*tensor_data), value););
+ return TH_MATH_NAME(pow)(sum, 1.0/value);
  }
 }
 
@@ -2963,7 +2972,7 @@ void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension,
  } else if (value == 2) {
  TH_TENSOR_APPLY(real, rowS, accreal z = *rowS_data; norm += z*z;);
  } else {
- TH_TENSOR_APPLY(real, rowS, norm += pow(fabs(*rowS_data), value););
+ TH_TENSOR_APPLY(real, rowS, norm += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*rowS_data), value););
  }
 
  norm = pow(norm, 1/value);
@@ -2989,8 +2998,9 @@ accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
 {
  real sum = 0;
  TH_TENSOR_APPLY2(real, tensor, real, src,
- sum += pow(fabs(*tensor_data - *src_data), value);)
- return pow(sum, 1.0/value);
+ sum += TH_MATH_NAME(pow)(
+ TH_MATH_NAME(fabs)(*tensor_data - *src_data), value););
+ return TH_MATH_NAME(pow)(sum, 1.0/value);
 }
 
 accreal THTensor_(meanall)(THTensor *tensor)
@@ -3048,12 +3058,12 @@ void THTensor_(logspace)(THTensor *r_, real a, real b, long n)
 
  if(n == 1) {
  TH_TENSOR_APPLY(real, r_,
- *r__data = pow(10.0, a);
+ *r__data = TH_MATH_NAME(pow)(10.0, a);
  i++;
  );
  } else {
  TH_TENSOR_APPLY(real, r_,
- *r__data = pow(10.0, a + i*(b-a)/((real)(n-1)));
+ *r__data = TH_MATH_NAME(pow)(10.0, a + i*(b-a)/((real)(n-1)));
  i++;
  );
  }
@@ -3141,6 +3151,7 @@ void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minval
  );
 }
 
+#undef TH_MATH_NAME
 #endif /* floating point only part */
 #undef IS_NONZERO
 #endif
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ SET(AVX2_CODE "`
`73`	`73`
`74`	`74`	`int main()`
`75`	`75`	`{`
`76`		`- __m256i a;`
	`76`	`+ __m256i a = {0};`
`77`	`77`	`a = _mm256_abs_epi16(a);`
`78`	`78`	`return 0;`
`79`	`79`	`}`