kaldi-asr
diff --git a/‎src/cudamatrix/cu-kernels-ansi.h‎
Lines changed: 11 additions & 0 deletions b/‎src/cudamatrix/cu-kernels-ansi.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/cudamatrix/cu-kernels.cu‎
Lines changed: 99 additions & 1 deletion b/‎src/cudamatrix/cu-kernels.cu‎
Lines changed: 99 additions & 1 deletion
diff --git a/‎src/cudamatrix/cu-kernels.h‎
Lines changed: 17 additions & 0 deletions b/‎src/cudamatrix/cu-kernels.h‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/cudamatrix/cu-math-test.cc‎
Lines changed: 103 additions & 1 deletion b/‎src/cudamatrix/cu-math-test.cc‎
Lines changed: 103 additions & 1 deletion
diff --git a/‎src/cudamatrix/cu-math.cc‎
Lines changed: 95 additions & 1 deletion b/‎src/cudamatrix/cu-math.cc‎
Lines changed: 95 additions & 1 deletion
@@ -705,6 +705,17 @@ void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
 void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
  const float *v_in);
 
+void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+ int id_stride, const float *iv,
+ MatrixDim iv_dim, const float* od,
+ int od_stride, float target_rms,
+ bool add_log_stddev);
+void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+ int id_stride, const double *iv,
+ MatrixDim iv_dim, const double* od,
+ int od_stride, double target_rms,
+ bool add_log_stddev);
+
 } // extern "C"
 
 #endif // HAVE_CUDA
 
@@ -2292,7 +2292,7 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
  }
  }
 
- const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+ const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
  if (tid == 0) {
  ssum[0] = sqrt(
  fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
@@ -2315,6 +2315,87 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
 }
 
 
+template<typename Real>
+__global__
+static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
+ MatrixDim iv_dim, const Real* od,
+ int od_stride, Real target_rms,
+ bool add_log_stddev) {
+
+ const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+ const Real kInvNormFloor = 8589934592.0;
+
+ const int tid = threadIdx.x;
+ const int i = blockIdx.x;
+ const Real* iv_row = iv + i * iv_dim.stride;
+ const Real* od_row = od + i * od_stride;
+
+ // reduce to CU1DBLOCK elements per row
+ Real dot_products = Real(0);
+ Real in_norm = Real(0);
+ for (int j = tid; j < iv_dim.cols; j += CU1DBLOCK) {
+ const Real iv_ij = iv_row[j];
+ dot_products += iv_ij * od_row[j];
+ in_norm += iv_ij * iv_ij;
+ }
+ __shared__ Real sprod[CU1DBLOCK];
+ __shared__ Real snorm[CU1DBLOCK];
+ sprod[tid] = dot_products;
+ snorm[tid] = in_norm;
+ __syncthreads();
+
+ // reduce to 2x warpSize elements per row
+# pragma unroll
+ for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+ if (tid < shift) {
+ sprod[tid] += sprod[tid + shift];
+ snorm[tid] += snorm[tid + shift];
+ }
+ __syncthreads();
+ }
+
+ // reduce to 1 element per row
+ if (tid < warpSize) {
+# pragma unroll
+ for (int shift = warpSize; shift > 0; shift >>= 1) {
+ sprod[tid] += sprod[tid + shift];
+ snorm[tid] += snorm[tid + shift];
+ }
+ }
+
+ // broadcast the sum results
+ __syncthreads();
+ dot_products = sprod[0];
+ in_norm = snorm[0];
+
+ Real log_stddev_deriv;
+ if (add_log_stddev) {
+ log_stddev_deriv = Real(1) / max(in_norm, iv_dim.cols * kSquaredNormFloor)
+ * od_row[iv_dim.cols];
+ }
+
+ const Real inv_d_scaled = Real(1) / (iv_dim.cols * target_rms * target_rms);
+ in_norm = Real(1) / sqrt(max(in_norm * inv_d_scaled, kSquaredNormFloor));
+
+ const Real f = in_norm == kInvNormFloor ? Real(0) : in_norm;
+ dot_products *= f * f * f * inv_d_scaled;
+
+ for (int j = tid; j < iv_dim.cols; j += CU1DBLOCK) {
+ const Real iv_ij = iv_row[j];
+ Real id_ij = id[i * id_stride + j];
+ if (add_log_stddev) {
+ id_ij += log_stddev_deriv * iv_ij;
+ }
+ if (id != od) {
+ id_ij += in_norm * od_row[j];
+ } else {
+ id_ij *= in_norm;
+ }
+ id_ij -= dot_products * iv_ij;
+ id[i * id_stride + j] = id_ij;
+ }
+}
+
 // Per-row log-softmax operation on 'x', with writing to 'y'.
 // note, x and y may point to the same memory. This is equivalent to setting
 // matrix y to matrix x and then, for each row of y, subtracting the offset that
@@ -4690,3 +4771,20 @@ void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
  const float *v_in) {
  _copy_cols_from_vec<<<Gr, Bl>>>(mat_out, d_out, v_in);
 }
+
+void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+ int id_stride, const float *iv,
+ MatrixDim iv_dim, const float* od,
+ int od_stride, float target_rms,
+ bool add_log_stddev) {
+ _diff_normalize_per_row<<<Gr, Bl>>>(id, id_stride, iv, iv_dim, od, od_stride,
+ target_rms, add_log_stddev);
+}
+void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+ int id_stride, const double *iv,
+ MatrixDim iv_dim, const double* od,
+ int od_stride, double target_rms,
+ bool add_log_stddev) {
+ _diff_normalize_per_row<<<Gr, Bl>>>(id, id_stride, iv, iv_dim, od, od_stride,
+ target_rms, add_log_stddev);
+}
@@ -1348,6 +1348,23 @@ inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
  cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
 
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+ int id_stride, const double *iv,
+ MatrixDim iv_dim, const double* od,
+ int od_stride, double target_rms,
+ bool add_log_stddev) {
+ cudaD_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+ target_rms, add_log_stddev);
+}
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+ int id_stride, const float *iv,
+ MatrixDim iv_dim, const float* od,
+ int od_stride, float target_rms,
+ bool add_log_stddev) {
+ cudaF_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+ target_rms, add_log_stddev);
+}
+
 } // namespace kaldi
 
 #endif // HAVE_CUDA
 
@@ -510,14 +510,115 @@ static void UnitTestCuMathNormalizePerRow() {
 
  BaseFloat gflops = ((BaseFloat) dim * dim * iter)
  / (tim.Elapsed() * 1.0e+09);
- KALDI_LOG << "For CuMatrix::NormalizePerRow"
+ KALDI_LOG << "For CuMath::NormalizePerRow"
  << (sizeof(Real)==8?"<double>":"<float>") << ", for dim = "
  << dim << ", speed was " << gflops << " gigaflops.";
  if (tim.Elapsed() > 0.05)
  break;
  }
 }
 
+template<typename Real>
+static void UnitTestCuDiffNormalizePerRow() {
+ for (int32 i = 0; i < 2; i++) {
+ int row = 10 + Rand() % 40;
+ int col = 10 + Rand() % 50;
+
+ Matrix<Real> Hi(row, col);
+ Matrix<Real> Ho(row, col + 1);
+ Matrix<Real> Hid(row, col);
+ Matrix<Real> Hod(row, col + 1);
+ Hi.SetRandn();
+ Hod.SetRandn();
+ Hi.Scale(5.0);
+
+ CuMatrix<Real> Di(row, col);
+ CuMatrix<Real> Do(row, col + 1);
+ CuMatrix<Real> Did(row, col);
+ CuMatrix<Real> Dod(row, col + 1);
+ Di.CopyFromMat(Hi);
+ Dod.CopyFromMat(Hod);
+
+ Real target_rms = 0.3456;
+ bool add_log_stddev = true;
+ const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+
+ //gpu
+ cu::DiffNormalizePerRow(Di, Dod, target_rms, add_log_stddev, &Did);
+
+ //cpu
+ {
+ MatrixBase<Real>* in_deriv = &Hid;
+ MatrixBase<Real>& out_deriv(Hod);
+ MatrixBase<Real>& in_value(Hi);
+
+ const SubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
+ 0, in_value.NumCols());
+ Vector<Real> dot_products(out_deriv.NumRows());
+ dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
+ kTrans, 0.0);
+ Vector<Real> in_norm(in_value.NumRows());
+ Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
+ in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+ if (add_log_stddev) {
+ Vector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+ out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+ // f = log(sqrt(max(epsi, x^T x / D)))
+ // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x : 0.
+ // we don't compute this exactly below for the case when x^2 x is very
+ // small, but we do make sure that the deriv isn't infinity when the input
+ // is zero.
+ log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
+ log_stddev_deriv.ApplyPow(-1.0);
+ out_deriv_for_stddev.CopyColFromMat(out_deriv,
+ (out_deriv.NumCols() - 1));
+ log_stddev_deriv.MulElements(out_deriv_for_stddev);
+ if (in_deriv)
+ in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans,
+ 1.0);
+ }
+ in_norm.Scale(1.0 / d_scaled);
+ in_norm.ApplyFloor(kSquaredNormFloor);
+ in_norm.ApplyPow(-0.5);
+ if (in_deriv) {
+ if (in_deriv->Data() != out_deriv_no_log.Data())
+ in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans,
+ 1.0);
+ else
+ in_deriv->MulRowsVec(in_norm);
+ in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+ in_norm.ApplyPow(3.0);
+ dot_products.MulElements(in_norm);
+
+ in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value,
+ kNoTrans, 1.0);
+ }
+
+ Matrix<Real> Hid2(Did);
+ AssertEqual(Hid, Hid2, 0.00001);
+ }
+ }
+
+ for (int dim = 16; dim <= 1024; dim *= 2) {
+ BaseFloat time_in_secs = 0.025;
+ CuMatrix<Real> id(dim, dim), iv(dim, dim), od(dim, dim + 1);
+ iv.SetRandn();
+ od.SetRandn();
+ Timer tim;
+ int32 iter = 0;
+ for (; tim.Elapsed() < time_in_secs; iter++) {
+ cu::DiffNormalizePerRow(iv, od, Real(0.456), true, &id);
+ }
+ BaseFloat fdim = dim;
+ BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+ KALDI_LOG << "For CuMath::DiffNormalizePerRow"
+ << (sizeof(Real)==8?"<double>":"<float>")
+ << ", for dim = " << dim << ", speed was " << gflops
+ << " gigaflops.";
+ }
+}
+
+
 
 template<typename Real> void CudaMathUnitTest() {
 #if HAVE_CUDA == 1
@@ -531,6 +632,7 @@ template<typename Real> void CudaMathUnitTest() {
  UnitTestLstmNonlinearity();
  UnitTestBackpropLstmNonlinearity<Real>();
  UnitTestCuMathNormalizePerRow<Real>();
+ UnitTestCuDiffNormalizePerRow<Real>();
 }
 
 } // namespace kaldi
 
@@ -245,7 +245,7 @@ void Randomize(const CuMatrixBase<double> &src,
 template<typename Real>
 void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
  const bool add_log_stddev, CuMatrixBase<Real>* out) {
- const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+ const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
  if (add_log_stddev) {
  KALDI_ASSERT(in.NumRows() == out->NumRows());
  KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
@@ -291,6 +291,100 @@ void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
  const bool add_log_stddev, CuMatrixBase<double>* out);
 
 
+// A note on the derivative of NormalizeComponent...
+// let both row_in and row_out be vectors of dimension D.
+// Let p = row_in^T row_in / (D * target_rms^2), and let
+// f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
+// row_out = f row_in.
+// Suppose we have a quantity deriv_out which is the derivative
+// of the objective function w.r.t. row_out. We want to compute
+// deriv_in which is the derivative of the objective function w.r.t.
+// row_in. Let the objective function be F. One term is obvious: we have
+// deriv_in = f deriv_out + ....
+// next we have to take into account the derivative that gets back-propagated
+// through f. Obviously, dF/df = deriv_out^T row_in.
+// And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+// and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
+// So this term in dF/d(row_in) equals:
+// dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+// So
+// deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in
+// if add_log_stddev_ true, the deriv_in has another term as
+// dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
+template<typename Real>
+void DiffNormalizePerRow(const CuMatrixBase<Real> &in_value,
+ const CuMatrixBase<Real> &out_deriv,
+ const Real target_rms, const bool add_log_stddev,
+ CuMatrixBase<Real>* in_deriv) {
+ const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+#if HAVE_CUDA == 1
+ if (CuDevice::Instantiate().Enabled()) {
+ Timer tim;
+ size_t dimBlock = CU1DBLOCK;
+ size_t dimGrid = in_deriv->NumRows();
+ cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
+ in_deriv->Stride(), in_value.Data(),
+ in_value.Dim(), out_deriv.Data(),
+ out_deriv.Stride(), target_rms, add_log_stddev);
+ CU_SAFE_CALL(cudaGetLastError());
+ CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+ } else
+#endif
+ {
+ const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
+ 0, in_value.NumCols());
+ CuVector<Real> dot_products(out_deriv.NumRows());
+ dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
+ kTrans, 0.0);
+ CuVector<Real> in_norm(in_value.NumRows());
+ Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
+ in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+
+ if (add_log_stddev) {
+ CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+ out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+ // f = log(sqrt(max(epsi, x^T x / D)))
+ // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x : 0.
+ // we don't compute this exactly below for the case when x^2 x is very
+ // small, but we do make sure that the deriv isn't infinity when the input
+ // is zero.
+ log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
+ log_stddev_deriv.ApplyPow(-1.0);
+ out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
+ log_stddev_deriv.MulElements(out_deriv_for_stddev);
+ if (in_deriv)
+ in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
+ }
+ in_norm.Scale(1.0 / d_scaled);
+ in_norm.ApplyFloor(kSquaredNormFloor);
+ in_norm.ApplyPow(-0.5);
+ if (in_deriv) {
+ if (in_deriv->Data() != out_deriv_no_log.Data())
+ in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
+ else
+ in_deriv->MulRowsVec(in_norm);
+ in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+ in_norm.ApplyPow(3.0);
+ dot_products.MulElements(in_norm);
+
+ in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
+ 1.0);
+ }
+ }
+}
+
+template
+void DiffNormalizePerRow(const CuMatrixBase<float> &in_value,
+ const CuMatrixBase<float> &out_deriv,
+ const float target_rms, const bool add_log_stddev,
+ CuMatrixBase<float>* in_deriv);
+template
+void DiffNormalizePerRow(const CuMatrixBase<double> &in_value,
+ const CuMatrixBase<double> &out_deriv,
+ const double target_rms, const bool add_log_stddev,
+ CuMatrixBase<double>* in_deriv);
+
+
 // not calling this Sigmoid to reduce the chance of future collisions.
 template<typename Real>
 static inline Real ScalarSigmoid(Real a) {