|
| 1 | +// cudafeat/feature-online-batched-cmvn-cuda-kernels.cu |
| 2 | +// |
| 3 | +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. |
| 4 | +// Justin Luitjens |
| 5 | +// |
| 6 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | +// you may not use this file except in compliance with the License. |
| 8 | +// You may obtain a copy of the License at |
| 9 | +// |
| 10 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +// |
| 12 | +// Unless required by applicable law or agreed to in writing, software |
| 13 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +// See the License for the specific language governing permissions and |
| 16 | +// limitations under the License. |
| 17 | +// |
| 18 | +#include <cub/cub.cuh> |
| 19 | +#include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h" |
| 20 | + |
| 21 | +__device__ inline float2 operator-(const float2 &a, const float2 &b) { |
| 22 | + float2 retval; |
| 23 | + retval.x = a.x - b.x; |
| 24 | + retval.y = a.y - b.y; |
| 25 | + return retval; |
| 26 | +} |
| 27 | +__device__ inline float2 operator+(const float2 &a, const float2 &b) { |
| 28 | + float2 retval; |
| 29 | + retval.x = a.x + b.x; |
| 30 | + retval.y = a.y + b.y; |
| 31 | + return retval; |
| 32 | +} |
| 33 | + |
| 34 | +__device__ inline void atomicAdd(float2 *addr, float2 val) { |
| 35 | + atomicAdd(reinterpret_cast<float *>(addr), val.x); |
| 36 | + atomicAdd(reinterpret_cast<float *>(addr) + 1, val.y); |
| 37 | +} |
| 38 | + |
| 39 | +__device__ inline void operator+=(float2 &a, float2 &b) { |
| 40 | + // overloading += |
| 41 | + a.x += b.x; |
| 42 | + a.y += b.y; |
| 43 | +} |
| 44 | + |
| 45 | +namespace kaldi { |
| 46 | +// threadIdx.x = frame (up to 1024?) |
| 47 | +// blockIdx.x = feature |
| 48 | +// blockIdx.y = batch id |
| 49 | +__global__ void compute_cmvn_stats_kernel( |
| 50 | + int32_t feat_dim, int32_t chunk_size, int32_t stats_coarsening_factor, |
| 51 | + int32_t cmn_window, const float *in_data, int32_t ldi, int32_t stridei, |
| 52 | + float *stats_data, int32_t lds, const LaneDesc *lanes, int32_t num_lanes) { |
| 53 | + typedef cub::BlockScan<float2, 1024> BlockScan; |
| 54 | + __shared__ typename BlockScan::TempStorage temp_storage; |
| 55 | + |
| 56 | + int32_t lane = blockIdx.y; |
| 57 | + int32_t feat = blockIdx.x; // feature for this block |
| 58 | + int32_t tidx = threadIdx.x; |
| 59 | + |
| 60 | + // width of a window of stats data |
| 61 | + int32_t num_fragments = (chunk_size + cmn_window) / stats_coarsening_factor; |
| 62 | + |
| 63 | + // function to compute window location based on frame |
| 64 | + auto SIDX = [&](int frame, int feat) { |
| 65 | + int row = feat; |
| 66 | + int col = (frame / stats_coarsening_factor) % num_fragments; |
| 67 | + return row * num_fragments + col; |
| 68 | + }; |
| 69 | + |
| 70 | + LaneDesc desc = lanes[lane]; |
| 71 | + ChannelId channel = desc.channel; |
| 72 | + int32_t num_chunk_frames = desc.num_chunk_frames; |
| 73 | + |
| 74 | + // compute memory offsets for batch |
| 75 | + float2 *sdata = reinterpret_cast<float2 *>(stats_data + channel * lds); |
| 76 | + |
| 77 | + // batch is rows, cols is chunk_size x feat_dim, where feat_dim is |
| 78 | + // padded to ldi |
| 79 | + const float *idata = in_data + lane * stridei; |
| 80 | + |
| 81 | + // starting frame of audio |
| 82 | + int32_t start_frame = desc.current_frame; |
| 83 | + |
| 84 | + float2 running_sum = {0.0f, 0.0f}; |
| 85 | + |
| 86 | + // load previous running sum if this is not the first frame |
| 87 | + if (start_frame > 0) running_sum = sdata[SIDX(start_frame - 1, feat)]; |
| 88 | + |
| 89 | + // for each frame compute prefix sum |
| 90 | + for (int32_t f = 0; f < num_chunk_frames; f += blockDim.x) { |
| 91 | + int frame = f + tidx; |
| 92 | + |
| 93 | + float val = 0.0f; |
| 94 | + if (frame < num_chunk_frames) { |
| 95 | + // uncoalesced |
| 96 | + val = idata[frame * ldi + feat]; |
| 97 | + } |
| 98 | + |
| 99 | + float2 sum = {val, val * val}; |
| 100 | + float2 psum; // row prefix sum |
| 101 | + float2 total; // total count |
| 102 | + |
| 103 | + BlockScan(temp_storage).InclusiveSum(sum, psum, total); |
| 104 | + |
| 105 | + // offset by running sum |
| 106 | + psum = psum + running_sum; |
| 107 | + |
| 108 | + // increase running sum by new total |
| 109 | + running_sum = running_sum + total; |
| 110 | + |
| 111 | + // The last thread of each fragement will write their value to stats |
| 112 | + bool write = (frame < num_chunk_frames && frame % stats_coarsening_factor == |
| 113 | + stats_coarsening_factor - 1); |
| 114 | + |
| 115 | + // last frame will always write |
| 116 | + // this fagment may not have full stats |
| 117 | + // use our frame to fill in those stats |
| 118 | + if (f == num_chunk_frames - 1) { |
| 119 | + // This thread will write |
| 120 | + write = true; |
| 121 | + |
| 122 | + // number of frames in my fragement with stats |
| 123 | + int32_t in_frame = f % stats_coarsening_factor + 1; |
| 124 | + // number of frames int my fragement without stats |
| 125 | + int32_t not_in_frame = stats_coarsening_factor - in_frame; |
| 126 | + |
| 127 | + // multiply this frame stats by the number of frames not counted |
| 128 | + float2 add = make_float2(sum.x * not_in_frame, sum.y * not_in_frame); |
| 129 | + |
| 130 | + // if the fragment is full add will be (0,0) |
| 131 | + // Add in stats |
| 132 | + psum += add; |
| 133 | + } |
| 134 | + |
| 135 | + if (write) { |
| 136 | + // un-coalesced |
| 137 | + sdata[SIDX(start_frame + frame, feat)] = psum; |
| 138 | + } |
| 139 | + } |
| 140 | +} |
| 141 | + |
| 142 | +// For each channel in batch size, compute coarsened stats in rolling |
| 143 | +// window |
| 144 | +void compute_cmvn_stats(int32_t feat_dim, int32_t chunk_size, |
| 145 | + int32_t stats_coarsening_factor, int32_t cmn_window, |
| 146 | + const float *in_data, int32_t ldi, int32_t stridei, |
| 147 | + float *stats_data, int32_t lds, const LaneDesc *lanes, |
| 148 | + int32_t num_lanes) { |
| 149 | + int threads = 1024; |
| 150 | + dim3 blocks(feat_dim, num_lanes); |
| 151 | + |
| 152 | + compute_cmvn_stats_kernel<<<blocks, threads>>>( |
| 153 | + feat_dim, chunk_size, stats_coarsening_factor, cmn_window, in_data, ldi, |
| 154 | + stridei, stats_data, lds, lanes, num_lanes); |
| 155 | +}; |
| 156 | + |
| 157 | +// threadIdx.x = feature (32?) |
| 158 | +// threadIdx.y, blockIdx.x = frame |
| 159 | +// blockIdx.y = batch id |
| 160 | +__global__ void apply_cmvn_kernel( |
| 161 | + int32_t cmvn_window, bool var_norm, bool mean_norm, int32_t feat_dim, |
| 162 | + int32_t chunk_size, int32_t stats_coarsening_factor, |
| 163 | + const float *__restrict__ in_data, int32_t ldi, int32_t stridei, |
| 164 | + const float *__restrict__ stats_data, int32_t lds, |
| 165 | + const float *__restrict__ global_stats, int32_t ldg, int32_t global_frames, |
| 166 | + const float *__restrict__ speaker_stats, int32_t ldss, |
| 167 | + int32_t speaker_frames, float *out_data, int32_t ldo, int32_t strideo, |
| 168 | + const LaneDesc *lanes, int32_t num_lanes) { |
| 169 | + int32_t lane = blockIdx.y; |
| 170 | + LaneDesc desc = lanes[lane]; |
| 171 | + ChannelId channel = desc.channel; |
| 172 | + |
| 173 | + // compute memory offsets for batch |
| 174 | + const float2 *sdata = |
| 175 | + reinterpret_cast<const float2 *>(stats_data + channel * lds); |
| 176 | + // batch is rows, cols is chunk_size x feat_dim, where feat_dim is |
| 177 | + // padded to ldi |
| 178 | + const float *idata = in_data + lane * stridei; |
| 179 | + float *odata = out_data + lane * strideo; |
| 180 | + |
| 181 | + // width of a window of stats data |
| 182 | + int32_t num_fragments = (chunk_size + cmvn_window) / stats_coarsening_factor; |
| 183 | + |
| 184 | + // function to compute window location based on frame |
| 185 | + auto SIDX = [&](int frame, int feat) { |
| 186 | + int row = feat; |
| 187 | + int col = (frame / stats_coarsening_factor) % num_fragments; |
| 188 | + return row * num_fragments + col; |
| 189 | + }; |
| 190 | + |
| 191 | + int32_t current_frame = desc.current_frame; |
| 192 | + int32_t num_chunk_frames = desc.num_chunk_frames; |
| 193 | + for (int f = blockIdx.x * blockDim.y + threadIdx.y; f < num_chunk_frames; |
| 194 | + f += blockDim.y * gridDim.x) { |
| 195 | + int frame = current_frame + f; |
| 196 | + |
| 197 | + for (int feat = threadIdx.x; feat < feat_dim; feat += blockDim.x) { |
| 198 | + // Compute stats for frame |
| 199 | + float2 frame_stats = sdata[SIDX(frame, feat)]; |
| 200 | + // load value |
| 201 | + float val = idata[f * ldi + feat]; |
| 202 | + |
| 203 | + // compute window length |
| 204 | + float window_length = min(frame + 1, cmvn_window); |
| 205 | + |
| 206 | + // possibly remove stats -cmvn window away |
| 207 | + if (frame >= cmvn_window) { |
| 208 | + float2 old_frame_stats = sdata[SIDX(frame - cmvn_window, feat)]; |
| 209 | + frame_stats = frame_stats - old_frame_stats; |
| 210 | + } |
| 211 | + |
| 212 | + // Smooth stats by speaker frames if necessary |
| 213 | + float smooth_frames = cmvn_window - window_length; |
| 214 | + if (smooth_frames > 0 && speaker_frames > 0) { |
| 215 | + float count_from_speaker = min(smooth_frames, (float)speaker_frames); |
| 216 | + float speaker_count = speaker_stats[feat_dim]; |
| 217 | + |
| 218 | + if (count_from_speaker > 0.0) { |
| 219 | + float alpha = count_from_speaker / speaker_count; |
| 220 | + |
| 221 | + frame_stats.x += alpha * speaker_stats[feat]; // update mean |
| 222 | + frame_stats.y += |
| 223 | + alpha * speaker_stats[ldss + feat]; // update variance |
| 224 | + window_length += alpha * speaker_count; // update window length |
| 225 | + |
| 226 | + // recompute smooth frames now that we have speaker stats |
| 227 | + smooth_frames = cmvn_window - window_length; |
| 228 | + } |
| 229 | + } // end speaker smooth |
| 230 | + |
| 231 | + // Smooth stats by global frames if necessary |
| 232 | + if (smooth_frames > 0 && global_frames > 0) { |
| 233 | + float count_from_global = min(smooth_frames, (float)global_frames); |
| 234 | + float global_count = global_stats[feat_dim]; |
| 235 | + |
| 236 | + if (count_from_global > 0.0) { |
| 237 | + float alpha = count_from_global / global_count; |
| 238 | + |
| 239 | + frame_stats.x += alpha * global_stats[feat]; // update mean |
| 240 | + frame_stats.y += alpha * global_stats[ldg + feat]; // update variance |
| 241 | + window_length += alpha * global_count; // update window length |
| 242 | + } |
| 243 | + } // end global smooth |
| 244 | + |
| 245 | + float mean = frame_stats.x / window_length; |
| 246 | + float var = frame_stats.y / window_length - mean * mean; |
| 247 | + |
| 248 | + float floor = 1e-20; |
| 249 | + if (var < floor) { |
| 250 | + // avoid dividing by zero |
| 251 | + var = floor; |
| 252 | + } |
| 253 | + if (!var_norm) { |
| 254 | + // skip variance normalization |
| 255 | + var = 1.0f; |
| 256 | + } |
| 257 | + if (!mean_norm) { |
| 258 | + // skip mean normalization |
| 259 | + mean = 0.0f; |
| 260 | + } |
| 261 | + |
| 262 | + // shift by mean and scale by variance |
| 263 | + float oval = (val - mean) / sqrtf(var); |
| 264 | + |
| 265 | + odata[f * ldo + feat] = oval; |
| 266 | + } // end feat loop |
| 267 | + } // end frame loop |
| 268 | +} |
| 269 | + |
| 270 | +void apply_cmvn(int32_t cmvn_window, bool var_norm, bool mean_norm, |
| 271 | + int32_t feat_dim, int32_t chunk_size, |
| 272 | + int32_t stats_coarsening_factor, const float *in_data, |
| 273 | + int32_t ldi, int32_t stridei, const float *stats_data, |
| 274 | + int32_t lds, const float *global_stats, int32_t ldg, |
| 275 | + int32_t global_frames, const float *speaker_stats, int32_t ldss, |
| 276 | + int32_t speaker_frames, float *out_data, int32_t ldo, |
| 277 | + int32_t strideo, const LaneDesc *lanes, int32_t num_lanes) { |
| 278 | + // round threads to neared warp |
| 279 | + int threadsx = 64; |
| 280 | + int threadsy = 512 / threadsx; |
| 281 | + dim3 threads(threadsx, threadsy); |
| 282 | + |
| 283 | + int blocksx = (chunk_size + threadsy - 1) / threadsy; |
| 284 | + int blocksy = num_lanes; |
| 285 | + dim3 blocks(blocksx, blocksy); |
| 286 | + |
| 287 | + apply_cmvn_kernel<<<blocks, threads>>>( |
| 288 | + cmvn_window, var_norm, mean_norm, feat_dim, chunk_size, |
| 289 | + stats_coarsening_factor, in_data, ldi, stridei, stats_data, lds, |
| 290 | + global_stats, ldg, global_frames, speaker_stats, ldss, speaker_frames, |
| 291 | + out_data, ldo, strideo, lanes, num_lanes); |
| 292 | +} |
| 293 | + |
| 294 | +} // namespace kaldi |
0 commit comments