Skip to content

Commit 216c518

Browse files
luitjensdanpovey
authored andcommitted
[src] Add online-batched-cuda-cmvn. (#3875)
This is the first of a couple patches to enable online-batched-cuda-feature-extraction. Processing occurs across a batch of multiple audio files in a chunked fasion. This allows us to do processing before all audio is present lower latnecy and to get better device utilization by batching those chunks together. The interface for this class is fairly low level as it is ment to be driven by the online-batched-cuda-feature pipeline which will be added later. A binary demonstrating the usage of this class is included in cudafeatbin/apply-batched-cmvn-online-cuda.cc Correctness has been tested as follows: %> ./compute-mfcc-feats-cuda --config=mfcc.conf scp:wav.scp ark,scp:mfcc.ark,mfcc.scp %> ./apply-batched-cmvn-online-cuda --batch-size=20 global_cmvn.stats "scp:mfcc.scp" "ark,scp:cmvn-cuda-batched.ark,cmvn-cuda-batched.scp" %> ./apply-cmvn-online-cuda global_cmvn.stats "scp:mfcc.scp" "ark,scp:cmvn-cuda.ark,cmvn-cuda.scp" %> ../featbin/compare-feats scp:cmvn-cuda.scp scp:cmvn-cuda-batched.scp Output from last command on a dataset is: LOG (compare-feats[5.5.1622~4-126aa]:main():compare-feats.cc:111) Similarity metric for each dimension [ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ] (1.0 means identical, the smaller the more different) LOG (compare-feats[5.5.1622~4-126aa]:main():compare-feats.cc:116) Overall similarity for the two feats is:1 (1.0 means identical, the smaller the more different) LOG (compare-feats[5.5.1622~4-126aa]:main():compare-feats.cc:119) Processed 80 feature files, 0 had errors. LOG (compare-feats[5.5.1622~4-126aa]:main():compare-feats.cc:126) Features are considered similar since 1 >= 0.99
1 parent fdb2d57 commit 216c518

8 files changed

+842
-2
lines changed

src/cudafeat/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ TESTFILES =
1010
ifeq ($(CUDA), true)
1111
OBJFILES += feature-window-cuda.o feature-spectral-cuda.o feature-online-cmvn-cuda.o \
1212
online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
13-
online-cuda-feature-pipeline.o
13+
online-cuda-feature-pipeline.o src/cudafeat/feature-online-batched-cmvn-cuda.o \
14+
src/cudafeat/feature-online-batched-cmvn-cuda-kernels.o
1415
endif
1516

1617
LIBNAME = kaldi-cudafeat
Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
// cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
2+
//
3+
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4+
// Justin Luitjens
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
#include <cub/cub.cuh>
19+
#include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h"
20+
21+
__device__ inline float2 operator-(const float2 &a, const float2 &b) {
22+
float2 retval;
23+
retval.x = a.x - b.x;
24+
retval.y = a.y - b.y;
25+
return retval;
26+
}
27+
__device__ inline float2 operator+(const float2 &a, const float2 &b) {
28+
float2 retval;
29+
retval.x = a.x + b.x;
30+
retval.y = a.y + b.y;
31+
return retval;
32+
}
33+
34+
__device__ inline void atomicAdd(float2 *addr, float2 val) {
35+
atomicAdd(reinterpret_cast<float *>(addr), val.x);
36+
atomicAdd(reinterpret_cast<float *>(addr) + 1, val.y);
37+
}
38+
39+
__device__ inline void operator+=(float2 &a, float2 &b) {
40+
// overloading +=
41+
a.x += b.x;
42+
a.y += b.y;
43+
}
44+
45+
namespace kaldi {
46+
// threadIdx.x = frame (up to 1024?)
47+
// blockIdx.x = feature
48+
// blockIdx.y = batch id
49+
__global__ void compute_cmvn_stats_kernel(
50+
int32_t feat_dim, int32_t chunk_size, int32_t stats_coarsening_factor,
51+
int32_t cmn_window, const float *in_data, int32_t ldi, int32_t stridei,
52+
float *stats_data, int32_t lds, const LaneDesc *lanes, int32_t num_lanes) {
53+
typedef cub::BlockScan<float2, 1024> BlockScan;
54+
__shared__ typename BlockScan::TempStorage temp_storage;
55+
56+
int32_t lane = blockIdx.y;
57+
int32_t feat = blockIdx.x; // feature for this block
58+
int32_t tidx = threadIdx.x;
59+
60+
// width of a window of stats data
61+
int32_t num_fragments = (chunk_size + cmn_window) / stats_coarsening_factor;
62+
63+
// function to compute window location based on frame
64+
auto SIDX = [&](int frame, int feat) {
65+
int row = feat;
66+
int col = (frame / stats_coarsening_factor) % num_fragments;
67+
return row * num_fragments + col;
68+
};
69+
70+
LaneDesc desc = lanes[lane];
71+
ChannelId channel = desc.channel;
72+
int32_t num_chunk_frames = desc.num_chunk_frames;
73+
74+
// compute memory offsets for batch
75+
float2 *sdata = reinterpret_cast<float2 *>(stats_data + channel * lds);
76+
77+
// batch is rows, cols is chunk_size x feat_dim, where feat_dim is
78+
// padded to ldi
79+
const float *idata = in_data + lane * stridei;
80+
81+
// starting frame of audio
82+
int32_t start_frame = desc.current_frame;
83+
84+
float2 running_sum = {0.0f, 0.0f};
85+
86+
// load previous running sum if this is not the first frame
87+
if (start_frame > 0) running_sum = sdata[SIDX(start_frame - 1, feat)];
88+
89+
// for each frame compute prefix sum
90+
for (int32_t f = 0; f < num_chunk_frames; f += blockDim.x) {
91+
int frame = f + tidx;
92+
93+
float val = 0.0f;
94+
if (frame < num_chunk_frames) {
95+
// uncoalesced
96+
val = idata[frame * ldi + feat];
97+
}
98+
99+
float2 sum = {val, val * val};
100+
float2 psum; // row prefix sum
101+
float2 total; // total count
102+
103+
BlockScan(temp_storage).InclusiveSum(sum, psum, total);
104+
105+
// offset by running sum
106+
psum = psum + running_sum;
107+
108+
// increase running sum by new total
109+
running_sum = running_sum + total;
110+
111+
// The last thread of each fragement will write their value to stats
112+
bool write = (frame < num_chunk_frames && frame % stats_coarsening_factor ==
113+
stats_coarsening_factor - 1);
114+
115+
// last frame will always write
116+
// this fagment may not have full stats
117+
// use our frame to fill in those stats
118+
if (f == num_chunk_frames - 1) {
119+
// This thread will write
120+
write = true;
121+
122+
// number of frames in my fragement with stats
123+
int32_t in_frame = f % stats_coarsening_factor + 1;
124+
// number of frames int my fragement without stats
125+
int32_t not_in_frame = stats_coarsening_factor - in_frame;
126+
127+
// multiply this frame stats by the number of frames not counted
128+
float2 add = make_float2(sum.x * not_in_frame, sum.y * not_in_frame);
129+
130+
// if the fragment is full add will be (0,0)
131+
// Add in stats
132+
psum += add;
133+
}
134+
135+
if (write) {
136+
// un-coalesced
137+
sdata[SIDX(start_frame + frame, feat)] = psum;
138+
}
139+
}
140+
}
141+
142+
// For each channel in batch size, compute coarsened stats in rolling
143+
// window
144+
void compute_cmvn_stats(int32_t feat_dim, int32_t chunk_size,
145+
int32_t stats_coarsening_factor, int32_t cmn_window,
146+
const float *in_data, int32_t ldi, int32_t stridei,
147+
float *stats_data, int32_t lds, const LaneDesc *lanes,
148+
int32_t num_lanes) {
149+
int threads = 1024;
150+
dim3 blocks(feat_dim, num_lanes);
151+
152+
compute_cmvn_stats_kernel<<<blocks, threads>>>(
153+
feat_dim, chunk_size, stats_coarsening_factor, cmn_window, in_data, ldi,
154+
stridei, stats_data, lds, lanes, num_lanes);
155+
};
156+
157+
// threadIdx.x = feature (32?)
158+
// threadIdx.y, blockIdx.x = frame
159+
// blockIdx.y = batch id
160+
__global__ void apply_cmvn_kernel(
161+
int32_t cmvn_window, bool var_norm, bool mean_norm, int32_t feat_dim,
162+
int32_t chunk_size, int32_t stats_coarsening_factor,
163+
const float *__restrict__ in_data, int32_t ldi, int32_t stridei,
164+
const float *__restrict__ stats_data, int32_t lds,
165+
const float *__restrict__ global_stats, int32_t ldg, int32_t global_frames,
166+
const float *__restrict__ speaker_stats, int32_t ldss,
167+
int32_t speaker_frames, float *out_data, int32_t ldo, int32_t strideo,
168+
const LaneDesc *lanes, int32_t num_lanes) {
169+
int32_t lane = blockIdx.y;
170+
LaneDesc desc = lanes[lane];
171+
ChannelId channel = desc.channel;
172+
173+
// compute memory offsets for batch
174+
const float2 *sdata =
175+
reinterpret_cast<const float2 *>(stats_data + channel * lds);
176+
// batch is rows, cols is chunk_size x feat_dim, where feat_dim is
177+
// padded to ldi
178+
const float *idata = in_data + lane * stridei;
179+
float *odata = out_data + lane * strideo;
180+
181+
// width of a window of stats data
182+
int32_t num_fragments = (chunk_size + cmvn_window) / stats_coarsening_factor;
183+
184+
// function to compute window location based on frame
185+
auto SIDX = [&](int frame, int feat) {
186+
int row = feat;
187+
int col = (frame / stats_coarsening_factor) % num_fragments;
188+
return row * num_fragments + col;
189+
};
190+
191+
int32_t current_frame = desc.current_frame;
192+
int32_t num_chunk_frames = desc.num_chunk_frames;
193+
for (int f = blockIdx.x * blockDim.y + threadIdx.y; f < num_chunk_frames;
194+
f += blockDim.y * gridDim.x) {
195+
int frame = current_frame + f;
196+
197+
for (int feat = threadIdx.x; feat < feat_dim; feat += blockDim.x) {
198+
// Compute stats for frame
199+
float2 frame_stats = sdata[SIDX(frame, feat)];
200+
// load value
201+
float val = idata[f * ldi + feat];
202+
203+
// compute window length
204+
float window_length = min(frame + 1, cmvn_window);
205+
206+
// possibly remove stats -cmvn window away
207+
if (frame >= cmvn_window) {
208+
float2 old_frame_stats = sdata[SIDX(frame - cmvn_window, feat)];
209+
frame_stats = frame_stats - old_frame_stats;
210+
}
211+
212+
// Smooth stats by speaker frames if necessary
213+
float smooth_frames = cmvn_window - window_length;
214+
if (smooth_frames > 0 && speaker_frames > 0) {
215+
float count_from_speaker = min(smooth_frames, (float)speaker_frames);
216+
float speaker_count = speaker_stats[feat_dim];
217+
218+
if (count_from_speaker > 0.0) {
219+
float alpha = count_from_speaker / speaker_count;
220+
221+
frame_stats.x += alpha * speaker_stats[feat]; // update mean
222+
frame_stats.y +=
223+
alpha * speaker_stats[ldss + feat]; // update variance
224+
window_length += alpha * speaker_count; // update window length
225+
226+
// recompute smooth frames now that we have speaker stats
227+
smooth_frames = cmvn_window - window_length;
228+
}
229+
} // end speaker smooth
230+
231+
// Smooth stats by global frames if necessary
232+
if (smooth_frames > 0 && global_frames > 0) {
233+
float count_from_global = min(smooth_frames, (float)global_frames);
234+
float global_count = global_stats[feat_dim];
235+
236+
if (count_from_global > 0.0) {
237+
float alpha = count_from_global / global_count;
238+
239+
frame_stats.x += alpha * global_stats[feat]; // update mean
240+
frame_stats.y += alpha * global_stats[ldg + feat]; // update variance
241+
window_length += alpha * global_count; // update window length
242+
}
243+
} // end global smooth
244+
245+
float mean = frame_stats.x / window_length;
246+
float var = frame_stats.y / window_length - mean * mean;
247+
248+
float floor = 1e-20;
249+
if (var < floor) {
250+
// avoid dividing by zero
251+
var = floor;
252+
}
253+
if (!var_norm) {
254+
// skip variance normalization
255+
var = 1.0f;
256+
}
257+
if (!mean_norm) {
258+
// skip mean normalization
259+
mean = 0.0f;
260+
}
261+
262+
// shift by mean and scale by variance
263+
float oval = (val - mean) / sqrtf(var);
264+
265+
odata[f * ldo + feat] = oval;
266+
} // end feat loop
267+
} // end frame loop
268+
}
269+
270+
void apply_cmvn(int32_t cmvn_window, bool var_norm, bool mean_norm,
271+
int32_t feat_dim, int32_t chunk_size,
272+
int32_t stats_coarsening_factor, const float *in_data,
273+
int32_t ldi, int32_t stridei, const float *stats_data,
274+
int32_t lds, const float *global_stats, int32_t ldg,
275+
int32_t global_frames, const float *speaker_stats, int32_t ldss,
276+
int32_t speaker_frames, float *out_data, int32_t ldo,
277+
int32_t strideo, const LaneDesc *lanes, int32_t num_lanes) {
278+
// round threads to neared warp
279+
int threadsx = 64;
280+
int threadsy = 512 / threadsx;
281+
dim3 threads(threadsx, threadsy);
282+
283+
int blocksx = (chunk_size + threadsy - 1) / threadsy;
284+
int blocksy = num_lanes;
285+
dim3 blocks(blocksx, blocksy);
286+
287+
apply_cmvn_kernel<<<blocks, threads>>>(
288+
cmvn_window, var_norm, mean_norm, feat_dim, chunk_size,
289+
stats_coarsening_factor, in_data, ldi, stridei, stats_data, lds,
290+
global_stats, ldg, global_frames, speaker_stats, ldss, speaker_frames,
291+
out_data, ldo, strideo, lanes, num_lanes);
292+
}
293+
294+
} // namespace kaldi
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// cudafeat/feature-online-batched-cmvn-cuda-kernels.h
2+
//
3+
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4+
// Justin Luitjens
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
18+
#ifndef KALDI_CUDAFEAT_FEATURE_ONLINE_BATCHED_CMVN_CUDA_KERNELS_H_
19+
#define KALDI_CUDAFEAT_FEATURE_ONLINE_BATCHED_CMVN_CUDA_KERNELS_H_
20+
21+
#include "cudafeat/lane-desc.h"
22+
23+
namespace kaldi {
24+
25+
// ld{i,o} = size of inner dimension of matrix alloction
26+
// stride{i,o} = stride between consecutive batch matrices
27+
28+
void compute_cmvn_stats(int32_t feat_dim, int32_t chunk_size,
29+
int32_t stats_coarsening_factor, int32_t cmn_window,
30+
const float *in_data, int32_t ldi, int32_t stridei,
31+
float *stats_data, int32_t lds, const LaneDesc *lanes,
32+
int32_t num_lanes);
33+
34+
void apply_cmvn(int32_t cmvn_window, bool var_norm, bool mean_norm,
35+
int32_t feat_dim, int32_t chunk_size,
36+
int32_t stats_coarsening_factor, const float *in_data,
37+
int32_t ldi, int32_t stridei, const float *stats_data,
38+
int32_t lds, const float *global_stats, int32_t ldg,
39+
int32_t global_frames, const float *speaker_stats, int32_t ldss,
40+
int32_t speaker_frames, float *out_data, int32_t ldo,
41+
int32_t strideo, const LaneDesc *lanes, int32_t num_lanes);
42+
} // namespace kaldi
43+
44+
#endif

0 commit comments

Comments
 (0)