Skip to content

Commit 78faf01

Browse files
committed
Unroll all loops for even faster FFT
1 parent 690c070 commit 78faf01

File tree

8 files changed

+80
-41
lines changed

8 files changed

+80
-41
lines changed

28_FFTBloom/app_resources/common.hlsl

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint32_t Channels = 3;
55

66
using namespace nbl::hlsl;
77

8-
// Necessary bits for each int type (except BDAs) are being generous and considering a resolution of up to 4k
9-
// Also, an extra bit is given since it fits: they're uints being passed as ints so we don't litter the code with int32 casts
10-
// The extra bit is to make sure MSB is a 0 and it doesn't sign extend and give negative numbers when using the maximum amount of considered bits
8+
// All packed bitfields of int32_t are uints being passed as ints so we don't litter the code with int32 casts
119
struct PushConstantData
1210
{
1311
// After running FFT along a column, we want to store the result in column major order for coalesced writes, and similarly after running an FFT in row major order
@@ -21,7 +19,10 @@ struct PushConstantData
2119
// The following three fields being push constants allow dynamic resizing of the image without recompiling shaders (limited by the FFT length)
2220
int32_t imageRowLength : 16;
2321
int32_t imageHalfRowLength : 16;
24-
// Actually only needs at worst 10 bits, but we don't pack it into a bitfield so we can use offsetof and update only this field from CPP side
22+
// Only middle pass uses these
23+
uint32_t currentChannel;
24+
uint64_t channelStartOffsetBytes;
25+
// We don't pack it into a bitfield so we can use offsetof and update only this field from CPP side
2526
// Alternatively, we could do the packing/unpacking manually to save 32 bits
2627
int32_t padding;
2728
// Used by IFFT to tell if an index belongs to an image or is in the padding

28_FFTBloom/app_resources/fft_convolve_ifft.hlsl

Lines changed: 27 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
// ------------------------------------------ SECOND AXIS FFT + CONVOLUTION + IFFT -------------------------------------------------------------
88

9+
// This is done for the channel specified in pushConstants.
10+
911
// This time each Workgroup will compute the FFT along a horizontal line (fixed y for the whole Workgroup). We get the y coordinate for the
1012
// row a workgroup is working on via `gl_WorkGroupID().x`. We have to keep this in mind: What's stored as the first row is actually `Z + iN`,
1113
// where `Z` is the actual 0th row and `N` is the Nyquist row (the one with index TotalSize / 2). Those are packed together
@@ -34,14 +36,6 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
3436
{
3537
return y * pushConstants.imageRowLength + x; // can no longer sum with | since there's no guarantees on row length
3638
}
37-
38-
// Same as what was used to store in col-major after first axis FFT. This time we launch one workgroup per row so the height of the channel's (half) image is NumWorkgroups,
39-
// and the width (number of columns) is passed as a push constant
40-
uint64_t getChannelStartOffsetBytes(uint16_t channel)
41-
{
42-
return uint64_t(channel) * NumWorkgroups * pushConstants.imageRowLength * sizeof(complex_t<scalar_t>);
43-
}
44-
4539
// ---------------------------------------------------- End Utils ---------------------------------------------------------
4640

4741
// Unpacking on load: Has no workgroup shuffles (which become execution barriers) which would be necessary for unpacking on store
@@ -56,11 +50,10 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
5650
// corresponding to the column they correspond to.
5751
// The `gl_WorkGroupID().x = 0` case is special because instead of getting the mirror we need to get both zero and nyquist frequencies for the columns, which doesn't happen just by mirror
5852
// indexing.
59-
void preload(uint16_t channel)
53+
void preload()
6054
{
6155
// Set up accessor to read in data
62-
const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
63-
const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + channelStartOffsetBytes);
56+
const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + pushConstants.channelStartOffsetBytes);
6457

6558
// This one shows up a lot so we give it a name
6659
const bool oddThread = glsl::gl_SubgroupInvocationID() & 1u;
@@ -77,6 +70,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
7770
// the previous pass' threads), then `PreviousWorkgroupSize` odd elements (`preloaded[1]`) and so on
7871
const uint32_t evenRow = glsl::gl_WorkGroupID().x + ((glsl::gl_WorkGroupID().x / PreviousWorkgroupSize) * PreviousWorkgroupSize);
7972
const uint32_t y = oddThread ? PreviousPassFFTIndexingUtils::getNablaMirrorIndex(evenRow) : evenRow;
73+
[unroll]
8074
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
8175
{
8276
// If mirrored, we need to invert which thread is loading lo and which is loading hi
@@ -115,6 +109,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
115109
// Even thread retrieves Zero, odd thread retrieves Nyquist. Zero is always `preloaded[0]` of the previous FFT's 0th thread, while Nyquist is always `preloaded[1]` of that same thread.
116110
// Therefore we know Nyquist ends up exactly at y = PreviousWorkgroupSize
117111
const uint32_t y = oddThread ? PreviousWorkgroupSize : 0;
112+
[unroll]
118113
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
119114
{
120115
int32_t wrappedIndex = paddedIndex < 0 ? ~paddedIndex : paddedIndex; // ~x = - x - 1 in two's complement (except maybe at the borders of representable range)
@@ -143,18 +138,19 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
143138
// Each element on this row is Nabla-ordered. So the element at `x' = index, y' = gl_WorkGroupID().x` that we're operating on is actually the element at
144139
// `x = F(index), y = bitreverse(gl_WorkGroupID().x)` (with the bitreversal done as an N-1 bit number, for `N = log2(TotalSize)` *of the first axist FFT*)
145140
template<typename sharedmem_adaptor_t>
146-
void convolve(uint32_t channel, NBL_REF_ARG(sharedmem_adaptor_t) adaptorForSharedMemory)
141+
void convolve(NBL_REF_ARG(sharedmem_adaptor_t) adaptorForSharedMemory)
147142
{
148143
if (glsl::gl_WorkGroupID().x)
149144
{
150145
const uint32_t y = bitReverseAs<uint32_t, NumWorkgroupsLog2>(glsl::gl_WorkGroupID().x);
151146
uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
147+
[unroll]
152148
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
153149
{
154150
const uint32_t indexDFT = FFTIndexingUtils::getDFTIndex(globalElementIndex);
155151
const uint32_t2 texCoords = uint32_t2(indexDFT, y);
156152
const float32_t2 uv = texCoords * float32_t2(TotalSizeReciprocal, 1.f / NumWorkgroups) + KernelHalfPixelSize;
157-
const vector<scalar_t, 2> sampledKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(channel)), 0));
153+
const vector<scalar_t, 2> sampledKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(pushConstants.currentChannel)), 0));
158154
const vector<scalar_t, 2> sampledKernelInterpolatedVector = lerp(sampledKernelVector, One, promote<vector<scalar_t, 2>, float32_t>(pushConstants.interpolatingFactor));
159155
const complex_t<scalar_t> sampledKernelInterpolated = { sampledKernelInterpolatedVector.x, sampledKernelInterpolatedVector.y };
160156
preloaded[localElementIndex] = preloaded[localElementIndex] * sampledKernelInterpolated;
@@ -166,6 +162,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
166162
else
167163
{
168164
uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
165+
[unroll]
169166
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex += 2)
170167
{
171168
complex_t<scalar_t> zero = preloaded[localElementIndex];
@@ -179,14 +176,14 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
179176
const float32_t indexDFT = float32_t(FFTIndexingUtils::getDFTIndex(globalElementIndex));
180177

181178
float32_t2 uv = float32_t2(indexDFT * TotalSizeReciprocal, float32_t(0)) + KernelHalfPixelSize;
182-
const vector<scalar_t, 2> zeroKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(channel)), 0));
179+
const vector<scalar_t, 2> zeroKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(pushConstants.currentChannel)), 0));
183180
const vector<scalar_t, 2> zeroKernelInterpolatedVector = lerp(zeroKernelVector, One, promote<vector<scalar_t, 2>, float32_t>(pushConstants.interpolatingFactor));
184181
const complex_t<scalar_t> zeroKernelInterpolated = { zeroKernelInterpolatedVector.x, zeroKernelInterpolatedVector.y };
185182
zero = zero * zeroKernelInterpolated;
186183

187184
// Do the same for the nyquist coord
188185
uv.y = 1.f - KernelHalfPixelSize.y;
189-
const vector<scalar_t, 2> nyquistKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(channel)), 0));
186+
const vector<scalar_t, 2> nyquistKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(pushConstants.currentChannel)), 0));
190187
const vector<scalar_t, 2> nyquistKernelInterpolatedVector = lerp(nyquistKernelVector, One, promote<vector<scalar_t, 2>, float32_t>(pushConstants.interpolatingFactor));
191188
const complex_t<scalar_t> nyquistKernelInterpolated = { nyquistKernelInterpolatedVector.x, nyquistKernelInterpolatedVector.y };
192189
nyquist = nyquist * nyquistKernelInterpolated;
@@ -216,14 +213,14 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
216213
}
217214

218215
// Save a row back in row major order. Remember that the first row (one with `gl_WorkGroupID().x == 0`) will actually hold the packed IFFT of Zero and Nyquist rows.
219-
void unload(uint16_t channel)
216+
void unload()
220217
{
221218
// Set up accessor to write out data
222-
const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
223-
const LegacyBdaAccessor<complex_t<scalar_t> > rowMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.rowMajorBufferAddress + channelStartOffsetBytes);
219+
const LegacyBdaAccessor<complex_t<scalar_t> > rowMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.rowMajorBufferAddress + pushConstants.channelStartOffsetBytes);
224220

225221
const uint32_t firstIndex = workgroup::SubgroupContiguousIndex();
226222
int32_t paddedIndex = int32_t(firstIndex) - pushConstants.padding;
223+
[unroll]
227224
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
228225
{
229226
if (paddedIndex >= 0 && paddedIndex < pushConstants.imageRowLength)
@@ -245,21 +242,16 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
245242
sharedmem_adaptor_t adaptorForSharedMemory;
246243

247244
PreloadedSecondAxisAccessor preloadedAccessor;
248-
for (uint16_t channel = 0; channel < Channels; channel++)
249-
{
250-
preloadedAccessor.preload(channel);
251-
// Wait on previous pass FFT
252-
if(channel)
253-
sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
254-
workgroup::FFT<false, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
255-
// Update state after FFT run
256-
adaptorForSharedMemory.accessor = sharedmemAccessor;
257-
preloadedAccessor.convolve(channel, adaptorForSharedMemory);
258-
// Remember to update the accessor's state
259-
sharedmemAccessor = adaptorForSharedMemory.accessor;
260-
// Either wait on FFT (most workgroups but 0) or convolution (only 0th workgroup actually uses sharedmem for convolution)
261-
sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
262-
workgroup::FFT<true, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
263-
preloadedAccessor.unload(channel);
264-
}
245+
246+
preloadedAccessor.preload();
247+
workgroup::FFT<false, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
248+
// Update state after FFT run
249+
adaptorForSharedMemory.accessor = sharedmemAccessor;
250+
preloadedAccessor.convolve(adaptorForSharedMemory);
251+
// Remember to update the accessor's state
252+
sharedmemAccessor = adaptorForSharedMemory.accessor;
253+
// Either wait on first FFT (all workgroups but 0) or convolution (only 0th workgroup actually uses sharedmem for convolution)
254+
sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
255+
workgroup::FFT<true, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
256+
preloadedAccessor.unload();
265257
}

28_FFTBloom/app_resources/image_fft_first_axis.hlsl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,17 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
3535
normalizedCoordsSecondLine.x = normalizedCoordsFirstLine.x + pushConstants.imagePixelSize.x;
3636
normalizedCoordsFirstLine.y = (int32_t(workgroup::SubgroupContiguousIndex()) - pushConstants.padding) * pushConstants.imagePixelSize.y + pushConstants.imageHalfPixelSize.y;
3737

38+
[unroll]
3839
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
3940
{
4041
const float32_t4 firstLineTexValue = texture.SampleLevel(samplerState, normalizedCoordsFirstLine, 0);
42+
[unroll]
4143
for (uint16_t channel = 0; channel < Channels; channel++)
4244
preloaded[channel][localElementIndex].real(scalar_t(firstLineTexValue[channel]));
4345

4446
normalizedCoordsSecondLine.y = normalizedCoordsFirstLine.y;
4547
const float32_t4 secondLineTexValue = texture.SampleLevel(samplerState, normalizedCoordsSecondLine, 0);
48+
[unroll]
4649
for (uint16_t channel = 0; channel < Channels; channel++)
4750
preloaded[channel][localElementIndex].imag(scalar_t(secondLineTexValue[channel]));
4851

@@ -54,12 +57,14 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
5457
// Channels will be contiguous in buffer memory.
5558
void unload()
5659
{
60+
[unroll]
5761
for (uint16_t channel = 0; channel < Channels; channel++)
5862
{
5963
const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
6064
const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + channelStartOffsetBytes);
6165

6266
uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
67+
[unroll]
6368
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
6469
{
6570
colMajorAccessor.set(colMajorOffset(glsl::gl_WorkGroupID().x, globalElementIndex), preloaded[channel][localElementIndex]);
@@ -76,6 +81,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
7681
PreloadedFirstAxisAccessor preloadedAccessor;
7782

7883
preloadedAccessor.preload();
84+
[unroll]
7985
for (uint16_t channel = 0; channel < Channels; channel++)
8086
{
8187
preloadedAccessor.currentChannel = channel;

28_FFTBloom/app_resources/image_ifft_first_axis.hlsl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
3636
template<typename sharedmem_adaptor_t>
3737
void preload(NBL_REF_ARG(sharedmem_adaptor_t) adaptorForSharedMemory)
3838
{
39+
[unroll]
3940
for (uint16_t channel = 0; channel < Channels; channel++)
4041
{
4142
const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
@@ -44,6 +45,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
4445

4546
uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
4647
// Load all even elements of first column
48+
[unroll]
4749
for (uint32_t localElementIndex = 0; localElementIndex < (ElementsPerInvocation / 2); localElementIndex++)
4850
{
4951
preloaded[channel][localElementIndex << 1] = rowMajorAccessor.get(rowMajorOffset(2 * glsl::gl_WorkGroupID().x, globalElementIndex));
@@ -52,6 +54,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
5254
// Get all odd elements by trading
5355
// Reset globalElementIndex - Add WorkgroupSize to account for `localElementIndex` starting at 1
5456
globalElementIndex = WorkgroupSize | workgroup::SubgroupContiguousIndex();
57+
[unroll]
5558
for (uint32_t localElementIndex = 1; localElementIndex < ElementsPerInvocation; localElementIndex += 2)
5659
{
5760
preloaded[channel][localElementIndex] = conj(getDFTMirror<sharedmem_adaptor_t>(globalElementIndex, channel, adaptorForSharedMemory));
@@ -63,6 +66,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
6366
// This makes even positions hold C1 + iC2
6467
// Reset globalElementIndex
6568
globalElementIndex = workgroup::SubgroupContiguousIndex();
69+
[unroll]
6670
for (uint32_t localElementIndex = 0; localElementIndex < (ElementsPerInvocation / 2); localElementIndex++)
6771
{
6872
preloaded[channel][localElementIndex << 1] = preloaded[channel][localElementIndex << 1] + rotateLeft<scalar_t>(rowMajorAccessor.get(rowMajorOffset(2 * glsl::gl_WorkGroupID().x + 1, globalElementIndex)));
@@ -73,6 +77,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
7377
// add conj(C1) back to have conj(C1) + i * conj(C2).
7478
// Reset globalElementIndex - Add WorkgroupSize to account for `localElementIndex` starting at 1
7579
globalElementIndex = WorkgroupSize | workgroup::SubgroupContiguousIndex();
80+
[unroll]
7681
for (uint32_t localElementIndex = 1; localElementIndex < ElementsPerInvocation; localElementIndex += 2)
7782
{
7883
complex_t<scalar_t> otherThreadEven = conj(getDFTMirror<sharedmem_adaptor_t>(globalElementIndex, channel, adaptorForSharedMemory));
@@ -106,6 +111,8 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
106111
{
107112
const uint32_t firstIndex = workgroup::SubgroupContiguousIndex();
108113
int32_t paddedIndex = int32_t(firstIndex) - pushConstants.padding;
114+
115+
[unroll]
109116
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
110117
{
111118
if (paddedIndex >= 0 && paddedIndex < pushConstants.imageColumnLength)
@@ -115,6 +122,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
115122
firstLineTexValue.a = 1.f;
116123
secondLineTexValue.a = 1.f;
117124

125+
[unroll]
118126
for (uint16_t channel = 0; channel < Channels; channel++)
119127
{
120128
firstLineTexValue[channel] = scalar_t(preloaded[channel][localElementIndex].real());
@@ -142,6 +150,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
142150
// Update state after preload
143151
sharedmemAccessor = adaptorForSharedMemory.accessor;
144152

153+
[unroll]
145154
for (uint16_t channel = 0; channel < Channels; channel++)
146155
{
147156
preloadedAccessor.currentChannel = channel;

28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,16 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
2929
void preload()
3030
{
3131
uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
32+
[unroll]
3233
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
3334
{
3435
const float32_t4 firstLineTexValue = texture[uint32_t2(2 * glsl::gl_WorkGroupID().x, globalElementIndex)];
36+
[unroll]
3537
for (uint16_t channel = 0; channel < Channels; channel++)
3638
preloaded[channel][localElementIndex].real(scalar_t(firstLineTexValue[channel]));
3739

3840
const float32_t4 secondLineTexValue = texture[uint32_t2(2 * glsl::gl_WorkGroupID().x + 1, globalElementIndex)];
41+
[unroll]
3942
for (uint16_t channel = 0; channel < Channels; channel++)
4043
preloaded[channel][localElementIndex].imag(scalar_t(secondLineTexValue[channel]));
4144

@@ -47,12 +50,14 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
4750
// Channels will be contiguous in buffer memory.
4851
void unload()
4952
{
53+
[unroll]
5054
for (uint16_t channel = 0; channel < Channels; channel++)
5155
{
5256
const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
5357
const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + channelStartOffsetBytes);
5458

5559
uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
60+
[unroll]
5661
for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
5762
{
5863
colMajorAccessor.set(colMajorOffset(glsl::gl_WorkGroupID().x, globalElementIndex), preloaded[channel][localElementIndex]);
@@ -69,6 +74,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
6974
PreloadedFirstAxisAccessor preloadedAccessor;
7075

7176
preloadedAccessor.preload();
77+
[unroll]
7278
for (uint16_t channel = 0; channel < Channels; channel++)
7379
{
7480
preloadedAccessor.currentChannel = channel;

0 commit comments

Comments
 (0)