Devsh-Graphics-Programming
diff --git a/‎28_FFTBloom/app_resources/common.hlsl‎
Lines changed: 5 additions & 4 deletions b/‎28_FFTBloom/app_resources/common.hlsl‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎28_FFTBloom/app_resources/fft_convolve_ifft.hlsl‎
Lines changed: 27 additions & 35 deletions b/‎28_FFTBloom/app_resources/fft_convolve_ifft.hlsl‎
Lines changed: 27 additions & 35 deletions
diff --git a/‎28_FFTBloom/app_resources/image_fft_first_axis.hlsl‎
Lines changed: 6 additions & 0 deletions b/‎28_FFTBloom/app_resources/image_fft_first_axis.hlsl‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎28_FFTBloom/app_resources/image_ifft_first_axis.hlsl‎
Lines changed: 9 additions & 0 deletions b/‎28_FFTBloom/app_resources/image_ifft_first_axis.hlsl‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl‎
Lines changed: 6 additions & 0 deletions b/‎28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl‎
Lines changed: 6 additions & 0 deletions
@@ -5,9 +5,7 @@ NBL_CONSTEXPR_STATIC_INLINE uint32_t Channels = 3;
 
 using namespace nbl::hlsl;
 
-// Necessary bits for each int type (except BDAs) are being generous and considering a resolution of up to 4k
-// Also, an extra bit is given since it fits: they're uints being passed as ints so we don't litter the code with int32 casts
-// The extra bit is to make sure MSB is a 0 and it doesn't sign extend and give negative numbers when using the maximum amount of considered bits
+// All packed bitfields of int32_t are uints being passed as ints so we don't litter the code with int32 casts
 struct PushConstantData
 {
 // After running FFT along a column, we want to store the result in column major order for coalesced writes, and similarly after running an FFT in row major order
@@ -21,7 +19,10 @@ struct PushConstantData
 // The following three fields being push constants allow dynamic resizing of the image without recompiling shaders (limited by the FFT length)
 int32_t imageRowLength : 16; 
 int32_t imageHalfRowLength : 16;
-// Actually only needs at worst 10 bits, but we don't pack it into a bitfield so we can use offsetof and update only this field from CPP side
+// Only middle pass uses these
+uint32_t currentChannel;
+uint64_t channelStartOffsetBytes;
+// We don't pack it into a bitfield so we can use offsetof and update only this field from CPP side
 // Alternatively, we could do the packing/unpacking manually to save 32 bits
 int32_t padding;
 // Used by IFFT to tell if an index belongs to an image or is in the padding
 
@@ -6,6 +6,8 @@
 
 // ------------------------------------------ SECOND AXIS FFT + CONVOLUTION + IFFT -------------------------------------------------------------
 
+// This is done for the channel specified in pushConstants.
+
 // This time each Workgroup will compute the FFT along a horizontal line (fixed y for the whole Workgroup). We get the y coordinate for the
 // row a workgroup is working on via `gl_WorkGroupID().x`. We have to keep this in mind: What's stored as the first row is actually `Z + iN`, 
 // where `Z` is the actual 0th row and `N` is the Nyquist row (the one with index TotalSize / 2). Those are packed together
@@ -34,14 +36,6 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 {
 return y * pushConstants.imageRowLength + x; // can no longer sum with | since there's no guarantees on row length
 }
-
-// Same as what was used to store in col-major after first axis FFT. This time we launch one workgroup per row so the height of the channel's (half) image is NumWorkgroups,
-// and the width (number of columns) is passed as a push constant
-uint64_t getChannelStartOffsetBytes(uint16_t channel)
-{
-return uint64_t(channel) * NumWorkgroups * pushConstants.imageRowLength * sizeof(complex_t<scalar_t>);
-}
-
 // ---------------------------------------------------- End Utils ---------------------------------------------------------
 
 // Unpacking on load: Has no workgroup shuffles (which become execution barriers) which would be necessary for unpacking on store
@@ -56,11 +50,10 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 // corresponding to the column they correspond to. 
 // The `gl_WorkGroupID().x = 0` case is special because instead of getting the mirror we need to get both zero and nyquist frequencies for the columns, which doesn't happen just by mirror
 // indexing. 
-void preload(uint16_t channel)
+void preload()
 {
 // Set up accessor to read in data
-const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
-const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + channelStartOffsetBytes);
+const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + pushConstants.channelStartOffsetBytes);
 
 // This one shows up a lot so we give it a name
 const bool oddThread = glsl::gl_SubgroupInvocationID() & 1u;
@@ -77,6 +70,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 // the previous pass' threads), then `PreviousWorkgroupSize` odd elements (`preloaded[1]`) and so on
 const uint32_t evenRow = glsl::gl_WorkGroupID().x + ((glsl::gl_WorkGroupID().x / PreviousWorkgroupSize) * PreviousWorkgroupSize);
 const uint32_t y = oddThread ? PreviousPassFFTIndexingUtils::getNablaMirrorIndex(evenRow) : evenRow;
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 // If mirrored, we need to invert which thread is loading lo and which is loading hi
@@ -115,6 +109,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 // Even thread retrieves Zero, odd thread retrieves Nyquist. Zero is always `preloaded[0]` of the previous FFT's 0th thread, while Nyquist is always `preloaded[1]` of that same thread.
 // Therefore we know Nyquist ends up exactly at y = PreviousWorkgroupSize
 const uint32_t y = oddThread ? PreviousWorkgroupSize : 0;
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 int32_t wrappedIndex = paddedIndex < 0 ? ~paddedIndex : paddedIndex; // ~x = - x - 1 in two's complement (except maybe at the borders of representable range) 
@@ -143,18 +138,19 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 // Each element on this row is Nabla-ordered. So the element at `x' = index, y' = gl_WorkGroupID().x` that we're operating on is actually the element at
 // `x = F(index), y = bitreverse(gl_WorkGroupID().x)` (with the bitreversal done as an N-1 bit number, for `N = log2(TotalSize)` *of the first axist FFT*)
 template<typename sharedmem_adaptor_t>
-void convolve(uint32_t channel, NBL_REF_ARG(sharedmem_adaptor_t) adaptorForSharedMemory)
+void convolve(NBL_REF_ARG(sharedmem_adaptor_t) adaptorForSharedMemory)
 {
 if (glsl::gl_WorkGroupID().x)
 {
 const uint32_t y = bitReverseAs<uint32_t, NumWorkgroupsLog2>(glsl::gl_WorkGroupID().x);
 uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 const uint32_t indexDFT = FFTIndexingUtils::getDFTIndex(globalElementIndex);
 const uint32_t2 texCoords = uint32_t2(indexDFT, y);
 const float32_t2 uv = texCoords * float32_t2(TotalSizeReciprocal, 1.f / NumWorkgroups) + KernelHalfPixelSize;
-const vector<scalar_t, 2> sampledKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(channel)), 0));
+const vector<scalar_t, 2> sampledKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(pushConstants.currentChannel)), 0));
 const vector<scalar_t, 2> sampledKernelInterpolatedVector = lerp(sampledKernelVector, One, promote<vector<scalar_t, 2>, float32_t>(pushConstants.interpolatingFactor));
 const complex_t<scalar_t> sampledKernelInterpolated = { sampledKernelInterpolatedVector.x, sampledKernelInterpolatedVector.y };
 preloaded[localElementIndex] = preloaded[localElementIndex] * sampledKernelInterpolated;
@@ -166,6 +162,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 else
 {
 uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex += 2)
 {
 complex_t<scalar_t> zero = preloaded[localElementIndex];
@@ -179,14 +176,14 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 const float32_t indexDFT = float32_t(FFTIndexingUtils::getDFTIndex(globalElementIndex));
 
 float32_t2 uv = float32_t2(indexDFT * TotalSizeReciprocal, float32_t(0)) + KernelHalfPixelSize;
-const vector<scalar_t, 2> zeroKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(channel)), 0));
+const vector<scalar_t, 2> zeroKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(pushConstants.currentChannel)), 0));
 const vector<scalar_t, 2> zeroKernelInterpolatedVector = lerp(zeroKernelVector, One, promote<vector<scalar_t, 2>, float32_t>(pushConstants.interpolatingFactor));
 const complex_t<scalar_t> zeroKernelInterpolated = { zeroKernelInterpolatedVector.x, zeroKernelInterpolatedVector.y };
 zero = zero * zeroKernelInterpolated;
 
 // Do the same for the nyquist coord
 uv.y = 1.f - KernelHalfPixelSize.y;
-const vector<scalar_t, 2> nyquistKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(channel)), 0));
+const vector<scalar_t, 2> nyquistKernelVector = vector<scalar_t, 2>(kernelChannels.SampleLevel(samplerState, float32_t3(uv, float32_t(pushConstants.currentChannel)), 0));
 const vector<scalar_t, 2> nyquistKernelInterpolatedVector = lerp(nyquistKernelVector, One, promote<vector<scalar_t, 2>, float32_t>(pushConstants.interpolatingFactor));
 const complex_t<scalar_t> nyquistKernelInterpolated = { nyquistKernelInterpolatedVector.x, nyquistKernelInterpolatedVector.y };
 nyquist = nyquist * nyquistKernelInterpolated;
@@ -216,14 +213,14 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 }
 
 // Save a row back in row major order. Remember that the first row (one with `gl_WorkGroupID().x == 0`) will actually hold the packed IFFT of Zero and Nyquist rows.
-void unload(uint16_t channel)
+void unload()
 {
 // Set up accessor to write out data
-const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
-const LegacyBdaAccessor<complex_t<scalar_t> > rowMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.rowMajorBufferAddress + channelStartOffsetBytes);
+const LegacyBdaAccessor<complex_t<scalar_t> > rowMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.rowMajorBufferAddress + pushConstants.channelStartOffsetBytes);
 
 const uint32_t firstIndex = workgroup::SubgroupContiguousIndex();
 int32_t paddedIndex = int32_t(firstIndex) - pushConstants.padding;
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 if (paddedIndex >= 0 && paddedIndex < pushConstants.imageRowLength)
@@ -245,21 +242,16 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 sharedmem_adaptor_t adaptorForSharedMemory;
 
 PreloadedSecondAxisAccessor preloadedAccessor;
-for (uint16_t channel = 0; channel < Channels; channel++)
-{
-preloadedAccessor.preload(channel);
-// Wait on previous pass FFT
-if(channel)
-sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
-workgroup::FFT<false, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
-// Update state after FFT run
-adaptorForSharedMemory.accessor = sharedmemAccessor;
-preloadedAccessor.convolve(channel, adaptorForSharedMemory);
-// Remember to update the accessor's state
-sharedmemAccessor = adaptorForSharedMemory.accessor;
-// Either wait on FFT (most workgroups but 0) or convolution (only 0th workgroup actually uses sharedmem for convolution)
-sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
-workgroup::FFT<true, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
-preloadedAccessor.unload(channel);
-}
+
+preloadedAccessor.preload();
+workgroup::FFT<false, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
+// Update state after FFT run
+adaptorForSharedMemory.accessor = sharedmemAccessor;
+preloadedAccessor.convolve(adaptorForSharedMemory);
+// Remember to update the accessor's state
+sharedmemAccessor = adaptorForSharedMemory.accessor;
+// Either wait on first FFT (all workgroups but 0) or convolution (only 0th workgroup actually uses sharedmem for convolution)
+sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
+workgroup::FFT<true, FFTParameters>::template __call(preloadedAccessor, sharedmemAccessor);
+preloadedAccessor.unload();
 }
@@ -35,14 +35,17 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
 normalizedCoordsSecondLine.x = normalizedCoordsFirstLine.x + pushConstants.imagePixelSize.x;
 normalizedCoordsFirstLine.y = (int32_t(workgroup::SubgroupContiguousIndex()) - pushConstants.padding) * pushConstants.imagePixelSize.y + pushConstants.imageHalfPixelSize.y;
 
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 const float32_t4 firstLineTexValue = texture.SampleLevel(samplerState, normalizedCoordsFirstLine, 0);
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 preloaded[channel][localElementIndex].real(scalar_t(firstLineTexValue[channel]));
 
 normalizedCoordsSecondLine.y = normalizedCoordsFirstLine.y;
 const float32_t4 secondLineTexValue = texture.SampleLevel(samplerState, normalizedCoordsSecondLine, 0);
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 preloaded[channel][localElementIndex].imag(scalar_t(secondLineTexValue[channel]));
 
@@ -54,12 +57,14 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
 // Channels will be contiguous in buffer memory. 
 void unload()
 {
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 {
 const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
 const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + channelStartOffsetBytes);
 
 uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 colMajorAccessor.set(colMajorOffset(glsl::gl_WorkGroupID().x, globalElementIndex), preloaded[channel][localElementIndex]);
@@ -76,6 +81,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 PreloadedFirstAxisAccessor preloadedAccessor;
 
 preloadedAccessor.preload();
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 {
 preloadedAccessor.currentChannel = channel;
 
@@ -36,6 +36,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 template<typename sharedmem_adaptor_t>
 void preload(NBL_REF_ARG(sharedmem_adaptor_t) adaptorForSharedMemory)
 {
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 {
 const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
@@ -44,6 +45,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 
 uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
 // Load all even elements of first column
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < (ElementsPerInvocation / 2); localElementIndex++)
 {
 preloaded[channel][localElementIndex << 1] = rowMajorAccessor.get(rowMajorOffset(2 * glsl::gl_WorkGroupID().x, globalElementIndex));
@@ -52,6 +54,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 // Get all odd elements by trading
 // Reset globalElementIndex - Add WorkgroupSize to account for `localElementIndex` starting at 1
 globalElementIndex = WorkgroupSize | workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 1; localElementIndex < ElementsPerInvocation; localElementIndex += 2)
 {
 preloaded[channel][localElementIndex] = conj(getDFTMirror<sharedmem_adaptor_t>(globalElementIndex, channel, adaptorForSharedMemory));
@@ -63,6 +66,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 // This makes even positions hold C1 + iC2
 // Reset globalElementIndex
 globalElementIndex = workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < (ElementsPerInvocation / 2); localElementIndex++)
 {
 preloaded[channel][localElementIndex << 1] = preloaded[channel][localElementIndex << 1] + rotateLeft<scalar_t>(rowMajorAccessor.get(rowMajorOffset(2 * glsl::gl_WorkGroupID().x + 1, globalElementIndex)));
@@ -73,6 +77,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 // add conj(C1) back to have conj(C1) + i * conj(C2).
 // Reset globalElementIndex - Add WorkgroupSize to account for `localElementIndex` starting at 1
 globalElementIndex = WorkgroupSize | workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 1; localElementIndex < ElementsPerInvocation; localElementIndex += 2)
 {
 complex_t<scalar_t> otherThreadEven = conj(getDFTMirror<sharedmem_adaptor_t>(globalElementIndex, channel, adaptorForSharedMemory));
@@ -106,6 +111,8 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 {
 const uint32_t firstIndex = workgroup::SubgroupContiguousIndex();
 int32_t paddedIndex = int32_t(firstIndex) - pushConstants.padding;
+
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 if (paddedIndex >= 0 && paddedIndex < pushConstants.imageColumnLength)
@@ -115,6 +122,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase
 firstLineTexValue.a = 1.f; 
 secondLineTexValue.a = 1.f;
 
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 {
 firstLineTexValue[channel] = scalar_t(preloaded[channel][localElementIndex].real());
@@ -142,6 +150,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 // Update state after preload
 sharedmemAccessor = adaptorForSharedMemory.accessor;
 
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 {
 preloadedAccessor.currentChannel = channel;
 
@@ -29,13 +29,16 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
 void preload()
 {
 uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 const float32_t4 firstLineTexValue = texture[uint32_t2(2 * glsl::gl_WorkGroupID().x, globalElementIndex)];
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 preloaded[channel][localElementIndex].real(scalar_t(firstLineTexValue[channel]));
 
 const float32_t4 secondLineTexValue = texture[uint32_t2(2 * glsl::gl_WorkGroupID().x + 1, globalElementIndex)];
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 preloaded[channel][localElementIndex].imag(scalar_t(secondLineTexValue[channel]));
 
@@ -47,12 +50,14 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase
 // Channels will be contiguous in buffer memory.
 void unload()
 {
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 {
 const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);
 const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + channelStartOffsetBytes);
 
 uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();
+[unroll]
 for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)
 {
 colMajorAccessor.set(colMajorOffset(glsl::gl_WorkGroupID().x, globalElementIndex), preloaded[channel][localElementIndex]);
@@ -69,6 +74,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 PreloadedFirstAxisAccessor preloadedAccessor;
 
 preloadedAccessor.preload();
+[unroll]
 for (uint16_t channel = 0; channel < Channels; channel++)
 {
 preloadedAccessor.currentChannel = channel;
Original file line number	Diff line number	Diff line change
`@@ -29,13 +29,16 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase`
`29`	`29`	`void preload()`
`30`	`30`	`{`
`31`	`31`	`uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();`
	`32`	`+[unroll]`
`32`	`33`	`for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)`
`33`	`34`	`{`
`34`	`35`	`const float32_t4 firstLineTexValue = texture[uint32_t2(2 * glsl::gl_WorkGroupID().x, globalElementIndex)];`
	`36`	`+[unroll]`
`35`	`37`	`for (uint16_t channel = 0; channel < Channels; channel++)`
`36`	`38`	`preloaded[channel][localElementIndex].real(scalar_t(firstLineTexValue[channel]));`
`37`	`39`
`38`	`40`	`const float32_t4 secondLineTexValue = texture[uint32_t2(2 * glsl::gl_WorkGroupID().x + 1, globalElementIndex)];`
	`41`	`+[unroll]`
`39`	`42`	`for (uint16_t channel = 0; channel < Channels; channel++)`
`40`	`43`	`preloaded[channel][localElementIndex].imag(scalar_t(secondLineTexValue[channel]));`
`41`	`44`
`@@ -47,12 +50,14 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase`
`47`	`50`	`// Channels will be contiguous in buffer memory.`
`48`	`51`	`void unload()`
`49`	`52`	`{`
	`53`	`+[unroll]`
`50`	`54`	`for (uint16_t channel = 0; channel < Channels; channel++)`
`51`	`55`	`{`
`52`	`56`	`const uint64_t channelStartOffsetBytes = getChannelStartOffsetBytes(channel);`
`53`	`57`	`const LegacyBdaAccessor<complex_t<scalar_t> > colMajorAccessor = LegacyBdaAccessor<complex_t<scalar_t> >::create(pushConstants.colMajorBufferAddress + channelStartOffsetBytes);`
`54`	`58`
`55`	`59`	`uint32_t globalElementIndex = workgroup::SubgroupContiguousIndex();`
	`60`	`+[unroll]`
`56`	`61`	`for (uint32_t localElementIndex = 0; localElementIndex < ElementsPerInvocation; localElementIndex++)`
`57`	`62`	`{`
`58`	`63`	`colMajorAccessor.set(colMajorOffset(glsl::gl_WorkGroupID().x, globalElementIndex), preloaded[channel][localElementIndex]);`
`@@ -69,6 +74,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)`
`69`	`74`	`PreloadedFirstAxisAccessor preloadedAccessor;`
`70`	`75`
`71`	`76`	`preloadedAccessor.preload();`
	`77`	`+[unroll]`
`72`	`78`	`for (uint16_t channel = 0; channel < Channels; channel++)`
`73`	`79`	`{`
`74`	`80`	`preloadedAccessor.currentChannel = channel;`