Skip to content

Commit 8cada24

Browse files
Merge pull request #169 from Devsh-Graphics-Programming/ali_blur3
initialize blur example
2 parents 45978e2 + 9c59f0b commit 8cada24

File tree

5 files changed

+975
-0
lines changed

5 files changed

+975
-0
lines changed

26_Blur/CMakeLists.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
include(common RESULT_VARIABLE RES)
2+
if(NOT RES)
3+
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
4+
endif()
5+
6+
nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
7+
8+
if(NBL_EMBED_BUILTIN_RESOURCES)
9+
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
10+
set(RESOURCE_DIR "app_resources")
11+
12+
get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
13+
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
14+
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
15+
16+
file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
17+
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
18+
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
19+
endforeach()
20+
21+
ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
22+
23+
LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
24+
endif()

26_Blur/app_resources/common.hlsl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
2+
#include "nbl/builtin/hlsl/type_traits.hlsl"
3+
4+
static const uint16_t PASSES = 2;
5+
6+
struct PushConstants
7+
{
8+
nbl::hlsl::float32_t radius;
9+
uint32_t activeAxis : 2;
10+
uint32_t edgeWrapMode : 6;
11+
};
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#include "nbl/builtin/hlsl/prefix_sum_blur/blur.hlsl"
2+
#include "nbl/builtin/hlsl/prefix_sum_blur/box_sampler.hlsl"
3+
#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
4+
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
5+
#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
6+
#include "common.hlsl"
7+
8+
using namespace nbl::hlsl;
9+
10+
uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(WORKGROUP_SIZE, 1, 1); }
11+
12+
[[vk::binding(0)]]
13+
Texture2D<float32_t4> input;
14+
[[vk::binding(1)]]
15+
RWTexture2D<float32_t4> output;
16+
17+
[[vk::push_constant]] PushConstants pc;
18+
19+
template<uint16_t Chnls>
20+
struct TextureProxy
21+
{
22+
NBL_CONSTEXPR uint16_t Channels = Chnls;
23+
using texel_t = vector<float32_t, Channels>;
24+
25+
// divisions by PoT constant will optimize out nicely
26+
template<typename T>
27+
T get(const uint16_t channel, const uint16_t uv)
28+
{
29+
return spill[uv / WORKGROUP_SIZE][channel];
30+
}
31+
32+
template<typename T>
33+
void set(const uint16_t channel, const uint16_t uv, T value)
34+
{
35+
spill[uv / WORKGROUP_SIZE][channel] = value;
36+
}
37+
38+
void load()
39+
{
40+
const uint16_t end = linearSize();
41+
uint16_t ix = workgroup::SubgroupContiguousIndex();
42+
// because workgroups do scans cooperatively all spill values need sane defaults
43+
for (uint16_t i=0; i < SpillSize; ix += WORKGROUP_SIZE)
44+
spill[i++] = ix < end ? (texel_t)input[position(ix)] : promote<texel_t>(0.f);
45+
}
46+
47+
void store()
48+
{
49+
const uint16_t end = linearSize();
50+
uint16_t i = 0;
51+
// making sure that we don't store out of range
52+
for (uint16_t ix = workgroup::SubgroupContiguousIndex(); ix < end; ix += WORKGROUP_SIZE)
53+
{
54+
float32_t4 tmp = float32_t4(0, 0, 0, 1);
55+
for (uint16_t ch=0; ch < Channels; ch++)
56+
tmp[ch] = spill[i][ch];
57+
i++;
58+
output[position(ix)] = tmp;
59+
}
60+
}
61+
62+
uint16_t linearSize()
63+
{
64+
uint32_t3 dims;
65+
input.GetDimensions(0, dims.x, dims.y, dims.z);
66+
return _static_cast<uint16_t>(dims[activeAxis]);
67+
}
68+
69+
uint16_t2 position(uint16_t ix)
70+
{
71+
uint16_t2 pos;
72+
pos[activeAxis] = ix;
73+
pos[activeAxis ^ 0x1] = _static_cast<uint16_t>(glsl::gl_WorkGroupID().x);
74+
return pos;
75+
}
76+
77+
// whether we pas along X or Y
78+
uint16_t activeAxis;
79+
NBL_CONSTEXPR uint16_t SpillSize = (MAX_SCANLINE_SIZE - 1) / WORKGROUP_SIZE + 1;
80+
texel_t spill[SpillSize];
81+
};
82+
83+
static const uint16_t MAX_SCAN_SCRATCH_SIZE = workgroup::scratch_size_arithmetic<WORKGROUP_SIZE, MAX_SUBGROUP_SIZE>::value + 2;
84+
85+
// we always use `uint32_t`
86+
groupshared uint32_t smem[MAX_SCANLINE_SIZE];
87+
groupshared uint32_t prefix_smem[MAX_SCAN_SCRATCH_SIZE];
88+
89+
struct SharedMemoryProxy
90+
{
91+
NBL_CONSTEXPR uint16_t Size = MAX_SCANLINE_SIZE;
92+
93+
template<typename T, typename I = uint16_t>
94+
enable_if_t<sizeof(T) == sizeof(uint32_t), T> get(const I idx)
95+
{
96+
return bit_cast<T>(smem[idx]);
97+
}
98+
99+
template<typename T, typename I = uint16_t>
100+
enable_if_t<sizeof(T) == sizeof(uint32_t), void> set(const I idx, T value)
101+
{
102+
smem[idx] = bit_cast<uint32_t>(value);
103+
}
104+
105+
void workgroupExecutionAndMemoryBarrier()
106+
{
107+
glsl::barrier();
108+
}
109+
};
110+
111+
struct ScanSharedMemoryProxy
112+
{
113+
NBL_CONSTEXPR uint16_t Size = MAX_SCAN_SCRATCH_SIZE;
114+
115+
// these get used by Box1D
116+
template<typename T, typename I = uint16_t>
117+
enable_if_t<sizeof(T) == sizeof(uint32_t), T> get(const I idx)
118+
{
119+
return bit_cast<T>(prefix_smem[idx]);
120+
}
121+
122+
template<typename T, typename I = uint16_t>
123+
enable_if_t<sizeof(T) == sizeof(uint32_t), void> set(const I idx, T value)
124+
{
125+
prefix_smem[idx] = bit_cast<uint32_t>(value);
126+
}
127+
128+
void workgroupExecutionAndMemoryBarrier()
129+
{
130+
glsl::barrier();
131+
}
132+
};
133+
134+
[numthreads(WORKGROUP_SIZE, 1, 1)]
135+
void main()
136+
{
137+
ScanSharedMemoryProxy scanSmemAccessor;
138+
139+
TextureProxy<CHANNELS> texAccessor;
140+
texAccessor.activeAxis = (uint16_t)pc.activeAxis;
141+
texAccessor.load();
142+
143+
prefix_sum_blur::BoxSampler<SharedMemoryProxy, float32_t> boxSampler;
144+
boxSampler.wrapMode = uint16_t(pc.edgeWrapMode);
145+
boxSampler.linearSize = texAccessor.linearSize();
146+
boxSampler.normalizationFactor = 1 / (2 * pc.radius + 1);
147+
148+
prefix_sum_blur::Blur1D<decltype(texAccessor), decltype(scanSmemAccessor), decltype(boxSampler), WORKGROUP_SIZE, jit::device_capabilities> blur;
149+
blur.radius = pc.radius;
150+
blur.borderColor = float32_t4(0, 1, 0, 1);
151+
152+
for (uint16_t ch=0; ch < CHANNELS; ch++)
153+
for (uint16_t pass=0; pass < PASSES; pass++)
154+
{
155+
// its the `SharedMemoryProxy` that gets aliased and reused so we need to barrier on its memory
156+
if (ch != 0 && pass != 0)
157+
boxSampler.prefixSumAccessor.workgroupExecutionAndMemoryBarrier();
158+
blur(texAccessor, scanSmemAccessor, boxSampler, ch);
159+
}
160+
161+
texAccessor.store();
162+
}

0 commit comments

Comments
 (0)