1+ #include "nbl/builtin/hlsl/cpp_compat.hlsl"
2+ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
3+ #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
4+ #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
5+ #include "nbl/builtin/hlsl/device_capabilities_traits.hlsl"
6+ #include "nbl/builtin/hlsl/enums.hlsl"
7+
8+ #ifndef _NBL_BUILTIN_PREFIX_SUM_BLUR_INCLUDED_
9+ #define _NBL_BUILTIN_PREFIX_SUM_BLUR_INCLUDED_
10+
11+ namespace nbl
12+ {
13+ namespace hlsl
14+ {
15+ namespace prefix_sum_blur
16+ {
17+
18+ // Prefix-Sum Blur using SAT (Summed Area Table) technique.
19+ // `scanScract` and `_sampler.prefixSumAccessor` must not to alias.
20+ template<
21+ typename DataAccessor,
22+ typename ScanSharedAccessor,
23+ typename Sampler,
24+ uint16_t WorkgroupSize,
25+ class device_capabilities=void > // TODO: define concepts for the Box1D and apply constraints
26+ struct Blur1D
27+ {
28+ void operator ()(
29+ NBL_REF_ARG (DataAccessor) data,
30+ NBL_REF_ARG (ScanSharedAccessor) scanScratch,
31+ NBL_REF_ARG (Sampler) _sampler,
32+ const uint16_t channel)
33+ {
34+ const uint16_t end = data.linearSize ();
35+ const uint16_t localInvocationIndex = workgroup::SubgroupContiguousIndex ();
36+
37+ // prefix sum
38+ // note the dynamically uniform loop condition
39+ for (uint16_t baseIx = 0 ; baseIx < end;)
40+ {
41+ const uint16_t ix = localInvocationIndex + baseIx;
42+ float32_t input = data.template get<float32_t>(channel, ix);
43+ // dynamically uniform condition
44+ if (baseIx != 0 )
45+ {
46+ // take result of previous prefix sum and add it to first element here
47+ if (localInvocationIndex == 0 )
48+ input += _sampler.prefixSumAccessor.template get<float32_t>(baseIx - 1 );
49+ }
50+ const float32_t sum = workgroup::inclusive_scan<plus<float32_t>, WorkgroupSize, device_capabilities>::template __call (input, scanScratch);
51+ // loop increment
52+ baseIx += WorkgroupSize;
53+ // save prefix sum results
54+ if (ix < end)
55+ _sampler.prefixSumAccessor.template set<float32_t>(ix, sum);
56+ // previous prefix sum must have finished before we ask for results
57+ _sampler.prefixSumAccessor.workgroupExecutionAndMemoryBarrier ();
58+ }
59+
60+ // TODO: split this Blur1D into two separate functors:
61+ // - multi-wg-wide prefix sum
62+ // - the SAT sampling
63+ const float32_t last = end - 1 ;
64+ for (float32_t ix = localInvocationIndex; ix < end; ix += WorkgroupSize)
65+ {
66+ const float32_t result = _sampler (ix, radius, borderColor[channel]);
67+ data.template set<float32_t>(channel, uint16_t (ix), result);
68+ }
69+ }
70+
71+ vector <float32_t, DataAccessor::Channels> borderColor;
72+ float32_t radius;
73+ };
74+
75+ }
76+ }
77+ }
78+
79+ #endif
0 commit comments