11#include "nbl/builtin/hlsl/cpp_compat.hlsl"
2- #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
3- #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
4- #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
5- #include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
6- #include "nbl/builtin/hlsl/device_capabilities_traits.hlsl"
72#include "nbl/builtin/hlsl/enums.hlsl"
3+ #include "nbl/builtin/hlsl/macros.h"
4+
5+ #ifndef _NBL_BUILTIN_BOX_SAMPLER_INCLUDED_
6+ #define _NBL_BUILTIN_BOX_SAMPLER_INCLUDED_
87
98namespace nbl
109{
1110namespace hlsl
1211{
13- namespace box_blur
14- {
15-
16- template<
17- typename DataAccessor,
18- typename SharedAccessor,
19- typename ScanSharedAccessor,
20- typename Sampler,
21- uint16_t WorkgroupSize,
22- class device_capabilities=void > // TODO: define concepts for the Box1D and apply constraints
23- struct Box1D
12+ namespace prefix_sum_blur
2413{
25- // TODO: Generalize later on when Francesco enforces accessor-concepts in `workgroup` and adds a `SharedMemoryAccessor` concept
26- struct ScanSharedAccessorWrapper
27- {
28- void get (const uint16_t ix, NBL_REF_ARG (float32_t) val)
29- {
30- val = base.template get<float32_t, uint16_t>(ix);
31- }
32-
33- void set (const uint16_t ix, const float32_t val)
34- {
35- base.template set<float32_t, uint16_t>(ix, val);
36- }
37-
38- void workgroupExecutionAndMemoryBarrier ()
39- {
40- base.workgroupExecutionAndMemoryBarrier ();
41- }
42-
43- ScanSharedAccessor base;
44- };
45-
46- void operator ()(
47- NBL_REF_ARG (DataAccessor) data,
48- NBL_REF_ARG (SharedAccessor) scratch,
49- NBL_REF_ARG (ScanSharedAccessor) scanScratch,
50- NBL_REF_ARG (Sampler) boxSampler,
51- const uint16_t channel)
52- {
53- const uint16_t end = data.linearSize ();
54- const uint16_t localInvocationIndex = workgroup::SubgroupContiguousIndex ();
55-
56- // prefix sum
57- // note the dynamically uniform loop condition
58- for (uint16_t baseIx = 0 ; baseIx < end;)
59- {
60- const uint16_t ix = localInvocationIndex + baseIx;
61- float32_t input = data.template get<float32_t>(channel, ix);
62- // dynamically uniform condition
63- if (baseIx != 0 )
64- {
65- // take result of previous prefix sum and add it to first element here
66- if (localInvocationIndex == 0 )
67- input += scratch.template get<float32_t>(baseIx - 1 );
68- }
69- // need to copy-in / copy-out the accessor cause no references in HLSL - yay!
70- ScanSharedAccessorWrapper scanScratchWrapper;
71- scanScratchWrapper.base = scanScratch;
72- const float32_t sum = workgroup::inclusive_scan<plus<float32_t>, WorkgroupSize, device_capabilities>::template __call (input, scanScratchWrapper);
73- scanScratch = scanScratchWrapper.base;
74- // loop increment
75- baseIx += WorkgroupSize;
76- // if doing the last prefix sum, we need to barrier to stop aliasing of temporary scratch for `inclusive_scan` and our scanline
77- // TODO: might be worth adding a non-aliased mode as NSight says nr 1 hotspot is barrier waiting in this code
78- if (end + ScanSharedAccessor::Size > SharedAccessor::Size)
79- scratch.workgroupExecutionAndMemoryBarrier ();
80- // save prefix sum results
81- if (ix < end)
82- scratch.template set<float32_t>(ix, sum);
83- // previous prefix sum must have finished before we ask for results
84- scratch.workgroupExecutionAndMemoryBarrier ();
85- }
86-
87- const float32_t last = end - 1 ;
88- const float32_t normalizationFactor = 1.f / (2.f * radius + 1.f );
89-
90- for (float32_t ix = localInvocationIndex; ix < end; ix += WorkgroupSize)
91- {
92- const float32_t result = boxSampler (scratch, ix, radius, borderColor[channel]);
93- data.template set<float32_t>(channel, uint16_t (ix), result * normalizationFactor);
94- }
95- }
96-
97- vector <float32_t, DataAccessor::Channels> borderColor;
98- float32_t radius;
99- };
10014
10115template<typename PrefixSumAccessor, typename T>
10216struct BoxSampler
10317{
18+ using prefix_sum_accessor_t = PrefixSumAccessor;
19+
20+ PrefixSumAccessor prefixSumAccessor;
10421 uint16_t wrapMode;
10522 uint16_t linearSize;
23+ T normalizationFactor;
10624
107- T operator ()(NBL_REF_ARG (PrefixSumAccessor) prefixSumAccessor, float32_t ix, float32_t radius, float32_t borderColor)
25+ T operator ()(float32_t ix, float32_t radius, float32_t borderColor)
10826 {
109- const float32_t alpha = radius - floor (radius);
110- const float32_t lastIdx = linearSize - 1 ;
27+ const float32_t alpha = frac (radius);
11128 const float32_t rightIdx = float32_t (ix) + radius;
112- const float32_t leftIdx = float32_t (ix) - radius;
29+ const float32_t leftIdx = float32_t (ix) - radius - 1 ;
30+ const int32_t lastIdx = linearSize - 1 ;
11331 const int32_t rightFlIdx = (int32_t)floor (rightIdx);
11432 const int32_t rightClIdx = (int32_t)ceil (rightIdx);
11533 const int32_t leftFlIdx = (int32_t)floor (leftIdx);
11634 const int32_t leftClIdx = (int32_t)ceil (leftIdx);
11735
36+ assert (linearSize > 1 );
37+
11838 T result = 0 ;
119- if (rightFlIdx < linearSize)
39+ if (rightClIdx < linearSize)
12040 {
12141 result += lerp (prefixSumAccessor.template get<T, uint32_t>(rightFlIdx), prefixSumAccessor.template get<T, uint32_t>(rightClIdx), alpha);
12242 }
@@ -126,8 +46,8 @@ struct BoxSampler
12646 case ETC_REPEAT:
12747 {
12848 const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
129- const T floored = prefixSumAccessor.template get<T, uint32_t>(rightFlIdx % linearSize) + ceil ( float32_t (rightFlIdx % lastIdx) / linearSize) * last;
130- const T ceiled = prefixSumAccessor.template get<T, uint32_t>(rightClIdx % linearSize) + ceil ( float32_t (rightClIdx % lastIdx) / linearSize) * last;
49+ const T floored = prefixSumAccessor.template get<T, uint32_t>(rightFlIdx % linearSize) + last;
50+ const T ceiled = prefixSumAccessor.template get<T, uint32_t>(rightClIdx % linearSize) + last;
13151 result += lerp (floored, ceiled, alpha);
13252 break ;
13353 }
@@ -179,8 +99,7 @@ struct BoxSampler
17999 {
180100 const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
181101 const T first = prefixSumAccessor.template get<T, uint32_t>(0 );
182- const T firstPlusOne = prefixSumAccessor.template get<T, uint32_t>(1 );
183- result += (rightIdx - lastIdx) * (firstPlusOne - first) + last;
102+ result += (rightIdx - lastIdx) * first + last;
184103 break ;
185104 }
186105 }
@@ -196,19 +115,19 @@ struct BoxSampler
196115 case ETC_REPEAT:
197116 {
198117 const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
199- const T floored = prefixSumAccessor.template get<T, uint32_t>(abs ( leftFlIdx) % linearSize) + ceil (T (leftFlIdx) / linearSize) * last;
200- const T ceiled = prefixSumAccessor.template get<T, uint32_t>(abs ( leftClIdx) % linearSize) + ceil ( float32_t (leftClIdx) / linearSize) * last;
118+ const T floored = prefixSumAccessor.template get<T, uint32_t>((lastIdx + leftFlIdx) % linearSize) + floor (T (leftFlIdx) / linearSize) * last;
119+ const T ceiled = prefixSumAccessor.template get<T, uint32_t>((lastIdx + leftClIdx) % linearSize) + floor ( T (leftClIdx) / linearSize) * last;
201120 result -= lerp (floored, ceiled, alpha);
202121 break ;
203122 }
204123 case ETC_CLAMP_TO_BORDER:
205124 {
206- result -= prefixSumAccessor.template get<T, uint32_t>( 0 ) + leftIdx * borderColor;
125+ result -= (leftIdx + 1 ) * borderColor;
207126 break ;
208127 }
209128 case ETC_CLAMP_TO_EDGE:
210129 {
211- result -= leftIdx * prefixSumAccessor.template get<T, uint32_t>(0 );
130+ result -= ( 1 - abs ( leftIdx)) * prefixSumAccessor.template get<T, uint32_t>(0 );
212131 break ;
213132 }
214133 case ETC_MIRROR:
@@ -247,16 +166,18 @@ struct BoxSampler
247166 {
248167 const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
249168 const T lastMinusOne = prefixSumAccessor.template get<T, uint32_t>(lastIdx - 1 );
250- result -= leftIdx * (last - lastMinusOne);
169+ result -= ( 1 - abs ( leftIdx)) * (last - lastMinusOne);
251170 break ;
252171 }
253172 }
254173 }
255174
256- return result;
175+ return result * normalizationFactor ;
257176 }
258177};
259178
260179}
261180}
262- }
181+ }
182+
183+ #endif
0 commit comments