@@ -141,6 +141,151 @@ struct geom_meter {
141141 float_t sampleCount;
142142 float_t2 lumaMinMax;
143143};
144+
145+ template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
146+ struct median_meter {
147+ using int_t = typename SharedAccessor::type;
148+ using float_t = float32_t;
149+ using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
150+ using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
151+ using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
152+
153+ static this_t create (float_t2 lumaMinMax, float_t sampleCount) {
154+ this_t retval;
155+ retval.lumaMinMax = lumaMinMax;
156+ retval.sampleCount = sampleCount;
157+ return retval;
158+ }
159+
160+ int_t inclusive_scan (float_t value, NBL_REF_ARG (SharedAccessor) sdata) {
161+ return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
162+ template __call <SharedAccessor>(value, sdata);
163+ }
164+
165+ float_t computeLuma (
166+ NBL_CONST_REF_ARG (MeteringWindow) window,
167+ NBL_REF_ARG (TexAccessor) tex,
168+ float_t2 shiftedCoord
169+ ) {
170+ float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
171+ float_t3 color = tex.get (uvPos);
172+ float_t luma = (float_t)TexAccessor::toXYZ (color);
173+
174+ return clamp (luma, lumaMinMax.x, lumaMinMax.y);
175+ }
176+
177+ int_t float2Int (
178+ float_t val,
179+ float_t minLog2,
180+ float_t rangeLog2
181+ ) {
182+ uint32_t3 workGroupCount = glsl::gl_NumWorkGroups ();
183+ uint32_t fixedPointBitsLeft = 32 - uint32_t (ceil (log2 (workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2 ();
184+
185+ return int_t (clamp ((val - minLog2) * rangeLog2, 0.f , float32_t ((1 << fixedPointBitsLeft) - 1 )));
186+ }
187+
188+ float_t int2Float (
189+ int_t val,
190+ float_t minLog2,
191+ float_t rangeLog2
192+ ) {
193+ return val / rangeLog2 + minLog2;
194+ }
195+
196+ void sampleLuma (
197+ NBL_CONST_REF_ARG (MeteringWindow) window,
198+ NBL_REF_ARG (HistogramAccessor) histo,
199+ NBL_REF_ARG (TexAccessor) tex,
200+ NBL_REF_ARG (SharedAccessor) sdata,
201+ float_t2 tileOffset,
202+ float_t2 viewportSize
203+ ) {
204+ uint32_t tid = workgroup::SubgroupContiguousIndex ();
205+
206+ for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
207+ sdata.set (vid, 0 );
208+ }
209+
210+ sdata.workgroupExecutionAndMemoryBarrier ();
211+
212+ uint32_t2 coord = {
213+ morton2d_decode_x (tid),
214+ morton2d_decode_y (tid)
215+ };
216+
217+ float_t luma = 0.0f ;
218+ float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
219+ luma = computeLuma (window, tex, shiftedCoord);
220+
221+ float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
222+ uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
223+
224+ sdata.atomicAdd (binIndex, float2Int (luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
225+
226+ sdata.workgroupExecutionAndMemoryBarrier ();
227+
228+ float_t histogram_value;
229+ sdata.get (tid, histogram_value);
230+
231+ sdata.workgroupExecutionAndMemoryBarrier ();
232+
233+ float_t sum = inclusive_scan (histogram_value, sdata);
234+ histo.atomicAdd (tid, float2Int (sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
235+
236+ const bool is_last_wg_invocation = tid == (GroupSize - 1 );
237+ const static uint32_t RoundedBinCount = 1 + (BinCount - 1 ) / GroupSize;
238+
239+ for (int i = 1 ; i < RoundedBinCount; i++) {
240+ uint32_t keyBucketStart = GroupSize * i;
241+ uint32_t vid = tid + keyBucketStart;
242+
243+ // no if statement about the last iteration needed
244+ if (is_last_wg_invocation) {
245+ float_t beforeSum;
246+ sdata.get (keyBucketStart, beforeSum);
247+ sdata.set (keyBucketStart, beforeSum + sum);
248+ }
249+
250+ // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
251+ sdata.workgroupExecutionAndMemoryBarrier ();
252+
253+ // no aliasing anymore
254+ float_t atVid;
255+ sdata.get (vid, atVid);
256+ sum = inclusive_scan (atVid, sdata);
257+ if (vid < BinCount) {
258+ histo.atomicAdd (vid, float2Int (sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
259+ }
260+ }
261+ }
262+
263+ float_t gatherLuma (
264+ NBL_REF_ARG (HistogramAccessor) histo,
265+ NBL_REF_ARG (SharedAccessor) sdata
266+ ) {
267+ uint32_t tid = workgroup::SubgroupContiguousIndex ();
268+
269+ for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
270+ sdata.set (
271+ vid,
272+ histo.get (vid & (BinCount - 1 ))
273+ );
274+ }
275+
276+ sdata.workgroupExecutionAndMemoryBarrier ();
277+
278+ uint32_t percentile40, percentile60;
279+ sdata.get (BinCount * 0.4 , percentile40);
280+ sdata.get (BinCount * 0.6 , percentile60);
281+
282+ return (int2Float (percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float (percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2 ;
283+ }
284+
285+ float_t sampleCount;
286+ float_t2 lumaMinMax;
287+ };
288+
144289}
145290}
146291}
0 commit comments