@@ -95,12 +95,232 @@ namespace
9595 }
9696}
9797
98- double cv::cuda::threshold (InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
98+
99+ __global__ void otsu_sums (uint *histogram, uint *threshold_sums, unsigned long long *sums)
100+ {
101+ const uint32_t n_bins = 256 ;
102+
103+ __shared__ uint shared_memory_ts[n_bins];
104+ __shared__ unsigned long long shared_memory_s[n_bins];
105+
106+ int bin_idx = threadIdx .x ;
107+ int threshold = blockIdx .x ;
108+
109+ uint threshold_sum_above = 0 ;
110+ unsigned long long sum_above = 0 ;
111+
112+ if (bin_idx >= threshold)
113+ {
114+ uint value = histogram[bin_idx];
115+ threshold_sum_above = value;
116+ sum_above = value * bin_idx;
117+ }
118+
119+ blockReduce<n_bins>(shared_memory_ts, threshold_sum_above, bin_idx, plus<uint>());
120+ blockReduce<n_bins>(shared_memory_s, sum_above, bin_idx, plus<unsigned long long >());
121+
122+ if (bin_idx == 0 )
123+ {
124+ threshold_sums[threshold] = threshold_sum_above;
125+ sums[threshold] = sum_above;
126+ }
127+ }
128+
129+ __global__ void
130+ otsu_variance (float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
131+ {
132+ const uint32_t n_bins = 256 ;
133+
134+ __shared__ signed long long shared_memory_a[n_bins];
135+ __shared__ signed long long shared_memory_b[n_bins];
136+
137+ int bin_idx = threadIdx .x ;
138+ int threshold = blockIdx .x ;
139+
140+ uint n_samples = threshold_sums[0 ];
141+ uint n_samples_above = threshold_sums[threshold];
142+ uint n_samples_below = n_samples - n_samples_above;
143+
144+ unsigned long long total_sum = sums[0 ];
145+ unsigned long long sum_above = sums[threshold];
146+ unsigned long long sum_below = total_sum - sum_above;
147+
148+ float threshold_variance_above_f32 = 0 ;
149+ float threshold_variance_below_f32 = 0 ;
150+ if (bin_idx >= threshold)
151+ {
152+ float mean = (float ) sum_above / n_samples_above;
153+ float sigma = bin_idx - mean;
154+ threshold_variance_above_f32 = sigma * sigma;
155+ }
156+ else
157+ {
158+ float mean = (float ) sum_below / n_samples_below;
159+ float sigma = bin_idx - mean;
160+ threshold_variance_below_f32 = sigma * sigma;
161+ }
162+
163+ uint bin_count = histogram[bin_idx];
164+ signed long long threshold_variance_above_i64 = (signed long long )(threshold_variance_above_f32 * bin_count);
165+ signed long long threshold_variance_below_i64 = (signed long long )(threshold_variance_below_f32 * bin_count);
166+ blockReduce<n_bins>(shared_memory_a, threshold_variance_above_i64, bin_idx, plus<signed long long >());
167+ blockReduce<n_bins>(shared_memory_b, threshold_variance_below_i64, bin_idx, plus<signed long long >());
168+
169+ if (bin_idx == 0 )
170+ {
171+ variance[threshold] = make_float2 (threshold_variance_above_i64, threshold_variance_below_i64);
172+ }
173+ }
174+
175+
176+ __global__ void
177+ otsu_score (uint *otsu_threshold, uint *threshold_sums, float2 *variance)
178+ {
179+ const uint32_t n_thresholds = 256 ;
180+
181+ __shared__ float shared_memory[n_thresholds / WARP_SIZE];
182+
183+ int threshold = threadIdx .x ;
184+
185+ uint n_samples = threshold_sums[0 ];
186+ uint n_samples_above = threshold_sums[threshold];
187+ uint n_samples_below = n_samples - n_samples_above;
188+
189+ float threshold_mean_above = (float )n_samples_above / n_samples;
190+ float threshold_mean_below = (float )n_samples_below / n_samples;
191+
192+ float2 variances = variance[threshold];
193+ float variance_above = variances.x / n_samples_above;
194+ float variance_below = variances.y / n_samples_below;
195+
196+ float above = threshold_mean_above * variance_above;
197+ float below = threshold_mean_below * variance_below;
198+ float score = above + below;
199+
200+ float original_score = score;
201+
202+ blockReduce<n_thresholds>(shared_memory, score, threshold, minimum<float >());
203+
204+ if (threshold == 0 )
205+ {
206+ shared_memory[0 ] = score;
207+ }
208+ __syncthreads ();
209+
210+ score = shared_memory[0 ];
211+
212+ // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
213+ // know which threshold it is
214+ if (original_score == score)
215+ {
216+ *otsu_threshold = threshold - 1 ;
217+ }
218+ }
219+
220+ void compute_otsu (uint *histogram, uint *otsu_threshold, Stream &stream)
221+ {
222+ const uint n_bins = 256 ;
223+ const uint n_thresholds = 256 ;
224+
225+ cudaStream_t cuda_stream = StreamAccessor::getStream (stream);
226+
227+ dim3 block_all (n_bins);
228+ dim3 grid_all (n_thresholds);
229+ dim3 block_score (n_thresholds);
230+ dim3 grid_score (1 );
231+
232+ BufferPool pool (stream);
233+ GpuMat gpu_threshold_sums (1 , n_bins, CV_32SC1, pool.getAllocator ());
234+ GpuMat gpu_sums (1 , n_bins, CV_64FC1, pool.getAllocator ());
235+ GpuMat gpu_variances (1 , n_bins, CV_32FC2, pool.getAllocator ());
236+
237+ otsu_sums<<<grid_all, block_all, 0 , cuda_stream>>> (
238+ histogram, gpu_threshold_sums.ptr <uint>(), gpu_sums.ptr <unsigned long long >());
239+ otsu_variance<<<grid_all, block_all, 0 , cuda_stream>>> (
240+ gpu_variances.ptr <float2 >(), histogram, gpu_threshold_sums.ptr <uint>(), gpu_sums.ptr <unsigned long long >());
241+ otsu_score<<<grid_score, block_score, 0 , cuda_stream>>> (
242+ otsu_threshold, gpu_threshold_sums.ptr <uint>(), gpu_variances.ptr <float2 >());
243+ }
244+
245+ // TODO: Replace this is cv::cuda::calcHist
246+ template <uint n_bins>
247+ __global__ void histogram_kernel (
248+ uint *histogram, const uint8_t *image, uint width,
249+ uint height, uint pitch)
250+ {
251+ __shared__ uint local_histogram[n_bins];
252+
253+ uint x = blockIdx .x * blockDim .x + threadIdx .x ;
254+ uint y = blockIdx .y * blockDim .y + threadIdx .y ;
255+ uint tid = threadIdx .y * blockDim .x + threadIdx .x ;
256+
257+ if (tid < n_bins)
258+ {
259+ local_histogram[tid] = 0 ;
260+ }
261+
262+ __syncthreads ();
263+
264+ if (x < width && y < height)
265+ {
266+ uint8_t value = image[y * pitch + x];
267+ atomicInc (&local_histogram[value], 0xFFFFFFFF );
268+ }
269+
270+ __syncthreads ();
271+
272+ if (tid < n_bins)
273+ {
274+ cv::cudev::atomicAdd (&histogram[tid], local_histogram[tid]);
275+ }
276+ }
277+
278+ // TODO: Replace this with cv::cuda::calcHist
279+ void calcHist (
280+ const GpuMat src, GpuMat histogram, Stream stream)
281+ {
282+ const uint n_bins = 256 ;
283+
284+ cudaStream_t cuda_stream = StreamAccessor::getStream (stream);
285+
286+ dim3 block (128 , 4 , 1 );
287+ dim3 grid = dim3 (divUp (src.cols , block.x ), divUp (src.rows , block.y ), 1 );
288+ CV_CUDEV_SAFE_CALL (cudaMemsetAsync (histogram.ptr <uint>(), 0 , n_bins * sizeof (uint), cuda_stream));
289+ histogram_kernel<n_bins>
290+ <<<grid, block, 0 , cuda_stream>>> (
291+ histogram.ptr <uint>(), src.ptr <uint8_t >(), (uint) src.cols , (uint) src.rows , (uint) src.step );
292+ }
293+
294+ double cv::cuda::threshold (InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream &stream)
99295{
100296 GpuMat src = getInputMat (_src, stream);
101297
102298 const int depth = src.depth ();
103299
300+ const int THRESH_OTSU = 8 ;
301+ if ((type & THRESH_OTSU) == THRESH_OTSU)
302+ {
303+ CV_Assert (depth == CV_8U);
304+ CV_Assert (src.channels () == 1 );
305+
306+ BufferPool pool (stream);
307+
308+ // Find the threshold using Otsu and then run the normal thresholding algorithm
309+ GpuMat gpu_histogram (256 , 1 , CV_32SC1, pool.getAllocator ());
310+ calcHist (src, gpu_histogram, stream);
311+
312+ GpuMat gpu_otsu_threshold (1 , 1 , CV_32SC1, pool.getAllocator ());
313+ compute_otsu (gpu_histogram.ptr <uint>(), gpu_otsu_threshold.ptr <uint>(), stream);
314+
315+ cv::Mat mat_otsu_threshold;
316+ gpu_otsu_threshold.download (mat_otsu_threshold, stream);
317+ stream.waitForCompletion ();
318+
319+ // Overwrite the threshold value with the Otsu value and remove the Otsu flag from the type
320+ type = type & ~THRESH_OTSU;
321+ thresh = (double ) mat_otsu_threshold.at <int >(0 );
322+ }
323+
104324 CV_Assert ( depth <= CV_64F );
105325 CV_Assert ( type <= 4 /* THRESH_TOZERO_INV*/ );
106326
0 commit comments