5151/* This is a thread implementation for Win32 lazy implementation */
5252
5353/* Thread server common information */
54- typedef struct {
55- CRITICAL_SECTION lock ;
56- HANDLE filled ;
57- HANDLE killed ;
5854
59- blas_queue_t * queue ; /* Parameter Pointer */
60- int shutdown ; /* server shutdown flag */
61-
62- } blas_pool_t ;
55+ static blas_queue_t * work_queue = NULL ;
56+ static HANDLE kickoff_event = NULL ;
57+ static CRITICAL_SECTION queue_lock ;
6358
6459/* We need this global for checking if initialization is finished. */
6560int blas_server_avail = 0 ;
6661
6762/* Local Variables */
6863static BLASULONG server_lock = 0 ;
6964
70- static blas_pool_t pool ;
7165static HANDLE blas_threads [MAX_CPU_NUMBER ];
7266static DWORD blas_threads_id [MAX_CPU_NUMBER ];
67+ static volatile int thread_target ; // target num of live threads, volatile for cross-thread reads
7368
74-
69+ #if defined (__GNUC__ ) && (__GNUC__ < 6 )
70+ #define WIN_CAS (dest , exch , comp ) __sync_val_compare_and_swap(dest, comp, exch)
71+ #else
72+ #if defined(_WIN64 )
73+ #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange64(dest, exch, comp)
74+ #else
75+ #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange(dest, exch, comp)
76+ #endif
77+ #endif
7578
7679static void legacy_exec (void * func , int mode , blas_arg_t * args , void * sb ){
7780
@@ -202,14 +205,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
202205static DWORD WINAPI blas_thread_server (void * arg ){
203206
204207 /* Thread identifier */
205- #ifdef SMP_DEBUG
206208 BLASLONG cpu = (BLASLONG )arg ;
207- #endif
208209
209210 void * buffer , * sa , * sb ;
210211 blas_queue_t * queue ;
211- DWORD action ;
212- HANDLE handles [] = {pool .filled , pool .killed };
213212
214213 /* Each server needs each buffer */
215214 buffer = blas_memory_alloc (2 );
@@ -225,29 +224,44 @@ static DWORD WINAPI blas_thread_server(void *arg){
225224#ifdef SMP_DEBUG
226225 fprintf (STDERR , "Server[%2ld] Waiting for Queue.\n" , cpu );
227226#endif
227+ // event raised when work is added to the queue
228+ WaitForSingleObject (kickoff_event , INFINITE );
228229
229- do {
230- action = WaitForMultipleObjects ( 2 , handles , FALSE, INFINITE );
231- } while (( action != WAIT_OBJECT_0 ) && ( action != WAIT_OBJECT_0 + 1 ) );
232-
233- if ( action == WAIT_OBJECT_0 + 1 ) break ;
230+ if ( cpu > thread_target - 2 )
231+ {
232+ //printf("thread [%d] exiting.\n", cpu );
233+ break ; // excess thread, so worker thread exits
234+ }
234235
235236#ifdef SMP_DEBUG
236237 fprintf (STDERR , "Server[%2ld] Got it.\n" , cpu );
237238#endif
238239
239- EnterCriticalSection (& pool .lock );
240+ #if 1
241+ EnterCriticalSection (& queue_lock );
242+
243+ queue = work_queue ;
244+ if (queue )
245+ work_queue = work_queue -> next ;
246+
247+ LeaveCriticalSection (& queue_lock );
248+ #else
249+ volatile blas_queue_t * queue_next ;
240250
241- queue = pool .queue ;
242- if (queue ) pool .queue = queue -> next ;
251+ INT_PTR prev_value ;
252+ do {
253+ queue = (volatile blas_queue_t * )work_queue ;
254+ if (!queue )
255+ break ;
243256
244- LeaveCriticalSection (& pool .lock );
257+ queue_next = (volatile blas_queue_t * )queue -> next ;
258+ prev_value = WIN_CAS ((INT_PTR * )& work_queue , (INT_PTR )queue_next , (INT_PTR )queue );
259+ } while (prev_value != queue );
260+ #endif
245261
246262 if (queue ) {
247263 int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = queue -> routine ;
248264
249- if (pool .queue ) SetEvent (pool .filled );
250-
251265 sa = queue -> sa ;
252266 sb = queue -> sb ;
253267
@@ -331,14 +345,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
331345#ifdef SMP_DEBUG
332346 fprintf (STDERR , "Server[%2ld] Finished!\n" , cpu );
333347#endif
348+
349+ queue -> finished = 1 ;
334350
335- EnterCriticalSection (& queue -> lock );
336-
337- queue -> status = BLAS_STATUS_FINISHED ;
338-
339- LeaveCriticalSection (& queue -> lock );
340-
341- SetEvent (queue -> finish );
342351 }
343352
344353 /* Shutdown procedure */
@@ -366,15 +375,16 @@ int blas_thread_init(void){
366375#endif
367376
368377 if (!blas_server_avail ){
378+ // create the kickoff Event
379+ kickoff_event = CreateEvent (NULL , TRUE, FALSE, NULL );
369380
370- InitializeCriticalSection (& pool .lock );
371- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
372- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
381+ thread_target = blas_cpu_number ;
373382
374- pool .shutdown = 0 ;
375- pool .queue = NULL ;
383+ InitializeCriticalSection (& queue_lock );
376384
377385 for (i = 0 ; i < blas_cpu_number - 1 ; i ++ ){
386+ //printf("thread_init: creating thread [%d]\n", i);
387+
378388 blas_threads [i ] = CreateThread (NULL , 0 ,
379389 blas_thread_server , (void * )i ,
380390 0 , & blas_threads_id [i ]);
@@ -409,32 +419,39 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
409419 current = queue ;
410420
411421 while (current ) {
412- InitializeCriticalSection (& current -> lock );
413- current -> finish = CreateEvent (NULL , FALSE, FALSE, NULL );
414422 current -> position = pos ;
415423
416424#ifdef CONSISTENT_FPCSR
417425 __asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode ));
418426 __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode ));
419427#endif
420428
429+ current -> finished = 0 ;
421430 current = current -> next ;
422431 pos ++ ;
423432 }
424433
425- EnterCriticalSection (& pool .lock );
434+ EnterCriticalSection (& queue_lock );
435+
436+ if (!work_queue )
437+ {
438+ work_queue = queue ;
439+ }
440+ else
441+ {
442+ blas_queue_t * next_item = work_queue ;
443+
444+ // find the end of the work queue
445+ while (next_item )
446+ next_item = next_item -> next ;
426447
427- if (pool .queue ) {
428- current = pool .queue ;
429- while (current -> next ) current = current -> next ;
430- current -> next = queue ;
431- } else {
432- pool .queue = queue ;
448+ // add new work to the end
449+ next_item = queue ;
433450 }
434451
435- LeaveCriticalSection (& pool . lock );
452+ LeaveCriticalSection (& queue_lock );
436453
437- SetEvent (pool . filled );
454+ SetEvent (kickoff_event );
438455
439456 return 0 ;
440457}
@@ -449,21 +466,26 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
449466#ifdef SMP_DEBUG
450467 fprintf (STDERR , "Waiting Queue ..\n" );
451468#endif
469+ while (!queue -> finished )
470+ YIELDING ;
452471
453- WaitForSingleObject (queue -> finish , INFINITE );
454-
455- CloseHandle (queue -> finish );
456- DeleteCriticalSection (& queue -> lock );
457-
458- queue = queue -> next ;
459- num -- ;
472+ queue = queue -> next ;
473+ num -- ;
460474 }
461475
462476#ifdef SMP_DEBUG
463477 fprintf (STDERR , "Completely Done.\n\n" );
464478#endif
479+ // if work was added to the queue after this batch we can't sleep the worker threads
480+ // by resetting the event
481+ EnterCriticalSection (& queue_lock );
465482
466- return 0 ;
483+ if (work_queue == NULL )
484+ ResetEvent (kickoff_event );
485+
486+ LeaveCriticalSection (& queue_lock );
487+
488+ return 0 ;
467489}
468490
469491/* Execute Threads */
@@ -512,8 +534,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
512534
513535 if (blas_server_avail ){
514536
515- SetEvent (pool .killed );
516-
517537 for (i = 0 ; i < blas_num_threads - 1 ; i ++ ){
518538 // Could also just use WaitForMultipleObjects
519539 DWORD wait_thread_value = WaitForSingleObject (blas_threads [i ], 50 );
@@ -528,9 +548,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
528548 CloseHandle (blas_threads [i ]);
529549 }
530550
531- CloseHandle (pool .filled );
532- CloseHandle (pool .killed );
533-
534551 blas_server_avail = 0 ;
535552 }
536553
@@ -552,23 +569,48 @@ void goto_set_num_threads(int num_threads)
552569
553570 if (num_threads > MAX_CPU_NUMBER ) num_threads = MAX_CPU_NUMBER ;
554571
572+ if (blas_server_avail && num_threads < blas_num_threads ) {
573+ LOCK_COMMAND (& server_lock );
574+
575+ thread_target = num_threads ;
576+
577+ SetEvent (kickoff_event );
578+
579+ for (i = num_threads - 1 ; i < blas_num_threads - 1 ; i ++ ) {
580+ //printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
581+
582+ WaitForSingleObject (blas_threads [i ], INFINITE );
583+
584+ //printf("set_num_threads: thread [%d] has quit.\n", i);
585+
586+ CloseHandle (blas_threads [i ]);
587+ }
588+
589+ blas_num_threads = num_threads ;
590+
591+ ResetEvent (kickoff_event );
592+
593+ UNLOCK_COMMAND (& server_lock );
594+ }
595+
555596 if (num_threads > blas_num_threads ) {
556597
557598 LOCK_COMMAND (& server_lock );
558599
600+ thread_target = num_threads ;
601+
559602 //increased_threads = 1;
560603 if (!blas_server_avail ){
604+ // create the kickoff Event
605+ kickoff_event = CreateEvent (NULL , TRUE, FALSE, NULL );
561606
562- InitializeCriticalSection (& pool .lock );
563- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
564- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
607+ InitializeCriticalSection (& queue_lock );
565608
566- pool .shutdown = 0 ;
567- pool .queue = NULL ;
568609 blas_server_avail = 1 ;
569610 }
570611
571612 for (i = (blas_num_threads > 0 ) ? blas_num_threads - 1 : 0 ; i < num_threads - 1 ; i ++ ){
613+ //printf("set_num_threads: creating thread [%d]\n", i);
572614
573615 blas_threads [i ] = CreateThread (NULL , 0 ,
574616 blas_thread_server , (void * )i ,
0 commit comments