@@ -28,12 +28,10 @@ import (
2828 "errors"
2929 "fmt"
3030 "os"
31- "runtime"
3231 "sync"
3332 "syscall"
3433 "time"
3534
36- "github.com/shirou/gopsutil/cpu"
3735 "github.com/uber-go/tally"
3836 "go.uber.org/zap"
3937 "go.uber.org/zap/zapcore"
@@ -140,10 +138,11 @@ type (
140138 logger * zap.Logger
141139 metricsScope tally.Scope
142140
143- pollerRequestCh chan struct {}
144- pollerAutoScaler * pollerAutoScaler
145- taskQueueCh chan interface {}
146- sessionTokenBucket * sessionTokenBucket
141+ pollerRequestCh chan struct {}
142+ pollerAutoScaler * pollerAutoScaler
143+ workerUsageCollector * workerUsageCollector
144+ taskQueueCh chan interface {}
145+ sessionTokenBucket * sessionTokenBucket
147146 }
148147
149148 polledTask struct {
@@ -173,17 +172,29 @@ func newBaseWorker(options baseWorkerOptions, logger *zap.Logger, metricsScope t
173172 logger ,
174173 )
175174 }
175+ // for now it's default to be enabled
176+ var workerUC * workerUsageCollector
177+ workerUC = newWorkerUsageCollector (
178+ workerUsageCollectorOptions {
179+ Enabled : true ,
180+ Cooldown : 30 * time .Second ,
181+ Host : options .host ,
182+ MetricsScope : metricsScope ,
183+ },
184+ logger ,
185+ )
176186
177187 bw := & baseWorker {
178- options : options ,
179- shutdownCh : make (chan struct {}),
180- taskLimiter : rate .NewLimiter (rate .Limit (options .maxTaskPerSecond ), 1 ),
181- retrier : backoff .NewConcurrentRetrier (pollOperationRetryPolicy ),
182- logger : logger .With (zapcore.Field {Key : tagWorkerType , Type : zapcore .StringType , String : options .workerType }),
183- metricsScope : tagScope (metricsScope , tagWorkerType , options .workerType ),
184- pollerRequestCh : make (chan struct {}, options .maxConcurrentTask ),
185- pollerAutoScaler : pollerAS ,
186- taskQueueCh : make (chan interface {}), // no buffer, so poller only able to poll new task after previous is dispatched.
188+ options : options ,
189+ shutdownCh : make (chan struct {}),
190+ taskLimiter : rate .NewLimiter (rate .Limit (options .maxTaskPerSecond ), 1 ),
191+ retrier : backoff .NewConcurrentRetrier (pollOperationRetryPolicy ),
192+ logger : logger .With (zapcore.Field {Key : tagWorkerType , Type : zapcore .StringType , String : options .workerType }),
193+ metricsScope : tagScope (metricsScope , tagWorkerType , options .workerType ),
194+ pollerRequestCh : make (chan struct {}, options .maxConcurrentTask ),
195+ pollerAutoScaler : pollerAS ,
196+ workerUsageCollector : workerUC ,
197+ taskQueueCh : make (chan interface {}), // no buffer, so poller only able to poll new task after previous is dispatched.
187198
188199 limiterContext : ctx ,
189200 limiterContextCancel : cancel ,
@@ -207,6 +218,10 @@ func (bw *baseWorker) Start() {
207218 bw .pollerAutoScaler .Start ()
208219 }
209220
221+ if bw .workerUsageCollector != nil {
222+ bw .workerUsageCollector .Start ()
223+ }
224+
210225 for i := 0 ; i < bw .options .pollerCount ; i ++ {
211226 bw .shutdownWG .Add (1 )
212227 go bw .runPoller ()
@@ -215,15 +230,6 @@ func (bw *baseWorker) Start() {
215230 bw .shutdownWG .Add (1 )
216231 go bw .runTaskDispatcher ()
217232
218- // We want the emit function run once per host instead of run once per worker
219- //collectHardwareUsageOnce.Do(func() {
220- // bw.shutdownWG.Add(1)
221- // go bw.emitHardwareUsage()
222- //})
223-
224- bw .shutdownWG .Add (1 )
225- go bw .emitHardwareUsage ()
226-
227233 bw .isWorkerStarted = true
228234 traceLog (func () {
229235 bw .logger .Info ("Started Worker" ,
@@ -407,6 +413,9 @@ func (bw *baseWorker) Stop() {
407413 if bw .pollerAutoScaler != nil {
408414 bw .pollerAutoScaler .Stop ()
409415 }
416+ if bw .workerUsageCollector != nil {
417+ bw .workerUsageCollector .Stop ()
418+ }
410419
411420 if success := util .AwaitWaitGroup (& bw .shutdownWG , bw .options .shutdownTimeout ); ! success {
412421 traceLog (func () {
@@ -420,53 +429,3 @@ func (bw *baseWorker) Stop() {
420429 }
421430 return
422431}
423-
424- func (bw * baseWorker ) emitHardwareUsage () {
425- defer func () {
426- if p := recover (); p != nil {
427- bw .metricsScope .Counter (metrics .WorkerPanicCounter ).Inc (1 )
428- topLine := fmt .Sprintf ("base worker for %s [panic]:" , bw .options .workerType )
429- st := getStackTraceRaw (topLine , 7 , 0 )
430- bw .logger .Error ("Unhandled panic in hardware emitting." ,
431- zap .String (tagPanicError , fmt .Sprintf ("%v" , p )),
432- zap .String (tagPanicStack , st ))
433- }
434- }()
435- defer bw .shutdownWG .Done ()
436- collectHardwareUsageOnce .Do (
437- func () {
438- ticker := time .NewTicker (hardwareMetricsCollectInterval )
439- for {
440- select {
441- case <- bw .shutdownCh :
442- ticker .Stop ()
443- return
444- case <- ticker .C :
445- host := bw .options .host
446- scope := bw .metricsScope .Tagged (map [string ]string {clientHostTag : host })
447-
448- cpuPercent , err := cpu .Percent (0 , false )
449- if err != nil {
450- bw .logger .Warn ("Failed to get cpu percent" , zap .Error (err ))
451- return
452- }
453- cpuCores , err := cpu .Counts (false )
454- if err != nil {
455- bw .logger .Warn ("Failed to get number of cpu cores" , zap .Error (err ))
456- return
457- }
458- scope .Gauge (metrics .NumCPUCores ).Update (float64 (cpuCores ))
459- scope .Gauge (metrics .CPUPercentage ).Update (cpuPercent [0 ])
460-
461- var memStats runtime.MemStats
462- runtime .ReadMemStats (& memStats )
463-
464- scope .Gauge (metrics .NumGoRoutines ).Update (float64 (runtime .NumGoroutine ()))
465- scope .Gauge (metrics .TotalMemory ).Update (float64 (memStats .Sys ))
466- scope .Gauge (metrics .MemoryUsedHeap ).Update (float64 (memStats .HeapInuse ))
467- scope .Gauge (metrics .MemoryUsedStack ).Update (float64 (memStats .StackInuse ))
468- }
469- }
470- })
471-
472- }
0 commit comments