@@ -383,6 +383,7 @@ private async Task RunAsync(Pipelines.AgentJobRequestMessage message, WorkerDisp
383383
384384 var jobRequestCancellationToken = newJobDispatch . WorkerCancellationTokenSource . Token ;
385385 var workerCancelTimeoutKillToken = newJobDispatch . WorkerCancelTimeoutKillTokenSource . Token ;
386+ var workerFlushLogsTimeoutToken = newJobDispatch . WorkerFlushLogsTimeoutTokenSource . Token ;
386387 var term = HostContext . GetService < ITerminal > ( ) ;
387388 term . WriteLine ( StringUtil . Loc ( "RunningJob" , DateTime . UtcNow , message . JobDisplayName ) ) ;
388389
@@ -450,6 +451,7 @@ private async Task RunAsync(Pipelines.AgentJobRequestMessage message, WorkerDisp
450451 var featureFlagProvider = HostContext . GetService < IFeatureFlagProvider > ( ) ;
451452 var newMaskerAndRegexesFeatureFlagStatus = await featureFlagProvider . GetFeatureFlagAsync ( HostContext , "DistributedTask.Agent.EnableNewMaskerAndRegexes" , Trace ) ;
452453 var enhancedLoggingFlag = await featureFlagProvider . GetFeatureFlagAsync ( HostContext , "DistributedTask.Agent.UseEnhancedLogging" , Trace ) ;
454+
453455 var environment = new Dictionary < string , string > ( ) ;
454456 if ( newMaskerAndRegexesFeatureFlagStatus ? . EffectiveState == "On" )
455457 {
@@ -734,24 +736,19 @@ await processChannel.SendAsync(
734736 }
735737
736738 Trace . Info ( $ "Waiting for worker to exit gracefully for job: { message . JobId } ") ;
737- // wait worker to exit
738- // if worker doesn't exit within timeout, then kill worker.
739- var exitTask = await Task . WhenAny ( workerProcessTask , Task . Delay ( - 1 , workerCancelTimeoutKillToken ) ) ;
740739
741- // worker haven't exit within cancellation timeout.
742- if ( exitTask != workerProcessTask )
740+ // Wait for worker to complete within the original timeout
741+ var gracefulExitTask = await Task . WhenAny ( workerProcessTask , Task . Delay ( - 1 , workerFlushLogsTimeoutToken ) ) ;
742+
743+ if ( gracefulExitTask != workerProcessTask )
743744 {
744- Trace . Info ( $ "worker process for job { message . JobId } haven't exit within cancellation timout, kill running worker.") ;
745- workerProcessCancelTokenSource . Cancel ( ) ;
746- try
747- {
748- await workerProcessTask ;
749- Trace . Info ( "Worker process forceful termination completed" ) ;
750- }
751- catch ( OperationCanceledException )
752- {
753- Trace . Info ( "worker process has been killed." ) ;
754- }
745+ // Original timeout expired, handle with timeout log flushing if enabled
746+ await HandleWorkerTimeoutAsync (
747+ message . JobId ,
748+ processChannel ,
749+ workerProcessTask ,
750+ workerProcessCancelTokenSource ,
751+ workerCancelTimeoutKillToken ) ;
755752 }
756753 else
757754 {
@@ -1070,6 +1067,7 @@ private class WorkerDispatcher : IDisposable
10701067 public TaskCompletionSource < JobMetadataMessage > MetadataSource { get ; set ; }
10711068 public CancellationTokenSource WorkerCancellationTokenSource { get ; private set ; }
10721069 public CancellationTokenSource WorkerCancelTimeoutKillTokenSource { get ; private set ; }
1070+ public CancellationTokenSource WorkerFlushLogsTimeoutTokenSource { get ; private set ; }
10731071 private readonly object _lock = new object ( ) ;
10741072
10751073 const int maxValueInMinutes = 35790 ; // 35790 * 60 * 1000 = 2147400000
@@ -1080,18 +1078,19 @@ public WorkerDispatcher(Guid jobId, long requestId)
10801078 {
10811079 JobId = jobId ;
10821080 RequestId = requestId ;
1083- WorkerCancelTimeoutKillTokenSource = new CancellationTokenSource ( ) ;
10841081 WorkerCancellationTokenSource = new CancellationTokenSource ( ) ;
1082+ WorkerCancelTimeoutKillTokenSource = new CancellationTokenSource ( ) ;
1083+ WorkerFlushLogsTimeoutTokenSource = new CancellationTokenSource ( ) ;
10851084 MetadataSource = new TaskCompletionSource < JobMetadataMessage > ( ) ;
10861085 }
10871086
10881087 public bool Cancel ( TimeSpan timeout )
10891088 {
1090- if ( WorkerCancellationTokenSource != null && WorkerCancelTimeoutKillTokenSource != null )
1089+ if ( WorkerCancellationTokenSource != null && WorkerCancelTimeoutKillTokenSource != null && WorkerFlushLogsTimeoutTokenSource != null )
10911090 {
10921091 lock ( _lock )
10931092 {
1094- if ( WorkerCancellationTokenSource != null && WorkerCancelTimeoutKillTokenSource != null )
1093+ if ( WorkerCancellationTokenSource != null && WorkerCancelTimeoutKillTokenSource != null && WorkerFlushLogsTimeoutTokenSource != null )
10951094 {
10961095 WorkerCancellationTokenSource . Cancel ( ) ;
10971096
@@ -1107,7 +1106,12 @@ public bool Cancel(TimeSpan timeout)
11071106 timeout = TimeSpan . FromMinutes ( maxValueInMinutes ) ;
11081107 }
11091108
1110- WorkerCancelTimeoutKillTokenSource . CancelAfter ( timeout . Subtract ( TimeSpan . FromSeconds ( 15 ) ) ) ;
1109+ // Use the original timeout for worker execution (no flush signal beforehand)
1110+ WorkerFlushLogsTimeoutTokenSource . CancelAfter ( timeout . Subtract ( TimeSpan . FromSeconds ( 15 ) ) ) ;
1111+
1112+ // Set kill timeout to original timeout + 1 minute for log flushing
1113+ TimeSpan killTimeout = timeout . Add ( TimeSpan . FromMinutes ( 1 ) ) ;
1114+ WorkerCancelTimeoutKillTokenSource . CancelAfter ( killTimeout ) ;
11111115 return true ;
11121116 }
11131117 }
@@ -1139,7 +1143,7 @@ private void Dispose(bool disposing)
11391143 {
11401144 if ( disposing )
11411145 {
1142- if ( WorkerCancellationTokenSource != null || WorkerCancelTimeoutKillTokenSource != null )
1146+ if ( WorkerCancellationTokenSource != null || WorkerCancelTimeoutKillTokenSource != null || WorkerFlushLogsTimeoutTokenSource != null )
11431147 {
11441148 lock ( _lock )
11451149 {
@@ -1154,10 +1158,65 @@ private void Dispose(bool disposing)
11541158 WorkerCancelTimeoutKillTokenSource . Dispose ( ) ;
11551159 WorkerCancelTimeoutKillTokenSource = null ;
11561160 }
1161+
1162+ if ( WorkerFlushLogsTimeoutTokenSource != null )
1163+ {
1164+ WorkerFlushLogsTimeoutTokenSource . Dispose ( ) ;
1165+ WorkerFlushLogsTimeoutTokenSource = null ;
1166+ }
11571167 }
11581168 }
11591169 }
11601170 }
11611171 }
1172+
1173+ private async Task HandleWorkerTimeoutAsync (
1174+ Guid jobId ,
1175+ IProcessChannel processChannel ,
1176+ Task < int > workerProcessTask ,
1177+ CancellationTokenSource workerProcessCancelTokenSource ,
1178+ CancellationToken workerCancelTimeoutKillToken )
1179+ {
1180+ Trace . Info ( $ "Worker process for job { jobId } hasn't completed within original timeout, sending flush logs request and waiting 1 minute before forceful kill.") ;
1181+ try
1182+ {
1183+ // Send special flush logs request to worker
1184+ using ( var csSendFlush = new CancellationTokenSource ( _channelTimeout ) )
1185+ {
1186+ await processChannel . SendAsync (
1187+ messageType : MessageType . FlushLogsRequest ,
1188+ body : string . Empty ,
1189+ cancellationToken : csSendFlush . Token ) ;
1190+ }
1191+ Trace . Info ( "Flush logs request sent to worker, waiting 1 minute for log flushing before forceful kill." ) ;
1192+ }
1193+ catch ( Exception ex )
1194+ {
1195+ Trace . Warning ( $ "Failed to send flush logs request to worker: { ex . Message } ") ;
1196+ }
1197+
1198+ // Now wait for the additional 1 minute log flushing period
1199+ try
1200+ {
1201+ await Task . WhenAny ( workerProcessTask , Task . Delay ( - 1 , workerCancelTimeoutKillToken ) ) ;
1202+
1203+ if ( ! workerProcessTask . IsCompleted )
1204+ {
1205+ // Worker still hasn't exited after 1 minute log flushing period, force kill
1206+ Trace . Info ( $ "Worker process for job { jobId } hasn't exited after 1 minute log flushing period, proceeding to forceful kill.") ;
1207+ workerProcessCancelTokenSource . Cancel ( ) ;
1208+ await workerProcessTask ;
1209+ Trace . Info ( "Worker process forceful termination completed" ) ;
1210+ }
1211+ else
1212+ {
1213+ Trace . Info ( "Worker process exited gracefully after flush logs signal" ) ;
1214+ }
1215+ }
1216+ catch ( OperationCanceledException )
1217+ {
1218+ Trace . Info ( "worker process has been killed." ) ;
1219+ }
1220+ }
11621221 }
11631222}
0 commit comments