Skip to content

Commit 67794b4

Browse files
Adding trace logs in agent worker - job runner in agent lifecycle execution (#5285)
* Adding trace logs in agent worker - job runner in gent lifecycle execution * minor fix | removing unnecessary log * Code cleaning in worker program.cs * Cleaning worker code and removing some traces * JobRunner trace logs update, * Minor syntax fixes in logs
1 parent cdc67e3 commit 67794b4

File tree

3 files changed

+89
-29
lines changed

3 files changed

+89
-29
lines changed

src/Agent.Worker/JobRunner.cs

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
5151
ArgUtil.NotNull(message.Resources, nameof(message.Resources));
5252
ArgUtil.NotNull(message.Variables, nameof(message.Variables));
5353
ArgUtil.NotNull(message.Steps, nameof(message.Steps));
54-
Trace.Info("Job ID {0}", message.JobId);
54+
Trace.Entering();
5555

5656
DateTime jobStartTimeUtc = DateTime.UtcNow;
5757

@@ -73,15 +73,17 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
7373
if (!message.Variables.ContainsKey(Constants.Variables.System.ServerType) ||
7474
string.Equals(message.Variables[Constants.Variables.System.ServerType]?.Value, "OnPremises", StringComparison.OrdinalIgnoreCase))
7575
{
76+
Trace.Info("OnPremises server detected - applying config URI base replacement");
7677
ReplaceConfigUriBaseInJobRequestMessage(message);
78+
Trace.Info("Config URI base replacement completed for OnPremises server");
7779
}
7880

7981
// Setup the job server and job server queue.
8082
var jobServer = HostContext.GetService<IJobServer>();
8183
VssCredentials jobServerCredential = VssUtil.GetVssCredential(systemConnection);
8284
Uri jobServerUrl = systemConnection.Url;
8385

84-
Trace.Info($"Creating job server with URL: {jobServerUrl}");
86+
Trace.Info("Creating job server connection [URL:{0}]", jobServerUrl);
8587
// jobServerQueue is the throttling reporter.
8688
_jobServerQueue = HostContext.GetService<IJobServerQueue>();
8789
VssConnection jobConnection = VssUtil.CreateConnection(
@@ -95,6 +97,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
9597

9698
_jobServerQueue.Start(message);
9799
HostContext.WritePerfCounter($"WorkerJobServerQueueStarted_{message.RequestId.ToString()}");
100+
Trace.Info("JobServer connection established successfully [URL:{0}, ThrottlingEnabled:True]", jobServerUrl);
98101

99102
IExecutionContext jobContext = null;
100103
CancellationTokenRegistration? agentShutdownRegistration = null;
@@ -107,9 +110,9 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
107110
jobContext = HostContext.CreateService<IExecutionContext>();
108111
jobContext.InitializeJob(message, jobRequestCancellationToken);
109112

110-
Trace.Info("Starting the job execution context.");
111113
jobContext.Start();
112114
jobContext.Section(StringUtil.Loc("StepStarting", message.JobDisplayName));
115+
Trace.Info($"ExecutionContext initialized successfully. [JobName: {message.JobDisplayName}]");
113116

114117
//Start Resource Diagnostics if enabled in the job message
115118
jobContext.Variables.TryGetValue("system.debug", out var systemDebug);
@@ -138,11 +141,14 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
138141
{
139142
case ShutdownReason.UserCancelled:
140143
errorMessage = StringUtil.Loc("UserShutdownAgent");
144+
Trace.Warning("Agent shutdown initiated [Reason:UserCancelled, JobId:{0}]", message.JobId);
141145
break;
142146
case ShutdownReason.OperatingSystemShutdown:
143147
errorMessage = StringUtil.Loc("OperatingSystemShutdown", Environment.MachineName);
148+
Trace.Warning("Agent shutdown initiated [Reason:OperatingSystemShutdown, JobId:{0}, Machine:{1}]", message.JobId, Environment.MachineName);
144149
break;
145150
default:
151+
Trace.Error("Unknown shutdown reason detected [Reason:{0}, JobId:{1}]", HostContext.AgentShutdownReason, message.JobId);
146152
throw new ArgumentException(HostContext.AgentShutdownReason.ToString(), nameof(HostContext.AgentShutdownReason));
147153
}
148154
jobContext.AddIssue(new Issue() { Type = IssueType.Error, Message = errorMessage });
@@ -155,6 +161,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
155161
{
156162
Directory.CreateDirectory(workDirectory);
157163
IOUtil.ValidateExecutePermission(workDirectory);
164+
Trace.Info("Work directory validation successful [Path:{0}]", workDirectory);
158165
}
159166
catch (Exception ex)
160167
{
@@ -175,6 +182,8 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
175182
jobContext.SetVariable(Constants.Variables.Agent.OS, VarUtil.OS);
176183
jobContext.SetVariable(Constants.Variables.Agent.OSArchitecture, VarUtil.OSArchitecture);
177184
jobContext.SetVariable(Constants.Variables.Agent.RootDirectory, HostContext.GetDirectory(WellKnownDirectory.Work), isFilePath: true);
185+
Trace.Info("Agent metadata populated [AgentId:{0}, AgentName:{1}, OS:{2}, Architecture:{3}, SelfHosted:{4}, CloudId:{5}, MachineName:{6}]",
186+
settings.AgentId, settings.AgentName, VarUtil.OS, VarUtil.OSArchitecture, !settings.IsMSHosted, settings.AgentCloudId, Environment.MachineName);
178187
if (PlatformUtil.RunningOnWindows)
179188
{
180189
string serverOMDirectoryVariable = AgentKnobs.InstallLegacyTfExe.GetValue(jobContext).AsBoolean()
@@ -190,6 +199,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
190199
jobContext.SetVariable(Constants.Variables.Agent.WorkFolder, HostContext.GetDirectory(WellKnownDirectory.Work), isFilePath: true);
191200
jobContext.SetVariable(Constants.Variables.System.WorkFolder, HostContext.GetDirectory(WellKnownDirectory.Work), isFilePath: true);
192201

202+
Trace.Info("Environment detection initiated - setting up Azure VM and Docker container detection");
193203
var azureVmCheckCommand = jobContext.GetHostContext().GetService<IAsyncCommandContext>();
194204
azureVmCheckCommand.InitializeCommandContext(jobContext, Constants.AsyncExecution.Commands.Names.GetAzureVMMetada);
195205
azureVmCheckCommand.Task = Task.Run(() => jobContext.SetVariable(Constants.Variables.System.IsAzureVM, PlatformUtil.DetectAzureVM() ? "1" : "0"));
@@ -203,6 +213,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
203213
string toolsDirectory = HostContext.GetDirectory(WellKnownDirectory.Tools);
204214
Directory.CreateDirectory(toolsDirectory);
205215
jobContext.SetVariable(Constants.Variables.Agent.ToolsDirectory, toolsDirectory, isFilePath: true);
216+
Trace.Info("Tools directory initialized [Path:{0}]", toolsDirectory);
206217

207218
if (AgentKnobs.DisableGitPrompt.GetValue(jobContext).AsBoolean())
208219
{
@@ -212,9 +223,11 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
212223
// Setup TEMP directories
213224
_tempDirectoryManager = HostContext.GetService<ITempDirectoryManager>();
214225
_tempDirectoryManager.InitializeTempDirectory(jobContext);
226+
Trace.Info("Temporary directory manager initialized - TEMP directories configured for job execution");
215227

216228
// todo: task server can throw. try/catch and fail job gracefully.
217229
// prefer task definitions url, then TFS collection url, then TFS account url
230+
Trace.Info("TaskServer connection setup initiated - establishing connection for task definitions");
218231
var taskServer = HostContext.GetService<ITaskServer>();
219232
Uri taskServerUri = null;
220233
if (!string.IsNullOrEmpty(jobContext.Variables.System_TaskDefinitionsUri))
@@ -229,10 +242,11 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
229242
var taskServerCredential = VssUtil.GetVssCredential(systemConnection);
230243
if (taskServerUri != null)
231244
{
232-
Trace.Info($"Creating task server with {taskServerUri}");
245+
Trace.Info("Creating task server [URI:{0}]", taskServerUri);
233246

234247
taskConnection = VssUtil.CreateConnection(taskServerUri, taskServerCredential, Trace, skipServerCertificateValidation);
235248
await taskServer.ConnectAsync(taskConnection);
249+
Trace.Info($"TaskServer connection established successfully [URI: {taskServerUri}]");
236250
}
237251

238252
// for back compat TFS 2015 RTM/QU1, we may need to switch the task server url to agent config url
@@ -247,6 +261,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
247261
Trace.Info($"Recreate task server with configuration server url: {taskServerUri}");
248262
legacyTaskConnection = VssUtil.CreateConnection(taskServerUri, taskServerCredential, trace: Trace, skipServerCertificateValidation);
249263
await taskServer.ConnectAsync(legacyTaskConnection);
264+
Trace.Info($"Legacy TaskServer connection established successfully - [URI: {taskServerUri}]");
250265
}
251266
}
252267

@@ -256,6 +271,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
256271
jobContext.Variables.ExpandValues(target: endpoint.Data);
257272
VarUtil.ExpandEnvironmentVariables(HostContext, target: endpoint.Data);
258273
}
274+
Trace.Info($"Endpoint data expansion completed for {jobContext.Endpoints?.Count ?? 0} endpoints");
259275

260276
// Expand the repository property values.
261277
foreach (var repository in jobContext.Repositories)
@@ -278,6 +294,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
278294
repository.Properties.Set<JToken>(Pipelines.RepositoryPropertyNames.Mappings, mappings);
279295
}
280296
}
297+
Trace.Info($"Repository property expansion completed for {jobContext.Repositories?.Count ?? 0} repositories");
281298

282299
// Expand container properties
283300
foreach (var container in jobContext.Containers)
@@ -288,11 +305,13 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
288305
{
289306
this.ExpandProperties(sidecar, jobContext.Variables);
290307
}
308+
Trace.Info($"Container property expansion completed - Containers: {jobContext.Containers?.Count ?? 0}, Sidecars: {jobContext.SidecarContainers?.Count ?? 0}");
291309

292310
// Send telemetry in case if git is preinstalled on windows platform
293311
var isSelfHosted = StringUtil.ConvertToBoolean(jobContext.Variables.Get(Constants.Variables.Agent.IsSelfHosted));
294312
if (PlatformUtil.RunningOnWindows && isSelfHosted)
295313
{
314+
Trace.Info("Initiating Windows preinstalled Git telemetry collection for self-hosted agent");
296315
var windowsPreinstalledGitCommand = jobContext.GetHostContext().GetService<IAsyncCommandContext>();
297316
windowsPreinstalledGitCommand.InitializeCommandContext(jobContext, Constants.AsyncExecution.Commands.Names.WindowsPreinstalledGitTelemetry);
298317
windowsPreinstalledGitCommand.Task = Task.Run(() =>
@@ -315,7 +334,7 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
315334
}
316335

317336
// Get the job extension.
318-
Trace.Info("Getting job extension.");
337+
Trace.Info("Job extension initialization initiated - determining host type and loading appropriate extension");
319338
var hostType = jobContext.Variables.System_HostType;
320339
var extensionManager = HostContext.GetService<IExtensionManager>();
321340
// We should always have one job extension
@@ -324,12 +343,13 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
324343
.Where(x => x.HostType.HasFlag(hostType))
325344
.FirstOrDefault();
326345
ArgUtil.NotNull(jobExtension, nameof(jobExtension));
327-
346+
Trace.Info($"Job extension loaded successfully - HostType: {hostType}, ExtensionType: {jobExtension?.GetType()?.Name}");
328347
List<IStep> jobSteps = null;
329348
try
330349
{
331-
Trace.Info("Initialize job. Getting all job steps.");
350+
Trace.Info("Job steps initialization initiated - parsing step definitions and resolving task references");
332351
jobSteps = await jobExtension.InitializeJob(jobContext, message);
352+
Trace.Info($"Job extension initialization completed successfully");
333353
}
334354
catch (OperationCanceledException ex) when (jobContext.CancellationToken.IsCancellationRequested)
335355
{
@@ -375,7 +395,9 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
375395
var stepsRunner = HostContext.GetService<IStepsRunner>();
376396
try
377397
{
398+
Trace.Info("Step execution pipeline initiated - beginning job steps execution with StepsRunner");
378399
await stepsRunner.RunAsync(jobContext, jobSteps);
400+
Trace.Info("Step execution pipeline completed successfully - all job steps finished execution");
379401
}
380402
catch (Exception ex)
381403
{
@@ -396,15 +418,15 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
396418

397419
if (jobContext.Variables.GetBoolean(Constants.Variables.Agent.Diagnostic) ?? false)
398420
{
399-
Trace.Info("Support log upload starting.");
421+
Trace.Info("Support log upload initiated - Diagnostic mode enabled, uploading support logs");
400422

401423
IDiagnosticLogManager diagnosticLogManager = HostContext.GetService<IDiagnosticLogManager>();
402424

403425
try
404426
{
405427
await diagnosticLogManager.UploadDiagnosticLogsAsync(executionContext: jobContext, message: message, jobStartTimeUtc: jobStartTimeUtc);
406428

407-
Trace.Info("Support log upload complete.");
429+
Trace.Info("Support log upload completed - Diagnostic logs uploaded successfully");
408430
}
409431
catch (Exception ex)
410432
{
@@ -434,8 +456,8 @@ public async Task<TaskResult> RunAsync(Pipelines.AgentJobRequestMessage message,
434456
legacyTaskConnection?.Dispose();
435457
taskConnection?.Dispose();
436458
jobConnection?.Dispose();
437-
438459
await ShutdownQueue(throwOnFailure: false);
460+
Trace.Info("Job server queue shutdown completed - all resources cleaned up successfully");
439461
}
440462
}
441463

@@ -474,13 +496,16 @@ public void ExpandProperties(ContainerInfo container, Variables variables)
474496

475497
private async Task<TaskResult> CompleteJobAsync(IJobServer jobServer, IExecutionContext jobContext, Pipelines.AgentJobRequestMessage message, TaskResult? taskResult = null)
476498
{
499+
Trace.Info($"Job finalization initiated - Job: '{message.JobDisplayName}', JobId: {message.JobId}");
477500
ArgUtil.NotNull(message, nameof(message));
478501
jobContext.Section(StringUtil.Loc("StepFinishing", message.JobDisplayName));
479502
TaskResult result = jobContext.Complete(taskResult);
503+
Trace.Info($"Job result calculation completed - Final result: {result}");
480504

481505
try
482506
{
483507
await ShutdownQueue(throwOnFailure: true);
508+
Trace.Info("Job extension finalization completed - Job server queue shutdown successful");
484509
}
485510
catch (AggregateException ex)
486511
{
@@ -499,24 +524,28 @@ private async Task<TaskResult> CompleteJobAsync(IJobServer jobServer, IExecution
499524

500525
// Clean TEMP after finish process jobserverqueue, since there might be a pending fileupload still use the TEMP dir.
501526
_tempDirectoryManager?.CleanupTempDirectory();
527+
Trace.Info("Resource disposal completed - Temporary directory cleanup finished");
502528

503529
if (!jobContext.Features.HasFlag(PlanFeatures.JobCompletedPlanEvent))
504530
{
505-
Trace.Info($"Skip raise job completed event call from worker because Plan version is {message.Plan.Version}");
531+
Trace.Info($"Job completion event skipped - [Plan version {message.Plan.Version}] does not support JobCompletedPlanEvent. [Final result: {result}]");
506532
return result;
507533
}
508534

535+
Trace.Info($"Timeline record updates initiated - Raising job completed event for JobId: {message.JobId}");
509536
Trace.Info("Raising job completed event.");
510537
var jobCompletedEvent = new JobCompletedEvent(message.RequestId, message.JobId, result,
511538
jobContext.Variables.Get(Constants.Variables.Agent.RunMode) == Constants.Agent.CommandLine.Flags.Once);
512539

513540
var completeJobRetryLimit = 5;
514541
var exceptions = new List<Exception>();
542+
Trace.Info($"Timeline record updates - Attempting completion event with retry limit: {completeJobRetryLimit}");
515543
while (completeJobRetryLimit-- > 0)
516544
{
517545
try
518546
{
519547
await jobServer.RaisePlanEventAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, default(CancellationToken));
548+
Trace.Info($"Timeline record updates completed - Job completed event raised successfully - Final result: {result}");
520549
return result;
521550
}
522551
catch (TaskOrchestrationPlanNotFoundException ex)
@@ -539,10 +568,12 @@ private async Task<TaskResult> CompleteJobAsync(IJobServer jobServer, IExecution
539568
}
540569

541570
// delay 5 seconds before next retry.
571+
Trace.Info($"Timeline record updates retry delay - Waiting 5 seconds before retry {5 - completeJobRetryLimit}/5");
542572
await Task.Delay(TimeSpan.FromSeconds(5));
543573
}
544574

545575
// rethrow exceptions from all attempts.
576+
Trace.Error($"Timeline record updates failed - All {5} retry attempts exhausted, throwing AggregateException");
546577
throw new AggregateException(exceptions);
547578
}
548579

0 commit comments

Comments
 (0)