2323import json
2424import six
2525import yaml
26+ import botocore .config
2627from botocore .exceptions import ClientError
2728
2829from sagemaker .user_agent import prepend_user_agent
@@ -549,7 +550,7 @@ def get_caller_identity_arn(self):
549550 role = re .sub (r'^(.+)sts::(\d+):assumed-role/(.+?)/.*$' , r'\1iam::\2:role/\3' , assumed_role )
550551 return role
551552
552- def logs_for_job (self , job_name , wait = False , poll = 5 ): # noqa: C901 - suppress complexity warning for this method
553+ def logs_for_job (self , job_name , wait = False , poll = 10 ): # noqa: C901 - suppress complexity warning for this method
553554 """Display the logs for a given training job, optionally tailing them until the
554555 job is complete. If the output is a tty or a Jupyter cell, it will be color-coded
555556 based on which instance the log entry is from.
@@ -569,7 +570,11 @@ def logs_for_job(self, job_name, wait=False, poll=5): # noqa: C901 - suppress c
569570
570571 stream_names = [] # The list of log streams
571572 positions = {} # The current position in each stream, map of stream name -> position
572- client = self .boto_session .client ('logs' )
573+
574+ # Increase retries allowed (from default of 4), as we don't want waiting for a training job
575+ # to be interrupted by a transient exception.
576+ config = botocore .config .Config (retries = {'max_attempts' : 15 })
577+ client = self .boto_session .client ('logs' , config = config )
573578 log_group = '/aws/sagemaker/TrainingJobs'
574579
575580 job_already_completed = True if status == 'Completed' or status == 'Failed' else False
0 commit comments