File tree Expand file tree Collapse file tree 1 file changed +5
-1
lines changed
tensorflow/compiler/plugin/poplar/kernels/application_runtime Expand file tree Collapse file tree 1 file changed +5
-1
lines changed Original file line number Diff line number Diff line change @@ -728,6 +728,8 @@ class EngineResource {
728728 tensorflow::ThreadOptions (), engine_name_ + " _execute_thread" , [this ] {
729729 Status runtime_status = Status::OK ();
730730 bool reset_engine = false ;
731+ static constexpr int num_of_reset_retries = 2 ;
732+ int reset_engine_retries_left = num_of_reset_retries;
731733 while (!communication_manager_.Exiting ()) {
732734 {
733735 // Prevent any new requests from being inserted whilst the engine
@@ -741,17 +743,19 @@ class EngineResource {
741743 communication_manager_.Abort (runtime_status);
742744 // If the exception raised only requires an engine reset, then
743745 // continue, otherwise we can't recover.
744- if (!reset_engine) {
746+ if (!reset_engine || reset_engine_retries_left <= 0 ) {
745747 communication_manager_.InitiateExit ();
746748 return ;
747749 }
748750 }
749751
750752 runtime_status = StartEngineAndConnectStreams (reset_engine);
751753 if (!runtime_status.ok ()) {
754+ --reset_engine_retries_left;
752755 continue ;
753756 }
754757
758+ reset_engine_retries_left = num_of_reset_retries;
755759 reset_engine = false ;
756760 runtime_status = Status::OK ();
757761 }
You can’t perform that action at this time.
0 commit comments