Skip to content

Commit 6cabb81

Browse files
committed
Fix handling MEMORY and DIV BY 0 errors for C600
Summary: Ref T73341 Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, kamil.andrzejewski Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, kamil.andrzejewski Maniphest Tasks: T73341 Differential Revision: https://phabricator.sourcevertex.net/D81663
1 parent 4501595 commit 6cabb81

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

tensorflow/compiler/plugin/poplar/kernels/application_runtime/application_runtime.cc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,8 @@ class EngineResource {
728728
tensorflow::ThreadOptions(), engine_name_ + "_execute_thread", [this] {
729729
Status runtime_status = Status::OK();
730730
bool reset_engine = false;
731+
static constexpr int num_of_reset_retries = 2;
732+
int reset_engine_retries_left = num_of_reset_retries;
731733
while (!communication_manager_.Exiting()) {
732734
{
733735
// Prevent any new requests from being inserted whilst the engine
@@ -741,17 +743,19 @@ class EngineResource {
741743
communication_manager_.Abort(runtime_status);
742744
// If the exception raised only requires an engine reset, then
743745
// continue, otherwise we can't recover.
744-
if (!reset_engine) {
746+
if (!reset_engine || reset_engine_retries_left <= 0) {
745747
communication_manager_.InitiateExit();
746748
return;
747749
}
748750
}
749751

750752
runtime_status = StartEngineAndConnectStreams(reset_engine);
751753
if (!runtime_status.ok()) {
754+
--reset_engine_retries_left;
752755
continue;
753756
}
754757

758+
reset_engine_retries_left = num_of_reset_retries;
755759
reset_engine = false;
756760
runtime_status = Status::OK();
757761
}

0 commit comments

Comments
 (0)