Skip to content

Commit bafaf3c

Browse files
committed
Handle Poplar Exceptions in a uniform manner
Summary: Recognise runtime errors, if they can be recovered by an IPU reset which occurs when we reset an engine, then return a status, otherwise fail. Fix T42980 Test Plan: CI Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, davidn, jakeh, jamiep Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, jakeh Subscribers: jamiep Maniphest Tasks: T42980 Differential Revision: https://phabricator.sourcevertex.net/D49393
1 parent 4af82b3 commit bafaf3c

File tree

8 files changed

+98
-83
lines changed

8 files changed

+98
-83
lines changed

tensorflow/compiler/plugin/poplar/docs/device_selection.rst

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,3 +386,44 @@ have been targeted at the Poplar device. For example:
386386
387387
# Creates a session with log_device_placement set to True.
388388
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
389+
390+
Error Handling
391+
~~~~~~~~~~~~~~
392+
393+
The error and exception handling by TensorFlow is divided into two categories:
394+
395+
* Poplar graph construction and compilation errors which occur during
396+
construction and compilation of TensorFlow programs.
397+
* Poplar runtime errors which occur during the execution of the compiled
398+
program.
399+
400+
The following sections describe the actions you need to take when these errors
401+
occur.
402+
403+
Construction and compilation errors
404+
...................................
405+
406+
These errors are reported to the user using the TensorFlow Status error classes.
407+
The error messages contain information about why the error occurred and what
408+
action the user is required to take in order to stop the error from occurring.
409+
410+
Runtime errors
411+
..............
412+
413+
These errors and exceptions occur when running a Poplar program. The full list
414+
of all the exceptions and their meanings can be found in the Poplar
415+
documentation in the `Exceptions <https://docs.graphcore.ai/projects/poplar-api/en/latest/poplar_api.html#exceptions>`__
416+
section of the Poplar API reference manual.
417+
418+
These runtime errors are handled in the following manner:
419+
420+
* ``application_runtime_error`` - a ``tensorflow.errors.InternalError`` error
421+
is raised. The error message contains the reason why the error occurred. An
422+
IPU reset will be performed before the next execution of a Poplar program.
423+
* ``recoverable_runtime_error`` with a recovery action ``poplar::RecoveryAction::IPU_RESET`` - a ``tensorflow.errors.InternalError`` error
424+
is raised. The error message contains the reason why the error occurred. An
425+
IPU reset will be performed before the next execution of a Poplar program.
426+
* All other runtime errors - the process executing the Poplar program is
427+
terminated and the full error message is logged to the console. When these
428+
errors occur manual intervention might be required before the system is
429+
operational again. The error message might contain a required recovery action.

tensorflow/compiler/plugin/poplar/driver/poplar_compiler.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@ Status CreatePoplarGraphs(CompilerResources& resources, const HloModule* module,
696696
poplar_target,
697697
poplar::replication_factor(resources.replication_factor));
698698
} catch (const std::exception& e) {
699-
return PoplarExceptionToTensorflowStatus("[Create Graph] ", e);
699+
return PoplarExceptionToTensorflowStatus("[Create Graph]", e);
700700
}
701701

702702
if (resources.replication_factor > 1) {
@@ -1112,7 +1112,7 @@ StatusOr<std::unique_ptr<PoplarExecutableCore>> CompileEngine(
11121112
} catch (const std::exception& e) {
11131113
const std::string origin =
11141114
"[Deserialize][File: " + filenames.CachedExecutableFilename() +
1115-
"] ";
1115+
"]";
11161116
return PoplarExceptionToTensorflowStatus(origin, e);
11171117
}
11181118
}
@@ -1651,7 +1651,7 @@ StatusOr<std::unique_ptr<PoplarExecutableCore>> CompileEngine(
16511651
VLOG(1) << "End Poplar graph construction.";
16521652
resources.progress_bar->MoveToNextStage();
16531653
} catch (const std::exception& e) {
1654-
return PoplarExceptionToTensorflowStatus("[Build graph] ", e);
1654+
return PoplarExceptionToTensorflowStatus("[Build graph]", e);
16551655
}
16561656

16571657
poplar::program::Sequence main_program({}, {"MainProgram"});
@@ -1812,7 +1812,7 @@ StatusOr<std::unique_ptr<PoplarExecutableCore>> CompileEngine(
18121812
VLOG(1) << "End compiling Poplar engine.";
18131813

18141814
} catch (const std::exception& e) {
1815-
return PoplarExceptionToTensorflowStatus("[Compile engine] ", e);
1815+
return PoplarExceptionToTensorflowStatus("[Compile engine]", e);
18161816
}
18171817

18181818
if (enable_trace_events && compile) {

tensorflow/compiler/plugin/poplar/driver/poplar_executable.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ Status ExportInternal(
446446
TF_RETURN_IF_ERROR(file->Append(json_metadata));
447447
TF_RETURN_IF_ERROR(file->Close());
448448
} catch (const std::exception& e) {
449-
return PoplarExceptionToTensorflowStatus("[Serialize] ", e);
449+
return PoplarExceptionToTensorflowStatus("[Serialize]", e);
450450
}
451451
return Status::OK();
452452
}

tensorflow/compiler/plugin/poplar/driver/poplar_executor.cc

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2965,7 +2965,7 @@ Status PoplarExecutor::MoveDeviceToHost() {
29652965
TF_RETURN_IF_ERROR(ResetTensorControlState(tc));
29662966
}
29672967
} catch (const std::exception& e) {
2968-
return PoplarExceptionToTensorflowStatus("[Device to host] ", e);
2968+
return PoplarExceptionToTensorflowStatus("[Device to host]", e);
29692969
}
29702970
return Status::OK();
29712971
}
@@ -3125,7 +3125,7 @@ Status PoplarExecutor::MoveHostToDevice() {
31253125
tc->converted_data.clear();
31263126
}
31273127
} catch (const std::exception& e) {
3128-
return PoplarExceptionToTensorflowStatus("[Host to device] ", e);
3128+
return PoplarExceptionToTensorflowStatus("[Host to device]", e);
31293129
}
31303130

31313131
return Status::OK();
@@ -3600,6 +3600,11 @@ void PoplarExecutor::ExecuteEngine(se::DeviceMemoryBase* result_buffer,
36003600
}
36013601
current_status_ = ExecuteEngineImpl(result_buffer, executor, executable,
36023602
args_map, allocator, args);
3603+
if (!current_status_.ok()) {
3604+
StopIOThreads();
3605+
TF_CHECK_OK(ResetOnDeviceBuffers());
3606+
current_engine_ = nullptr;
3607+
}
36033608
}
36043609

36053610
Status PoplarExecutor::ExecuteEngineImpl(se::DeviceMemoryBase* result_buffer,
@@ -3710,7 +3715,7 @@ Status PoplarExecutor::ExecuteEngineImpl(se::DeviceMemoryBase* result_buffer,
37103715

37113716
executable.OnEngineLoaded();
37123717
} catch (const std::exception& e) {
3713-
return PoplarExceptionToTensorflowStatus("[Load engine] ", e);
3718+
return PoplarExceptionToTensorflowStatus("[Load engine]", e);
37143719
}
37153720
}
37163721

@@ -3847,10 +3852,7 @@ Status PoplarExecutor::ExecuteEngineImpl(se::DeviceMemoryBase* result_buffer,
38473852
// right format on the host
38483853
PostProcessStreamedVariablesDeviceToHost();
38493854
} catch (const std::exception& e) {
3850-
StopIOThreads();
3851-
TF_CHECK_OK(ResetOnDeviceBuffers());
3852-
current_engine_ = nullptr;
3853-
return PoplarExceptionToTensorflowStatus("[Execute engine] ", e);
3855+
return PoplarExceptionToTensorflowStatus("[Execute engine]", e);
38543856
}
38553857

38563858
try {
@@ -3868,7 +3870,7 @@ Status PoplarExecutor::ExecuteEngineImpl(se::DeviceMemoryBase* result_buffer,
38683870
AddExecuteEventRecord(executable.module().name());
38693871
}
38703872
} catch (const std::exception& e) {
3871-
return PoplarExceptionToTensorflowStatus("[Execute engine] ", e);
3873+
return PoplarExceptionToTensorflowStatus("[Execute engine]", e);
38723874
}
38733875
}
38743876

tensorflow/compiler/plugin/poplar/driver/tools/poplar_executable_binary_file.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ Status PoplarExecutableBinaryFile::Write(
5959
try {
6060
serialize_executable(file);
6161
} catch (const std::exception& e) {
62-
return PoplarExceptionToTensorflowStatus("[Serialize] ", e);
62+
return PoplarExceptionToTensorflowStatus("[Serialize]", e);
6363
}
6464

6565
return Status::OK();
@@ -69,7 +69,7 @@ StatusOr<poplar::Executable> PoplarExecutableBinaryFile::Read(
6969
const std::string& file_name, ::tensorflow::protobuf::MessageLite* proto) {
7070
auto file = absl::make_unique<std::ifstream>(file_name, std::ios::binary);
7171
const std::string error_prefix =
72-
absl::StrCat("[Deserialize][File: ", file_name, "] ");
72+
absl::StrCat("[Deserialize][File: ", file_name, "]");
7373

7474
std::array<uint8, MAGIC_STRING.size()> magic_string;
7575
if (!file->read(reinterpret_cast<char*>(magic_string.data()),
@@ -90,7 +90,7 @@ StatusOr<poplar::Executable> PoplarExecutableBinaryFile::Read(
9090
if (!file->read(reinterpret_cast<char*>(proto_length_bytes.data()),
9191
proto_length_bytes.size())) {
9292
return InternalErrorStrCat(error_prefix,
93-
"Corrupted - Cannot read the metadata length.");
93+
" Corrupted - Cannot read the metadata length.");
9494
}
9595
uint64 metadata_length = 0;
9696
for (uint64 i = 0; i != proto_length_bytes.size(); ++i) {
@@ -101,11 +101,11 @@ StatusOr<poplar::Executable> PoplarExecutableBinaryFile::Read(
101101
std::vector<char> serialized(metadata_length);
102102
if (!file->read(serialized.data(), metadata_length)) {
103103
return InternalErrorStrCat(error_prefix,
104-
"Corrupted - Cannot read the metadata.");
104+
" Corrupted - Cannot read the metadata.");
105105
}
106106
if (!proto->ParseFromArray(serialized.data(), metadata_length)) {
107107
return InternalErrorStrCat(error_prefix,
108-
"Corrupted - Cannot parse the metadata.");
108+
" Corrupted - Cannot parse the metadata.");
109109
}
110110
serialized.clear();
111111

tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.cc

Lines changed: 33 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -215,76 +215,47 @@ Status SetVertexField(poplar::Graph& graph, const poplar::FieldRef& field,
215215

216216
Status PoplarExceptionToTensorflowStatus(const std::string& origin,
217217
const std::exception& e) {
218-
const std::string prefix = "[Error]" + origin;
219-
/* NOTE: Reduce this list if/when Poplar errors are subclassed */
218+
const std::string prefix = "[Poplar]" + origin + " ";
220219
try {
221220
std::rethrow_exception(std::current_exception());
222-
} catch (const poplar::file_load_error& e) {
223-
return tensorflow::errors::NotFound(prefix, e.what());
224-
} catch (const poplar::missing_perf_estimate& e) {
225-
return tensorflow::errors::NotFound(prefix, e.what());
226-
} catch (const poplar::symbol_error& e) {
227-
return tensorflow::errors::NotFound(prefix, e.what());
228-
} catch (const poplar::unknown_field& e) {
229-
return tensorflow::errors::NotFound(prefix, e.what());
230-
} catch (const poplar::unknown_vertex_type& e) {
231-
return tensorflow::errors::NotFound(prefix, e.what());
232-
} catch (const poplar::no_environment& e) {
233-
return tensorflow::errors::NotFound(prefix, e.what());
234-
} catch (const poplar::parse_error& e) {
235-
return tensorflow::errors::InvalidArgument(prefix, e.what());
236-
} catch (const poplar::invalid_option& e) {
237-
return tensorflow::errors::InvalidArgument(prefix, e.what());
238-
} catch (const poplar::invalid_machine_model& e) {
239-
return tensorflow::errors::InvalidArgument(prefix, e.what());
240-
} catch (const poplar::stream_connection_error& e) {
241-
return tensorflow::errors::InvalidArgument(prefix, e.what());
242-
} catch (const poplar::graph_cycle_error& e) {
243-
return tensorflow::errors::InvalidArgument(prefix, e.what());
244-
} catch (const poplar::invalid_tile_mapping& e) {
245-
return tensorflow::errors::InvalidArgument(prefix, e.what());
246-
} catch (const poplar::type_error& e) {
247-
return tensorflow::errors::InvalidArgument(prefix, e.what());
248-
} catch (const poplar::no_size_specified& e) {
249-
return tensorflow::errors::InvalidArgument(prefix, e.what());
250-
} catch (const poplar::profiling_disabled& e) {
251-
return tensorflow::errors::InvalidArgument(prefix, e.what());
252-
} catch (const poplar::control_program_error& e) {
253-
return tensorflow::errors::InvalidArgument(prefix, e.what());
221+
} catch (const poplar::recoverable_runtime_error& e) {
222+
auto runtime_error =
223+
static_cast<const poplar::recoverable_runtime_error*>(&e);
224+
// Recoverable runtime error with IPU_RESET action is handled by resetting
225+
// the engine to nullptr, otherwise it's a fatal error.
226+
if (runtime_error->getRecoveryAction() ==
227+
poplar::RecoveryAction::IPU_RESET) {
228+
return tensorflow::errors::Internal(
229+
prefix, runtime_error->type, ": ", e.what(),
230+
". IPU will be reset the next time a program is executed.");
231+
} else {
232+
LOG(FATAL) << prefix << runtime_error->type << ": " << e.what()
233+
<< " Recovery action required: "
234+
<< poplar::toString(runtime_error->getRecoveryAction());
235+
}
236+
} catch (const poplar::application_runtime_error& e) {
237+
// Application errors require an engine reset.
238+
auto runtime_error =
239+
static_cast<const poplar::application_runtime_error*>(&e);
240+
return tensorflow::errors::Internal(prefix, runtime_error->type, ": ",
241+
e.what());
254242
} catch (const poplar::runtime_error& e) {
255-
return tensorflow::errors::Internal(prefix, e.what());
256-
} catch (const poplar::overflow_error& e) {
257-
return tensorflow::errors::Internal(prefix, e.what());
258-
} catch (const poplar::tensor_io_state_error& e) {
259-
return tensorflow::errors::Internal(prefix, e.what());
260-
} catch (const poplar::graph_connection_error& e) {
261-
return tensorflow::errors::Internal(prefix, e.what());
262-
} catch (const poplar::graph_object_load_error& e) {
263-
return tensorflow::errors::Internal(prefix, e.what());
264-
} catch (const poplar::graph_object_creation_error& e) {
265-
return tensorflow::errors::Internal(prefix, e.what());
266-
} catch (const poplar::graph_program_compilation_error& e) {
267-
return tensorflow::errors::Internal(prefix, e.what());
268-
} catch (const poputil::poplibs_error& e) {
269-
return tensorflow::errors::Internal(prefix, e.what());
243+
auto runtime_error = static_cast<const poplar::runtime_error*>(&e);
244+
// Default case for runtime errors which we can't recover from.
245+
LOG(FATAL) << prefix << runtime_error->type << ": " << e.what();
270246
} catch (const poplar::link_error& e) {
271-
return tensorflow::errors::Internal(prefix, e.what());
272-
} catch (const poplar::stream_memory_allocation_error& e) {
273-
return tensorflow::errors::Internal(prefix, e.what());
274-
} catch (const poplar::graph_memory_allocation_error& e) {
275-
return tensorflow::errors::Internal(prefix, e.what());
276-
} catch (const poplar::tensor_creation_error& e) {
277-
return tensorflow::errors::Internal(prefix, e.what());
278-
} catch (const poplar::memory_elem_constraints_error& e) {
279-
return tensorflow::errors::Internal(prefix, e.what());
280-
} catch (const poplar::index_error& e) {
281-
return tensorflow::errors::OutOfRange(prefix, e.what());
247+
auto link_error = static_cast<const poplar::link_error*>(&e);
248+
return tensorflow::errors::Internal(prefix, link_error->type, ": ",
249+
e.what(),
250+
" Output: ", link_error->output);
282251
} catch (const poplar::poplar_error& e) {
283-
return tensorflow::errors::Internal(prefix, e.what());
252+
auto poplar_error = static_cast<const poplar::poplar_error*>(&e);
253+
return tensorflow::errors::Internal(prefix, poplar_error->type, ": ",
254+
e.what());
284255
} catch (const std::exception& e) {
285256
}
286257

287-
return tensorflow::errors::Unknown(prefix, e.what());
258+
return tensorflow::errors::Unknown(e.what());
288259
}
289260

290261
void SetFlagIfNotPresent(poplar::OptionFlags& opts, const std::string& key,

tensorflow/compiler/plugin/poplar/tests/ipu_model_device_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,8 @@ def testEngineCompilationOptions(self):
145145
cfg.configure_ipu_system()
146146

147147
fd = {pa: np.zeros([480]), pb: np.zeros([480])}
148-
with self.assertRaisesRegex(errors.InvalidArgumentError,
149-
"Unrecognised option"):
148+
with self.assertRaisesRegex(errors.InternalError,
149+
"invalid_option: Unrecognised"):
150150
sess.run(output, fd)
151151

152152
def testNamedOperations(self):

tensorflow/python/ipu/tests/multi_conv_test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,8 @@ def body(a, b):
440440
session.run(variables.global_variables_initializer())
441441
with self.assertRaisesRegex(
442442
Exception,
443-
r"\[Error\]\[Build graph\] Unrecognised option \'invalidFlag\'"):
443+
r"\[Poplar\]\[Build graph\] invalid_option: Unrecognised option "
444+
r"\'invalidFlag\'"):
444445
session.run(res, {x: np.ones(x.shape) for x in [a, b]})
445446

446447

0 commit comments

Comments
 (0)