Skip to content

Commit 11e7d64

Browse files
committed
Improve error message when IPUs are not found
Summary: The error message when TensorFlow fails to attach to a poplar device can be confusing and it is not immediately to users that the number of devices available on the machine is the issue. Fix T51079 Reviewers: georgep, #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved Reviewed By: georgep, #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved Maniphest Tasks: T51079 Differential Revision: https://phabricator.sourcevertex.net/D56603
1 parent 7954103 commit 11e7d64

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

tensorflow/compiler/plugin/poplar/driver/poplar_executor.cc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,10 +1605,14 @@ StatusOr<std::size_t> PoplarExecutor::AttachToPoplarDevice(
16051605
absl::Span<const poplar::Device> device_list, int32 ordinal,
16061606
bool wait_for_device) {
16071607
TENSORFLOW_TRACEPOINT();
1608+
const std::string error_probable_causes =
1609+
" Common causes of this error include: incorrect "
1610+
"configuration of your V-IPU partition, requesting more IPUs than are "
1611+
"available on your system, or IPUs being used by another program.";
16081612
if (device_list.empty()) {
16091613
return InvalidArgumentStrCat(
16101614
"No device matches the requested configuration for ordinal ", ordinal,
1611-
".");
1615+
".", error_probable_causes);
16121616
}
16131617

16141618
const uint64 on_demand_device_poll_time =
@@ -1617,7 +1621,6 @@ StatusOr<std::size_t> PoplarExecutor::AttachToPoplarDevice(
16171621
on_demand_device_poll_time * 1000ULL;
16181622
const uint64 on_demand_device_timeout =
16191623
PoplarXlaFlags::Get().on_demand_device_timeout;
1620-
16211624
tensorflow::Env* env = tensorflow::Env::Default();
16221625
auto start_time = std::chrono::steady_clock::now();
16231626

@@ -1643,7 +1646,7 @@ StatusOr<std::size_t> PoplarExecutor::AttachToPoplarDevice(
16431646
if (elapsed_time.count() > on_demand_device_timeout) {
16441647
return InternalErrorStrCat(
16451648
"Timed out trying to find an available device for ordinal ",
1646-
ordinal, ".");
1649+
ordinal, ".", error_probable_causes);
16471650
} else {
16481651
if (!logged_message) {
16491652
LOG(INFO) << "Currently there is no available device for ordinal ",
@@ -1654,7 +1657,8 @@ StatusOr<std::size_t> PoplarExecutor::AttachToPoplarDevice(
16541657
}
16551658
} else {
16561659
return InternalErrorStrCat(
1657-
"Could not find an available device for ordinal ", ordinal, ".");
1660+
"Could not find an available device for ordinal ", ordinal, ".",
1661+
error_probable_causes);
16581662
}
16591663
}
16601664
}

0 commit comments

Comments
 (0)