Skip to content

Commit db44f89

Browse files
whoozlegeorgepaw
authored andcommitted
Reallocate tensors used for device to host copy
Summary: This commit fixes a regression introduced in D55136. If broadcast was passed to device to host copy, it ends on the single tile, rebalance output tensors if they have an aliases. Ref T51875. Ref T56588. Test Plan: CI, regressed yolo3 model Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, georgep, samuelh, babakk Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, georgep Maniphest Tasks: T56588, T51875 Differential Revision: https://phabricator.sourcevertex.net/D62637
1 parent 4fafa44 commit db44f89

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

tensorflow/compiler/plugin/poplar/driver/visitors/entry_visitor.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,14 @@ Status EntryVisitor::FinishDeferedAllocationVisit(HloInstruction* root) {
287287
++tuple_index) {
288288
poplar::Tensor out = ConvertFromDeviceLayout(
289289
layout_sub_shapes[tuple_index], out_tensors[tuple_index]);
290+
if (!out.isParallelWriteable()) {
291+
poplar::Tensor out_copy = TensorCloneAndRebalanceAliasing(
292+
graph, resources_, out, debug_name_and_id);
293+
294+
seq.add(
295+
poplar::program::Copy(out, out_copy, false, debug_name_and_id));
296+
out = out_copy;
297+
}
290298

291299
const std::string handle = out_info.Handles().at(tuple_index);
292300
auto fifo = graph.addDeviceToHostFIFO(handle, out.elementType(),

tensorflow/compiler/plugin/poplar/tests/size_speed_tests/matmul_size_test.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,36 @@ def testInference(self):
9494
report = pva.openReport(report_helper.find_report())
9595
self.assert_total_tile_memory(report, 12181182)
9696

97+
def testInferenceManyLayersRebalancedBroadcastInInitialisation(self):
98+
cfg = ipu.utils.IPUConfig()
99+
report_helper = tu.ReportHelper()
100+
report_helper.set_autoreport_options(cfg)
101+
cfg.ipu_model.compile_ipu_code = True
102+
cfg.ipu_model.tiles_per_ipu = 1472
103+
cfg.optimizations.math.dot_strength = False
104+
cfg.configure_ipu_system()
105+
106+
with self.session() as sess:
107+
x = array_ops.placeholder(datatype, shape=[2, 112 * 112 * 4])
108+
y = array_ops.placeholder(datatype, shape=[2, 4])
109+
110+
with ipu.scopes.ipu_scope("/device:IPU:0"):
111+
logits = x
112+
for idx in range(20):
113+
# Any value less than tiles per ipu would work here,
114+
# see ShouldRebalanceTensor().
115+
logits = layer(logits, 1400, "l%d" % idx)
116+
logits = layer(logits, 4, "out")
117+
118+
math_ops.reduce_mean(
119+
nn_ops.softmax_cross_entropy_with_logits_v2(logits=logits,
120+
labels=y))
121+
122+
sess.run(variables.global_variables_initializer())
123+
report = pva.openReport(report_helper.find_report())
124+
self.assert_max_tile_memory(report, 162410)
125+
# Skip the rest of the test, broadcast are returned for global initialisers.
126+
97127
def testTrainingBs1(self):
98128
cfg = ipu.utils.IPUConfig()
99129
report_helper = tu.ReportHelper()

0 commit comments

Comments
 (0)