insertCopiesUnder: drop iteration over tensor dimensions of size 1 up front

Sven Verdoolaege · Sven Verdoolaege · commit d984f4e31b65 · 2018-04-26T16:41:34.000+02:00
When mapping the code for copying arrays to/from shared memory to threads,
the band members of the copying schedule that iterate over
a tensor dimension of size 1 do not get mapped to a thread identifier.
This was handled by code in mapCopiesToThreads that is relatively
complicated and that needs to take into account situations
that may never arise in practice.

It is much simpler to make sure that those band members
do not even appear in the copying schedule, by removing
them while the copying schedule is being constructed in
insertCopiesUnder.

Note that insertCopiesUnder now relies on promotedDecls_,
so the call to insertCopiesUnder needs to be moved after
the construction of promotedDecls_.
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -99,36 +99,18 @@ void mapCopiesToThreads(MappedScop& mscop, bool unroll) {
 
     // Map band dimensions to threads, in inverse order since the last member
     // iterates over the last subscript and is likely to result in coalescing.
-    // Step over band members that iterate over size-1 arrays subscripts as
-    // they would have been executed by a single thread.
     // If not all available thread ids are used, fix remaining to 1 thread.
-    auto filter = node->elemAs<ScheduleTreeElemFilter>()->filter_;
-    auto filterSets = isl::UnionAsVector<isl::union_set>(filter);
-    size_t t = 0;
-    for (int i = band->nMember() - 1;
-         i >= 0 && t < mscop.numThreads.view.size();
-         --i) {
-      auto skip = std::all_of(
-          filterSets.begin(), filterSets.end(), [&mscop, i](isl::set s) {
-            auto groupId =
-                s.get_space().unwrap().get_tuple_id(isl::dim_type::out);
-            auto decl = mscop.scop().promotedDecl(groupId);
-            return static_cast<size_t>(i) >= decl.sizes.size() ||
-                decl.sizes[i] == 1;
-          });
-      if (skip) {
-        continue;
-      }
-
+    auto nToMap = std::min(band->nMember(), mscop.numThreads.view.size());
+    for (size_t t = 0; t < nToMap; ++t) {
+      auto pos = band->nMember() - 1 - t;
       mapToParameterWithExtent(
           root,
           bandNode,
-          i,
+          pos,
           mapping::ThreadId::makeId(t),
           mscop.numThreads.view[t]);
-      ++t;
     }
-    mscop.mapRemaining<mapping::ThreadId>(bandNode, t);
+    mscop.mapRemaining<mapping::ThreadId>(bandNode, nToMap);
 
     // Unroll if requested.
     if (unroll) {
diff --git a/tc/core/polyhedral/memory_promotion.cc b/tc/core/polyhedral/memory_promotion.cc
@@ -422,6 +422,30 @@ isl::set tensorElementsSet(const Scop& scop, isl::id tensorId) {
   }
   return tensorElements;
 }
+
+/*
+ * "schedule" iterates over the elements of the tensor described by "decl".
+ * Remove the schedule dimensions that correspond to tensor dimensions
+ * of size 1.
+ * Note that this function drops the name of the target space of "schedule",
+ * but this space is irrelevant for the caller.
+ */
+isl::multi_aff dropDummyTensorDimensions(
+    isl::multi_aff schedule,
+    const Scop::PromotedDecl& decl) {
+  auto list = schedule.get_aff_list();
+  auto space = schedule.get_space().domain();
+
+  auto n = list.n();
+  for (int i = n - 1; i >= 0; --i) {
+    if (decl.sizes[i] == 1) {
+      list = list.drop(i, 1);
+    }
+  }
+
+  space = space.from_domain().add_dims(isl::dim_type::out, list.n());
+  return isl::multi_aff(space, list);
+}
 } // namespace
 
 ScheduleTree* insertCopiesUnder(
@@ -449,6 +473,9 @@ ScheduleTree* insertCopiesUnder(
       isl::multi_aff::identity(promotionSpace.range().map_from_set());
   identityCopySchedule =
       identityCopySchedule.pullback(isl::multi_aff::range_map(promotionSpace));
+  // Only iterate over significant tensor dimensions.
+  auto decl = scop.promotedDecl(groupId);
+  identityCopySchedule = dropDummyTensorDimensions(identityCopySchedule, decl);
   auto readSchedule = isl::multi_union_pw_aff(
       identityCopySchedule.set_tuple_id(isl::dim_type::in, readId));
   auto writeSchedule = isl::multi_union_pw_aff(
diff --git a/tc/core/polyhedral/scop.cc b/tc/core/polyhedral/scop.cc
@@ -201,12 +201,12 @@ void Scop::promoteGroup(
   }
 
   auto groupId = nextGroupIdForTensor(tensorId);
-  insertCopiesUnder(*this, tree, *gr, tensorId, groupId);
   auto sizes = gr->approximationSizes();
   if (sizes.size() > 0 && forceLastExtentOdd && (sizes.back() % 2) == 0) {
     sizes.back() += 1;
   }
   promotedDecls_[groupId] = PromotedDecl{tensorId, sizes, kind};
+  insertCopiesUnder(*this, tree, *gr, tensorId, groupId);
 
   // FIXME: we can now store a unique pointer...
   auto group = std::shared_ptr<TensorReferenceGroup>(std::move(gr));

Original file line number	Diff line number	Diff line change
`@@ -201,12 +201,12 @@ void Scop::promoteGroup(`
`201`	`201`	`}`
`202`	`202`
`203`	`203`	`auto groupId = nextGroupIdForTensor(tensorId);`
`204`		`- insertCopiesUnder(this, tree, gr, tensorId, groupId);`
`205`	`204`	`auto sizes = gr->approximationSizes();`
`206`	`205`	`if (sizes.size() > 0 && forceLastExtentOdd && (sizes.back() % 2) == 0) {`
`207`	`206`	`sizes.back() += 1;`
`208`	`207`	`}`
`209`	`208`	`promotedDecls_[groupId] = PromotedDecl{tensorId, sizes, kind};`
	`209`	`+ insertCopiesUnder(this, tree, gr, tensorId, groupId);`
`210`	`210`
`211`	`211`	`// FIXME: we can now store a unique pointer...`
`212`	`212`	`auto group = std::shared_ptr<TensorReferenceGroup>(std::move(gr));`