@@ -2184,6 +2184,29 @@ class BoUpSLP {
21842184 const DataLayout &DL,
21852185 ScalarEvolution &SE,
21862186 const BoUpSLP &R);
2187+
2188+ // / Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2189+ // / users of \p TE and collects the stores. It returns the map from the store
2190+ // / pointers to the collected stores.
2191+ DenseMap<Value *, SmallVector<StoreInst *, 4 >>
2192+ collectUserStores (const BoUpSLP::TreeEntry *TE) const ;
2193+
2194+ // / Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2195+ // / stores in \p StoresVec can for a vector instruction. If so it returns true
2196+ // / and populates \p ReorderIndices with the shuffle indices of the the stores
2197+ // / when compared to the sorted vector.
2198+ bool CanFormVector (const SmallVector<StoreInst *, 4 > &StoresVec,
2199+ OrdersType &ReorderIndices) const ;
2200+
2201+ // / Iterates through the users of \p TE, looking for scalar stores that can be
2202+ // / potentially vectorized in a future SLP-tree. If found, it keeps track of
2203+ // / their order and builds an order index vector for each store bundle. It
2204+ // / returns all these order vectors found.
2205+ // / We run this after the tree has formed, otherwise we may come across user
2206+ // / instructions that are not yet in the tree.
2207+ SmallVector<OrdersType, 1 >
2208+ findExternalStoreUsersReorderIndices (TreeEntry *TE) const ;
2209+
21872210 struct TreeEntry {
21882211 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8 >;
21892212 TreeEntry (VecTreeTy &Container) : Container(Container) {}
@@ -3584,11 +3607,25 @@ void BoUpSLP::reorderTopToBottom() {
35843607 // ExtractElement gather nodes which can be vectorized and need to handle
35853608 // their ordering.
35863609 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
3610+
3611+ // Maps a TreeEntry to the reorder indices of external users.
3612+ DenseMap<const TreeEntry *, SmallVector<OrdersType, 1 >>
3613+ ExternalUserReorderMap;
35873614 // Find all reorderable nodes with the given VF.
35883615 // Currently the are vectorized stores,loads,extracts + some gathering of
35893616 // extracts.
3590- for_each (VectorizableTree, [this , &VFToOrderedEntries, &GathersToOrders](
3617+ for_each (VectorizableTree, [this , &VFToOrderedEntries, &GathersToOrders,
3618+ &ExternalUserReorderMap](
35913619 const std::unique_ptr<TreeEntry> &TE) {
3620+ // Look for external users that will probably be vectorized.
3621+ SmallVector<OrdersType, 1 > ExternalUserReorderIndices =
3622+ findExternalStoreUsersReorderIndices (TE.get ());
3623+ if (!ExternalUserReorderIndices.empty ()) {
3624+ VFToOrderedEntries[TE->Scalars .size ()].insert (TE.get ());
3625+ ExternalUserReorderMap.try_emplace (TE.get (),
3626+ std::move (ExternalUserReorderIndices));
3627+ }
3628+
35923629 if (Optional<OrdersType> CurrentOrder =
35933630 getReorderingData (*TE, /* TopToBottom=*/ true )) {
35943631 // Do not include ordering for nodes used in the alt opcode vectorization,
@@ -3643,10 +3680,23 @@ void BoUpSLP::reorderTopToBottom() {
36433680 continue ;
36443681 // Count number of orders uses.
36453682 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3646- if (OpTE->State == TreeEntry::NeedToGather)
3647- return GathersToOrders.find (OpTE)->second ;
3683+ if (OpTE->State == TreeEntry::NeedToGather) {
3684+ auto It = GathersToOrders.find (OpTE);
3685+ if (It != GathersToOrders.end ())
3686+ return It->second ;
3687+ }
36483688 return OpTE->ReorderIndices ;
36493689 }();
3690+ // First consider the order of the external scalar users.
3691+ auto It = ExternalUserReorderMap.find (OpTE);
3692+ if (It != ExternalUserReorderMap.end ()) {
3693+ const auto &ExternalUserReorderIndices = It->second ;
3694+ for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
3695+ ++OrdersUses.insert (std::make_pair (ExtOrder, 0 )).first ->second ;
3696+ // No other useful reorder data in this entry.
3697+ if (Order.empty ())
3698+ continue ;
3699+ }
36503700 // Stores actually store the mask, not the order, need to invert.
36513701 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle () &&
36523702 OpTE->getOpcode () == Instruction::Store && !Order.empty ()) {
@@ -4078,6 +4128,152 @@ void BoUpSLP::buildExternalUses(
40784128 }
40794129}
40804130
4131+ DenseMap<Value *, SmallVector<StoreInst *, 4 >>
4132+ BoUpSLP::collectUserStores (const BoUpSLP::TreeEntry *TE) const {
4133+ DenseMap<Value *, SmallVector<StoreInst *, 4 >> PtrToStoresMap;
4134+ for (unsigned Lane : seq<unsigned >(0 , TE->Scalars .size ())) {
4135+ Value *V = TE->Scalars [Lane];
4136+ // To save compilation time we don't visit if we have too many users.
4137+ static constexpr unsigned UsersLimit = 4 ;
4138+ if (V->hasNUsesOrMore (UsersLimit))
4139+ break ;
4140+
4141+ // Collect stores per pointer object.
4142+ for (User *U : V->users ()) {
4143+ auto *SI = dyn_cast<StoreInst>(U);
4144+ if (SI == nullptr || !SI->isSimple () ||
4145+ !isValidElementType (SI->getValueOperand ()->getType ()))
4146+ continue ;
4147+ // Skip entry if already
4148+ if (getTreeEntry (U))
4149+ continue ;
4150+
4151+ Value *Ptr = getUnderlyingObject (SI->getPointerOperand ());
4152+ auto &StoresVec = PtrToStoresMap[Ptr];
4153+ // For now just keep one store per pointer object per lane.
4154+ // TODO: Extend this to support multiple stores per pointer per lane
4155+ if (StoresVec.size () > Lane)
4156+ continue ;
4157+ // Skip if in different BBs.
4158+ if (!StoresVec.empty () &&
4159+ SI->getParent () != StoresVec.back ()->getParent ())
4160+ continue ;
4161+ // Make sure that the stores are of the same type.
4162+ if (!StoresVec.empty () &&
4163+ SI->getValueOperand ()->getType () !=
4164+ StoresVec.back ()->getValueOperand ()->getType ())
4165+ continue ;
4166+ StoresVec.push_back (SI);
4167+ }
4168+ }
4169+ return PtrToStoresMap;
4170+ }
4171+
4172+ bool BoUpSLP::CanFormVector (const SmallVector<StoreInst *, 4 > &StoresVec,
4173+ OrdersType &ReorderIndices) const {
4174+ // We check whether the stores in StoreVec can form a vector by sorting them
4175+ // and checking whether they are consecutive.
4176+
4177+ // To avoid calling getPointersDiff() while sorting we create a vector of
4178+ // pairs {store, offset from first} and sort this instead.
4179+ SmallVector<std::pair<StoreInst *, int >, 4 > StoreOffsetVec (StoresVec.size ());
4180+ StoreInst *S0 = StoresVec[0 ];
4181+ StoreOffsetVec[0 ] = {S0, 0 };
4182+ Type *S0Ty = S0->getValueOperand ()->getType ();
4183+ Value *S0Ptr = S0->getPointerOperand ();
4184+ for (unsigned Idx : seq<unsigned >(1 , StoresVec.size ())) {
4185+ StoreInst *SI = StoresVec[Idx];
4186+ Optional<int > Diff =
4187+ getPointersDiff (S0Ty, S0Ptr, SI->getValueOperand ()->getType (),
4188+ SI->getPointerOperand (), *DL, *SE,
4189+ /* StrictCheck=*/ true );
4190+ // We failed to compare the pointers so just abandon this StoresVec.
4191+ if (!Diff)
4192+ return false ;
4193+ StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
4194+ }
4195+
4196+ // Sort the vector based on the pointers. We create a copy because we may
4197+ // need the original later for calculating the reorder (shuffle) indices.
4198+ stable_sort (StoreOffsetVec, [](const std::pair<StoreInst *, int > &Pair1,
4199+ const std::pair<StoreInst *, int > &Pair2) {
4200+ int Offset1 = Pair1.second ;
4201+ int Offset2 = Pair2.second ;
4202+ return Offset1 < Offset2;
4203+ });
4204+
4205+ // Check if the stores are consecutive by checking if last-first == size-1.
4206+ int LastOffset = StoreOffsetVec.back ().second ;
4207+ int FirstOffset = StoreOffsetVec.front ().second ;
4208+ if (LastOffset - FirstOffset != (int )StoreOffsetVec.size () - 1 )
4209+ return false ;
4210+
4211+ // Calculate the shuffle indices according to their offset against the sorted
4212+ // StoreOffsetVec.
4213+ ReorderIndices.reserve (StoresVec.size ());
4214+ for (StoreInst *SI : StoresVec) {
4215+ unsigned Idx = find_if (StoreOffsetVec,
4216+ [SI](const std::pair<StoreInst *, int > &Pair) {
4217+ return Pair.first == SI;
4218+ }) -
4219+ StoreOffsetVec.begin ();
4220+ ReorderIndices.push_back (Idx);
4221+ }
4222+ // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
4223+ // reorderTopToBottom() and reorderBottomToTop(), so we are following the
4224+ // same convention here.
4225+ auto IsIdentityOrder = [](const OrdersType &Order) {
4226+ for (unsigned Idx : seq<unsigned >(0 , Order.size ()))
4227+ if (Idx != Order[Idx])
4228+ return false ;
4229+ return true ;
4230+ };
4231+ if (IsIdentityOrder (ReorderIndices))
4232+ ReorderIndices.clear ();
4233+
4234+ return true ;
4235+ }
4236+
4237+ #ifndef NDEBUG
4238+ LLVM_DUMP_METHOD static void dumpOrder (const BoUpSLP::OrdersType &Order) {
4239+ for (unsigned Idx : Order)
4240+ dbgs () << Idx << " , " ;
4241+ dbgs () << " \n " ;
4242+ }
4243+ #endif
4244+
4245+ SmallVector<BoUpSLP::OrdersType, 1 >
4246+ BoUpSLP::findExternalStoreUsersReorderIndices (TreeEntry *TE) const {
4247+ unsigned NumLanes = TE->Scalars .size ();
4248+
4249+ DenseMap<Value *, SmallVector<StoreInst *, 4 >> PtrToStoresMap =
4250+ collectUserStores (TE);
4251+
4252+ // Holds the reorder indices for each candidate store vector that is a user of
4253+ // the current TreeEntry.
4254+ SmallVector<OrdersType, 1 > ExternalReorderIndices;
4255+
4256+ // Now inspect the stores collected per pointer and look for vectorization
4257+ // candidates. For each candidate calculate the reorder index vector and push
4258+ // it into `ExternalReorderIndices`
4259+ for (const auto &Pair : PtrToStoresMap) {
4260+ auto &StoresVec = Pair.second ;
4261+ // If we have fewer than NumLanes stores, then we can't form a vector.
4262+ if (StoresVec.size () != NumLanes)
4263+ continue ;
4264+
4265+ // If the stores are not consecutive then abandon this StoresVec.
4266+ OrdersType ReorderIndices;
4267+ if (!CanFormVector (StoresVec, ReorderIndices))
4268+ continue ;
4269+
4270+ // We now know that the scalars in StoresVec can form a vector instruction,
4271+ // so set the reorder indices.
4272+ ExternalReorderIndices.push_back (ReorderIndices);
4273+ }
4274+ return ExternalReorderIndices;
4275+ }
4276+
40814277void BoUpSLP::buildTree (ArrayRef<Value *> Roots,
40824278 ArrayRef<Value *> UserIgnoreLst) {
40834279 deleteTree ();
0 commit comments