1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 #include "llvm/Transforms/Vectorize.h"
19 #include "llvm/ADT/MapVector.h"
20 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/PostOrderIterator.h"
22 #include "llvm/ADT/SetVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/AliasAnalysis.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/CodeMetrics.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/ScalarEvolution.h"
29 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/Analysis/ValueTracking.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/NoFolder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Analysis/VectorUtils.h"
47 #include <algorithm>
48 #include <map>
49 #include <memory>
50
51 using namespace llvm;
52
53 #define SV_NAME "slp-vectorizer"
54 #define DEBUG_TYPE "SLP"
55
56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
57
58 static cl::opt<int>
59 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
60 cl::desc("Only vectorize if you gain more than this "
61 "number "));
62
63 static cl::opt<bool>
64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
65 cl::desc("Attempt to vectorize horizontal reductions"));
66
67 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
68 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
69 cl::desc(
70 "Attempt to vectorize horizontal reductions feeding into a store"));
71
72 static cl::opt<int>
73 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
74 cl::desc("Attempt to vectorize for this register size in bits"));
75
76 namespace {
77
78 // FIXME: Set this via cl::opt to allow overriding.
79 static const unsigned MinVecRegSize = 128;
80
81 static const unsigned RecursionMaxDepth = 12;
82
83 // Limit the number of alias checks. The limit is chosen so that
84 // it has no negative effect on the llvm benchmarks.
85 static const unsigned AliasedCheckLimit = 10;
86
87 // Another limit for the alias checks: The maximum distance between load/store
88 // instructions where alias checks are done.
89 // This limit is useful for very large basic blocks.
90 static const unsigned MaxMemDepDistance = 160;
91
92 /// \brief Predicate for the element types that the SLP vectorizer supports.
93 ///
94 /// The most important thing to filter here are types which are invalid in LLVM
95 /// vectors. We also filter target specific types which have absolutely no
96 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
97 /// avoids spending time checking the cost model and realizing that they will
98 /// be inevitably scalarized.
isValidElementType(Type * Ty)99 static bool isValidElementType(Type *Ty) {
100 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
101 !Ty->isPPC_FP128Ty();
102 }
103
104 /// \returns the parent basic block if all of the instructions in \p VL
105 /// are in the same block or null otherwise.
getSameBlock(ArrayRef<Value * > VL)106 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
107 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
108 if (!I0)
109 return nullptr;
110 BasicBlock *BB = I0->getParent();
111 for (int i = 1, e = VL.size(); i < e; i++) {
112 Instruction *I = dyn_cast<Instruction>(VL[i]);
113 if (!I)
114 return nullptr;
115
116 if (BB != I->getParent())
117 return nullptr;
118 }
119 return BB;
120 }
121
122 /// \returns True if all of the values in \p VL are constants.
allConstant(ArrayRef<Value * > VL)123 static bool allConstant(ArrayRef<Value *> VL) {
124 for (unsigned i = 0, e = VL.size(); i < e; ++i)
125 if (!isa<Constant>(VL[i]))
126 return false;
127 return true;
128 }
129
130 /// \returns True if all of the values in \p VL are identical.
isSplat(ArrayRef<Value * > VL)131 static bool isSplat(ArrayRef<Value *> VL) {
132 for (unsigned i = 1, e = VL.size(); i < e; ++i)
133 if (VL[i] != VL[0])
134 return false;
135 return true;
136 }
137
138 ///\returns Opcode that can be clubbed with \p Op to create an alternate
139 /// sequence which can later be merged as a ShuffleVector instruction.
getAltOpcode(unsigned Op)140 static unsigned getAltOpcode(unsigned Op) {
141 switch (Op) {
142 case Instruction::FAdd:
143 return Instruction::FSub;
144 case Instruction::FSub:
145 return Instruction::FAdd;
146 case Instruction::Add:
147 return Instruction::Sub;
148 case Instruction::Sub:
149 return Instruction::Add;
150 default:
151 return 0;
152 }
153 }
154
155 ///\returns bool representing if Opcode \p Op can be part
156 /// of an alternate sequence which can later be merged as
157 /// a ShuffleVector instruction.
canCombineAsAltInst(unsigned Op)158 static bool canCombineAsAltInst(unsigned Op) {
159 if (Op == Instruction::FAdd || Op == Instruction::FSub ||
160 Op == Instruction::Sub || Op == Instruction::Add)
161 return true;
162 return false;
163 }
164
165 /// \returns ShuffleVector instruction if intructions in \p VL have
166 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
167 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
isAltInst(ArrayRef<Value * > VL)168 static unsigned isAltInst(ArrayRef<Value *> VL) {
169 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
170 unsigned Opcode = I0->getOpcode();
171 unsigned AltOpcode = getAltOpcode(Opcode);
172 for (int i = 1, e = VL.size(); i < e; i++) {
173 Instruction *I = dyn_cast<Instruction>(VL[i]);
174 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
175 return 0;
176 }
177 return Instruction::ShuffleVector;
178 }
179
180 /// \returns The opcode if all of the Instructions in \p VL have the same
181 /// opcode, or zero.
getSameOpcode(ArrayRef<Value * > VL)182 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
183 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
184 if (!I0)
185 return 0;
186 unsigned Opcode = I0->getOpcode();
187 for (int i = 1, e = VL.size(); i < e; i++) {
188 Instruction *I = dyn_cast<Instruction>(VL[i]);
189 if (!I || Opcode != I->getOpcode()) {
190 if (canCombineAsAltInst(Opcode) && i == 1)
191 return isAltInst(VL);
192 return 0;
193 }
194 }
195 return Opcode;
196 }
197
198 /// Get the intersection (logical and) of all of the potential IR flags
199 /// of each scalar operation (VL) that will be converted into a vector (I).
200 /// Flag set: NSW, NUW, exact, and all of fast-math.
propagateIRFlags(Value * I,ArrayRef<Value * > VL)201 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
202 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
203 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
204 // Intersection is initialized to the 0th scalar,
205 // so start counting from index '1'.
206 for (int i = 1, e = VL.size(); i < e; ++i) {
207 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
208 Intersection->andIRFlags(Scalar);
209 }
210 VecOp->copyIRFlags(Intersection);
211 }
212 }
213 }
214
215 /// \returns \p I after propagating metadata from \p VL.
propagateMetadata(Instruction * I,ArrayRef<Value * > VL)216 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
217 Instruction *I0 = cast<Instruction>(VL[0]);
218 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
219 I0->getAllMetadataOtherThanDebugLoc(Metadata);
220
221 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
222 unsigned Kind = Metadata[i].first;
223 MDNode *MD = Metadata[i].second;
224
225 for (int i = 1, e = VL.size(); MD && i != e; i++) {
226 Instruction *I = cast<Instruction>(VL[i]);
227 MDNode *IMD = I->getMetadata(Kind);
228
229 switch (Kind) {
230 default:
231 MD = nullptr; // Remove unknown metadata
232 break;
233 case LLVMContext::MD_tbaa:
234 MD = MDNode::getMostGenericTBAA(MD, IMD);
235 break;
236 case LLVMContext::MD_alias_scope:
237 MD = MDNode::getMostGenericAliasScope(MD, IMD);
238 break;
239 case LLVMContext::MD_noalias:
240 MD = MDNode::intersect(MD, IMD);
241 break;
242 case LLVMContext::MD_fpmath:
243 MD = MDNode::getMostGenericFPMath(MD, IMD);
244 break;
245 }
246 }
247 I->setMetadata(Kind, MD);
248 }
249 return I;
250 }
251
252 /// \returns The type that all of the values in \p VL have or null if there
253 /// are different types.
getSameType(ArrayRef<Value * > VL)254 static Type* getSameType(ArrayRef<Value *> VL) {
255 Type *Ty = VL[0]->getType();
256 for (int i = 1, e = VL.size(); i < e; i++)
257 if (VL[i]->getType() != Ty)
258 return nullptr;
259
260 return Ty;
261 }
262
263 /// \returns True if the ExtractElement instructions in VL can be vectorized
264 /// to use the original vector.
CanReuseExtract(ArrayRef<Value * > VL)265 static bool CanReuseExtract(ArrayRef<Value *> VL) {
266 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
267 // Check if all of the extracts come from the same vector and from the
268 // correct offset.
269 Value *VL0 = VL[0];
270 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
271 Value *Vec = E0->getOperand(0);
272
273 // We have to extract from the same vector type.
274 unsigned NElts = Vec->getType()->getVectorNumElements();
275
276 if (NElts != VL.size())
277 return false;
278
279 // Check that all of the indices extract from the correct offset.
280 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
281 if (!CI || CI->getZExtValue())
282 return false;
283
284 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
285 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
286 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
287
288 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
289 return false;
290 }
291
292 return true;
293 }
294
295 /// \returns True if in-tree use also needs extract. This refers to
296 /// possible scalar operand in vectorized instruction.
InTreeUserNeedToExtract(Value * Scalar,Instruction * UserInst,TargetLibraryInfo * TLI)297 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
298 TargetLibraryInfo *TLI) {
299
300 unsigned Opcode = UserInst->getOpcode();
301 switch (Opcode) {
302 case Instruction::Load: {
303 LoadInst *LI = cast<LoadInst>(UserInst);
304 return (LI->getPointerOperand() == Scalar);
305 }
306 case Instruction::Store: {
307 StoreInst *SI = cast<StoreInst>(UserInst);
308 return (SI->getPointerOperand() == Scalar);
309 }
310 case Instruction::Call: {
311 CallInst *CI = cast<CallInst>(UserInst);
312 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
313 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
314 return (CI->getArgOperand(1) == Scalar);
315 }
316 }
317 default:
318 return false;
319 }
320 }
321
322 /// \returns the AA location that is being access by the instruction.
getLocation(Instruction * I,AliasAnalysis * AA)323 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
324 if (StoreInst *SI = dyn_cast<StoreInst>(I))
325 return MemoryLocation::get(SI);
326 if (LoadInst *LI = dyn_cast<LoadInst>(I))
327 return MemoryLocation::get(LI);
328 return MemoryLocation();
329 }
330
331 /// \returns True if the instruction is not a volatile or atomic load/store.
isSimple(Instruction * I)332 static bool isSimple(Instruction *I) {
333 if (LoadInst *LI = dyn_cast<LoadInst>(I))
334 return LI->isSimple();
335 if (StoreInst *SI = dyn_cast<StoreInst>(I))
336 return SI->isSimple();
337 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
338 return !MI->isVolatile();
339 return true;
340 }
341
342 /// Bottom Up SLP Vectorizer.
343 class BoUpSLP {
344 public:
345 typedef SmallVector<Value *, 8> ValueList;
346 typedef SmallVector<Instruction *, 16> InstrList;
347 typedef SmallPtrSet<Value *, 16> ValueSet;
348 typedef SmallVector<StoreInst *, 8> StoreList;
349
BoUpSLP(Function * Func,ScalarEvolution * Se,TargetTransformInfo * Tti,TargetLibraryInfo * TLi,AliasAnalysis * Aa,LoopInfo * Li,DominatorTree * Dt,AssumptionCache * AC)350 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
351 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
352 DominatorTree *Dt, AssumptionCache *AC)
353 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
354 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
355 Builder(Se->getContext()) {
356 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
357 }
358
359 /// \brief Vectorize the tree that starts with the elements in \p VL.
360 /// Returns the vectorized root.
361 Value *vectorizeTree();
362
363 /// \returns the cost incurred by unwanted spills and fills, caused by
364 /// holding live values over call sites.
365 int getSpillCost();
366
367 /// \returns the vectorization cost of the subtree that starts at \p VL.
368 /// A negative number means that this is profitable.
369 int getTreeCost();
370
371 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
372 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
373 void buildTree(ArrayRef<Value *> Roots,
374 ArrayRef<Value *> UserIgnoreLst = None);
375
376 /// Clear the internal data structures that are created by 'buildTree'.
deleteTree()377 void deleteTree() {
378 VectorizableTree.clear();
379 ScalarToTreeEntry.clear();
380 MustGather.clear();
381 ExternalUses.clear();
382 NumLoadsWantToKeepOrder = 0;
383 NumLoadsWantToChangeOrder = 0;
384 for (auto &Iter : BlocksSchedules) {
385 BlockScheduling *BS = Iter.second.get();
386 BS->clear();
387 }
388 }
389
390 /// \returns true if the memory operations A and B are consecutive.
391 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
392
393 /// \brief Perform LICM and CSE on the newly generated gather sequences.
394 void optimizeGatherSequence();
395
396 /// \returns true if it is benefitial to reverse the vector order.
shouldReorder() const397 bool shouldReorder() const {
398 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
399 }
400
401 private:
402 struct TreeEntry;
403
404 /// \returns the cost of the vectorizable entry.
405 int getEntryCost(TreeEntry *E);
406
407 /// This is the recursive part of buildTree.
408 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
409
410 /// Vectorize a single entry in the tree.
411 Value *vectorizeTree(TreeEntry *E);
412
413 /// Vectorize a single entry in the tree, starting in \p VL.
414 Value *vectorizeTree(ArrayRef<Value *> VL);
415
416 /// \returns the pointer to the vectorized value if \p VL is already
417 /// vectorized, or NULL. They may happen in cycles.
418 Value *alreadyVectorized(ArrayRef<Value *> VL) const;
419
420 /// \brief Take the pointer operand from the Load/Store instruction.
421 /// \returns NULL if this is not a valid Load/Store instruction.
422 static Value *getPointerOperand(Value *I);
423
424 /// \brief Take the address space operand from the Load/Store instruction.
425 /// \returns -1 if this is not a valid Load/Store instruction.
426 static unsigned getAddressSpaceOperand(Value *I);
427
428 /// \returns the scalarization cost for this type. Scalarization in this
429 /// context means the creation of vectors from a group of scalars.
430 int getGatherCost(Type *Ty);
431
432 /// \returns the scalarization cost for this list of values. Assuming that
433 /// this subtree gets vectorized, we may need to extract the values from the
434 /// roots. This method calculates the cost of extracting the values.
435 int getGatherCost(ArrayRef<Value *> VL);
436
437 /// \brief Set the Builder insert point to one after the last instruction in
438 /// the bundle
439 void setInsertPointAfterBundle(ArrayRef<Value *> VL);
440
441 /// \returns a vector from a collection of scalars in \p VL.
442 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
443
444 /// \returns whether the VectorizableTree is fully vectoriable and will
445 /// be beneficial even the tree height is tiny.
446 bool isFullyVectorizableTinyTree();
447
448 /// \reorder commutative operands in alt shuffle if they result in
449 /// vectorized code.
450 void reorderAltShuffleOperands(ArrayRef<Value *> VL,
451 SmallVectorImpl<Value *> &Left,
452 SmallVectorImpl<Value *> &Right);
453 /// \reorder commutative operands to get better probability of
454 /// generating vectorized code.
455 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
456 SmallVectorImpl<Value *> &Left,
457 SmallVectorImpl<Value *> &Right);
458 struct TreeEntry {
TreeEntry__anonc602e1830111::BoUpSLP::TreeEntry459 TreeEntry() : Scalars(), VectorizedValue(nullptr),
460 NeedToGather(0) {}
461
462 /// \returns true if the scalars in VL are equal to this entry.
isSame__anonc602e1830111::BoUpSLP::TreeEntry463 bool isSame(ArrayRef<Value *> VL) const {
464 assert(VL.size() == Scalars.size() && "Invalid size");
465 return std::equal(VL.begin(), VL.end(), Scalars.begin());
466 }
467
468 /// A vector of scalars.
469 ValueList Scalars;
470
471 /// The Scalars are vectorized into this value. It is initialized to Null.
472 Value *VectorizedValue;
473
474 /// Do we need to gather this sequence ?
475 bool NeedToGather;
476 };
477
478 /// Create a new VectorizableTree entry.
newTreeEntry(ArrayRef<Value * > VL,bool Vectorized)479 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
480 VectorizableTree.emplace_back();
481 int idx = VectorizableTree.size() - 1;
482 TreeEntry *Last = &VectorizableTree[idx];
483 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
484 Last->NeedToGather = !Vectorized;
485 if (Vectorized) {
486 for (int i = 0, e = VL.size(); i != e; ++i) {
487 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
488 ScalarToTreeEntry[VL[i]] = idx;
489 }
490 } else {
491 MustGather.insert(VL.begin(), VL.end());
492 }
493 return Last;
494 }
495
496 /// -- Vectorization State --
497 /// Holds all of the tree entries.
498 std::vector<TreeEntry> VectorizableTree;
499
500 /// Maps a specific scalar to its tree entry.
501 SmallDenseMap<Value*, int> ScalarToTreeEntry;
502
503 /// A list of scalars that we found that we need to keep as scalars.
504 ValueSet MustGather;
505
506 /// This POD struct describes one external user in the vectorized tree.
507 struct ExternalUser {
ExternalUser__anonc602e1830111::BoUpSLP::ExternalUser508 ExternalUser (Value *S, llvm::User *U, int L) :
509 Scalar(S), User(U), Lane(L){};
510 // Which scalar in our function.
511 Value *Scalar;
512 // Which user that uses the scalar.
513 llvm::User *User;
514 // Which lane does the scalar belong to.
515 int Lane;
516 };
517 typedef SmallVector<ExternalUser, 16> UserList;
518
519 /// Checks if two instructions may access the same memory.
520 ///
521 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
522 /// is invariant in the calling loop.
isAliased(const MemoryLocation & Loc1,Instruction * Inst1,Instruction * Inst2)523 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
524 Instruction *Inst2) {
525
526 // First check if the result is already in the cache.
527 AliasCacheKey key = std::make_pair(Inst1, Inst2);
528 Optional<bool> &result = AliasCache[key];
529 if (result.hasValue()) {
530 return result.getValue();
531 }
532 MemoryLocation Loc2 = getLocation(Inst2, AA);
533 bool aliased = true;
534 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
535 // Do the alias check.
536 aliased = AA->alias(Loc1, Loc2);
537 }
538 // Store the result in the cache.
539 result = aliased;
540 return aliased;
541 }
542
543 typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
544
545 /// Cache for alias results.
546 /// TODO: consider moving this to the AliasAnalysis itself.
547 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
548
549 /// Removes an instruction from its block and eventually deletes it.
550 /// It's like Instruction::eraseFromParent() except that the actual deletion
551 /// is delayed until BoUpSLP is destructed.
552 /// This is required to ensure that there are no incorrect collisions in the
553 /// AliasCache, which can happen if a new instruction is allocated at the
554 /// same address as a previously deleted instruction.
eraseInstruction(Instruction * I)555 void eraseInstruction(Instruction *I) {
556 I->removeFromParent();
557 I->dropAllReferences();
558 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
559 }
560
561 /// Temporary store for deleted instructions. Instructions will be deleted
562 /// eventually when the BoUpSLP is destructed.
563 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
564
565 /// A list of values that need to extracted out of the tree.
566 /// This list holds pairs of (Internal Scalar : External User).
567 UserList ExternalUses;
568
569 /// Values used only by @llvm.assume calls.
570 SmallPtrSet<const Value *, 32> EphValues;
571
572 /// Holds all of the instructions that we gathered.
573 SetVector<Instruction *> GatherSeq;
574 /// A list of blocks that we are going to CSE.
575 SetVector<BasicBlock *> CSEBlocks;
576
577 /// Contains all scheduling relevant data for an instruction.
578 /// A ScheduleData either represents a single instruction or a member of an
579 /// instruction bundle (= a group of instructions which is combined into a
580 /// vector instruction).
581 struct ScheduleData {
582
583 // The initial value for the dependency counters. It means that the
584 // dependencies are not calculated yet.
585 enum { InvalidDeps = -1 };
586
ScheduleData__anonc602e1830111::BoUpSLP::ScheduleData587 ScheduleData()
588 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
589 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
590 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
591 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
592
init__anonc602e1830111::BoUpSLP::ScheduleData593 void init(int BlockSchedulingRegionID) {
594 FirstInBundle = this;
595 NextInBundle = nullptr;
596 NextLoadStore = nullptr;
597 IsScheduled = false;
598 SchedulingRegionID = BlockSchedulingRegionID;
599 UnscheduledDepsInBundle = UnscheduledDeps;
600 clearDependencies();
601 }
602
603 /// Returns true if the dependency information has been calculated.
hasValidDependencies__anonc602e1830111::BoUpSLP::ScheduleData604 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
605
606 /// Returns true for single instructions and for bundle representatives
607 /// (= the head of a bundle).
isSchedulingEntity__anonc602e1830111::BoUpSLP::ScheduleData608 bool isSchedulingEntity() const { return FirstInBundle == this; }
609
610 /// Returns true if it represents an instruction bundle and not only a
611 /// single instruction.
isPartOfBundle__anonc602e1830111::BoUpSLP::ScheduleData612 bool isPartOfBundle() const {
613 return NextInBundle != nullptr || FirstInBundle != this;
614 }
615
616 /// Returns true if it is ready for scheduling, i.e. it has no more
617 /// unscheduled depending instructions/bundles.
isReady__anonc602e1830111::BoUpSLP::ScheduleData618 bool isReady() const {
619 assert(isSchedulingEntity() &&
620 "can't consider non-scheduling entity for ready list");
621 return UnscheduledDepsInBundle == 0 && !IsScheduled;
622 }
623
624 /// Modifies the number of unscheduled dependencies, also updating it for
625 /// the whole bundle.
incrementUnscheduledDeps__anonc602e1830111::BoUpSLP::ScheduleData626 int incrementUnscheduledDeps(int Incr) {
627 UnscheduledDeps += Incr;
628 return FirstInBundle->UnscheduledDepsInBundle += Incr;
629 }
630
631 /// Sets the number of unscheduled dependencies to the number of
632 /// dependencies.
resetUnscheduledDeps__anonc602e1830111::BoUpSLP::ScheduleData633 void resetUnscheduledDeps() {
634 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
635 }
636
637 /// Clears all dependency information.
clearDependencies__anonc602e1830111::BoUpSLP::ScheduleData638 void clearDependencies() {
639 Dependencies = InvalidDeps;
640 resetUnscheduledDeps();
641 MemoryDependencies.clear();
642 }
643
dump__anonc602e1830111::BoUpSLP::ScheduleData644 void dump(raw_ostream &os) const {
645 if (!isSchedulingEntity()) {
646 os << "/ " << *Inst;
647 } else if (NextInBundle) {
648 os << '[' << *Inst;
649 ScheduleData *SD = NextInBundle;
650 while (SD) {
651 os << ';' << *SD->Inst;
652 SD = SD->NextInBundle;
653 }
654 os << ']';
655 } else {
656 os << *Inst;
657 }
658 }
659
660 Instruction *Inst;
661
662 /// Points to the head in an instruction bundle (and always to this for
663 /// single instructions).
664 ScheduleData *FirstInBundle;
665
666 /// Single linked list of all instructions in a bundle. Null if it is a
667 /// single instruction.
668 ScheduleData *NextInBundle;
669
670 /// Single linked list of all memory instructions (e.g. load, store, call)
671 /// in the block - until the end of the scheduling region.
672 ScheduleData *NextLoadStore;
673
674 /// The dependent memory instructions.
675 /// This list is derived on demand in calculateDependencies().
676 SmallVector<ScheduleData *, 4> MemoryDependencies;
677
678 /// This ScheduleData is in the current scheduling region if this matches
679 /// the current SchedulingRegionID of BlockScheduling.
680 int SchedulingRegionID;
681
682 /// Used for getting a "good" final ordering of instructions.
683 int SchedulingPriority;
684
685 /// The number of dependencies. Constitutes of the number of users of the
686 /// instruction plus the number of dependent memory instructions (if any).
687 /// This value is calculated on demand.
688 /// If InvalidDeps, the number of dependencies is not calculated yet.
689 ///
690 int Dependencies;
691
692 /// The number of dependencies minus the number of dependencies of scheduled
693 /// instructions. As soon as this is zero, the instruction/bundle gets ready
694 /// for scheduling.
695 /// Note that this is negative as long as Dependencies is not calculated.
696 int UnscheduledDeps;
697
698 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
699 /// single instructions.
700 int UnscheduledDepsInBundle;
701
702 /// True if this instruction is scheduled (or considered as scheduled in the
703 /// dry-run).
704 bool IsScheduled;
705 };
706
707 #ifndef NDEBUG
708 friend raw_ostream &operator<<(raw_ostream &os,
709 const BoUpSLP::ScheduleData &SD);
710 #endif
711
712 /// Contains all scheduling data for a basic block.
713 ///
714 struct BlockScheduling {
715
BlockScheduling__anonc602e1830111::BoUpSLP::BlockScheduling716 BlockScheduling(BasicBlock *BB)
717 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
718 ScheduleStart(nullptr), ScheduleEnd(nullptr),
719 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
720 // Make sure that the initial SchedulingRegionID is greater than the
721 // initial SchedulingRegionID in ScheduleData (which is 0).
722 SchedulingRegionID(1) {}
723
clear__anonc602e1830111::BoUpSLP::BlockScheduling724 void clear() {
725 ReadyInsts.clear();
726 ScheduleStart = nullptr;
727 ScheduleEnd = nullptr;
728 FirstLoadStoreInRegion = nullptr;
729 LastLoadStoreInRegion = nullptr;
730
731 // Make a new scheduling region, i.e. all existing ScheduleData is not
732 // in the new region yet.
733 ++SchedulingRegionID;
734 }
735
getScheduleData__anonc602e1830111::BoUpSLP::BlockScheduling736 ScheduleData *getScheduleData(Value *V) {
737 ScheduleData *SD = ScheduleDataMap[V];
738 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
739 return SD;
740 return nullptr;
741 }
742
isInSchedulingRegion__anonc602e1830111::BoUpSLP::BlockScheduling743 bool isInSchedulingRegion(ScheduleData *SD) {
744 return SD->SchedulingRegionID == SchedulingRegionID;
745 }
746
747 /// Marks an instruction as scheduled and puts all dependent ready
748 /// instructions into the ready-list.
749 template <typename ReadyListType>
schedule__anonc602e1830111::BoUpSLP::BlockScheduling750 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
751 SD->IsScheduled = true;
752 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
753
754 ScheduleData *BundleMember = SD;
755 while (BundleMember) {
756 // Handle the def-use chain dependencies.
757 for (Use &U : BundleMember->Inst->operands()) {
758 ScheduleData *OpDef = getScheduleData(U.get());
759 if (OpDef && OpDef->hasValidDependencies() &&
760 OpDef->incrementUnscheduledDeps(-1) == 0) {
761 // There are no more unscheduled dependencies after decrementing,
762 // so we can put the dependent instruction into the ready list.
763 ScheduleData *DepBundle = OpDef->FirstInBundle;
764 assert(!DepBundle->IsScheduled &&
765 "already scheduled bundle gets ready");
766 ReadyList.insert(DepBundle);
767 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
768 }
769 }
770 // Handle the memory dependencies.
771 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
772 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
773 // There are no more unscheduled dependencies after decrementing,
774 // so we can put the dependent instruction into the ready list.
775 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
776 assert(!DepBundle->IsScheduled &&
777 "already scheduled bundle gets ready");
778 ReadyList.insert(DepBundle);
779 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
780 }
781 }
782 BundleMember = BundleMember->NextInBundle;
783 }
784 }
785
786 /// Put all instructions into the ReadyList which are ready for scheduling.
787 template <typename ReadyListType>
initialFillReadyList__anonc602e1830111::BoUpSLP::BlockScheduling788 void initialFillReadyList(ReadyListType &ReadyList) {
789 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
790 ScheduleData *SD = getScheduleData(I);
791 if (SD->isSchedulingEntity() && SD->isReady()) {
792 ReadyList.insert(SD);
793 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
794 }
795 }
796 }
797
798 /// Checks if a bundle of instructions can be scheduled, i.e. has no
799 /// cyclic dependencies. This is only a dry-run, no instructions are
800 /// actually moved at this stage.
801 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
802
803 /// Un-bundles a group of instructions.
804 void cancelScheduling(ArrayRef<Value *> VL);
805
806 /// Extends the scheduling region so that V is inside the region.
807 void extendSchedulingRegion(Value *V);
808
809 /// Initialize the ScheduleData structures for new instructions in the
810 /// scheduling region.
811 void initScheduleData(Instruction *FromI, Instruction *ToI,
812 ScheduleData *PrevLoadStore,
813 ScheduleData *NextLoadStore);
814
815 /// Updates the dependency information of a bundle and of all instructions/
816 /// bundles which depend on the original bundle.
817 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
818 BoUpSLP *SLP);
819
820 /// Sets all instruction in the scheduling region to un-scheduled.
821 void resetSchedule();
822
823 BasicBlock *BB;
824
825 /// Simple memory allocation for ScheduleData.
826 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
827
828 /// The size of a ScheduleData array in ScheduleDataChunks.
829 int ChunkSize;
830
831 /// The allocator position in the current chunk, which is the last entry
832 /// of ScheduleDataChunks.
833 int ChunkPos;
834
835 /// Attaches ScheduleData to Instruction.
836 /// Note that the mapping survives during all vectorization iterations, i.e.
837 /// ScheduleData structures are recycled.
838 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
839
840 struct ReadyList : SmallVector<ScheduleData *, 8> {
insert__anonc602e1830111::BoUpSLP::BlockScheduling::ReadyList841 void insert(ScheduleData *SD) { push_back(SD); }
842 };
843
844 /// The ready-list for scheduling (only used for the dry-run).
845 ReadyList ReadyInsts;
846
847 /// The first instruction of the scheduling region.
848 Instruction *ScheduleStart;
849
850 /// The first instruction _after_ the scheduling region.
851 Instruction *ScheduleEnd;
852
853 /// The first memory accessing instruction in the scheduling region
854 /// (can be null).
855 ScheduleData *FirstLoadStoreInRegion;
856
857 /// The last memory accessing instruction in the scheduling region
858 /// (can be null).
859 ScheduleData *LastLoadStoreInRegion;
860
861 /// The ID of the scheduling region. For a new vectorization iteration this
862 /// is incremented which "removes" all ScheduleData from the region.
863 int SchedulingRegionID;
864 };
865
866 /// Attaches the BlockScheduling structures to basic blocks.
867 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
868
869 /// Performs the "real" scheduling. Done before vectorization is actually
870 /// performed in a basic block.
871 void scheduleBlock(BlockScheduling *BS);
872
873 /// List of users to ignore during scheduling and that don't need extracting.
874 ArrayRef<Value *> UserIgnoreList;
875
876 // Number of load-bundles, which contain consecutive loads.
877 int NumLoadsWantToKeepOrder;
878
879 // Number of load-bundles of size 2, which are consecutive loads if reversed.
880 int NumLoadsWantToChangeOrder;
881
882 // Analysis and block reference.
883 Function *F;
884 ScalarEvolution *SE;
885 TargetTransformInfo *TTI;
886 TargetLibraryInfo *TLI;
887 AliasAnalysis *AA;
888 LoopInfo *LI;
889 DominatorTree *DT;
890 /// Instruction builder to construct the vectorized tree.
891 IRBuilder<> Builder;
892 };
893
894 #ifndef NDEBUG
operator <<(raw_ostream & os,const BoUpSLP::ScheduleData & SD)895 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
896 SD.dump(os);
897 return os;
898 }
899 #endif
900
buildTree(ArrayRef<Value * > Roots,ArrayRef<Value * > UserIgnoreLst)901 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
902 ArrayRef<Value *> UserIgnoreLst) {
903 deleteTree();
904 UserIgnoreList = UserIgnoreLst;
905 if (!getSameType(Roots))
906 return;
907 buildTree_rec(Roots, 0);
908
909 // Collect the values that we need to extract from the tree.
910 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
911 TreeEntry *Entry = &VectorizableTree[EIdx];
912
913 // For each lane:
914 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
915 Value *Scalar = Entry->Scalars[Lane];
916
917 // No need to handle users of gathered values.
918 if (Entry->NeedToGather)
919 continue;
920
921 for (User *U : Scalar->users()) {
922 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
923
924 Instruction *UserInst = dyn_cast<Instruction>(U);
925 if (!UserInst)
926 continue;
927
928 // Skip in-tree scalars that become vectors
929 if (ScalarToTreeEntry.count(U)) {
930 int Idx = ScalarToTreeEntry[U];
931 TreeEntry *UseEntry = &VectorizableTree[Idx];
932 Value *UseScalar = UseEntry->Scalars[0];
933 // Some in-tree scalars will remain as scalar in vectorized
934 // instructions. If that is the case, the one in Lane 0 will
935 // be used.
936 if (UseScalar != U ||
937 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
938 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
939 << ".\n");
940 assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
941 continue;
942 }
943 }
944
945 // Ignore users in the user ignore list.
946 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
947 UserIgnoreList.end())
948 continue;
949
950 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
951 Lane << " from " << *Scalar << ".\n");
952 ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
953 }
954 }
955 }
956 }
957
958
buildTree_rec(ArrayRef<Value * > VL,unsigned Depth)959 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
960 bool SameTy = getSameType(VL); (void)SameTy;
961 bool isAltShuffle = false;
962 assert(SameTy && "Invalid types!");
963
964 if (Depth == RecursionMaxDepth) {
965 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
966 newTreeEntry(VL, false);
967 return;
968 }
969
970 // Don't handle vectors.
971 if (VL[0]->getType()->isVectorTy()) {
972 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
973 newTreeEntry(VL, false);
974 return;
975 }
976
977 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
978 if (SI->getValueOperand()->getType()->isVectorTy()) {
979 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
980 newTreeEntry(VL, false);
981 return;
982 }
983 unsigned Opcode = getSameOpcode(VL);
984
985 // Check that this shuffle vector refers to the alternate
986 // sequence of opcodes.
987 if (Opcode == Instruction::ShuffleVector) {
988 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
989 unsigned Op = I0->getOpcode();
990 if (Op != Instruction::ShuffleVector)
991 isAltShuffle = true;
992 }
993
994 // If all of the operands are identical or constant we have a simple solution.
995 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
996 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
997 newTreeEntry(VL, false);
998 return;
999 }
1000
1001 // We now know that this is a vector of instructions of the same type from
1002 // the same block.
1003
1004 // Don't vectorize ephemeral values.
1005 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1006 if (EphValues.count(VL[i])) {
1007 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1008 ") is ephemeral.\n");
1009 newTreeEntry(VL, false);
1010 return;
1011 }
1012 }
1013
1014 // Check if this is a duplicate of another entry.
1015 if (ScalarToTreeEntry.count(VL[0])) {
1016 int Idx = ScalarToTreeEntry[VL[0]];
1017 TreeEntry *E = &VectorizableTree[Idx];
1018 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1019 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1020 if (E->Scalars[i] != VL[i]) {
1021 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1022 newTreeEntry(VL, false);
1023 return;
1024 }
1025 }
1026 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1027 return;
1028 }
1029
1030 // Check that none of the instructions in the bundle are already in the tree.
1031 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1032 if (ScalarToTreeEntry.count(VL[i])) {
1033 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1034 ") is already in tree.\n");
1035 newTreeEntry(VL, false);
1036 return;
1037 }
1038 }
1039
1040 // If any of the scalars is marked as a value that needs to stay scalar then
1041 // we need to gather the scalars.
1042 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1043 if (MustGather.count(VL[i])) {
1044 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1045 newTreeEntry(VL, false);
1046 return;
1047 }
1048 }
1049
1050 // Check that all of the users of the scalars that we want to vectorize are
1051 // schedulable.
1052 Instruction *VL0 = cast<Instruction>(VL[0]);
1053 BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1054
1055 if (!DT->isReachableFromEntry(BB)) {
1056 // Don't go into unreachable blocks. They may contain instructions with
1057 // dependency cycles which confuse the final scheduling.
1058 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1059 newTreeEntry(VL, false);
1060 return;
1061 }
1062
1063 // Check that every instructions appears once in this bundle.
1064 for (unsigned i = 0, e = VL.size(); i < e; ++i)
1065 for (unsigned j = i+1; j < e; ++j)
1066 if (VL[i] == VL[j]) {
1067 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1068 newTreeEntry(VL, false);
1069 return;
1070 }
1071
1072 auto &BSRef = BlocksSchedules[BB];
1073 if (!BSRef) {
1074 BSRef = llvm::make_unique<BlockScheduling>(BB);
1075 }
1076 BlockScheduling &BS = *BSRef.get();
1077
1078 if (!BS.tryScheduleBundle(VL, this)) {
1079 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1080 BS.cancelScheduling(VL);
1081 newTreeEntry(VL, false);
1082 return;
1083 }
1084 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1085
1086 switch (Opcode) {
1087 case Instruction::PHI: {
1088 PHINode *PH = dyn_cast<PHINode>(VL0);
1089
1090 // Check for terminator values (e.g. invoke).
1091 for (unsigned j = 0; j < VL.size(); ++j)
1092 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1093 TerminatorInst *Term = dyn_cast<TerminatorInst>(
1094 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1095 if (Term) {
1096 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1097 BS.cancelScheduling(VL);
1098 newTreeEntry(VL, false);
1099 return;
1100 }
1101 }
1102
1103 newTreeEntry(VL, true);
1104 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1105
1106 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1107 ValueList Operands;
1108 // Prepare the operand vector.
1109 for (unsigned j = 0; j < VL.size(); ++j)
1110 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1111 PH->getIncomingBlock(i)));
1112
1113 buildTree_rec(Operands, Depth + 1);
1114 }
1115 return;
1116 }
1117 case Instruction::ExtractElement: {
1118 bool Reuse = CanReuseExtract(VL);
1119 if (Reuse) {
1120 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1121 } else {
1122 BS.cancelScheduling(VL);
1123 }
1124 newTreeEntry(VL, Reuse);
1125 return;
1126 }
1127 case Instruction::Load: {
1128 // Check if the loads are consecutive or of we need to swizzle them.
1129 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1130 LoadInst *L = cast<LoadInst>(VL[i]);
1131 if (!L->isSimple()) {
1132 BS.cancelScheduling(VL);
1133 newTreeEntry(VL, false);
1134 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1135 return;
1136 }
1137 const DataLayout &DL = F->getParent()->getDataLayout();
1138 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1139 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
1140 ++NumLoadsWantToChangeOrder;
1141 }
1142 BS.cancelScheduling(VL);
1143 newTreeEntry(VL, false);
1144 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1145 return;
1146 }
1147 }
1148 ++NumLoadsWantToKeepOrder;
1149 newTreeEntry(VL, true);
1150 DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1151 return;
1152 }
1153 case Instruction::ZExt:
1154 case Instruction::SExt:
1155 case Instruction::FPToUI:
1156 case Instruction::FPToSI:
1157 case Instruction::FPExt:
1158 case Instruction::PtrToInt:
1159 case Instruction::IntToPtr:
1160 case Instruction::SIToFP:
1161 case Instruction::UIToFP:
1162 case Instruction::Trunc:
1163 case Instruction::FPTrunc:
1164 case Instruction::BitCast: {
1165 Type *SrcTy = VL0->getOperand(0)->getType();
1166 for (unsigned i = 0; i < VL.size(); ++i) {
1167 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1168 if (Ty != SrcTy || !isValidElementType(Ty)) {
1169 BS.cancelScheduling(VL);
1170 newTreeEntry(VL, false);
1171 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1172 return;
1173 }
1174 }
1175 newTreeEntry(VL, true);
1176 DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1177
1178 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1179 ValueList Operands;
1180 // Prepare the operand vector.
1181 for (unsigned j = 0; j < VL.size(); ++j)
1182 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1183
1184 buildTree_rec(Operands, Depth+1);
1185 }
1186 return;
1187 }
1188 case Instruction::ICmp:
1189 case Instruction::FCmp: {
1190 // Check that all of the compares have the same predicate.
1191 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1192 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1193 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1194 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1195 if (Cmp->getPredicate() != P0 ||
1196 Cmp->getOperand(0)->getType() != ComparedTy) {
1197 BS.cancelScheduling(VL);
1198 newTreeEntry(VL, false);
1199 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1200 return;
1201 }
1202 }
1203
1204 newTreeEntry(VL, true);
1205 DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1206
1207 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1208 ValueList Operands;
1209 // Prepare the operand vector.
1210 for (unsigned j = 0; j < VL.size(); ++j)
1211 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1212
1213 buildTree_rec(Operands, Depth+1);
1214 }
1215 return;
1216 }
1217 case Instruction::Select:
1218 case Instruction::Add:
1219 case Instruction::FAdd:
1220 case Instruction::Sub:
1221 case Instruction::FSub:
1222 case Instruction::Mul:
1223 case Instruction::FMul:
1224 case Instruction::UDiv:
1225 case Instruction::SDiv:
1226 case Instruction::FDiv:
1227 case Instruction::URem:
1228 case Instruction::SRem:
1229 case Instruction::FRem:
1230 case Instruction::Shl:
1231 case Instruction::LShr:
1232 case Instruction::AShr:
1233 case Instruction::And:
1234 case Instruction::Or:
1235 case Instruction::Xor: {
1236 newTreeEntry(VL, true);
1237 DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1238
1239 // Sort operands of the instructions so that each side is more likely to
1240 // have the same opcode.
1241 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1242 ValueList Left, Right;
1243 reorderInputsAccordingToOpcode(VL, Left, Right);
1244 buildTree_rec(Left, Depth + 1);
1245 buildTree_rec(Right, Depth + 1);
1246 return;
1247 }
1248
1249 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1250 ValueList Operands;
1251 // Prepare the operand vector.
1252 for (unsigned j = 0; j < VL.size(); ++j)
1253 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1254
1255 buildTree_rec(Operands, Depth+1);
1256 }
1257 return;
1258 }
1259 case Instruction::GetElementPtr: {
1260 // We don't combine GEPs with complicated (nested) indexing.
1261 for (unsigned j = 0; j < VL.size(); ++j) {
1262 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1263 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1264 BS.cancelScheduling(VL);
1265 newTreeEntry(VL, false);
1266 return;
1267 }
1268 }
1269
1270 // We can't combine several GEPs into one vector if they operate on
1271 // different types.
1272 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1273 for (unsigned j = 0; j < VL.size(); ++j) {
1274 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1275 if (Ty0 != CurTy) {
1276 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1277 BS.cancelScheduling(VL);
1278 newTreeEntry(VL, false);
1279 return;
1280 }
1281 }
1282
1283 // We don't combine GEPs with non-constant indexes.
1284 for (unsigned j = 0; j < VL.size(); ++j) {
1285 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1286 if (!isa<ConstantInt>(Op)) {
1287 DEBUG(
1288 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1289 BS.cancelScheduling(VL);
1290 newTreeEntry(VL, false);
1291 return;
1292 }
1293 }
1294
1295 newTreeEntry(VL, true);
1296 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1297 for (unsigned i = 0, e = 2; i < e; ++i) {
1298 ValueList Operands;
1299 // Prepare the operand vector.
1300 for (unsigned j = 0; j < VL.size(); ++j)
1301 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1302
1303 buildTree_rec(Operands, Depth + 1);
1304 }
1305 return;
1306 }
1307 case Instruction::Store: {
1308 const DataLayout &DL = F->getParent()->getDataLayout();
1309 // Check if the stores are consecutive or of we need to swizzle them.
1310 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1311 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1312 BS.cancelScheduling(VL);
1313 newTreeEntry(VL, false);
1314 DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1315 return;
1316 }
1317
1318 newTreeEntry(VL, true);
1319 DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1320
1321 ValueList Operands;
1322 for (unsigned j = 0; j < VL.size(); ++j)
1323 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1324
1325 buildTree_rec(Operands, Depth + 1);
1326 return;
1327 }
1328 case Instruction::Call: {
1329 // Check if the calls are all to the same vectorizable intrinsic.
1330 CallInst *CI = cast<CallInst>(VL[0]);
1331 // Check if this is an Intrinsic call or something that can be
1332 // represented by an intrinsic call
1333 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1334 if (!isTriviallyVectorizable(ID)) {
1335 BS.cancelScheduling(VL);
1336 newTreeEntry(VL, false);
1337 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1338 return;
1339 }
1340 Function *Int = CI->getCalledFunction();
1341 Value *A1I = nullptr;
1342 if (hasVectorInstrinsicScalarOpd(ID, 1))
1343 A1I = CI->getArgOperand(1);
1344 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1345 CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1346 if (!CI2 || CI2->getCalledFunction() != Int ||
1347 getIntrinsicIDForCall(CI2, TLI) != ID) {
1348 BS.cancelScheduling(VL);
1349 newTreeEntry(VL, false);
1350 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1351 << "\n");
1352 return;
1353 }
1354 // ctlz,cttz and powi are special intrinsics whose second argument
1355 // should be same in order for them to be vectorized.
1356 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1357 Value *A1J = CI2->getArgOperand(1);
1358 if (A1I != A1J) {
1359 BS.cancelScheduling(VL);
1360 newTreeEntry(VL, false);
1361 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1362 << " argument "<< A1I<<"!=" << A1J
1363 << "\n");
1364 return;
1365 }
1366 }
1367 }
1368
1369 newTreeEntry(VL, true);
1370 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1371 ValueList Operands;
1372 // Prepare the operand vector.
1373 for (unsigned j = 0; j < VL.size(); ++j) {
1374 CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1375 Operands.push_back(CI2->getArgOperand(i));
1376 }
1377 buildTree_rec(Operands, Depth + 1);
1378 }
1379 return;
1380 }
1381 case Instruction::ShuffleVector: {
1382 // If this is not an alternate sequence of opcode like add-sub
1383 // then do not vectorize this instruction.
1384 if (!isAltShuffle) {
1385 BS.cancelScheduling(VL);
1386 newTreeEntry(VL, false);
1387 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1388 return;
1389 }
1390 newTreeEntry(VL, true);
1391 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1392
1393 // Reorder operands if reordering would enable vectorization.
1394 if (isa<BinaryOperator>(VL0)) {
1395 ValueList Left, Right;
1396 reorderAltShuffleOperands(VL, Left, Right);
1397 buildTree_rec(Left, Depth + 1);
1398 buildTree_rec(Right, Depth + 1);
1399 return;
1400 }
1401
1402 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1403 ValueList Operands;
1404 // Prepare the operand vector.
1405 for (unsigned j = 0; j < VL.size(); ++j)
1406 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1407
1408 buildTree_rec(Operands, Depth + 1);
1409 }
1410 return;
1411 }
1412 default:
1413 BS.cancelScheduling(VL);
1414 newTreeEntry(VL, false);
1415 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1416 return;
1417 }
1418 }
1419
getEntryCost(TreeEntry * E)1420 int BoUpSLP::getEntryCost(TreeEntry *E) {
1421 ArrayRef<Value*> VL = E->Scalars;
1422
1423 Type *ScalarTy = VL[0]->getType();
1424 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1425 ScalarTy = SI->getValueOperand()->getType();
1426 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1427
1428 if (E->NeedToGather) {
1429 if (allConstant(VL))
1430 return 0;
1431 if (isSplat(VL)) {
1432 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1433 }
1434 return getGatherCost(E->Scalars);
1435 }
1436 unsigned Opcode = getSameOpcode(VL);
1437 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1438 Instruction *VL0 = cast<Instruction>(VL[0]);
1439 switch (Opcode) {
1440 case Instruction::PHI: {
1441 return 0;
1442 }
1443 case Instruction::ExtractElement: {
1444 if (CanReuseExtract(VL)) {
1445 int DeadCost = 0;
1446 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1447 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1448 if (E->hasOneUse())
1449 // Take credit for instruction that will become dead.
1450 DeadCost +=
1451 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1452 }
1453 return -DeadCost;
1454 }
1455 return getGatherCost(VecTy);
1456 }
1457 case Instruction::ZExt:
1458 case Instruction::SExt:
1459 case Instruction::FPToUI:
1460 case Instruction::FPToSI:
1461 case Instruction::FPExt:
1462 case Instruction::PtrToInt:
1463 case Instruction::IntToPtr:
1464 case Instruction::SIToFP:
1465 case Instruction::UIToFP:
1466 case Instruction::Trunc:
1467 case Instruction::FPTrunc:
1468 case Instruction::BitCast: {
1469 Type *SrcTy = VL0->getOperand(0)->getType();
1470
1471 // Calculate the cost of this instruction.
1472 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1473 VL0->getType(), SrcTy);
1474
1475 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1476 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1477 return VecCost - ScalarCost;
1478 }
1479 case Instruction::FCmp:
1480 case Instruction::ICmp:
1481 case Instruction::Select:
1482 case Instruction::Add:
1483 case Instruction::FAdd:
1484 case Instruction::Sub:
1485 case Instruction::FSub:
1486 case Instruction::Mul:
1487 case Instruction::FMul:
1488 case Instruction::UDiv:
1489 case Instruction::SDiv:
1490 case Instruction::FDiv:
1491 case Instruction::URem:
1492 case Instruction::SRem:
1493 case Instruction::FRem:
1494 case Instruction::Shl:
1495 case Instruction::LShr:
1496 case Instruction::AShr:
1497 case Instruction::And:
1498 case Instruction::Or:
1499 case Instruction::Xor: {
1500 // Calculate the cost of this instruction.
1501 int ScalarCost = 0;
1502 int VecCost = 0;
1503 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1504 Opcode == Instruction::Select) {
1505 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1506 ScalarCost = VecTy->getNumElements() *
1507 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1508 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1509 } else {
1510 // Certain instructions can be cheaper to vectorize if they have a
1511 // constant second vector operand.
1512 TargetTransformInfo::OperandValueKind Op1VK =
1513 TargetTransformInfo::OK_AnyValue;
1514 TargetTransformInfo::OperandValueKind Op2VK =
1515 TargetTransformInfo::OK_UniformConstantValue;
1516 TargetTransformInfo::OperandValueProperties Op1VP =
1517 TargetTransformInfo::OP_None;
1518 TargetTransformInfo::OperandValueProperties Op2VP =
1519 TargetTransformInfo::OP_None;
1520
1521 // If all operands are exactly the same ConstantInt then set the
1522 // operand kind to OK_UniformConstantValue.
1523 // If instead not all operands are constants, then set the operand kind
1524 // to OK_AnyValue. If all operands are constants but not the same,
1525 // then set the operand kind to OK_NonUniformConstantValue.
1526 ConstantInt *CInt = nullptr;
1527 for (unsigned i = 0; i < VL.size(); ++i) {
1528 const Instruction *I = cast<Instruction>(VL[i]);
1529 if (!isa<ConstantInt>(I->getOperand(1))) {
1530 Op2VK = TargetTransformInfo::OK_AnyValue;
1531 break;
1532 }
1533 if (i == 0) {
1534 CInt = cast<ConstantInt>(I->getOperand(1));
1535 continue;
1536 }
1537 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1538 CInt != cast<ConstantInt>(I->getOperand(1)))
1539 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1540 }
1541 // FIXME: Currently cost of model modification for division by
1542 // power of 2 is handled only for X86. Add support for other targets.
1543 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1544 CInt->getValue().isPowerOf2())
1545 Op2VP = TargetTransformInfo::OP_PowerOf2;
1546
1547 ScalarCost = VecTy->getNumElements() *
1548 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1549 Op1VP, Op2VP);
1550 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1551 Op1VP, Op2VP);
1552 }
1553 return VecCost - ScalarCost;
1554 }
1555 case Instruction::GetElementPtr: {
1556 TargetTransformInfo::OperandValueKind Op1VK =
1557 TargetTransformInfo::OK_AnyValue;
1558 TargetTransformInfo::OperandValueKind Op2VK =
1559 TargetTransformInfo::OK_UniformConstantValue;
1560
1561 int ScalarCost =
1562 VecTy->getNumElements() *
1563 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1564 int VecCost =
1565 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1566
1567 return VecCost - ScalarCost;
1568 }
1569 case Instruction::Load: {
1570 // Cost of wide load - cost of scalar loads.
1571 int ScalarLdCost = VecTy->getNumElements() *
1572 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1573 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1574 return VecLdCost - ScalarLdCost;
1575 }
1576 case Instruction::Store: {
1577 // We know that we can merge the stores. Calculate the cost.
1578 int ScalarStCost = VecTy->getNumElements() *
1579 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1580 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1581 return VecStCost - ScalarStCost;
1582 }
1583 case Instruction::Call: {
1584 CallInst *CI = cast<CallInst>(VL0);
1585 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1586
1587 // Calculate the cost of the scalar and vector calls.
1588 SmallVector<Type*, 4> ScalarTys, VecTys;
1589 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1590 ScalarTys.push_back(CI->getArgOperand(op)->getType());
1591 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1592 VecTy->getNumElements()));
1593 }
1594
1595 int ScalarCallCost = VecTy->getNumElements() *
1596 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1597
1598 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1599
1600 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1601 << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1602 << " for " << *CI << "\n");
1603
1604 return VecCallCost - ScalarCallCost;
1605 }
1606 case Instruction::ShuffleVector: {
1607 TargetTransformInfo::OperandValueKind Op1VK =
1608 TargetTransformInfo::OK_AnyValue;
1609 TargetTransformInfo::OperandValueKind Op2VK =
1610 TargetTransformInfo::OK_AnyValue;
1611 int ScalarCost = 0;
1612 int VecCost = 0;
1613 for (unsigned i = 0; i < VL.size(); ++i) {
1614 Instruction *I = cast<Instruction>(VL[i]);
1615 if (!I)
1616 break;
1617 ScalarCost +=
1618 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1619 }
1620 // VecCost is equal to sum of the cost of creating 2 vectors
1621 // and the cost of creating shuffle.
1622 Instruction *I0 = cast<Instruction>(VL[0]);
1623 VecCost =
1624 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1625 Instruction *I1 = cast<Instruction>(VL[1]);
1626 VecCost +=
1627 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1628 VecCost +=
1629 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1630 return VecCost - ScalarCost;
1631 }
1632 default:
1633 llvm_unreachable("Unknown instruction");
1634 }
1635 }
1636
isFullyVectorizableTinyTree()1637 bool BoUpSLP::isFullyVectorizableTinyTree() {
1638 DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1639 VectorizableTree.size() << " is fully vectorizable .\n");
1640
1641 // We only handle trees of height 2.
1642 if (VectorizableTree.size() != 2)
1643 return false;
1644
1645 // Handle splat stores.
1646 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
1647 return true;
1648
1649 // Gathering cost would be too much for tiny trees.
1650 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1651 return false;
1652
1653 return true;
1654 }
1655
getSpillCost()1656 int BoUpSLP::getSpillCost() {
1657 // Walk from the bottom of the tree to the top, tracking which values are
1658 // live. When we see a call instruction that is not part of our tree,
1659 // query TTI to see if there is a cost to keeping values live over it
1660 // (for example, if spills and fills are required).
1661 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1662 int Cost = 0;
1663
1664 SmallPtrSet<Instruction*, 4> LiveValues;
1665 Instruction *PrevInst = nullptr;
1666
1667 for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1668 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1669 if (!Inst)
1670 continue;
1671
1672 if (!PrevInst) {
1673 PrevInst = Inst;
1674 continue;
1675 }
1676
1677 DEBUG(
1678 dbgs() << "SLP: #LV: " << LiveValues.size();
1679 for (auto *X : LiveValues)
1680 dbgs() << " " << X->getName();
1681 dbgs() << ", Looking at ";
1682 Inst->dump();
1683 );
1684
1685 // Update LiveValues.
1686 LiveValues.erase(PrevInst);
1687 for (auto &J : PrevInst->operands()) {
1688 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1689 LiveValues.insert(cast<Instruction>(&*J));
1690 }
1691
1692 // Now find the sequence of instructions between PrevInst and Inst.
1693 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1694 --PrevInstIt;
1695 while (InstIt != PrevInstIt) {
1696 if (PrevInstIt == PrevInst->getParent()->rend()) {
1697 PrevInstIt = Inst->getParent()->rbegin();
1698 continue;
1699 }
1700
1701 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1702 SmallVector<Type*, 4> V;
1703 for (auto *II : LiveValues)
1704 V.push_back(VectorType::get(II->getType(), BundleWidth));
1705 Cost += TTI->getCostOfKeepingLiveOverCall(V);
1706 }
1707
1708 ++PrevInstIt;
1709 }
1710
1711 PrevInst = Inst;
1712 }
1713
1714 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1715 return Cost;
1716 }
1717
getTreeCost()1718 int BoUpSLP::getTreeCost() {
1719 int Cost = 0;
1720 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1721 VectorizableTree.size() << ".\n");
1722
1723 // We only vectorize tiny trees if it is fully vectorizable.
1724 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1725 if (VectorizableTree.empty()) {
1726 assert(!ExternalUses.size() && "We should not have any external users");
1727 }
1728 return INT_MAX;
1729 }
1730
1731 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1732
1733 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1734 int C = getEntryCost(&VectorizableTree[i]);
1735 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1736 << *VectorizableTree[i].Scalars[0] << " .\n");
1737 Cost += C;
1738 }
1739
1740 SmallSet<Value *, 16> ExtractCostCalculated;
1741 int ExtractCost = 0;
1742 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1743 I != E; ++I) {
1744 // We only add extract cost once for the same scalar.
1745 if (!ExtractCostCalculated.insert(I->Scalar).second)
1746 continue;
1747
1748 // Uses by ephemeral values are free (because the ephemeral value will be
1749 // removed prior to code generation, and so the extraction will be
1750 // removed as well).
1751 if (EphValues.count(I->User))
1752 continue;
1753
1754 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1755 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1756 I->Lane);
1757 }
1758
1759 Cost += getSpillCost();
1760
1761 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1762 return Cost + ExtractCost;
1763 }
1764
getGatherCost(Type * Ty)1765 int BoUpSLP::getGatherCost(Type *Ty) {
1766 int Cost = 0;
1767 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1768 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1769 return Cost;
1770 }
1771
getGatherCost(ArrayRef<Value * > VL)1772 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1773 // Find the type of the operands in VL.
1774 Type *ScalarTy = VL[0]->getType();
1775 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1776 ScalarTy = SI->getValueOperand()->getType();
1777 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1778 // Find the cost of inserting/extracting values from the vector.
1779 return getGatherCost(VecTy);
1780 }
1781
getPointerOperand(Value * I)1782 Value *BoUpSLP::getPointerOperand(Value *I) {
1783 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1784 return LI->getPointerOperand();
1785 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1786 return SI->getPointerOperand();
1787 return nullptr;
1788 }
1789
getAddressSpaceOperand(Value * I)1790 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1791 if (LoadInst *L = dyn_cast<LoadInst>(I))
1792 return L->getPointerAddressSpace();
1793 if (StoreInst *S = dyn_cast<StoreInst>(I))
1794 return S->getPointerAddressSpace();
1795 return -1;
1796 }
1797
isConsecutiveAccess(Value * A,Value * B,const DataLayout & DL)1798 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
1799 Value *PtrA = getPointerOperand(A);
1800 Value *PtrB = getPointerOperand(B);
1801 unsigned ASA = getAddressSpaceOperand(A);
1802 unsigned ASB = getAddressSpaceOperand(B);
1803
1804 // Check that the address spaces match and that the pointers are valid.
1805 if (!PtrA || !PtrB || (ASA != ASB))
1806 return false;
1807
1808 // Make sure that A and B are different pointers of the same type.
1809 if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1810 return false;
1811
1812 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
1813 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1814 APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
1815
1816 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1817 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1818 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1819
1820 APInt OffsetDelta = OffsetB - OffsetA;
1821
1822 // Check if they are based on the same pointer. That makes the offsets
1823 // sufficient.
1824 if (PtrA == PtrB)
1825 return OffsetDelta == Size;
1826
1827 // Compute the necessary base pointer delta to have the necessary final delta
1828 // equal to the size.
1829 APInt BaseDelta = Size - OffsetDelta;
1830
1831 // Otherwise compute the distance with SCEV between the base pointers.
1832 const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1833 const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1834 const SCEV *C = SE->getConstant(BaseDelta);
1835 const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1836 return X == PtrSCEVB;
1837 }
1838
1839 // Reorder commutative operations in alternate shuffle if the resulting vectors
1840 // are consecutive loads. This would allow us to vectorize the tree.
1841 // If we have something like-
1842 // load a[0] - load b[0]
1843 // load b[1] + load a[1]
1844 // load a[2] - load b[2]
1845 // load a[3] + load b[3]
1846 // Reordering the second load b[1] load a[1] would allow us to vectorize this
1847 // code.
reorderAltShuffleOperands(ArrayRef<Value * > VL,SmallVectorImpl<Value * > & Left,SmallVectorImpl<Value * > & Right)1848 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
1849 SmallVectorImpl<Value *> &Left,
1850 SmallVectorImpl<Value *> &Right) {
1851 const DataLayout &DL = F->getParent()->getDataLayout();
1852
1853 // Push left and right operands of binary operation into Left and Right
1854 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1855 Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
1856 Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
1857 }
1858
1859 // Reorder if we have a commutative operation and consecutive access
1860 // are on either side of the alternate instructions.
1861 for (unsigned j = 0; j < VL.size() - 1; ++j) {
1862 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1863 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1864 Instruction *VL1 = cast<Instruction>(VL[j]);
1865 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1866 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1867 std::swap(Left[j], Right[j]);
1868 continue;
1869 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1870 std::swap(Left[j + 1], Right[j + 1]);
1871 continue;
1872 }
1873 // else unchanged
1874 }
1875 }
1876 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
1877 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
1878 Instruction *VL1 = cast<Instruction>(VL[j]);
1879 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1880 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1881 std::swap(Left[j], Right[j]);
1882 continue;
1883 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1884 std::swap(Left[j + 1], Right[j + 1]);
1885 continue;
1886 }
1887 // else unchanged
1888 }
1889 }
1890 }
1891 }
1892
reorderInputsAccordingToOpcode(ArrayRef<Value * > VL,SmallVectorImpl<Value * > & Left,SmallVectorImpl<Value * > & Right)1893 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1894 SmallVectorImpl<Value *> &Left,
1895 SmallVectorImpl<Value *> &Right) {
1896
1897 SmallVector<Value *, 16> OrigLeft, OrigRight;
1898
1899 bool AllSameOpcodeLeft = true;
1900 bool AllSameOpcodeRight = true;
1901 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1902 Instruction *I = cast<Instruction>(VL[i]);
1903 Value *VLeft = I->getOperand(0);
1904 Value *VRight = I->getOperand(1);
1905
1906 OrigLeft.push_back(VLeft);
1907 OrigRight.push_back(VRight);
1908
1909 Instruction *ILeft = dyn_cast<Instruction>(VLeft);
1910 Instruction *IRight = dyn_cast<Instruction>(VRight);
1911
1912 // Check whether all operands on one side have the same opcode. In this case
1913 // we want to preserve the original order and not make things worse by
1914 // reordering.
1915 if (i && AllSameOpcodeLeft && ILeft) {
1916 if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
1917 if (PLeft->getOpcode() != ILeft->getOpcode())
1918 AllSameOpcodeLeft = false;
1919 } else
1920 AllSameOpcodeLeft = false;
1921 }
1922 if (i && AllSameOpcodeRight && IRight) {
1923 if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
1924 if (PRight->getOpcode() != IRight->getOpcode())
1925 AllSameOpcodeRight = false;
1926 } else
1927 AllSameOpcodeRight = false;
1928 }
1929
1930 // Sort two opcodes. In the code below we try to preserve the ability to use
1931 // broadcast of values instead of individual inserts.
1932 // vl1 = load
1933 // vl2 = phi
1934 // vr1 = load
1935 // vr2 = vr2
1936 // = vl1 x vr1
1937 // = vl2 x vr2
1938 // If we just sorted according to opcode we would leave the first line in
1939 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
1940 // = vl1 x vr1
1941 // = vr2 x vl2
1942 // Because vr2 and vr1 are from the same load we loose the opportunity of a
1943 // broadcast for the packed right side in the backend: we have [vr1, vl2]
1944 // instead of [vr1, vr2=vr1].
1945 if (ILeft && IRight) {
1946 if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
1947 Left.push_back(IRight);
1948 Right.push_back(ILeft);
1949 } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
1950 Right[i - 1] != IRight) {
1951 // Try not to destroy a broad cast for no apparent benefit.
1952 Left.push_back(IRight);
1953 Right.push_back(ILeft);
1954 } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1955 Right[i - 1] == ILeft) {
1956 // Try preserve broadcasts.
1957 Left.push_back(IRight);
1958 Right.push_back(ILeft);
1959 } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1960 Left[i - 1] == IRight) {
1961 // Try preserve broadcasts.
1962 Left.push_back(IRight);
1963 Right.push_back(ILeft);
1964 } else {
1965 Left.push_back(ILeft);
1966 Right.push_back(IRight);
1967 }
1968 continue;
1969 }
1970 // One opcode, put the instruction on the right.
1971 if (ILeft) {
1972 Left.push_back(VRight);
1973 Right.push_back(ILeft);
1974 continue;
1975 }
1976 Left.push_back(VLeft);
1977 Right.push_back(VRight);
1978 }
1979
1980 bool LeftBroadcast = isSplat(Left);
1981 bool RightBroadcast = isSplat(Right);
1982
1983 // If operands end up being broadcast return this operand order.
1984 if (LeftBroadcast || RightBroadcast)
1985 return;
1986
1987 // Don't reorder if the operands where good to begin.
1988 if (AllSameOpcodeRight || AllSameOpcodeLeft) {
1989 Left = OrigLeft;
1990 Right = OrigRight;
1991 }
1992
1993 const DataLayout &DL = F->getParent()->getDataLayout();
1994
1995 // Finally check if we can get longer vectorizable chain by reordering
1996 // without breaking the good operand order detected above.
1997 // E.g. If we have something like-
1998 // load a[0] load b[0]
1999 // load b[1] load a[1]
2000 // load a[2] load b[2]
2001 // load a[3] load b[3]
2002 // Reordering the second load b[1] load a[1] would allow us to vectorize
2003 // this code and we still retain AllSameOpcode property.
2004 // FIXME: This load reordering might break AllSameOpcode in some rare cases
2005 // such as-
2006 // add a[0],c[0] load b[0]
2007 // add a[1],c[2] load b[1]
2008 // b[2] load b[2]
2009 // add a[3],c[3] load b[3]
2010 for (unsigned j = 0; j < VL.size() - 1; ++j) {
2011 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2012 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2013 if (isConsecutiveAccess(L, L1, DL)) {
2014 std::swap(Left[j + 1], Right[j + 1]);
2015 continue;
2016 }
2017 }
2018 }
2019 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2020 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2021 if (isConsecutiveAccess(L, L1, DL)) {
2022 std::swap(Left[j + 1], Right[j + 1]);
2023 continue;
2024 }
2025 }
2026 }
2027 // else unchanged
2028 }
2029 }
2030
setInsertPointAfterBundle(ArrayRef<Value * > VL)2031 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2032 Instruction *VL0 = cast<Instruction>(VL[0]);
2033 BasicBlock::iterator NextInst = VL0;
2034 ++NextInst;
2035 Builder.SetInsertPoint(VL0->getParent(), NextInst);
2036 Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
2037 }
2038
Gather(ArrayRef<Value * > VL,VectorType * Ty)2039 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2040 Value *Vec = UndefValue::get(Ty);
2041 // Generate the 'InsertElement' instruction.
2042 for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2043 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2044 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2045 GatherSeq.insert(Insrt);
2046 CSEBlocks.insert(Insrt->getParent());
2047
2048 // Add to our 'need-to-extract' list.
2049 if (ScalarToTreeEntry.count(VL[i])) {
2050 int Idx = ScalarToTreeEntry[VL[i]];
2051 TreeEntry *E = &VectorizableTree[Idx];
2052 // Find which lane we need to extract.
2053 int FoundLane = -1;
2054 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2055 // Is this the lane of the scalar that we are looking for ?
2056 if (E->Scalars[Lane] == VL[i]) {
2057 FoundLane = Lane;
2058 break;
2059 }
2060 }
2061 assert(FoundLane >= 0 && "Could not find the correct lane");
2062 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2063 }
2064 }
2065 }
2066
2067 return Vec;
2068 }
2069
alreadyVectorized(ArrayRef<Value * > VL) const2070 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2071 SmallDenseMap<Value*, int>::const_iterator Entry
2072 = ScalarToTreeEntry.find(VL[0]);
2073 if (Entry != ScalarToTreeEntry.end()) {
2074 int Idx = Entry->second;
2075 const TreeEntry *En = &VectorizableTree[Idx];
2076 if (En->isSame(VL) && En->VectorizedValue)
2077 return En->VectorizedValue;
2078 }
2079 return nullptr;
2080 }
2081
vectorizeTree(ArrayRef<Value * > VL)2082 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2083 if (ScalarToTreeEntry.count(VL[0])) {
2084 int Idx = ScalarToTreeEntry[VL[0]];
2085 TreeEntry *E = &VectorizableTree[Idx];
2086 if (E->isSame(VL))
2087 return vectorizeTree(E);
2088 }
2089
2090 Type *ScalarTy = VL[0]->getType();
2091 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2092 ScalarTy = SI->getValueOperand()->getType();
2093 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2094
2095 return Gather(VL, VecTy);
2096 }
2097
vectorizeTree(TreeEntry * E)2098 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2099 IRBuilder<>::InsertPointGuard Guard(Builder);
2100
2101 if (E->VectorizedValue) {
2102 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2103 return E->VectorizedValue;
2104 }
2105
2106 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2107 Type *ScalarTy = VL0->getType();
2108 if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2109 ScalarTy = SI->getValueOperand()->getType();
2110 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2111
2112 if (E->NeedToGather) {
2113 setInsertPointAfterBundle(E->Scalars);
2114 return Gather(E->Scalars, VecTy);
2115 }
2116
2117 const DataLayout &DL = F->getParent()->getDataLayout();
2118 unsigned Opcode = getSameOpcode(E->Scalars);
2119
2120 switch (Opcode) {
2121 case Instruction::PHI: {
2122 PHINode *PH = dyn_cast<PHINode>(VL0);
2123 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2124 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2125 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2126 E->VectorizedValue = NewPhi;
2127
2128 // PHINodes may have multiple entries from the same block. We want to
2129 // visit every block once.
2130 SmallSet<BasicBlock*, 4> VisitedBBs;
2131
2132 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2133 ValueList Operands;
2134 BasicBlock *IBB = PH->getIncomingBlock(i);
2135
2136 if (!VisitedBBs.insert(IBB).second) {
2137 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2138 continue;
2139 }
2140
2141 // Prepare the operand vector.
2142 for (Value *V : E->Scalars)
2143 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2144
2145 Builder.SetInsertPoint(IBB->getTerminator());
2146 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2147 Value *Vec = vectorizeTree(Operands);
2148 NewPhi->addIncoming(Vec, IBB);
2149 }
2150
2151 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2152 "Invalid number of incoming values");
2153 return NewPhi;
2154 }
2155
2156 case Instruction::ExtractElement: {
2157 if (CanReuseExtract(E->Scalars)) {
2158 Value *V = VL0->getOperand(0);
2159 E->VectorizedValue = V;
2160 return V;
2161 }
2162 return Gather(E->Scalars, VecTy);
2163 }
2164 case Instruction::ZExt:
2165 case Instruction::SExt:
2166 case Instruction::FPToUI:
2167 case Instruction::FPToSI:
2168 case Instruction::FPExt:
2169 case Instruction::PtrToInt:
2170 case Instruction::IntToPtr:
2171 case Instruction::SIToFP:
2172 case Instruction::UIToFP:
2173 case Instruction::Trunc:
2174 case Instruction::FPTrunc:
2175 case Instruction::BitCast: {
2176 ValueList INVL;
2177 for (Value *V : E->Scalars)
2178 INVL.push_back(cast<Instruction>(V)->getOperand(0));
2179
2180 setInsertPointAfterBundle(E->Scalars);
2181
2182 Value *InVec = vectorizeTree(INVL);
2183
2184 if (Value *V = alreadyVectorized(E->Scalars))
2185 return V;
2186
2187 CastInst *CI = dyn_cast<CastInst>(VL0);
2188 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2189 E->VectorizedValue = V;
2190 ++NumVectorInstructions;
2191 return V;
2192 }
2193 case Instruction::FCmp:
2194 case Instruction::ICmp: {
2195 ValueList LHSV, RHSV;
2196 for (Value *V : E->Scalars) {
2197 LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2198 RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2199 }
2200
2201 setInsertPointAfterBundle(E->Scalars);
2202
2203 Value *L = vectorizeTree(LHSV);
2204 Value *R = vectorizeTree(RHSV);
2205
2206 if (Value *V = alreadyVectorized(E->Scalars))
2207 return V;
2208
2209 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2210 Value *V;
2211 if (Opcode == Instruction::FCmp)
2212 V = Builder.CreateFCmp(P0, L, R);
2213 else
2214 V = Builder.CreateICmp(P0, L, R);
2215
2216 E->VectorizedValue = V;
2217 ++NumVectorInstructions;
2218 return V;
2219 }
2220 case Instruction::Select: {
2221 ValueList TrueVec, FalseVec, CondVec;
2222 for (Value *V : E->Scalars) {
2223 CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2224 TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2225 FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2226 }
2227
2228 setInsertPointAfterBundle(E->Scalars);
2229
2230 Value *Cond = vectorizeTree(CondVec);
2231 Value *True = vectorizeTree(TrueVec);
2232 Value *False = vectorizeTree(FalseVec);
2233
2234 if (Value *V = alreadyVectorized(E->Scalars))
2235 return V;
2236
2237 Value *V = Builder.CreateSelect(Cond, True, False);
2238 E->VectorizedValue = V;
2239 ++NumVectorInstructions;
2240 return V;
2241 }
2242 case Instruction::Add:
2243 case Instruction::FAdd:
2244 case Instruction::Sub:
2245 case Instruction::FSub:
2246 case Instruction::Mul:
2247 case Instruction::FMul:
2248 case Instruction::UDiv:
2249 case Instruction::SDiv:
2250 case Instruction::FDiv:
2251 case Instruction::URem:
2252 case Instruction::SRem:
2253 case Instruction::FRem:
2254 case Instruction::Shl:
2255 case Instruction::LShr:
2256 case Instruction::AShr:
2257 case Instruction::And:
2258 case Instruction::Or:
2259 case Instruction::Xor: {
2260 ValueList LHSVL, RHSVL;
2261 if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2262 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2263 else
2264 for (Value *V : E->Scalars) {
2265 LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2266 RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2267 }
2268
2269 setInsertPointAfterBundle(E->Scalars);
2270
2271 Value *LHS = vectorizeTree(LHSVL);
2272 Value *RHS = vectorizeTree(RHSVL);
2273
2274 if (LHS == RHS && isa<Instruction>(LHS)) {
2275 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2276 }
2277
2278 if (Value *V = alreadyVectorized(E->Scalars))
2279 return V;
2280
2281 BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2282 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2283 E->VectorizedValue = V;
2284 propagateIRFlags(E->VectorizedValue, E->Scalars);
2285 ++NumVectorInstructions;
2286
2287 if (Instruction *I = dyn_cast<Instruction>(V))
2288 return propagateMetadata(I, E->Scalars);
2289
2290 return V;
2291 }
2292 case Instruction::Load: {
2293 // Loads are inserted at the head of the tree because we don't want to
2294 // sink them all the way down past store instructions.
2295 setInsertPointAfterBundle(E->Scalars);
2296
2297 LoadInst *LI = cast<LoadInst>(VL0);
2298 Type *ScalarLoadTy = LI->getType();
2299 unsigned AS = LI->getPointerAddressSpace();
2300
2301 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2302 VecTy->getPointerTo(AS));
2303
2304 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2305 // ExternalUses list to make sure that an extract will be generated in the
2306 // future.
2307 if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2308 ExternalUses.push_back(
2309 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2310
2311 unsigned Alignment = LI->getAlignment();
2312 LI = Builder.CreateLoad(VecPtr);
2313 if (!Alignment) {
2314 Alignment = DL.getABITypeAlignment(ScalarLoadTy);
2315 }
2316 LI->setAlignment(Alignment);
2317 E->VectorizedValue = LI;
2318 ++NumVectorInstructions;
2319 return propagateMetadata(LI, E->Scalars);
2320 }
2321 case Instruction::Store: {
2322 StoreInst *SI = cast<StoreInst>(VL0);
2323 unsigned Alignment = SI->getAlignment();
2324 unsigned AS = SI->getPointerAddressSpace();
2325
2326 ValueList ValueOp;
2327 for (Value *V : E->Scalars)
2328 ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2329
2330 setInsertPointAfterBundle(E->Scalars);
2331
2332 Value *VecValue = vectorizeTree(ValueOp);
2333 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2334 VecTy->getPointerTo(AS));
2335 StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2336
2337 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2338 // ExternalUses list to make sure that an extract will be generated in the
2339 // future.
2340 if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2341 ExternalUses.push_back(
2342 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2343
2344 if (!Alignment) {
2345 Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
2346 }
2347 S->setAlignment(Alignment);
2348 E->VectorizedValue = S;
2349 ++NumVectorInstructions;
2350 return propagateMetadata(S, E->Scalars);
2351 }
2352 case Instruction::GetElementPtr: {
2353 setInsertPointAfterBundle(E->Scalars);
2354
2355 ValueList Op0VL;
2356 for (Value *V : E->Scalars)
2357 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2358
2359 Value *Op0 = vectorizeTree(Op0VL);
2360
2361 std::vector<Value *> OpVecs;
2362 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2363 ++j) {
2364 ValueList OpVL;
2365 for (Value *V : E->Scalars)
2366 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2367
2368 Value *OpVec = vectorizeTree(OpVL);
2369 OpVecs.push_back(OpVec);
2370 }
2371
2372 Value *V = Builder.CreateGEP(
2373 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2374 E->VectorizedValue = V;
2375 ++NumVectorInstructions;
2376
2377 if (Instruction *I = dyn_cast<Instruction>(V))
2378 return propagateMetadata(I, E->Scalars);
2379
2380 return V;
2381 }
2382 case Instruction::Call: {
2383 CallInst *CI = cast<CallInst>(VL0);
2384 setInsertPointAfterBundle(E->Scalars);
2385 Function *FI;
2386 Intrinsic::ID IID = Intrinsic::not_intrinsic;
2387 Value *ScalarArg = nullptr;
2388 if (CI && (FI = CI->getCalledFunction())) {
2389 IID = FI->getIntrinsicID();
2390 }
2391 std::vector<Value *> OpVecs;
2392 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2393 ValueList OpVL;
2394 // ctlz,cttz and powi are special intrinsics whose second argument is
2395 // a scalar. This argument should not be vectorized.
2396 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2397 CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2398 ScalarArg = CEI->getArgOperand(j);
2399 OpVecs.push_back(CEI->getArgOperand(j));
2400 continue;
2401 }
2402 for (Value *V : E->Scalars) {
2403 CallInst *CEI = cast<CallInst>(V);
2404 OpVL.push_back(CEI->getArgOperand(j));
2405 }
2406
2407 Value *OpVec = vectorizeTree(OpVL);
2408 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2409 OpVecs.push_back(OpVec);
2410 }
2411
2412 Module *M = F->getParent();
2413 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2414 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2415 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2416 Value *V = Builder.CreateCall(CF, OpVecs);
2417
2418 // The scalar argument uses an in-tree scalar so we add the new vectorized
2419 // call to ExternalUses list to make sure that an extract will be
2420 // generated in the future.
2421 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2422 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2423
2424 E->VectorizedValue = V;
2425 ++NumVectorInstructions;
2426 return V;
2427 }
2428 case Instruction::ShuffleVector: {
2429 ValueList LHSVL, RHSVL;
2430 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2431 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2432 setInsertPointAfterBundle(E->Scalars);
2433
2434 Value *LHS = vectorizeTree(LHSVL);
2435 Value *RHS = vectorizeTree(RHSVL);
2436
2437 if (Value *V = alreadyVectorized(E->Scalars))
2438 return V;
2439
2440 // Create a vector of LHS op1 RHS
2441 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2442 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2443
2444 // Create a vector of LHS op2 RHS
2445 Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2446 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2447 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2448
2449 // Create shuffle to take alternate operations from the vector.
2450 // Also, gather up odd and even scalar ops to propagate IR flags to
2451 // each vector operation.
2452 ValueList OddScalars, EvenScalars;
2453 unsigned e = E->Scalars.size();
2454 SmallVector<Constant *, 8> Mask(e);
2455 for (unsigned i = 0; i < e; ++i) {
2456 if (i & 1) {
2457 Mask[i] = Builder.getInt32(e + i);
2458 OddScalars.push_back(E->Scalars[i]);
2459 } else {
2460 Mask[i] = Builder.getInt32(i);
2461 EvenScalars.push_back(E->Scalars[i]);
2462 }
2463 }
2464
2465 Value *ShuffleMask = ConstantVector::get(Mask);
2466 propagateIRFlags(V0, EvenScalars);
2467 propagateIRFlags(V1, OddScalars);
2468
2469 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2470 E->VectorizedValue = V;
2471 ++NumVectorInstructions;
2472 if (Instruction *I = dyn_cast<Instruction>(V))
2473 return propagateMetadata(I, E->Scalars);
2474
2475 return V;
2476 }
2477 default:
2478 llvm_unreachable("unknown inst");
2479 }
2480 return nullptr;
2481 }
2482
vectorizeTree()2483 Value *BoUpSLP::vectorizeTree() {
2484
2485 // All blocks must be scheduled before any instructions are inserted.
2486 for (auto &BSIter : BlocksSchedules) {
2487 scheduleBlock(BSIter.second.get());
2488 }
2489
2490 Builder.SetInsertPoint(F->getEntryBlock().begin());
2491 vectorizeTree(&VectorizableTree[0]);
2492
2493 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2494
2495 // Extract all of the elements with the external uses.
2496 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2497 it != e; ++it) {
2498 Value *Scalar = it->Scalar;
2499 llvm::User *User = it->User;
2500
2501 // Skip users that we already RAUW. This happens when one instruction
2502 // has multiple uses of the same value.
2503 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2504 Scalar->user_end())
2505 continue;
2506 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2507
2508 int Idx = ScalarToTreeEntry[Scalar];
2509 TreeEntry *E = &VectorizableTree[Idx];
2510 assert(!E->NeedToGather && "Extracting from a gather list");
2511
2512 Value *Vec = E->VectorizedValue;
2513 assert(Vec && "Can't find vectorizable value");
2514
2515 Value *Lane = Builder.getInt32(it->Lane);
2516 // Generate extracts for out-of-tree users.
2517 // Find the insertion point for the extractelement lane.
2518 if (isa<Instruction>(Vec)){
2519 if (PHINode *PH = dyn_cast<PHINode>(User)) {
2520 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2521 if (PH->getIncomingValue(i) == Scalar) {
2522 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2523 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2524 CSEBlocks.insert(PH->getIncomingBlock(i));
2525 PH->setOperand(i, Ex);
2526 }
2527 }
2528 } else {
2529 Builder.SetInsertPoint(cast<Instruction>(User));
2530 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2531 CSEBlocks.insert(cast<Instruction>(User)->getParent());
2532 User->replaceUsesOfWith(Scalar, Ex);
2533 }
2534 } else {
2535 Builder.SetInsertPoint(F->getEntryBlock().begin());
2536 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2537 CSEBlocks.insert(&F->getEntryBlock());
2538 User->replaceUsesOfWith(Scalar, Ex);
2539 }
2540
2541 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2542 }
2543
2544 // For each vectorized value:
2545 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2546 TreeEntry *Entry = &VectorizableTree[EIdx];
2547
2548 // For each lane:
2549 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2550 Value *Scalar = Entry->Scalars[Lane];
2551 // No need to handle users of gathered values.
2552 if (Entry->NeedToGather)
2553 continue;
2554
2555 assert(Entry->VectorizedValue && "Can't find vectorizable value");
2556
2557 Type *Ty = Scalar->getType();
2558 if (!Ty->isVoidTy()) {
2559 #ifndef NDEBUG
2560 for (User *U : Scalar->users()) {
2561 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2562
2563 assert((ScalarToTreeEntry.count(U) ||
2564 // It is legal to replace users in the ignorelist by undef.
2565 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2566 UserIgnoreList.end())) &&
2567 "Replacing out-of-tree value with undef");
2568 }
2569 #endif
2570 Value *Undef = UndefValue::get(Ty);
2571 Scalar->replaceAllUsesWith(Undef);
2572 }
2573 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2574 eraseInstruction(cast<Instruction>(Scalar));
2575 }
2576 }
2577
2578 Builder.ClearInsertionPoint();
2579
2580 return VectorizableTree[0].VectorizedValue;
2581 }
2582
optimizeGatherSequence()2583 void BoUpSLP::optimizeGatherSequence() {
2584 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2585 << " gather sequences instructions.\n");
2586 // LICM InsertElementInst sequences.
2587 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2588 e = GatherSeq.end(); it != e; ++it) {
2589 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2590
2591 if (!Insert)
2592 continue;
2593
2594 // Check if this block is inside a loop.
2595 Loop *L = LI->getLoopFor(Insert->getParent());
2596 if (!L)
2597 continue;
2598
2599 // Check if it has a preheader.
2600 BasicBlock *PreHeader = L->getLoopPreheader();
2601 if (!PreHeader)
2602 continue;
2603
2604 // If the vector or the element that we insert into it are
2605 // instructions that are defined in this basic block then we can't
2606 // hoist this instruction.
2607 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2608 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2609 if (CurrVec && L->contains(CurrVec))
2610 continue;
2611 if (NewElem && L->contains(NewElem))
2612 continue;
2613
2614 // We can hoist this instruction. Move it to the pre-header.
2615 Insert->moveBefore(PreHeader->getTerminator());
2616 }
2617
2618 // Make a list of all reachable blocks in our CSE queue.
2619 SmallVector<const DomTreeNode *, 8> CSEWorkList;
2620 CSEWorkList.reserve(CSEBlocks.size());
2621 for (BasicBlock *BB : CSEBlocks)
2622 if (DomTreeNode *N = DT->getNode(BB)) {
2623 assert(DT->isReachableFromEntry(N));
2624 CSEWorkList.push_back(N);
2625 }
2626
2627 // Sort blocks by domination. This ensures we visit a block after all blocks
2628 // dominating it are visited.
2629 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2630 [this](const DomTreeNode *A, const DomTreeNode *B) {
2631 return DT->properlyDominates(A, B);
2632 });
2633
2634 // Perform O(N^2) search over the gather sequences and merge identical
2635 // instructions. TODO: We can further optimize this scan if we split the
2636 // instructions into different buckets based on the insert lane.
2637 SmallVector<Instruction *, 16> Visited;
2638 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2639 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2640 "Worklist not sorted properly!");
2641 BasicBlock *BB = (*I)->getBlock();
2642 // For all instructions in blocks containing gather sequences:
2643 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2644 Instruction *In = it++;
2645 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2646 continue;
2647
2648 // Check if we can replace this instruction with any of the
2649 // visited instructions.
2650 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2651 ve = Visited.end();
2652 v != ve; ++v) {
2653 if (In->isIdenticalTo(*v) &&
2654 DT->dominates((*v)->getParent(), In->getParent())) {
2655 In->replaceAllUsesWith(*v);
2656 eraseInstruction(In);
2657 In = nullptr;
2658 break;
2659 }
2660 }
2661 if (In) {
2662 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2663 Visited.push_back(In);
2664 }
2665 }
2666 }
2667 CSEBlocks.clear();
2668 GatherSeq.clear();
2669 }
2670
2671 // Groups the instructions to a bundle (which is then a single scheduling entity)
2672 // and schedules instructions until the bundle gets ready.
tryScheduleBundle(ArrayRef<Value * > VL,BoUpSLP * SLP)2673 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2674 BoUpSLP *SLP) {
2675 if (isa<PHINode>(VL[0]))
2676 return true;
2677
2678 // Initialize the instruction bundle.
2679 Instruction *OldScheduleEnd = ScheduleEnd;
2680 ScheduleData *PrevInBundle = nullptr;
2681 ScheduleData *Bundle = nullptr;
2682 bool ReSchedule = false;
2683 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
2684 for (Value *V : VL) {
2685 extendSchedulingRegion(V);
2686 ScheduleData *BundleMember = getScheduleData(V);
2687 assert(BundleMember &&
2688 "no ScheduleData for bundle member (maybe not in same basic block)");
2689 if (BundleMember->IsScheduled) {
2690 // A bundle member was scheduled as single instruction before and now
2691 // needs to be scheduled as part of the bundle. We just get rid of the
2692 // existing schedule.
2693 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
2694 << " was already scheduled\n");
2695 ReSchedule = true;
2696 }
2697 assert(BundleMember->isSchedulingEntity() &&
2698 "bundle member already part of other bundle");
2699 if (PrevInBundle) {
2700 PrevInBundle->NextInBundle = BundleMember;
2701 } else {
2702 Bundle = BundleMember;
2703 }
2704 BundleMember->UnscheduledDepsInBundle = 0;
2705 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2706
2707 // Group the instructions to a bundle.
2708 BundleMember->FirstInBundle = Bundle;
2709 PrevInBundle = BundleMember;
2710 }
2711 if (ScheduleEnd != OldScheduleEnd) {
2712 // The scheduling region got new instructions at the lower end (or it is a
2713 // new region for the first bundle). This makes it necessary to
2714 // recalculate all dependencies.
2715 // It is seldom that this needs to be done a second time after adding the
2716 // initial bundle to the region.
2717 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2718 ScheduleData *SD = getScheduleData(I);
2719 SD->clearDependencies();
2720 }
2721 ReSchedule = true;
2722 }
2723 if (ReSchedule) {
2724 resetSchedule();
2725 initialFillReadyList(ReadyInsts);
2726 }
2727
2728 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2729 << BB->getName() << "\n");
2730
2731 calculateDependencies(Bundle, true, SLP);
2732
2733 // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2734 // means that there are no cyclic dependencies and we can schedule it.
2735 // Note that's important that we don't "schedule" the bundle yet (see
2736 // cancelScheduling).
2737 while (!Bundle->isReady() && !ReadyInsts.empty()) {
2738
2739 ScheduleData *pickedSD = ReadyInsts.back();
2740 ReadyInsts.pop_back();
2741
2742 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2743 schedule(pickedSD, ReadyInsts);
2744 }
2745 }
2746 return Bundle->isReady();
2747 }
2748
cancelScheduling(ArrayRef<Value * > VL)2749 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2750 if (isa<PHINode>(VL[0]))
2751 return;
2752
2753 ScheduleData *Bundle = getScheduleData(VL[0]);
2754 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
2755 assert(!Bundle->IsScheduled &&
2756 "Can't cancel bundle which is already scheduled");
2757 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2758 "tried to unbundle something which is not a bundle");
2759
2760 // Un-bundle: make single instructions out of the bundle.
2761 ScheduleData *BundleMember = Bundle;
2762 while (BundleMember) {
2763 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2764 BundleMember->FirstInBundle = BundleMember;
2765 ScheduleData *Next = BundleMember->NextInBundle;
2766 BundleMember->NextInBundle = nullptr;
2767 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2768 if (BundleMember->UnscheduledDepsInBundle == 0) {
2769 ReadyInsts.insert(BundleMember);
2770 }
2771 BundleMember = Next;
2772 }
2773 }
2774
extendSchedulingRegion(Value * V)2775 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2776 if (getScheduleData(V))
2777 return;
2778 Instruction *I = dyn_cast<Instruction>(V);
2779 assert(I && "bundle member must be an instruction");
2780 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2781 if (!ScheduleStart) {
2782 // It's the first instruction in the new region.
2783 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2784 ScheduleStart = I;
2785 ScheduleEnd = I->getNextNode();
2786 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2787 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
2788 return;
2789 }
2790 // Search up and down at the same time, because we don't know if the new
2791 // instruction is above or below the existing scheduling region.
2792 BasicBlock::reverse_iterator UpIter(ScheduleStart);
2793 BasicBlock::reverse_iterator UpperEnd = BB->rend();
2794 BasicBlock::iterator DownIter(ScheduleEnd);
2795 BasicBlock::iterator LowerEnd = BB->end();
2796 for (;;) {
2797 if (UpIter != UpperEnd) {
2798 if (&*UpIter == I) {
2799 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2800 ScheduleStart = I;
2801 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
2802 return;
2803 }
2804 UpIter++;
2805 }
2806 if (DownIter != LowerEnd) {
2807 if (&*DownIter == I) {
2808 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2809 nullptr);
2810 ScheduleEnd = I->getNextNode();
2811 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2812 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
2813 return;
2814 }
2815 DownIter++;
2816 }
2817 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2818 "instruction not found in block");
2819 }
2820 }
2821
initScheduleData(Instruction * FromI,Instruction * ToI,ScheduleData * PrevLoadStore,ScheduleData * NextLoadStore)2822 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2823 Instruction *ToI,
2824 ScheduleData *PrevLoadStore,
2825 ScheduleData *NextLoadStore) {
2826 ScheduleData *CurrentLoadStore = PrevLoadStore;
2827 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2828 ScheduleData *SD = ScheduleDataMap[I];
2829 if (!SD) {
2830 // Allocate a new ScheduleData for the instruction.
2831 if (ChunkPos >= ChunkSize) {
2832 ScheduleDataChunks.push_back(
2833 llvm::make_unique<ScheduleData[]>(ChunkSize));
2834 ChunkPos = 0;
2835 }
2836 SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2837 ScheduleDataMap[I] = SD;
2838 SD->Inst = I;
2839 }
2840 assert(!isInSchedulingRegion(SD) &&
2841 "new ScheduleData already in scheduling region");
2842 SD->init(SchedulingRegionID);
2843
2844 if (I->mayReadOrWriteMemory()) {
2845 // Update the linked list of memory accessing instructions.
2846 if (CurrentLoadStore) {
2847 CurrentLoadStore->NextLoadStore = SD;
2848 } else {
2849 FirstLoadStoreInRegion = SD;
2850 }
2851 CurrentLoadStore = SD;
2852 }
2853 }
2854 if (NextLoadStore) {
2855 if (CurrentLoadStore)
2856 CurrentLoadStore->NextLoadStore = NextLoadStore;
2857 } else {
2858 LastLoadStoreInRegion = CurrentLoadStore;
2859 }
2860 }
2861
calculateDependencies(ScheduleData * SD,bool InsertInReadyList,BoUpSLP * SLP)2862 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2863 bool InsertInReadyList,
2864 BoUpSLP *SLP) {
2865 assert(SD->isSchedulingEntity());
2866
2867 SmallVector<ScheduleData *, 10> WorkList;
2868 WorkList.push_back(SD);
2869
2870 while (!WorkList.empty()) {
2871 ScheduleData *SD = WorkList.back();
2872 WorkList.pop_back();
2873
2874 ScheduleData *BundleMember = SD;
2875 while (BundleMember) {
2876 assert(isInSchedulingRegion(BundleMember));
2877 if (!BundleMember->hasValidDependencies()) {
2878
2879 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
2880 BundleMember->Dependencies = 0;
2881 BundleMember->resetUnscheduledDeps();
2882
2883 // Handle def-use chain dependencies.
2884 for (User *U : BundleMember->Inst->users()) {
2885 if (isa<Instruction>(U)) {
2886 ScheduleData *UseSD = getScheduleData(U);
2887 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2888 BundleMember->Dependencies++;
2889 ScheduleData *DestBundle = UseSD->FirstInBundle;
2890 if (!DestBundle->IsScheduled) {
2891 BundleMember->incrementUnscheduledDeps(1);
2892 }
2893 if (!DestBundle->hasValidDependencies()) {
2894 WorkList.push_back(DestBundle);
2895 }
2896 }
2897 } else {
2898 // I'm not sure if this can ever happen. But we need to be safe.
2899 // This lets the instruction/bundle never be scheduled and eventally
2900 // disable vectorization.
2901 BundleMember->Dependencies++;
2902 BundleMember->incrementUnscheduledDeps(1);
2903 }
2904 }
2905
2906 // Handle the memory dependencies.
2907 ScheduleData *DepDest = BundleMember->NextLoadStore;
2908 if (DepDest) {
2909 Instruction *SrcInst = BundleMember->Inst;
2910 MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
2911 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2912 unsigned numAliased = 0;
2913 unsigned DistToSrc = 1;
2914
2915 while (DepDest) {
2916 assert(isInSchedulingRegion(DepDest));
2917
2918 // We have two limits to reduce the complexity:
2919 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
2920 // SLP->isAliased (which is the expensive part in this loop).
2921 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
2922 // the whole loop (even if the loop is fast, it's quadratic).
2923 // It's important for the loop break condition (see below) to
2924 // check this limit even between two read-only instructions.
2925 if (DistToSrc >= MaxMemDepDistance ||
2926 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
2927 (numAliased >= AliasedCheckLimit ||
2928 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
2929
2930 // We increment the counter only if the locations are aliased
2931 // (instead of counting all alias checks). This gives a better
2932 // balance between reduced runtime and accurate dependencies.
2933 numAliased++;
2934
2935 DepDest->MemoryDependencies.push_back(BundleMember);
2936 BundleMember->Dependencies++;
2937 ScheduleData *DestBundle = DepDest->FirstInBundle;
2938 if (!DestBundle->IsScheduled) {
2939 BundleMember->incrementUnscheduledDeps(1);
2940 }
2941 if (!DestBundle->hasValidDependencies()) {
2942 WorkList.push_back(DestBundle);
2943 }
2944 }
2945 DepDest = DepDest->NextLoadStore;
2946
2947 // Example, explaining the loop break condition: Let's assume our
2948 // starting instruction is i0 and MaxMemDepDistance = 3.
2949 //
2950 // +--------v--v--v
2951 // i0,i1,i2,i3,i4,i5,i6,i7,i8
2952 // +--------^--^--^
2953 //
2954 // MaxMemDepDistance let us stop alias-checking at i3 and we add
2955 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
2956 // Previously we already added dependencies from i3 to i6,i7,i8
2957 // (because of MaxMemDepDistance). As we added a dependency from
2958 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
2959 // and we can abort this loop at i6.
2960 if (DistToSrc >= 2 * MaxMemDepDistance)
2961 break;
2962 DistToSrc++;
2963 }
2964 }
2965 }
2966 BundleMember = BundleMember->NextInBundle;
2967 }
2968 if (InsertInReadyList && SD->isReady()) {
2969 ReadyInsts.push_back(SD);
2970 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
2971 }
2972 }
2973 }
2974
resetSchedule()2975 void BoUpSLP::BlockScheduling::resetSchedule() {
2976 assert(ScheduleStart &&
2977 "tried to reset schedule on block which has not been scheduled");
2978 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2979 ScheduleData *SD = getScheduleData(I);
2980 assert(isInSchedulingRegion(SD));
2981 SD->IsScheduled = false;
2982 SD->resetUnscheduledDeps();
2983 }
2984 ReadyInsts.clear();
2985 }
2986
scheduleBlock(BlockScheduling * BS)2987 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2988
2989 if (!BS->ScheduleStart)
2990 return;
2991
2992 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2993
2994 BS->resetSchedule();
2995
2996 // For the real scheduling we use a more sophisticated ready-list: it is
2997 // sorted by the original instruction location. This lets the final schedule
2998 // be as close as possible to the original instruction order.
2999 struct ScheduleDataCompare {
3000 bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3001 return SD2->SchedulingPriority < SD1->SchedulingPriority;
3002 }
3003 };
3004 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3005
3006 // Ensure that all depencency data is updated and fill the ready-list with
3007 // initial instructions.
3008 int Idx = 0;
3009 int NumToSchedule = 0;
3010 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3011 I = I->getNextNode()) {
3012 ScheduleData *SD = BS->getScheduleData(I);
3013 assert(
3014 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3015 "scheduler and vectorizer have different opinion on what is a bundle");
3016 SD->FirstInBundle->SchedulingPriority = Idx++;
3017 if (SD->isSchedulingEntity()) {
3018 BS->calculateDependencies(SD, false, this);
3019 NumToSchedule++;
3020 }
3021 }
3022 BS->initialFillReadyList(ReadyInsts);
3023
3024 Instruction *LastScheduledInst = BS->ScheduleEnd;
3025
3026 // Do the "real" scheduling.
3027 while (!ReadyInsts.empty()) {
3028 ScheduleData *picked = *ReadyInsts.begin();
3029 ReadyInsts.erase(ReadyInsts.begin());
3030
3031 // Move the scheduled instruction(s) to their dedicated places, if not
3032 // there yet.
3033 ScheduleData *BundleMember = picked;
3034 while (BundleMember) {
3035 Instruction *pickedInst = BundleMember->Inst;
3036 if (LastScheduledInst->getNextNode() != pickedInst) {
3037 BS->BB->getInstList().remove(pickedInst);
3038 BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
3039 }
3040 LastScheduledInst = pickedInst;
3041 BundleMember = BundleMember->NextInBundle;
3042 }
3043
3044 BS->schedule(picked, ReadyInsts);
3045 NumToSchedule--;
3046 }
3047 assert(NumToSchedule == 0 && "could not schedule all instructions");
3048
3049 // Avoid duplicate scheduling of the block.
3050 BS->ScheduleStart = nullptr;
3051 }
3052
3053 /// The SLPVectorizer Pass.
3054 struct SLPVectorizer : public FunctionPass {
3055 typedef SmallVector<StoreInst *, 8> StoreList;
3056 typedef MapVector<Value *, StoreList> StoreListMap;
3057
3058 /// Pass identification, replacement for typeid
3059 static char ID;
3060
SLPVectorizer__anonc602e1830111::SLPVectorizer3061 explicit SLPVectorizer() : FunctionPass(ID) {
3062 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
3063 }
3064
3065 ScalarEvolution *SE;
3066 TargetTransformInfo *TTI;
3067 TargetLibraryInfo *TLI;
3068 AliasAnalysis *AA;
3069 LoopInfo *LI;
3070 DominatorTree *DT;
3071 AssumptionCache *AC;
3072
runOnFunction__anonc602e1830111::SLPVectorizer3073 bool runOnFunction(Function &F) override {
3074 if (skipOptnoneFunction(F))
3075 return false;
3076
3077 SE = &getAnalysis<ScalarEvolution>();
3078 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3079 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3080 TLI = TLIP ? &TLIP->getTLI() : nullptr;
3081 AA = &getAnalysis<AliasAnalysis>();
3082 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3083 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3084 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3085
3086 StoreRefs.clear();
3087 bool Changed = false;
3088
3089 // If the target claims to have no vector registers don't attempt
3090 // vectorization.
3091 if (!TTI->getNumberOfRegisters(true))
3092 return false;
3093
3094 // Use the vector register size specified by the target unless overridden
3095 // by a command-line option.
3096 // TODO: It would be better to limit the vectorization factor based on
3097 // data type rather than just register size. For example, x86 AVX has
3098 // 256-bit registers, but it does not support integer operations
3099 // at that width (that requires AVX2).
3100 if (MaxVectorRegSizeOption.getNumOccurrences())
3101 MaxVecRegSize = MaxVectorRegSizeOption;
3102 else
3103 MaxVecRegSize = TTI->getRegisterBitWidth(true);
3104
3105 // Don't vectorize when the attribute NoImplicitFloat is used.
3106 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
3107 return false;
3108
3109 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3110
3111 // Use the bottom up slp vectorizer to construct chains that start with
3112 // store instructions.
3113 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
3114
3115 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3116 // delete instructions.
3117
3118 // Scan the blocks in the function in post order.
3119 for (auto BB : post_order(&F.getEntryBlock())) {
3120 // Vectorize trees that end at stores.
3121 if (unsigned count = collectStores(BB, R)) {
3122 (void)count;
3123 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
3124 Changed |= vectorizeStoreChains(R);
3125 }
3126
3127 // Vectorize trees that end at reductions.
3128 Changed |= vectorizeChainsInBlock(BB, R);
3129 }
3130
3131 if (Changed) {
3132 R.optimizeGatherSequence();
3133 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3134 DEBUG(verifyFunction(F));
3135 }
3136 return Changed;
3137 }
3138
getAnalysisUsage__anonc602e1830111::SLPVectorizer3139 void getAnalysisUsage(AnalysisUsage &AU) const override {
3140 FunctionPass::getAnalysisUsage(AU);
3141 AU.addRequired<AssumptionCacheTracker>();
3142 AU.addRequired<ScalarEvolution>();
3143 AU.addRequired<AliasAnalysis>();
3144 AU.addRequired<TargetTransformInfoWrapperPass>();
3145 AU.addRequired<LoopInfoWrapperPass>();
3146 AU.addRequired<DominatorTreeWrapperPass>();
3147 AU.addPreserved<LoopInfoWrapperPass>();
3148 AU.addPreserved<DominatorTreeWrapperPass>();
3149 AU.setPreservesCFG();
3150 }
3151
3152 private:
3153
3154 /// \brief Collect memory references and sort them according to their base
3155 /// object. We sort the stores to their base objects to reduce the cost of the
3156 /// quadratic search on the stores. TODO: We can further reduce this cost
3157 /// if we flush the chain creation every time we run into a memory barrier.
3158 unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
3159
3160 /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
3161 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
3162
3163 /// \brief Try to vectorize a list of operands.
3164 /// \@param BuildVector A list of users to ignore for the purpose of
3165 /// scheduling and that don't need extracting.
3166 /// \returns true if a value was vectorized.
3167 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3168 ArrayRef<Value *> BuildVector = None,
3169 bool allowReorder = false);
3170
3171 /// \brief Try to vectorize a chain that may start at the operands of \V;
3172 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
3173
3174 /// \brief Vectorize the stores that were collected in StoreRefs.
3175 bool vectorizeStoreChains(BoUpSLP &R);
3176
3177 /// \brief Scan the basic block and look for patterns that are likely to start
3178 /// a vectorization chain.
3179 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3180
3181 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3182 BoUpSLP &R, unsigned VecRegSize);
3183
3184 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3185 BoUpSLP &R);
3186 private:
3187 StoreListMap StoreRefs;
3188 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3189 };
3190
3191 /// \brief Check that the Values in the slice in VL array are still existent in
3192 /// the WeakVH array.
3193 /// Vectorization of part of the VL array may cause later values in the VL array
3194 /// to become invalid. We track when this has happened in the WeakVH array.
hasValueBeenRAUWed(ArrayRef<Value * > VL,ArrayRef<WeakVH> VH,unsigned SliceBegin,unsigned SliceSize)3195 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
3196 unsigned SliceBegin, unsigned SliceSize) {
3197 VL = VL.slice(SliceBegin, SliceSize);
3198 VH = VH.slice(SliceBegin, SliceSize);
3199 return !std::equal(VL.begin(), VL.end(), VH.begin());
3200 }
3201
vectorizeStoreChain(ArrayRef<Value * > Chain,int CostThreshold,BoUpSLP & R,unsigned VecRegSize)3202 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3203 int CostThreshold, BoUpSLP &R,
3204 unsigned VecRegSize) {
3205 unsigned ChainLen = Chain.size();
3206 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3207 << "\n");
3208 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
3209 auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
3210 unsigned Sz = DL.getTypeSizeInBits(StoreTy);
3211 unsigned VF = VecRegSize / Sz;
3212
3213 if (!isPowerOf2_32(Sz) || VF < 2)
3214 return false;
3215
3216 // Keep track of values that were deleted by vectorizing in the loop below.
3217 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3218
3219 bool Changed = false;
3220 // Look for profitable vectorizable trees at all offsets, starting at zero.
3221 for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3222 if (i + VF > e)
3223 break;
3224
3225 // Check that a previous iteration of this loop did not delete the Value.
3226 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3227 continue;
3228
3229 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3230 << "\n");
3231 ArrayRef<Value *> Operands = Chain.slice(i, VF);
3232
3233 R.buildTree(Operands);
3234
3235 int Cost = R.getTreeCost();
3236
3237 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3238 if (Cost < CostThreshold) {
3239 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3240 R.vectorizeTree();
3241
3242 // Move to the next bundle.
3243 i += VF - 1;
3244 Changed = true;
3245 }
3246 }
3247
3248 return Changed;
3249 }
3250
vectorizeStores(ArrayRef<StoreInst * > Stores,int costThreshold,BoUpSLP & R)3251 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3252 int costThreshold, BoUpSLP &R) {
3253 SetVector<StoreInst *> Heads, Tails;
3254 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3255
3256 // We may run into multiple chains that merge into a single chain. We mark the
3257 // stores that we vectorized so that we don't visit the same store twice.
3258 BoUpSLP::ValueSet VectorizedStores;
3259 bool Changed = false;
3260
3261 // Do a quadratic search on all of the given stores and find
3262 // all of the pairs of stores that follow each other.
3263 for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3264 for (unsigned j = 0; j < e; ++j) {
3265 if (i == j)
3266 continue;
3267 const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
3268 if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
3269 Tails.insert(Stores[j]);
3270 Heads.insert(Stores[i]);
3271 ConsecutiveChain[Stores[i]] = Stores[j];
3272 }
3273 }
3274 }
3275
3276 // For stores that start but don't end a link in the chain:
3277 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
3278 it != e; ++it) {
3279 if (Tails.count(*it))
3280 continue;
3281
3282 // We found a store instr that starts a chain. Now follow the chain and try
3283 // to vectorize it.
3284 BoUpSLP::ValueList Operands;
3285 StoreInst *I = *it;
3286 // Collect the chain into a list.
3287 while (Tails.count(I) || Heads.count(I)) {
3288 if (VectorizedStores.count(I))
3289 break;
3290 Operands.push_back(I);
3291 // Move to the next value in the chain.
3292 I = ConsecutiveChain[I];
3293 }
3294
3295 // FIXME: Is division-by-2 the correct step? Should we assert that the
3296 // register size is a power-of-2?
3297 for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
3298 if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
3299 // Mark the vectorized stores so that we don't vectorize them again.
3300 VectorizedStores.insert(Operands.begin(), Operands.end());
3301 Changed = true;
3302 break;
3303 }
3304 }
3305 }
3306
3307 return Changed;
3308 }
3309
3310
collectStores(BasicBlock * BB,BoUpSLP & R)3311 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3312 unsigned count = 0;
3313 StoreRefs.clear();
3314 const DataLayout &DL = BB->getModule()->getDataLayout();
3315 for (Instruction &I : *BB) {
3316 StoreInst *SI = dyn_cast<StoreInst>(&I);
3317 if (!SI)
3318 continue;
3319
3320 // Don't touch volatile stores.
3321 if (!SI->isSimple())
3322 continue;
3323
3324 // Check that the pointer points to scalars.
3325 Type *Ty = SI->getValueOperand()->getType();
3326 if (!isValidElementType(Ty))
3327 continue;
3328
3329 // Find the base pointer.
3330 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3331
3332 // Save the store locations.
3333 StoreRefs[Ptr].push_back(SI);
3334 count++;
3335 }
3336 return count;
3337 }
3338
tryToVectorizePair(Value * A,Value * B,BoUpSLP & R)3339 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3340 if (!A || !B)
3341 return false;
3342 Value *VL[] = { A, B };
3343 return tryToVectorizeList(VL, R, None, true);
3344 }
3345
tryToVectorizeList(ArrayRef<Value * > VL,BoUpSLP & R,ArrayRef<Value * > BuildVector,bool allowReorder)3346 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3347 ArrayRef<Value *> BuildVector,
3348 bool allowReorder) {
3349 if (VL.size() < 2)
3350 return false;
3351
3352 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3353
3354 // Check that all of the parts are scalar instructions of the same type.
3355 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3356 if (!I0)
3357 return false;
3358
3359 unsigned Opcode0 = I0->getOpcode();
3360 const DataLayout &DL = I0->getModule()->getDataLayout();
3361
3362 Type *Ty0 = I0->getType();
3363 unsigned Sz = DL.getTypeSizeInBits(Ty0);
3364 // FIXME: Register size should be a parameter to this function, so we can
3365 // try different vectorization factors.
3366 unsigned VF = MinVecRegSize / Sz;
3367
3368 for (Value *V : VL) {
3369 Type *Ty = V->getType();
3370 if (!isValidElementType(Ty))
3371 return false;
3372 Instruction *Inst = dyn_cast<Instruction>(V);
3373 if (!Inst || Inst->getOpcode() != Opcode0)
3374 return false;
3375 }
3376
3377 bool Changed = false;
3378
3379 // Keep track of values that were deleted by vectorizing in the loop below.
3380 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3381
3382 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3383 unsigned OpsWidth = 0;
3384
3385 if (i + VF > e)
3386 OpsWidth = e - i;
3387 else
3388 OpsWidth = VF;
3389
3390 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3391 break;
3392
3393 // Check that a previous iteration of this loop did not delete the Value.
3394 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3395 continue;
3396
3397 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3398 << "\n");
3399 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3400
3401 ArrayRef<Value *> BuildVectorSlice;
3402 if (!BuildVector.empty())
3403 BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3404
3405 R.buildTree(Ops, BuildVectorSlice);
3406 // TODO: check if we can allow reordering also for other cases than
3407 // tryToVectorizePair()
3408 if (allowReorder && R.shouldReorder()) {
3409 assert(Ops.size() == 2);
3410 assert(BuildVectorSlice.empty());
3411 Value *ReorderedOps[] = { Ops[1], Ops[0] };
3412 R.buildTree(ReorderedOps, None);
3413 }
3414 int Cost = R.getTreeCost();
3415
3416 if (Cost < -SLPCostThreshold) {
3417 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3418 Value *VectorizedRoot = R.vectorizeTree();
3419
3420 // Reconstruct the build vector by extracting the vectorized root. This
3421 // way we handle the case where some elements of the vector are undefined.
3422 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3423 if (!BuildVectorSlice.empty()) {
3424 // The insert point is the last build vector instruction. The vectorized
3425 // root will precede it. This guarantees that we get an instruction. The
3426 // vectorized tree could have been constant folded.
3427 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3428 unsigned VecIdx = 0;
3429 for (auto &V : BuildVectorSlice) {
3430 IRBuilder<true, NoFolder> Builder(
3431 ++BasicBlock::iterator(InsertAfter));
3432 InsertElementInst *IE = cast<InsertElementInst>(V);
3433 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3434 VectorizedRoot, Builder.getInt32(VecIdx++)));
3435 IE->setOperand(1, Extract);
3436 IE->removeFromParent();
3437 IE->insertAfter(Extract);
3438 InsertAfter = IE;
3439 }
3440 }
3441 // Move to the next bundle.
3442 i += VF - 1;
3443 Changed = true;
3444 }
3445 }
3446
3447 return Changed;
3448 }
3449
tryToVectorize(BinaryOperator * V,BoUpSLP & R)3450 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3451 if (!V)
3452 return false;
3453
3454 // Try to vectorize V.
3455 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3456 return true;
3457
3458 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3459 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3460 // Try to skip B.
3461 if (B && B->hasOneUse()) {
3462 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3463 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3464 if (tryToVectorizePair(A, B0, R)) {
3465 return true;
3466 }
3467 if (tryToVectorizePair(A, B1, R)) {
3468 return true;
3469 }
3470 }
3471
3472 // Try to skip A.
3473 if (A && A->hasOneUse()) {
3474 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3475 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3476 if (tryToVectorizePair(A0, B, R)) {
3477 return true;
3478 }
3479 if (tryToVectorizePair(A1, B, R)) {
3480 return true;
3481 }
3482 }
3483 return 0;
3484 }
3485
3486 /// \brief Generate a shuffle mask to be used in a reduction tree.
3487 ///
3488 /// \param VecLen The length of the vector to be reduced.
3489 /// \param NumEltsToRdx The number of elements that should be reduced in the
3490 /// vector.
3491 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3492 /// reduction. A pairwise reduction will generate a mask of
3493 /// <0,2,...> or <1,3,..> while a splitting reduction will generate
3494 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3495 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
createRdxShuffleMask(unsigned VecLen,unsigned NumEltsToRdx,bool IsPairwise,bool IsLeft,IRBuilder<> & Builder)3496 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3497 bool IsPairwise, bool IsLeft,
3498 IRBuilder<> &Builder) {
3499 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3500
3501 SmallVector<Constant *, 32> ShuffleMask(
3502 VecLen, UndefValue::get(Builder.getInt32Ty()));
3503
3504 if (IsPairwise)
3505 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3506 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3507 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3508 else
3509 // Move the upper half of the vector to the lower half.
3510 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3511 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3512
3513 return ConstantVector::get(ShuffleMask);
3514 }
3515
3516
3517 /// Model horizontal reductions.
3518 ///
3519 /// A horizontal reduction is a tree of reduction operations (currently add and
3520 /// fadd) that has operations that can be put into a vector as its leaf.
3521 /// For example, this tree:
3522 ///
3523 /// mul mul mul mul
3524 /// \ / \ /
3525 /// + +
3526 /// \ /
3527 /// +
3528 /// This tree has "mul" as its reduced values and "+" as its reduction
3529 /// operations. A reduction might be feeding into a store or a binary operation
3530 /// feeding a phi.
3531 /// ...
3532 /// \ /
3533 /// +
3534 /// |
3535 /// phi +=
3536 ///
3537 /// Or:
3538 /// ...
3539 /// \ /
3540 /// +
3541 /// |
3542 /// *p =
3543 ///
3544 class HorizontalReduction {
3545 SmallVector<Value *, 16> ReductionOps;
3546 SmallVector<Value *, 32> ReducedVals;
3547
3548 BinaryOperator *ReductionRoot;
3549 PHINode *ReductionPHI;
3550
3551 /// The opcode of the reduction.
3552 unsigned ReductionOpcode;
3553 /// The opcode of the values we perform a reduction on.
3554 unsigned ReducedValueOpcode;
3555 /// The width of one full horizontal reduction operation.
3556 unsigned ReduxWidth;
3557 /// Should we model this reduction as a pairwise reduction tree or a tree that
3558 /// splits the vector in halves and adds those halves.
3559 bool IsPairwiseReduction;
3560
3561 public:
HorizontalReduction()3562 HorizontalReduction()
3563 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3564 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3565
3566 /// \brief Try to find a reduction tree.
matchAssociativeReduction(PHINode * Phi,BinaryOperator * B)3567 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
3568 assert((!Phi ||
3569 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3570 "Thi phi needs to use the binary operator");
3571
3572 // We could have a initial reductions that is not an add.
3573 // r *= v1 + v2 + v3 + v4
3574 // In such a case start looking for a tree rooted in the first '+'.
3575 if (Phi) {
3576 if (B->getOperand(0) == Phi) {
3577 Phi = nullptr;
3578 B = dyn_cast<BinaryOperator>(B->getOperand(1));
3579 } else if (B->getOperand(1) == Phi) {
3580 Phi = nullptr;
3581 B = dyn_cast<BinaryOperator>(B->getOperand(0));
3582 }
3583 }
3584
3585 if (!B)
3586 return false;
3587
3588 Type *Ty = B->getType();
3589 if (!isValidElementType(Ty))
3590 return false;
3591
3592 const DataLayout &DL = B->getModule()->getDataLayout();
3593 ReductionOpcode = B->getOpcode();
3594 ReducedValueOpcode = 0;
3595 // FIXME: Register size should be a parameter to this function, so we can
3596 // try different vectorization factors.
3597 ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
3598 ReductionRoot = B;
3599 ReductionPHI = Phi;
3600
3601 if (ReduxWidth < 4)
3602 return false;
3603
3604 // We currently only support adds.
3605 if (ReductionOpcode != Instruction::Add &&
3606 ReductionOpcode != Instruction::FAdd)
3607 return false;
3608
3609 // Post order traverse the reduction tree starting at B. We only handle true
3610 // trees containing only binary operators.
3611 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
3612 Stack.push_back(std::make_pair(B, 0));
3613 while (!Stack.empty()) {
3614 BinaryOperator *TreeN = Stack.back().first;
3615 unsigned EdgeToVist = Stack.back().second++;
3616 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3617
3618 // Only handle trees in the current basic block.
3619 if (TreeN->getParent() != B->getParent())
3620 return false;
3621
3622 // Each tree node needs to have one user except for the ultimate
3623 // reduction.
3624 if (!TreeN->hasOneUse() && TreeN != B)
3625 return false;
3626
3627 // Postorder vist.
3628 if (EdgeToVist == 2 || IsReducedValue) {
3629 if (IsReducedValue) {
3630 // Make sure that the opcodes of the operations that we are going to
3631 // reduce match.
3632 if (!ReducedValueOpcode)
3633 ReducedValueOpcode = TreeN->getOpcode();
3634 else if (ReducedValueOpcode != TreeN->getOpcode())
3635 return false;
3636 ReducedVals.push_back(TreeN);
3637 } else {
3638 // We need to be able to reassociate the adds.
3639 if (!TreeN->isAssociative())
3640 return false;
3641 ReductionOps.push_back(TreeN);
3642 }
3643 // Retract.
3644 Stack.pop_back();
3645 continue;
3646 }
3647
3648 // Visit left or right.
3649 Value *NextV = TreeN->getOperand(EdgeToVist);
3650 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3651 if (Next)
3652 Stack.push_back(std::make_pair(Next, 0));
3653 else if (NextV != Phi)
3654 return false;
3655 }
3656 return true;
3657 }
3658
3659 /// \brief Attempt to vectorize the tree found by
3660 /// matchAssociativeReduction.
tryToReduce(BoUpSLP & V,TargetTransformInfo * TTI)3661 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3662 if (ReducedVals.empty())
3663 return false;
3664
3665 unsigned NumReducedVals = ReducedVals.size();
3666 if (NumReducedVals < ReduxWidth)
3667 return false;
3668
3669 Value *VectorizedTree = nullptr;
3670 IRBuilder<> Builder(ReductionRoot);
3671 FastMathFlags Unsafe;
3672 Unsafe.setUnsafeAlgebra();
3673 Builder.SetFastMathFlags(Unsafe);
3674 unsigned i = 0;
3675
3676 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3677 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3678
3679 // Estimate cost.
3680 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3681 if (Cost >= -SLPCostThreshold)
3682 break;
3683
3684 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3685 << ". (HorRdx)\n");
3686
3687 // Vectorize a tree.
3688 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3689 Value *VectorizedRoot = V.vectorizeTree();
3690
3691 // Emit a reduction.
3692 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3693 if (VectorizedTree) {
3694 Builder.SetCurrentDebugLocation(Loc);
3695 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3696 ReducedSubTree, "bin.rdx");
3697 } else
3698 VectorizedTree = ReducedSubTree;
3699 }
3700
3701 if (VectorizedTree) {
3702 // Finish the reduction.
3703 for (; i < NumReducedVals; ++i) {
3704 Builder.SetCurrentDebugLocation(
3705 cast<Instruction>(ReducedVals[i])->getDebugLoc());
3706 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3707 ReducedVals[i]);
3708 }
3709 // Update users.
3710 if (ReductionPHI) {
3711 assert(ReductionRoot && "Need a reduction operation");
3712 ReductionRoot->setOperand(0, VectorizedTree);
3713 ReductionRoot->setOperand(1, ReductionPHI);
3714 } else
3715 ReductionRoot->replaceAllUsesWith(VectorizedTree);
3716 }
3717 return VectorizedTree != nullptr;
3718 }
3719
3720 private:
3721
3722 /// \brief Calcuate the cost of a reduction.
getReductionCost(TargetTransformInfo * TTI,Value * FirstReducedVal)3723 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3724 Type *ScalarTy = FirstReducedVal->getType();
3725 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3726
3727 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3728 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3729
3730 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3731 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3732
3733 int ScalarReduxCost =
3734 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3735
3736 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3737 << " for reduction that starts with " << *FirstReducedVal
3738 << " (It is a "
3739 << (IsPairwiseReduction ? "pairwise" : "splitting")
3740 << " reduction)\n");
3741
3742 return VecReduxCost - ScalarReduxCost;
3743 }
3744
createBinOp(IRBuilder<> & Builder,unsigned Opcode,Value * L,Value * R,const Twine & Name="")3745 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3746 Value *R, const Twine &Name = "") {
3747 if (Opcode == Instruction::FAdd)
3748 return Builder.CreateFAdd(L, R, Name);
3749 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3750 }
3751
3752 /// \brief Emit a horizontal reduction of the vectorized value.
emitReduction(Value * VectorizedValue,IRBuilder<> & Builder)3753 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3754 assert(VectorizedValue && "Need to have a vectorized tree node");
3755 assert(isPowerOf2_32(ReduxWidth) &&
3756 "We only handle power-of-two reductions for now");
3757
3758 Value *TmpVec = VectorizedValue;
3759 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3760 if (IsPairwiseReduction) {
3761 Value *LeftMask =
3762 createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3763 Value *RightMask =
3764 createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3765
3766 Value *LeftShuf = Builder.CreateShuffleVector(
3767 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3768 Value *RightShuf = Builder.CreateShuffleVector(
3769 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3770 "rdx.shuf.r");
3771 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3772 "bin.rdx");
3773 } else {
3774 Value *UpperHalf =
3775 createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3776 Value *Shuf = Builder.CreateShuffleVector(
3777 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3778 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3779 }
3780 }
3781
3782 // The result is in the first element of the vector.
3783 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3784 }
3785 };
3786
3787 /// \brief Recognize construction of vectors like
3788 /// %ra = insertelement <4 x float> undef, float %s0, i32 0
3789 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
3790 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
3791 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
3792 ///
3793 /// Returns true if it matches
3794 ///
findBuildVector(InsertElementInst * FirstInsertElem,SmallVectorImpl<Value * > & BuildVector,SmallVectorImpl<Value * > & BuildVectorOpds)3795 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3796 SmallVectorImpl<Value *> &BuildVector,
3797 SmallVectorImpl<Value *> &BuildVectorOpds) {
3798 if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3799 return false;
3800
3801 InsertElementInst *IE = FirstInsertElem;
3802 while (true) {
3803 BuildVector.push_back(IE);
3804 BuildVectorOpds.push_back(IE->getOperand(1));
3805
3806 if (IE->use_empty())
3807 return false;
3808
3809 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3810 if (!NextUse)
3811 return true;
3812
3813 // If this isn't the final use, make sure the next insertelement is the only
3814 // use. It's OK if the final constructed vector is used multiple times
3815 if (!IE->hasOneUse())
3816 return false;
3817
3818 IE = NextUse;
3819 }
3820
3821 return false;
3822 }
3823
PhiTypeSorterFunc(Value * V,Value * V2)3824 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3825 return V->getType() < V2->getType();
3826 }
3827
vectorizeChainsInBlock(BasicBlock * BB,BoUpSLP & R)3828 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3829 bool Changed = false;
3830 SmallVector<Value *, 4> Incoming;
3831 SmallSet<Value *, 16> VisitedInstrs;
3832
3833 bool HaveVectorizedPhiNodes = true;
3834 while (HaveVectorizedPhiNodes) {
3835 HaveVectorizedPhiNodes = false;
3836
3837 // Collect the incoming values from the PHIs.
3838 Incoming.clear();
3839 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3840 ++instr) {
3841 PHINode *P = dyn_cast<PHINode>(instr);
3842 if (!P)
3843 break;
3844
3845 if (!VisitedInstrs.count(P))
3846 Incoming.push_back(P);
3847 }
3848
3849 // Sort by type.
3850 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3851
3852 // Try to vectorize elements base on their type.
3853 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3854 E = Incoming.end();
3855 IncIt != E;) {
3856
3857 // Look for the next elements with the same type.
3858 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3859 while (SameTypeIt != E &&
3860 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3861 VisitedInstrs.insert(*SameTypeIt);
3862 ++SameTypeIt;
3863 }
3864
3865 // Try to vectorize them.
3866 unsigned NumElts = (SameTypeIt - IncIt);
3867 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3868 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3869 // Success start over because instructions might have been changed.
3870 HaveVectorizedPhiNodes = true;
3871 Changed = true;
3872 break;
3873 }
3874
3875 // Start over at the next instruction of a different type (or the end).
3876 IncIt = SameTypeIt;
3877 }
3878 }
3879
3880 VisitedInstrs.clear();
3881
3882 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3883 // We may go through BB multiple times so skip the one we have checked.
3884 if (!VisitedInstrs.insert(it).second)
3885 continue;
3886
3887 if (isa<DbgInfoIntrinsic>(it))
3888 continue;
3889
3890 // Try to vectorize reductions that use PHINodes.
3891 if (PHINode *P = dyn_cast<PHINode>(it)) {
3892 // Check that the PHI is a reduction PHI.
3893 if (P->getNumIncomingValues() != 2)
3894 return Changed;
3895 Value *Rdx =
3896 (P->getIncomingBlock(0) == BB
3897 ? (P->getIncomingValue(0))
3898 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3899 : nullptr));
3900 // Check if this is a Binary Operator.
3901 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3902 if (!BI)
3903 continue;
3904
3905 // Try to match and vectorize a horizontal reduction.
3906 HorizontalReduction HorRdx;
3907 if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
3908 HorRdx.tryToReduce(R, TTI)) {
3909 Changed = true;
3910 it = BB->begin();
3911 e = BB->end();
3912 continue;
3913 }
3914
3915 Value *Inst = BI->getOperand(0);
3916 if (Inst == P)
3917 Inst = BI->getOperand(1);
3918
3919 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3920 // We would like to start over since some instructions are deleted
3921 // and the iterator may become invalid value.
3922 Changed = true;
3923 it = BB->begin();
3924 e = BB->end();
3925 continue;
3926 }
3927
3928 continue;
3929 }
3930
3931 // Try to vectorize horizontal reductions feeding into a store.
3932 if (ShouldStartVectorizeHorAtStore)
3933 if (StoreInst *SI = dyn_cast<StoreInst>(it))
3934 if (BinaryOperator *BinOp =
3935 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3936 HorizontalReduction HorRdx;
3937 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
3938 HorRdx.tryToReduce(R, TTI)) ||
3939 tryToVectorize(BinOp, R))) {
3940 Changed = true;
3941 it = BB->begin();
3942 e = BB->end();
3943 continue;
3944 }
3945 }
3946
3947 // Try to vectorize horizontal reductions feeding into a return.
3948 if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
3949 if (RI->getNumOperands() != 0)
3950 if (BinaryOperator *BinOp =
3951 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3952 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
3953 if (tryToVectorizePair(BinOp->getOperand(0),
3954 BinOp->getOperand(1), R)) {
3955 Changed = true;
3956 it = BB->begin();
3957 e = BB->end();
3958 continue;
3959 }
3960 }
3961
3962 // Try to vectorize trees that start at compare instructions.
3963 if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3964 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3965 Changed = true;
3966 // We would like to start over since some instructions are deleted
3967 // and the iterator may become invalid value.
3968 it = BB->begin();
3969 e = BB->end();
3970 continue;
3971 }
3972
3973 for (int i = 0; i < 2; ++i) {
3974 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3975 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3976 Changed = true;
3977 // We would like to start over since some instructions are deleted
3978 // and the iterator may become invalid value.
3979 it = BB->begin();
3980 e = BB->end();
3981 break;
3982 }
3983 }
3984 }
3985 continue;
3986 }
3987
3988 // Try to vectorize trees that start at insertelement instructions.
3989 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3990 SmallVector<Value *, 16> BuildVector;
3991 SmallVector<Value *, 16> BuildVectorOpds;
3992 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3993 continue;
3994
3995 // Vectorize starting with the build vector operands ignoring the
3996 // BuildVector instructions for the purpose of scheduling and user
3997 // extraction.
3998 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
3999 Changed = true;
4000 it = BB->begin();
4001 e = BB->end();
4002 }
4003
4004 continue;
4005 }
4006 }
4007
4008 return Changed;
4009 }
4010
vectorizeStoreChains(BoUpSLP & R)4011 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
4012 bool Changed = false;
4013 // Attempt to sort and vectorize each of the store-groups.
4014 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
4015 it != e; ++it) {
4016 if (it->second.size() < 2)
4017 continue;
4018
4019 DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
4020 << it->second.size() << ".\n");
4021
4022 // Process the stores in chunks of 16.
4023 // TODO: The limit of 16 inhibits greater vectorization factors.
4024 // For example, AVX2 supports v32i8. Increasing this limit, however,
4025 // may cause a significant compile-time increase.
4026 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
4027 unsigned Len = std::min<unsigned>(CE - CI, 16);
4028 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
4029 -SLPCostThreshold, R);
4030 }
4031 }
4032 return Changed;
4033 }
4034
4035 } // end anonymous namespace
4036
4037 char SLPVectorizer::ID = 0;
4038 static const char lv_name[] = "SLP Vectorizer";
4039 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
4040 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
4041 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
4042 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
4043 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4044 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4045 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
4046
4047 namespace llvm {
createSLPVectorizerPass()4048 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
4049 }
4050