1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass converts selects to conditional jumps when profitable.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/ADT/Optional.h"
14 #include "llvm/ADT/SmallVector.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/BlockFrequencyInfo.h"
17 #include "llvm/Analysis/BranchProbabilityInfo.h"
18 #include "llvm/Analysis/LoopInfo.h"
19 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
20 #include "llvm/Analysis/ProfileSummaryInfo.h"
21 #include "llvm/Analysis/TargetTransformInfo.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/CodeGen/TargetLowering.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/CodeGen/TargetSchedule.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/InitializePasses.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/ScaledNumber.h"
35 #include "llvm/Target/TargetMachine.h"
36 #include "llvm/Transforms/Utils/SizeOpts.h"
37 #include <algorithm>
38 #include <memory>
39 #include <queue>
40 #include <stack>
41 #include <string>
42
43 using namespace llvm;
44
45 #define DEBUG_TYPE "select-optimize"
46
47 STATISTIC(NumSelectOptAnalyzed,
48 "Number of select groups considered for conversion to branch");
49 STATISTIC(NumSelectConvertedExpColdOperand,
50 "Number of select groups converted due to expensive cold operand");
51 STATISTIC(NumSelectConvertedHighPred,
52 "Number of select groups converted due to high-predictability");
53 STATISTIC(NumSelectUnPred,
54 "Number of select groups not converted due to unpredictability");
55 STATISTIC(NumSelectColdBB,
56 "Number of select groups not converted due to cold basic block");
57 STATISTIC(NumSelectConvertedLoop,
58 "Number of select groups converted due to loop-level analysis");
59 STATISTIC(NumSelectsConverted, "Number of selects converted");
60
61 static cl::opt<unsigned> ColdOperandThreshold(
62 "cold-operand-threshold",
63 cl::desc("Maximum frequency of path for an operand to be considered cold."),
64 cl::init(20), cl::Hidden);
65
66 static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
67 "cold-operand-max-cost-multiplier",
68 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
69 "slice of a cold operand to be considered inexpensive."),
70 cl::init(1), cl::Hidden);
71
72 static cl::opt<unsigned>
73 GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
74 cl::desc("Gradient gain threshold (%)."),
75 cl::init(25), cl::Hidden);
76
77 static cl::opt<unsigned>
78 GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
79 cl::desc("Minimum gain per loop (in cycles) threshold."),
80 cl::init(4), cl::Hidden);
81
82 static cl::opt<unsigned> GainRelativeThreshold(
83 "select-opti-loop-relative-gain-threshold",
84 cl::desc(
85 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
86 cl::init(8), cl::Hidden);
87
88 static cl::opt<unsigned> MispredictDefaultRate(
89 "mispredict-default-rate", cl::Hidden, cl::init(25),
90 cl::desc("Default mispredict rate (initialized to 25%)."));
91
92 static cl::opt<bool>
93 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
94 cl::init(false),
95 cl::desc("Disable loop-level heuristics."));
96
97 namespace {
98
99 class SelectOptimize : public FunctionPass {
100 const TargetMachine *TM = nullptr;
101 const TargetSubtargetInfo *TSI;
102 const TargetLowering *TLI = nullptr;
103 const TargetTransformInfo *TTI = nullptr;
104 const LoopInfo *LI;
105 DominatorTree *DT;
106 std::unique_ptr<BlockFrequencyInfo> BFI;
107 std::unique_ptr<BranchProbabilityInfo> BPI;
108 ProfileSummaryInfo *PSI;
109 OptimizationRemarkEmitter *ORE;
110 TargetSchedModel TSchedModel;
111
112 public:
113 static char ID;
114
SelectOptimize()115 SelectOptimize() : FunctionPass(ID) {
116 initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
117 }
118
119 bool runOnFunction(Function &F) override;
120
getAnalysisUsage(AnalysisUsage & AU) const121 void getAnalysisUsage(AnalysisUsage &AU) const override {
122 AU.addRequired<ProfileSummaryInfoWrapperPass>();
123 AU.addRequired<TargetPassConfig>();
124 AU.addRequired<TargetTransformInfoWrapperPass>();
125 AU.addRequired<DominatorTreeWrapperPass>();
126 AU.addRequired<LoopInfoWrapperPass>();
127 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
128 }
129
130 private:
131 // Select groups consist of consecutive select instructions with the same
132 // condition.
133 using SelectGroup = SmallVector<SelectInst *, 2>;
134 using SelectGroups = SmallVector<SelectGroup, 2>;
135
136 using Scaled64 = ScaledNumber<uint64_t>;
137
138 struct CostInfo {
139 /// Predicated cost (with selects as conditional moves).
140 Scaled64 PredCost;
141 /// Non-predicated cost (with selects converted to branches).
142 Scaled64 NonPredCost;
143 };
144
145 // Converts select instructions of a function to conditional jumps when deemed
146 // profitable. Returns true if at least one select was converted.
147 bool optimizeSelects(Function &F);
148
149 // Heuristics for determining which select instructions can be profitably
150 // conveted to branches. Separate heuristics for selects in inner-most loops
151 // and the rest of code regions (base heuristics for non-inner-most loop
152 // regions).
153 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
154 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
155
156 // Converts to branches the select groups that were deemed
157 // profitable-to-convert.
158 void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
159
160 // Splits selects of a given basic block into select groups.
161 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
162
163 // Determines for which select groups it is profitable converting to branches
164 // (base and inner-most-loop heuristics).
165 void findProfitableSIGroupsBase(SelectGroups &SIGroups,
166 SelectGroups &ProfSIGroups);
167 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
168 SelectGroups &ProfSIGroups);
169
170 // Determines if a select group should be converted to a branch (base
171 // heuristics).
172 bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
173
174 // Returns true if there are expensive instructions in the cold value
175 // operand's (if any) dependence slice of any of the selects of the given
176 // group.
177 bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
178
179 // For a given source instruction, collect its backwards dependence slice
180 // consisting of instructions exclusively computed for producing the operands
181 // of the source instruction.
182 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
183 bool ForSinking = false);
184
185 // Returns true if the condition of the select is highly predictable.
186 bool isSelectHighlyPredictable(const SelectInst *SI);
187
188 // Loop-level checks to determine if a non-predicated version (with branches)
189 // of the given loop is more profitable than its predicated version.
190 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
191
192 // Computes instruction and loop-critical-path costs for both the predicated
193 // and non-predicated version of the given loop.
194 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
195 DenseMap<const Instruction *, CostInfo> &InstCostMap,
196 CostInfo *LoopCost);
197
198 // Returns a set of all the select instructions in the given select groups.
199 SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
200
201 // Returns the latency cost of a given instruction.
202 Optional<uint64_t> computeInstCost(const Instruction *I);
203
204 // Returns the misprediction cost of a given select when converted to branch.
205 Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
206
207 // Returns the cost of a branch when the prediction is correct.
208 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
209 const SelectInst *SI);
210
211 // Returns true if the target architecture supports lowering a given select.
212 bool isSelectKindSupported(SelectInst *SI);
213 };
214 } // namespace
215
216 char SelectOptimize::ID = 0;
217
218 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
219 false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)220 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
221 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
222 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
223 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
224 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
225 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
226 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
227 false)
228
229 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
230
runOnFunction(Function & F)231 bool SelectOptimize::runOnFunction(Function &F) {
232 TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
233 TSI = TM->getSubtargetImpl(F);
234 TLI = TSI->getTargetLowering();
235
236 // If none of the select types is supported then skip this pass.
237 // This is an optimization pass. Legality issues will be handled by
238 // instruction selection.
239 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
240 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
241 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
242 return false;
243
244 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
245 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
246 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
247 BPI.reset(new BranchProbabilityInfo(F, *LI));
248 BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
249 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
250 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
251 TSchedModel.init(TSI);
252
253 // When optimizing for size, selects are preferable over branches.
254 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
255 return false;
256
257 return optimizeSelects(F);
258 }
259
optimizeSelects(Function & F)260 bool SelectOptimize::optimizeSelects(Function &F) {
261 // Determine for which select groups it is profitable converting to branches.
262 SelectGroups ProfSIGroups;
263 // Base heuristics apply only to non-loops and outer loops.
264 optimizeSelectsBase(F, ProfSIGroups);
265 // Separate heuristics for inner-most loops.
266 optimizeSelectsInnerLoops(F, ProfSIGroups);
267
268 // Convert to branches the select groups that were deemed
269 // profitable-to-convert.
270 convertProfitableSIGroups(ProfSIGroups);
271
272 // Code modified if at least one select group was converted.
273 return !ProfSIGroups.empty();
274 }
275
optimizeSelectsBase(Function & F,SelectGroups & ProfSIGroups)276 void SelectOptimize::optimizeSelectsBase(Function &F,
277 SelectGroups &ProfSIGroups) {
278 // Collect all the select groups.
279 SelectGroups SIGroups;
280 for (BasicBlock &BB : F) {
281 // Base heuristics apply only to non-loops and outer loops.
282 Loop *L = LI->getLoopFor(&BB);
283 if (L && L->isInnermost())
284 continue;
285 collectSelectGroups(BB, SIGroups);
286 }
287
288 // Determine for which select groups it is profitable converting to branches.
289 findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
290 }
291
optimizeSelectsInnerLoops(Function & F,SelectGroups & ProfSIGroups)292 void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
293 SelectGroups &ProfSIGroups) {
294 SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
295 // Need to check size on each iteration as we accumulate child loops.
296 for (unsigned long i = 0; i < Loops.size(); ++i)
297 for (Loop *ChildL : Loops[i]->getSubLoops())
298 Loops.push_back(ChildL);
299
300 for (Loop *L : Loops) {
301 if (!L->isInnermost())
302 continue;
303
304 SelectGroups SIGroups;
305 for (BasicBlock *BB : L->getBlocks())
306 collectSelectGroups(*BB, SIGroups);
307
308 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
309 }
310 }
311
312 /// If \p isTrue is true, return the true value of \p SI, otherwise return
313 /// false value of \p SI. If the true/false value of \p SI is defined by any
314 /// select instructions in \p Selects, look through the defining select
315 /// instruction until the true/false value is not defined in \p Selects.
316 static Value *
getTrueOrFalseValue(SelectInst * SI,bool isTrue,const SmallPtrSet<const Instruction *,2> & Selects)317 getTrueOrFalseValue(SelectInst *SI, bool isTrue,
318 const SmallPtrSet<const Instruction *, 2> &Selects) {
319 Value *V = nullptr;
320 for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
321 DefSI = dyn_cast<SelectInst>(V)) {
322 assert(DefSI->getCondition() == SI->getCondition() &&
323 "The condition of DefSI does not match with SI");
324 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
325 }
326 assert(V && "Failed to get select true/false value");
327 return V;
328 }
329
convertProfitableSIGroups(SelectGroups & ProfSIGroups)330 void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
331 for (SelectGroup &ASI : ProfSIGroups) {
332 // The code transformation here is a modified version of the sinking
333 // transformation in CodeGenPrepare::optimizeSelectInst with a more
334 // aggressive strategy of which instructions to sink.
335 //
336 // TODO: eliminate the redundancy of logic transforming selects to branches
337 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
338 // selects for all cases (with and without profile information).
339
340 // Transform a sequence like this:
341 // start:
342 // %cmp = cmp uge i32 %a, %b
343 // %sel = select i1 %cmp, i32 %c, i32 %d
344 //
345 // Into:
346 // start:
347 // %cmp = cmp uge i32 %a, %b
348 // %cmp.frozen = freeze %cmp
349 // br i1 %cmp.frozen, label %select.true, label %select.false
350 // select.true:
351 // br label %select.end
352 // select.false:
353 // br label %select.end
354 // select.end:
355 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
356 //
357 // %cmp should be frozen, otherwise it may introduce undefined behavior.
358 // In addition, we may sink instructions that produce %c or %d into the
359 // destination(s) of the new branch.
360 // If the true or false blocks do not contain a sunken instruction, that
361 // block and its branch may be optimized away. In that case, one side of the
362 // first branch will point directly to select.end, and the corresponding PHI
363 // predecessor block will be the start block.
364
365 // Find all the instructions that can be soundly sunk to the true/false
366 // blocks. These are instructions that are computed solely for producing the
367 // operands of the select instructions in the group and can be sunk without
368 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
369 // with side effects).
370 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
371 typedef std::stack<Instruction *>::size_type StackSizeType;
372 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
373 for (SelectInst *SI : ASI) {
374 // For each select, compute the sinkable dependence chains of the true and
375 // false operands.
376 if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
377 std::stack<Instruction *> TrueSlice;
378 getExclBackwardsSlice(TI, TrueSlice, true);
379 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
380 TrueSlices.push_back(TrueSlice);
381 }
382 if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
383 std::stack<Instruction *> FalseSlice;
384 getExclBackwardsSlice(FI, FalseSlice, true);
385 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
386 FalseSlices.push_back(FalseSlice);
387 }
388 }
389 // In the case of multiple select instructions in the same group, the order
390 // of non-dependent instructions (instructions of different dependence
391 // slices) in the true/false blocks appears to affect performance.
392 // Interleaving the slices seems to experimentally be the optimal approach.
393 // This interleaving scheduling allows for more ILP (with a natural downside
394 // of increasing a bit register pressure) compared to a simple ordering of
395 // one whole chain after another. One would expect that this ordering would
396 // not matter since the scheduling in the backend of the compiler would
397 // take care of it, but apparently the scheduler fails to deliver optimal
398 // ILP with a naive ordering here.
399 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
400 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
401 for (auto &S : TrueSlices) {
402 if (!S.empty()) {
403 TrueSlicesInterleaved.push_back(S.top());
404 S.pop();
405 }
406 }
407 }
408 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
409 for (auto &S : FalseSlices) {
410 if (!S.empty()) {
411 FalseSlicesInterleaved.push_back(S.top());
412 S.pop();
413 }
414 }
415 }
416
417 // We split the block containing the select(s) into two blocks.
418 SelectInst *SI = ASI.front();
419 SelectInst *LastSI = ASI.back();
420 BasicBlock *StartBlock = SI->getParent();
421 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
422 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
423 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
424 // Delete the unconditional branch that was just created by the split.
425 StartBlock->getTerminator()->eraseFromParent();
426
427 // Move any debug/pseudo instructions that were in-between the select
428 // group to the newly-created end block.
429 SmallVector<Instruction *, 2> DebugPseudoINS;
430 auto DIt = SI->getIterator();
431 while (&*DIt != LastSI) {
432 if (DIt->isDebugOrPseudoInst())
433 DebugPseudoINS.push_back(&*DIt);
434 DIt++;
435 }
436 for (auto *DI : DebugPseudoINS) {
437 DI->moveBefore(&*EndBlock->getFirstInsertionPt());
438 }
439
440 // These are the new basic blocks for the conditional branch.
441 // At least one will become an actual new basic block.
442 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
443 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
444 if (!TrueSlicesInterleaved.empty()) {
445 TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
446 EndBlock->getParent(), EndBlock);
447 TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
448 TrueBranch->setDebugLoc(LastSI->getDebugLoc());
449 for (Instruction *TrueInst : TrueSlicesInterleaved)
450 TrueInst->moveBefore(TrueBranch);
451 }
452 if (!FalseSlicesInterleaved.empty()) {
453 FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
454 EndBlock->getParent(), EndBlock);
455 FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
456 FalseBranch->setDebugLoc(LastSI->getDebugLoc());
457 for (Instruction *FalseInst : FalseSlicesInterleaved)
458 FalseInst->moveBefore(FalseBranch);
459 }
460 // If there was nothing to sink, then arbitrarily choose the 'false' side
461 // for a new input value to the PHI.
462 if (TrueBlock == FalseBlock) {
463 assert(TrueBlock == nullptr &&
464 "Unexpected basic block transform while optimizing select");
465
466 FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
467 EndBlock->getParent(), EndBlock);
468 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
469 FalseBranch->setDebugLoc(SI->getDebugLoc());
470 }
471
472 // Insert the real conditional branch based on the original condition.
473 // If we did not create a new block for one of the 'true' or 'false' paths
474 // of the condition, it means that side of the branch goes to the end block
475 // directly and the path originates from the start block from the point of
476 // view of the new PHI.
477 BasicBlock *TT, *FT;
478 if (TrueBlock == nullptr) {
479 TT = EndBlock;
480 FT = FalseBlock;
481 TrueBlock = StartBlock;
482 } else if (FalseBlock == nullptr) {
483 TT = TrueBlock;
484 FT = EndBlock;
485 FalseBlock = StartBlock;
486 } else {
487 TT = TrueBlock;
488 FT = FalseBlock;
489 }
490 IRBuilder<> IB(SI);
491 auto *CondFr =
492 IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
493 IB.CreateCondBr(CondFr, TT, FT, SI);
494
495 SmallPtrSet<const Instruction *, 2> INS;
496 INS.insert(ASI.begin(), ASI.end());
497 // Use reverse iterator because later select may use the value of the
498 // earlier select, and we need to propagate value through earlier select
499 // to get the PHI operand.
500 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
501 SelectInst *SI = *It;
502 // The select itself is replaced with a PHI Node.
503 PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
504 PN->takeName(SI);
505 PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
506 PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
507 PN->setDebugLoc(SI->getDebugLoc());
508
509 SI->replaceAllUsesWith(PN);
510 SI->eraseFromParent();
511 INS.erase(SI);
512 ++NumSelectsConverted;
513 }
514 }
515 }
516
collectSelectGroups(BasicBlock & BB,SelectGroups & SIGroups)517 void SelectOptimize::collectSelectGroups(BasicBlock &BB,
518 SelectGroups &SIGroups) {
519 BasicBlock::iterator BBIt = BB.begin();
520 while (BBIt != BB.end()) {
521 Instruction *I = &*BBIt++;
522 if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
523 SelectGroup SIGroup;
524 SIGroup.push_back(SI);
525 while (BBIt != BB.end()) {
526 Instruction *NI = &*BBIt;
527 SelectInst *NSI = dyn_cast<SelectInst>(NI);
528 if (NSI && SI->getCondition() == NSI->getCondition()) {
529 SIGroup.push_back(NSI);
530 } else if (!NI->isDebugOrPseudoInst()) {
531 // Debug/pseudo instructions should be skipped and not prevent the
532 // formation of a select group.
533 break;
534 }
535 ++BBIt;
536 }
537
538 // If the select type is not supported, no point optimizing it.
539 // Instruction selection will take care of it.
540 if (!isSelectKindSupported(SI))
541 continue;
542
543 SIGroups.push_back(SIGroup);
544 }
545 }
546 }
547
findProfitableSIGroupsBase(SelectGroups & SIGroups,SelectGroups & ProfSIGroups)548 void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
549 SelectGroups &ProfSIGroups) {
550 for (SelectGroup &ASI : SIGroups) {
551 ++NumSelectOptAnalyzed;
552 if (isConvertToBranchProfitableBase(ASI))
553 ProfSIGroups.push_back(ASI);
554 }
555 }
556
findProfitableSIGroupsInnerLoops(const Loop * L,SelectGroups & SIGroups,SelectGroups & ProfSIGroups)557 void SelectOptimize::findProfitableSIGroupsInnerLoops(
558 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
559 NumSelectOptAnalyzed += SIGroups.size();
560 // For each select group in an inner-most loop,
561 // a branch is more preferable than a select/conditional-move if:
562 // i) conversion to branches for all the select groups of the loop satisfies
563 // loop-level heuristics including reducing the loop's critical path by
564 // some threshold (see SelectOptimize::checkLoopHeuristics); and
565 // ii) the total cost of the select group is cheaper with a branch compared
566 // to its predicated version. The cost is in terms of latency and the cost
567 // of a select group is the cost of its most expensive select instruction
568 // (assuming infinite resources and thus fully leveraging available ILP).
569
570 DenseMap<const Instruction *, CostInfo> InstCostMap;
571 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
572 {Scaled64::getZero(), Scaled64::getZero()}};
573 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
574 !checkLoopHeuristics(L, LoopCost)) {
575 return;
576 }
577
578 for (SelectGroup &ASI : SIGroups) {
579 // Assuming infinite resources, the cost of a group of instructions is the
580 // cost of the most expensive instruction of the group.
581 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
582 for (SelectInst *SI : ASI) {
583 SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
584 BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
585 }
586 if (BranchCost < SelectCost) {
587 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
588 OR << "Profitable to convert to branch (loop analysis). BranchCost="
589 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
590 << ". ";
591 ORE->emit(OR);
592 ++NumSelectConvertedLoop;
593 ProfSIGroups.push_back(ASI);
594 } else {
595 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
596 ORmiss << "Select is more profitable (loop analysis). BranchCost="
597 << BranchCost.toString()
598 << ", SelectCost=" << SelectCost.toString() << ". ";
599 ORE->emit(ORmiss);
600 }
601 }
602 }
603
isConvertToBranchProfitableBase(const SmallVector<SelectInst *,2> & ASI)604 bool SelectOptimize::isConvertToBranchProfitableBase(
605 const SmallVector<SelectInst *, 2> &ASI) {
606 SelectInst *SI = ASI.front();
607 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
608 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
609
610 // Skip cold basic blocks. Better to optimize for size for cold blocks.
611 if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
612 ++NumSelectColdBB;
613 ORmiss << "Not converted to branch because of cold basic block. ";
614 ORE->emit(ORmiss);
615 return false;
616 }
617
618 // If unpredictable, branch form is less profitable.
619 if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
620 ++NumSelectUnPred;
621 ORmiss << "Not converted to branch because of unpredictable branch. ";
622 ORE->emit(ORmiss);
623 return false;
624 }
625
626 // If highly predictable, branch form is more profitable, unless a
627 // predictable select is inexpensive in the target architecture.
628 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
629 ++NumSelectConvertedHighPred;
630 OR << "Converted to branch because of highly predictable branch. ";
631 ORE->emit(OR);
632 return true;
633 }
634
635 // Look for expensive instructions in the cold operand's (if any) dependence
636 // slice of any of the selects in the group.
637 if (hasExpensiveColdOperand(ASI)) {
638 ++NumSelectConvertedExpColdOperand;
639 OR << "Converted to branch because of expensive cold operand.";
640 ORE->emit(OR);
641 return true;
642 }
643
644 ORmiss << "Not profitable to convert to branch (base heuristic).";
645 ORE->emit(ORmiss);
646 return false;
647 }
648
divideNearest(InstructionCost Numerator,uint64_t Denominator)649 static InstructionCost divideNearest(InstructionCost Numerator,
650 uint64_t Denominator) {
651 return (Numerator + (Denominator / 2)) / Denominator;
652 }
653
hasExpensiveColdOperand(const SmallVector<SelectInst *,2> & ASI)654 bool SelectOptimize::hasExpensiveColdOperand(
655 const SmallVector<SelectInst *, 2> &ASI) {
656 bool ColdOperand = false;
657 uint64_t TrueWeight, FalseWeight, TotalWeight;
658 if (ASI.front()->extractProfMetadata(TrueWeight, FalseWeight)) {
659 uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
660 TotalWeight = TrueWeight + FalseWeight;
661 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
662 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
663 } else if (PSI->hasProfileSummary()) {
664 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
665 ORmiss << "Profile data available but missing branch-weights metadata for "
666 "select instruction. ";
667 ORE->emit(ORmiss);
668 }
669 if (!ColdOperand)
670 return false;
671 // Check if the cold path's dependence slice is expensive for any of the
672 // selects of the group.
673 for (SelectInst *SI : ASI) {
674 Instruction *ColdI = nullptr;
675 uint64_t HotWeight;
676 if (TrueWeight < FalseWeight) {
677 ColdI = dyn_cast<Instruction>(SI->getTrueValue());
678 HotWeight = FalseWeight;
679 } else {
680 ColdI = dyn_cast<Instruction>(SI->getFalseValue());
681 HotWeight = TrueWeight;
682 }
683 if (ColdI) {
684 std::stack<Instruction *> ColdSlice;
685 getExclBackwardsSlice(ColdI, ColdSlice);
686 InstructionCost SliceCost = 0;
687 while (!ColdSlice.empty()) {
688 SliceCost += TTI->getInstructionCost(ColdSlice.top(),
689 TargetTransformInfo::TCK_Latency);
690 ColdSlice.pop();
691 }
692 // The colder the cold value operand of the select is the more expensive
693 // the cmov becomes for computing the cold value operand every time. Thus,
694 // the colder the cold operand is the more its cost counts.
695 // Get nearest integer cost adjusted for coldness.
696 InstructionCost AdjSliceCost =
697 divideNearest(SliceCost * HotWeight, TotalWeight);
698 if (AdjSliceCost >=
699 ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
700 return true;
701 }
702 }
703 return false;
704 }
705
706 // For a given source instruction, collect its backwards dependence slice
707 // consisting of instructions exclusively computed for the purpose of producing
708 // the operands of the source instruction. As an approximation
709 // (sufficiently-accurate in practice), we populate this set with the
710 // instructions of the backwards dependence slice that only have one-use and
711 // form an one-use chain that leads to the source instruction.
getExclBackwardsSlice(Instruction * I,std::stack<Instruction * > & Slice,bool ForSinking)712 void SelectOptimize::getExclBackwardsSlice(Instruction *I,
713 std::stack<Instruction *> &Slice,
714 bool ForSinking) {
715 SmallPtrSet<Instruction *, 2> Visited;
716 std::queue<Instruction *> Worklist;
717 Worklist.push(I);
718 while (!Worklist.empty()) {
719 Instruction *II = Worklist.front();
720 Worklist.pop();
721
722 // Avoid cycles.
723 if (!Visited.insert(II).second)
724 continue;
725
726 if (!II->hasOneUse())
727 continue;
728
729 // Cannot soundly sink instructions with side-effects.
730 // Terminator or phi instructions cannot be sunk.
731 // Avoid sinking other select instructions (should be handled separetely).
732 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
733 isa<SelectInst>(II) || isa<PHINode>(II)))
734 continue;
735
736 // Avoid considering instructions with less frequency than the source
737 // instruction (i.e., avoid colder code regions of the dependence slice).
738 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
739 continue;
740
741 // Eligible one-use instruction added to the dependence slice.
742 Slice.push(II);
743
744 // Explore all the operands of the current instruction to expand the slice.
745 for (unsigned k = 0; k < II->getNumOperands(); ++k)
746 if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
747 Worklist.push(OpI);
748 }
749 }
750
isSelectHighlyPredictable(const SelectInst * SI)751 bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
752 uint64_t TrueWeight, FalseWeight;
753 if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
754 uint64_t Max = std::max(TrueWeight, FalseWeight);
755 uint64_t Sum = TrueWeight + FalseWeight;
756 if (Sum != 0) {
757 auto Probability = BranchProbability::getBranchProbability(Max, Sum);
758 if (Probability > TTI->getPredictableBranchThreshold())
759 return true;
760 }
761 }
762 return false;
763 }
764
checkLoopHeuristics(const Loop * L,const CostInfo LoopCost[2])765 bool SelectOptimize::checkLoopHeuristics(const Loop *L,
766 const CostInfo LoopCost[2]) {
767 // Loop-level checks to determine if a non-predicated version (with branches)
768 // of the loop is more profitable than its predicated version.
769
770 if (DisableLoopLevelHeuristics)
771 return true;
772
773 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
774 L->getHeader()->getFirstNonPHI());
775
776 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
777 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
778 ORmissL << "No select conversion in the loop due to no reduction of loop's "
779 "critical path. ";
780 ORE->emit(ORmissL);
781 return false;
782 }
783
784 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
785 LoopCost[1].PredCost - LoopCost[1].NonPredCost};
786
787 // Profitably converting to branches need to reduce the loop's critical path
788 // by at least some threshold (absolute gain of GainCycleThreshold cycles and
789 // relative gain of 12.5%).
790 if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
791 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
792 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
793 ORmissL << "No select conversion in the loop due to small reduction of "
794 "loop's critical path. Gain="
795 << Gain[1].toString()
796 << ", RelativeGain=" << RelativeGain.toString() << "%. ";
797 ORE->emit(ORmissL);
798 return false;
799 }
800
801 // If the loop's critical path involves loop-carried dependences, the gradient
802 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
803 // This check ensures that the latency reduction for the loop's critical path
804 // keeps decreasing with sufficient rate beyond the two analyzed loop
805 // iterations.
806 if (Gain[1] > Gain[0]) {
807 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
808 (LoopCost[1].PredCost - LoopCost[0].PredCost);
809 if (GradientGain < Scaled64::get(GainGradientThreshold)) {
810 ORmissL << "No select conversion in the loop due to small gradient gain. "
811 "GradientGain="
812 << GradientGain.toString() << "%. ";
813 ORE->emit(ORmissL);
814 return false;
815 }
816 }
817 // If the gain decreases it is not profitable to convert.
818 else if (Gain[1] < Gain[0]) {
819 ORmissL
820 << "No select conversion in the loop due to negative gradient gain. ";
821 ORE->emit(ORmissL);
822 return false;
823 }
824
825 // Non-predicated version of the loop is more profitable than its
826 // predicated version.
827 return true;
828 }
829
830 // Computes instruction and loop-critical-path costs for both the predicated
831 // and non-predicated version of the given loop.
832 // Returns false if unable to compute these costs due to invalid cost of loop
833 // instruction(s).
computeLoopCosts(const Loop * L,const SelectGroups & SIGroups,DenseMap<const Instruction *,CostInfo> & InstCostMap,CostInfo * LoopCost)834 bool SelectOptimize::computeLoopCosts(
835 const Loop *L, const SelectGroups &SIGroups,
836 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
837 const auto &SIset = getSIset(SIGroups);
838 // Compute instruction and loop-critical-path costs across two iterations for
839 // both predicated and non-predicated version.
840 const unsigned Iterations = 2;
841 for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
842 // Cost of the loop's critical path.
843 CostInfo &MaxCost = LoopCost[Iter];
844 for (BasicBlock *BB : L->getBlocks()) {
845 for (const Instruction &I : *BB) {
846 if (I.isDebugOrPseudoInst())
847 continue;
848 // Compute the predicated and non-predicated cost of the instruction.
849 Scaled64 IPredCost = Scaled64::getZero(),
850 INonPredCost = Scaled64::getZero();
851
852 // Assume infinite resources that allow to fully exploit the available
853 // instruction-level parallelism.
854 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
855 for (const Use &U : I.operands()) {
856 auto UI = dyn_cast<Instruction>(U.get());
857 if (!UI)
858 continue;
859 if (InstCostMap.count(UI)) {
860 IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
861 INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
862 }
863 }
864 auto ILatency = computeInstCost(&I);
865 if (!ILatency) {
866 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
867 ORmissL << "Invalid instruction cost preventing analysis and "
868 "optimization of the inner-most loop containing this "
869 "instruction. ";
870 ORE->emit(ORmissL);
871 return false;
872 }
873 IPredCost += Scaled64::get(ILatency.value());
874 INonPredCost += Scaled64::get(ILatency.value());
875
876 // For a select that can be converted to branch,
877 // compute its cost as a branch (non-predicated cost).
878 //
879 // BranchCost = PredictedPathCost + MispredictCost
880 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
881 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
882 if (SIset.contains(&I)) {
883 auto SI = dyn_cast<SelectInst>(&I);
884
885 Scaled64 TrueOpCost = Scaled64::getZero(),
886 FalseOpCost = Scaled64::getZero();
887 if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
888 if (InstCostMap.count(TI))
889 TrueOpCost = InstCostMap[TI].NonPredCost;
890 if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
891 if (InstCostMap.count(FI))
892 FalseOpCost = InstCostMap[FI].NonPredCost;
893 Scaled64 PredictedPathCost =
894 getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
895
896 Scaled64 CondCost = Scaled64::getZero();
897 if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
898 if (InstCostMap.count(CI))
899 CondCost = InstCostMap[CI].NonPredCost;
900 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
901
902 INonPredCost = PredictedPathCost + MispredictCost;
903 }
904
905 InstCostMap[&I] = {IPredCost, INonPredCost};
906 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
907 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
908 }
909 }
910 }
911 return true;
912 }
913
914 SmallPtrSet<const Instruction *, 2>
getSIset(const SelectGroups & SIGroups)915 SelectOptimize::getSIset(const SelectGroups &SIGroups) {
916 SmallPtrSet<const Instruction *, 2> SIset;
917 for (const SelectGroup &ASI : SIGroups)
918 for (const SelectInst *SI : ASI)
919 SIset.insert(SI);
920 return SIset;
921 }
922
computeInstCost(const Instruction * I)923 Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
924 InstructionCost ICost =
925 TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
926 if (auto OC = ICost.getValue())
927 return Optional<uint64_t>(*OC);
928 return Optional<uint64_t>(None);
929 }
930
931 ScaledNumber<uint64_t>
getMispredictionCost(const SelectInst * SI,const Scaled64 CondCost)932 SelectOptimize::getMispredictionCost(const SelectInst *SI,
933 const Scaled64 CondCost) {
934 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
935
936 // Account for the default misprediction rate when using a branch
937 // (conservatively set to 25% by default).
938 uint64_t MispredictRate = MispredictDefaultRate;
939 // If the select condition is obviously predictable, then the misprediction
940 // rate is zero.
941 if (isSelectHighlyPredictable(SI))
942 MispredictRate = 0;
943
944 // CondCost is included to account for cases where the computation of the
945 // condition is part of a long dependence chain (potentially loop-carried)
946 // that would delay detection of a misprediction and increase its cost.
947 Scaled64 MispredictCost =
948 std::max(Scaled64::get(MispredictPenalty), CondCost) *
949 Scaled64::get(MispredictRate);
950 MispredictCost /= Scaled64::get(100);
951
952 return MispredictCost;
953 }
954
955 // Returns the cost of a branch when the prediction is correct.
956 // TrueCost * TrueProbability + FalseCost * FalseProbability.
957 ScaledNumber<uint64_t>
getPredictedPathCost(Scaled64 TrueCost,Scaled64 FalseCost,const SelectInst * SI)958 SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
959 const SelectInst *SI) {
960 Scaled64 PredPathCost;
961 uint64_t TrueWeight, FalseWeight;
962 if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
963 uint64_t SumWeight = TrueWeight + FalseWeight;
964 if (SumWeight != 0) {
965 PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
966 FalseCost * Scaled64::get(FalseWeight);
967 PredPathCost /= Scaled64::get(SumWeight);
968 return PredPathCost;
969 }
970 }
971 // Without branch weight metadata, we assume 75% for the one path and 25% for
972 // the other, and pick the result with the biggest cost.
973 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
974 FalseCost * Scaled64::get(3) + TrueCost);
975 PredPathCost /= Scaled64::get(4);
976 return PredPathCost;
977 }
978
isSelectKindSupported(SelectInst * SI)979 bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
980 bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
981 if (VectorCond)
982 return false;
983 TargetLowering::SelectSupportKind SelectKind;
984 if (SI->getType()->isVectorTy())
985 SelectKind = TargetLowering::ScalarCondVectorVal;
986 else
987 SelectKind = TargetLowering::ScalarValSelect;
988 return TLI->isSelectSupported(SelectKind);
989 }
990