1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass (at IR level) to replace atomic instructions with
11 // target specific instruction which implement the same semantics in a way
12 // which better fits the target backend.  This can include the use of either
13 // (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or
14 // type coercions.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/CodeGen/AtomicExpandUtils.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/IR/Function.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/InstIterator.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/Module.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include "llvm/Target/TargetLowering.h"
29 #include "llvm/Target/TargetMachine.h"
30 #include "llvm/Target/TargetSubtargetInfo.h"
31 
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "atomic-expand"
35 
36 namespace {
37   class AtomicExpand: public FunctionPass {
38     const TargetMachine *TM;
39     const TargetLowering *TLI;
40   public:
41     static char ID; // Pass identification, replacement for typeid
42     explicit AtomicExpand(const TargetMachine *TM = nullptr)
43       : FunctionPass(ID), TM(TM), TLI(nullptr) {
44       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
45     }
46 
47     bool runOnFunction(Function &F) override;
48 
49   private:
50     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
51                                bool IsStore, bool IsLoad);
52     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
53     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
54     bool tryExpandAtomicLoad(LoadInst *LI);
55     bool expandAtomicLoadToLL(LoadInst *LI);
56     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
57     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
58     bool expandAtomicStore(StoreInst *SI);
59     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
60     bool expandAtomicOpToLLSC(
61         Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
62         std::function<Value *(IRBuilder<> &, Value *)> PerformOp);
63     AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
64     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
65     bool isIdempotentRMW(AtomicRMWInst *AI);
66     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
67   };
68 }
69 
70 char AtomicExpand::ID = 0;
71 char &llvm::AtomicExpandID = AtomicExpand::ID;
72 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand",
73     "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg",
74     false, false)
75 
76 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
77   return new AtomicExpand(TM);
78 }
79 
80 bool AtomicExpand::runOnFunction(Function &F) {
81   if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
82     return false;
83   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
84 
85   SmallVector<Instruction *, 1> AtomicInsts;
86 
87   // Changing control-flow while iterating through it is a bad idea, so gather a
88   // list of all atomic instructions before we start.
89   for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
90     Instruction *I = &*II;
91     if (I->isAtomic() && !isa<FenceInst>(I))
92       AtomicInsts.push_back(I);
93   }
94 
95   bool MadeChange = false;
96   for (auto I : AtomicInsts) {
97     auto LI = dyn_cast<LoadInst>(I);
98     auto SI = dyn_cast<StoreInst>(I);
99     auto RMWI = dyn_cast<AtomicRMWInst>(I);
100     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
101     assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
102 
103     if (TLI->shouldInsertFencesForAtomic(I)) {
104       auto FenceOrdering = AtomicOrdering::Monotonic;
105       bool IsStore, IsLoad;
106       if (LI && isAcquireOrStronger(LI->getOrdering())) {
107         FenceOrdering = LI->getOrdering();
108         LI->setOrdering(AtomicOrdering::Monotonic);
109         IsStore = false;
110         IsLoad = true;
111       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
112         FenceOrdering = SI->getOrdering();
113         SI->setOrdering(AtomicOrdering::Monotonic);
114         IsStore = true;
115         IsLoad = false;
116       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
117                           isAcquireOrStronger(RMWI->getOrdering()))) {
118         FenceOrdering = RMWI->getOrdering();
119         RMWI->setOrdering(AtomicOrdering::Monotonic);
120         IsStore = IsLoad = true;
121       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
122                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
123                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
124         // If a compare and swap is lowered to LL/SC, we can do smarter fence
125         // insertion, with a stronger one on the success path than on the
126         // failure path. As a result, fence insertion is directly done by
127         // expandAtomicCmpXchg in that case.
128         FenceOrdering = CASI->getSuccessOrdering();
129         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
130         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
131         IsStore = IsLoad = true;
132       }
133 
134       if (FenceOrdering != AtomicOrdering::Monotonic) {
135         MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
136       }
137     }
138 
139     if (LI) {
140       if (LI->getType()->isFloatingPointTy()) {
141         // TODO: add a TLI hook to control this so that each target can
142         // convert to lowering the original type one at a time.
143         LI = convertAtomicLoadToIntegerType(LI);
144         assert(LI->getType()->isIntegerTy() && "invariant broken");
145         MadeChange = true;
146       }
147 
148       MadeChange |= tryExpandAtomicLoad(LI);
149     } else if (SI) {
150       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
151         // TODO: add a TLI hook to control this so that each target can
152         // convert to lowering the original type one at a time.
153         SI = convertAtomicStoreToIntegerType(SI);
154         assert(SI->getValueOperand()->getType()->isIntegerTy() &&
155                "invariant broken");
156         MadeChange = true;
157       }
158 
159       if (TLI->shouldExpandAtomicStoreInIR(SI))
160         MadeChange |= expandAtomicStore(SI);
161     } else if (RMWI) {
162       // There are two different ways of expanding RMW instructions:
163       // - into a load if it is idempotent
164       // - into a Cmpxchg/LL-SC loop otherwise
165       // we try them in that order.
166 
167       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
168         MadeChange = true;
169       } else {
170         MadeChange |= tryExpandAtomicRMW(RMWI);
171       }
172     } else if (CASI) {
173       // TODO: when we're ready to make the change at the IR level, we can
174       // extend convertCmpXchgToInteger for floating point too.
175       assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
176              "unimplemented - floating point not legal at IR level");
177       if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
178         // TODO: add a TLI hook to control this so that each target can
179         // convert to lowering the original type one at a time.
180         CASI = convertCmpXchgToIntegerType(CASI);
181         assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
182                "invariant broken");
183         MadeChange = true;
184       }
185 
186       if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
187         MadeChange |= expandAtomicCmpXchg(CASI);
188     }
189   }
190   return MadeChange;
191 }
192 
193 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
194                                          bool IsStore, bool IsLoad) {
195   IRBuilder<> Builder(I);
196 
197   auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
198 
199   auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
200   // The trailing fence is emitted before the instruction instead of after
201   // because there is no easy way of setting Builder insertion point after
202   // an instruction. So we must erase it from the BB, and insert it back
203   // in the right place.
204   // We have a guard here because not every atomic operation generates a
205   // trailing fence.
206   if (TrailingFence) {
207     TrailingFence->removeFromParent();
208     TrailingFence->insertAfter(I);
209   }
210 
211   return (LeadingFence || TrailingFence);
212 }
213 
214 /// Get the iX type with the same bitwidth as T.
215 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
216                                                        const DataLayout &DL) {
217   EVT VT = TLI->getValueType(DL, T);
218   unsigned BitWidth = VT.getStoreSizeInBits();
219   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
220   return IntegerType::get(T->getContext(), BitWidth);
221 }
222 
223 /// Convert an atomic load of a non-integral type to an integer load of the
224 /// equivalent bitwidth.  See the function comment on
225 /// convertAtomicStoreToIntegerType for background.
226 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
227   auto *M = LI->getModule();
228   Type *NewTy = getCorrespondingIntegerType(LI->getType(),
229                                             M->getDataLayout());
230 
231   IRBuilder<> Builder(LI);
232 
233   Value *Addr = LI->getPointerOperand();
234   Type *PT = PointerType::get(NewTy,
235                               Addr->getType()->getPointerAddressSpace());
236   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
237 
238   auto *NewLI = Builder.CreateLoad(NewAddr);
239   NewLI->setAlignment(LI->getAlignment());
240   NewLI->setVolatile(LI->isVolatile());
241   NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
242   DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
243 
244   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
245   LI->replaceAllUsesWith(NewVal);
246   LI->eraseFromParent();
247   return NewLI;
248 }
249 
250 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
251   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
252   case TargetLoweringBase::AtomicExpansionKind::None:
253     return false;
254   case TargetLoweringBase::AtomicExpansionKind::LLSC:
255     return expandAtomicOpToLLSC(
256         LI, LI->getPointerOperand(), LI->getOrdering(),
257         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
258   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
259     return expandAtomicLoadToLL(LI);
260   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
261     return expandAtomicLoadToCmpXchg(LI);
262   }
263   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
264 }
265 
266 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
267   IRBuilder<> Builder(LI);
268 
269   // On some architectures, load-linked instructions are atomic for larger
270   // sizes than normal loads. For example, the only 64-bit load guaranteed
271   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
272   Value *Val =
273       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
274   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
275 
276   LI->replaceAllUsesWith(Val);
277   LI->eraseFromParent();
278 
279   return true;
280 }
281 
282 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
283   IRBuilder<> Builder(LI);
284   AtomicOrdering Order = LI->getOrdering();
285   Value *Addr = LI->getPointerOperand();
286   Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
287   Constant *DummyVal = Constant::getNullValue(Ty);
288 
289   Value *Pair = Builder.CreateAtomicCmpXchg(
290       Addr, DummyVal, DummyVal, Order,
291       AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
292   Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
293 
294   LI->replaceAllUsesWith(Loaded);
295   LI->eraseFromParent();
296 
297   return true;
298 }
299 
300 /// Convert an atomic store of a non-integral type to an integer store of the
301 /// equivalent bitwidth.  We used to not support floating point or vector
302 /// atomics in the IR at all.  The backends learned to deal with the bitcast
303 /// idiom because that was the only way of expressing the notion of a atomic
304 /// float or vector store.  The long term plan is to teach each backend to
305 /// instruction select from the original atomic store, but as a migration
306 /// mechanism, we convert back to the old format which the backends understand.
307 /// Each backend will need individual work to recognize the new format.
308 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
309   IRBuilder<> Builder(SI);
310   auto *M = SI->getModule();
311   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
312                                             M->getDataLayout());
313   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
314 
315   Value *Addr = SI->getPointerOperand();
316   Type *PT = PointerType::get(NewTy,
317                               Addr->getType()->getPointerAddressSpace());
318   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
319 
320   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
321   NewSI->setAlignment(SI->getAlignment());
322   NewSI->setVolatile(SI->isVolatile());
323   NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
324   DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
325   SI->eraseFromParent();
326   return NewSI;
327 }
328 
329 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
330   // This function is only called on atomic stores that are too large to be
331   // atomic if implemented as a native store. So we replace them by an
332   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
333   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
334   // It is the responsibility of the target to only signal expansion via
335   // shouldExpandAtomicRMW in cases where this is required and possible.
336   IRBuilder<> Builder(SI);
337   AtomicRMWInst *AI =
338       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
339                               SI->getValueOperand(), SI->getOrdering());
340   SI->eraseFromParent();
341 
342   // Now we have an appropriate swap instruction, lower it as usual.
343   return tryExpandAtomicRMW(AI);
344 }
345 
346 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
347                                  Value *Loaded, Value *NewVal,
348                                  AtomicOrdering MemOpOrder,
349                                  Value *&Success, Value *&NewLoaded) {
350   Value* Pair = Builder.CreateAtomicCmpXchg(
351       Addr, Loaded, NewVal, MemOpOrder,
352       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
353   Success = Builder.CreateExtractValue(Pair, 1, "success");
354   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
355 }
356 
357 /// Emit IR to implement the given atomicrmw operation on values in registers,
358 /// returning the new value.
359 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
360                               Value *Loaded, Value *Inc) {
361   Value *NewVal;
362   switch (Op) {
363   case AtomicRMWInst::Xchg:
364     return Inc;
365   case AtomicRMWInst::Add:
366     return Builder.CreateAdd(Loaded, Inc, "new");
367   case AtomicRMWInst::Sub:
368     return Builder.CreateSub(Loaded, Inc, "new");
369   case AtomicRMWInst::And:
370     return Builder.CreateAnd(Loaded, Inc, "new");
371   case AtomicRMWInst::Nand:
372     return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
373   case AtomicRMWInst::Or:
374     return Builder.CreateOr(Loaded, Inc, "new");
375   case AtomicRMWInst::Xor:
376     return Builder.CreateXor(Loaded, Inc, "new");
377   case AtomicRMWInst::Max:
378     NewVal = Builder.CreateICmpSGT(Loaded, Inc);
379     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
380   case AtomicRMWInst::Min:
381     NewVal = Builder.CreateICmpSLE(Loaded, Inc);
382     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
383   case AtomicRMWInst::UMax:
384     NewVal = Builder.CreateICmpUGT(Loaded, Inc);
385     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
386   case AtomicRMWInst::UMin:
387     NewVal = Builder.CreateICmpULE(Loaded, Inc);
388     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
389   default:
390     llvm_unreachable("Unknown atomic op");
391   }
392 }
393 
394 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
395   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
396   case TargetLoweringBase::AtomicExpansionKind::None:
397     return false;
398   case TargetLoweringBase::AtomicExpansionKind::LLSC:
399     return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(),
400                                 [&](IRBuilder<> &Builder, Value *Loaded) {
401                                   return performAtomicOp(AI->getOperation(),
402                                                          Builder, Loaded,
403                                                          AI->getValOperand());
404                                 });
405   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
406     return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
407   default:
408     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
409   }
410 }
411 
412 bool AtomicExpand::expandAtomicOpToLLSC(
413     Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
414     std::function<Value *(IRBuilder<> &, Value *)> PerformOp) {
415   BasicBlock *BB = I->getParent();
416   Function *F = BB->getParent();
417   LLVMContext &Ctx = F->getContext();
418 
419   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
420   //
421   // The standard expansion we produce is:
422   //     [...]
423   //     fence?
424   // atomicrmw.start:
425   //     %loaded = @load.linked(%addr)
426   //     %new = some_op iN %loaded, %incr
427   //     %stored = @store_conditional(%new, %addr)
428   //     %try_again = icmp i32 ne %stored, 0
429   //     br i1 %try_again, label %loop, label %atomicrmw.end
430   // atomicrmw.end:
431   //     fence?
432   //     [...]
433   BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end");
434   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
435 
436   // This grabs the DebugLoc from I.
437   IRBuilder<> Builder(I);
438 
439   // The split call above "helpfully" added a branch at the end of BB (to the
440   // wrong place), but we might want a fence too. It's easiest to just remove
441   // the branch entirely.
442   std::prev(BB->end())->eraseFromParent();
443   Builder.SetInsertPoint(BB);
444   Builder.CreateBr(LoopBB);
445 
446   // Start the main loop block now that we've taken care of the preliminaries.
447   Builder.SetInsertPoint(LoopBB);
448   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
449 
450   Value *NewVal = PerformOp(Builder, Loaded);
451 
452   Value *StoreSuccess =
453       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
454   Value *TryAgain = Builder.CreateICmpNE(
455       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
456   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
457 
458   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
459 
460   I->replaceAllUsesWith(Loaded);
461   I->eraseFromParent();
462 
463   return true;
464 }
465 
466 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
467 /// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
468 /// IR.  As a migration step, we convert back to what use to be the standard
469 /// way to represent a pointer cmpxchg so that we can update backends one by
470 /// one.
471 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
472   auto *M = CI->getModule();
473   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
474                                             M->getDataLayout());
475 
476   IRBuilder<> Builder(CI);
477 
478   Value *Addr = CI->getPointerOperand();
479   Type *PT = PointerType::get(NewTy,
480                               Addr->getType()->getPointerAddressSpace());
481   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
482 
483   Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
484   Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
485 
486 
487   auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
488                                             CI->getSuccessOrdering(),
489                                             CI->getFailureOrdering(),
490                                             CI->getSynchScope());
491   NewCI->setVolatile(CI->isVolatile());
492   NewCI->setWeak(CI->isWeak());
493   DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
494 
495   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
496   Value *Succ = Builder.CreateExtractValue(NewCI, 1);
497 
498   OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
499 
500   Value *Res = UndefValue::get(CI->getType());
501   Res = Builder.CreateInsertValue(Res, OldVal, 0);
502   Res = Builder.CreateInsertValue(Res, Succ, 1);
503 
504   CI->replaceAllUsesWith(Res);
505   CI->eraseFromParent();
506   return NewCI;
507 }
508 
509 
510 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
511   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
512   AtomicOrdering FailureOrder = CI->getFailureOrdering();
513   Value *Addr = CI->getPointerOperand();
514   BasicBlock *BB = CI->getParent();
515   Function *F = BB->getParent();
516   LLVMContext &Ctx = F->getContext();
517   // If shouldInsertFencesForAtomic() returns true, then the target does not
518   // want to deal with memory orders, and emitLeading/TrailingFence should take
519   // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
520   // should preserve the ordering.
521   bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
522   AtomicOrdering MemOpOrder =
523       ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
524 
525   // In implementations which use a barrier to achieve release semantics, we can
526   // delay emitting this barrier until we know a store is actually going to be
527   // attempted. The cost of this delay is that we need 2 copies of the block
528   // emitting the load-linked, affecting code size.
529   //
530   // Ideally, this logic would be unconditional except for the minsize check
531   // since in other cases the extra blocks naturally collapse down to the
532   // minimal loop. Unfortunately, this puts too much stress on later
533   // optimisations so we avoid emitting the extra logic in those cases too.
534   bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
535                            SuccessOrder != AtomicOrdering::Monotonic &&
536                            SuccessOrder != AtomicOrdering::Acquire &&
537                            !F->optForMinSize();
538 
539   // There's no overhead for sinking the release barrier in a weak cmpxchg, so
540   // do it even on minsize.
541   bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
542 
543   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
544   //
545   // The full expansion we produce is:
546   //     [...]
547   // cmpxchg.start:
548   //     %unreleasedload = @load.linked(%addr)
549   //     %should_store = icmp eq %unreleasedload, %desired
550   //     br i1 %should_store, label %cmpxchg.fencedstore,
551   //                          label %cmpxchg.nostore
552   // cmpxchg.releasingstore:
553   //     fence?
554   //     br label cmpxchg.trystore
555   // cmpxchg.trystore:
556   //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
557   //                            [%releasedload, %cmpxchg.releasedload]
558   //     %stored = @store_conditional(%new, %addr)
559   //     %success = icmp eq i32 %stored, 0
560   //     br i1 %success, label %cmpxchg.success,
561   //                     label %cmpxchg.releasedload/%cmpxchg.failure
562   // cmpxchg.releasedload:
563   //     %releasedload = @load.linked(%addr)
564   //     %should_store = icmp eq %releasedload, %desired
565   //     br i1 %should_store, label %cmpxchg.trystore,
566   //                          label %cmpxchg.failure
567   // cmpxchg.success:
568   //     fence?
569   //     br label %cmpxchg.end
570   // cmpxchg.nostore:
571   //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
572   //                           [%releasedload,
573   //                               %cmpxchg.releasedload/%cmpxchg.trystore]
574   //     @load_linked_fail_balance()?
575   //     br label %cmpxchg.failure
576   // cmpxchg.failure:
577   //     fence?
578   //     br label %cmpxchg.end
579   // cmpxchg.end:
580   //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
581   //                   [%loaded.trystore, %cmpxchg.trystore]
582   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
583   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
584   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
585   //     [...]
586   BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
587   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
588   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
589   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
590   auto ReleasedLoadBB =
591       BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
592   auto TryStoreBB =
593       BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
594   auto ReleasingStoreBB =
595       BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
596   auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
597 
598   // This grabs the DebugLoc from CI
599   IRBuilder<> Builder(CI);
600 
601   // The split call above "helpfully" added a branch at the end of BB (to the
602   // wrong place), but we might want a fence too. It's easiest to just remove
603   // the branch entirely.
604   std::prev(BB->end())->eraseFromParent();
605   Builder.SetInsertPoint(BB);
606   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
607     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
608                           /*IsLoad=*/true);
609   Builder.CreateBr(StartBB);
610 
611   // Start the main loop block now that we've taken care of the preliminaries.
612   Builder.SetInsertPoint(StartBB);
613   Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
614   Value *ShouldStore = Builder.CreateICmpEQ(
615       UnreleasedLoad, CI->getCompareOperand(), "should_store");
616 
617   // If the cmpxchg doesn't actually need any ordering when it fails, we can
618   // jump straight past that fence instruction (if it exists).
619   Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
620 
621   Builder.SetInsertPoint(ReleasingStoreBB);
622   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
623     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
624                           /*IsLoad=*/true);
625   Builder.CreateBr(TryStoreBB);
626 
627   Builder.SetInsertPoint(TryStoreBB);
628   Value *StoreSuccess = TLI->emitStoreConditional(
629       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
630   StoreSuccess = Builder.CreateICmpEQ(
631       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
632   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
633   Builder.CreateCondBr(StoreSuccess, SuccessBB,
634                        CI->isWeak() ? FailureBB : RetryBB);
635 
636   Builder.SetInsertPoint(ReleasedLoadBB);
637   Value *SecondLoad;
638   if (HasReleasedLoadBB) {
639     SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
640     ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
641                                        "should_store");
642 
643     // If the cmpxchg doesn't actually need any ordering when it fails, we can
644     // jump straight past that fence instruction (if it exists).
645     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
646   } else
647     Builder.CreateUnreachable();
648 
649   // Make sure later instructions don't get reordered with a fence if
650   // necessary.
651   Builder.SetInsertPoint(SuccessBB);
652   if (ShouldInsertFencesForAtomic)
653     TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
654                            /*IsLoad=*/true);
655   Builder.CreateBr(ExitBB);
656 
657   Builder.SetInsertPoint(NoStoreBB);
658   // In the failing case, where we don't execute the store-conditional, the
659   // target might want to balance out the load-linked with a dedicated
660   // instruction (e.g., on ARM, clearing the exclusive monitor).
661   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
662   Builder.CreateBr(FailureBB);
663 
664   Builder.SetInsertPoint(FailureBB);
665   if (ShouldInsertFencesForAtomic)
666     TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
667                            /*IsLoad=*/true);
668   Builder.CreateBr(ExitBB);
669 
670   // Finally, we have control-flow based knowledge of whether the cmpxchg
671   // succeeded or not. We expose this to later passes by converting any
672   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
673   // PHI.
674   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
675   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
676   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
677   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
678 
679   // Setup the builder so we can create any PHIs we need.
680   Value *Loaded;
681   if (!HasReleasedLoadBB)
682     Loaded = UnreleasedLoad;
683   else {
684     Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
685     PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
686     TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
687     TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
688 
689     Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
690     PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
691     NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
692     NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
693 
694     Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
695     PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
696     ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
697     ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
698 
699     Loaded = ExitLoaded;
700   }
701 
702   // Look for any users of the cmpxchg that are just comparing the loaded value
703   // against the desired one, and replace them with the CFG-derived version.
704   SmallVector<ExtractValueInst *, 2> PrunedInsts;
705   for (auto User : CI->users()) {
706     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
707     if (!EV)
708       continue;
709 
710     assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
711            "weird extraction from { iN, i1 }");
712 
713     if (EV->getIndices()[0] == 0)
714       EV->replaceAllUsesWith(Loaded);
715     else
716       EV->replaceAllUsesWith(Success);
717 
718     PrunedInsts.push_back(EV);
719   }
720 
721   // We can remove the instructions now we're no longer iterating through them.
722   for (auto EV : PrunedInsts)
723     EV->eraseFromParent();
724 
725   if (!CI->use_empty()) {
726     // Some use of the full struct return that we don't understand has happened,
727     // so we've got to reconstruct it properly.
728     Value *Res;
729     Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
730     Res = Builder.CreateInsertValue(Res, Success, 1);
731 
732     CI->replaceAllUsesWith(Res);
733   }
734 
735   CI->eraseFromParent();
736   return true;
737 }
738 
739 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
740   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
741   if(!C)
742     return false;
743 
744   AtomicRMWInst::BinOp Op = RMWI->getOperation();
745   switch(Op) {
746     case AtomicRMWInst::Add:
747     case AtomicRMWInst::Sub:
748     case AtomicRMWInst::Or:
749     case AtomicRMWInst::Xor:
750       return C->isZero();
751     case AtomicRMWInst::And:
752       return C->isMinusOne();
753     // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
754     default:
755       return false;
756   }
757 }
758 
759 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
760   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
761     tryExpandAtomicLoad(ResultingLoad);
762     return true;
763   }
764   return false;
765 }
766 
767 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
768                                     CreateCmpXchgInstFun CreateCmpXchg) {
769   assert(AI);
770 
771   AtomicOrdering MemOpOrder = AI->getOrdering() == AtomicOrdering::Unordered
772                                   ? AtomicOrdering::Monotonic
773                                   : AI->getOrdering();
774   Value *Addr = AI->getPointerOperand();
775   BasicBlock *BB = AI->getParent();
776   Function *F = BB->getParent();
777   LLVMContext &Ctx = F->getContext();
778 
779   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
780   //
781   // The standard expansion we produce is:
782   //     [...]
783   //     %init_loaded = load atomic iN* %addr
784   //     br label %loop
785   // loop:
786   //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
787   //     %new = some_op iN %loaded, %incr
788   //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
789   //     %new_loaded = extractvalue { iN, i1 } %pair, 0
790   //     %success = extractvalue { iN, i1 } %pair, 1
791   //     br i1 %success, label %atomicrmw.end, label %loop
792   // atomicrmw.end:
793   //     [...]
794   BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end");
795   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
796 
797   // This grabs the DebugLoc from AI.
798   IRBuilder<> Builder(AI);
799 
800   // The split call above "helpfully" added a branch at the end of BB (to the
801   // wrong place), but we want a load. It's easiest to just remove
802   // the branch entirely.
803   std::prev(BB->end())->eraseFromParent();
804   Builder.SetInsertPoint(BB);
805   LoadInst *InitLoaded = Builder.CreateLoad(Addr);
806   // Atomics require at least natural alignment.
807   InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8);
808   Builder.CreateBr(LoopBB);
809 
810   // Start the main loop block now that we've taken care of the preliminaries.
811   Builder.SetInsertPoint(LoopBB);
812   PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
813   Loaded->addIncoming(InitLoaded, BB);
814 
815   Value *NewVal =
816       performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
817 
818   Value *NewLoaded = nullptr;
819   Value *Success = nullptr;
820 
821   CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder,
822                 Success, NewLoaded);
823   assert(Success && NewLoaded);
824 
825   Loaded->addIncoming(NewLoaded, LoopBB);
826 
827   Builder.CreateCondBr(Success, ExitBB, LoopBB);
828 
829   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
830 
831   AI->replaceAllUsesWith(NewLoaded);
832   AI->eraseFromParent();
833 
834   return true;
835 }
836