1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass (at IR level) to replace atomic instructions with
11 // target specific instruction which implement the same semantics in a way
12 // which better fits the target backend.  This can include the use of either
13 // (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or
14 // type coercions.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/CodeGen/AtomicExpandUtils.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/IR/Function.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/InstIterator.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/Module.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include "llvm/Target/TargetLowering.h"
29 #include "llvm/Target/TargetMachine.h"
30 #include "llvm/Target/TargetSubtargetInfo.h"
31 
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "atomic-expand"
35 
36 namespace {
37   class AtomicExpand: public FunctionPass {
38     const TargetMachine *TM;
39     const TargetLowering *TLI;
40   public:
41     static char ID; // Pass identification, replacement for typeid
42     explicit AtomicExpand(const TargetMachine *TM = nullptr)
43       : FunctionPass(ID), TM(TM), TLI(nullptr) {
44       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
45     }
46 
47     bool runOnFunction(Function &F) override;
48 
49   private:
50     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
51                                bool IsStore, bool IsLoad);
52     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
53     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
54     bool tryExpandAtomicLoad(LoadInst *LI);
55     bool expandAtomicLoadToLL(LoadInst *LI);
56     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
57     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
58     bool expandAtomicStore(StoreInst *SI);
59     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
60     bool expandAtomicOpToLLSC(
61         Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
62         std::function<Value *(IRBuilder<> &, Value *)> PerformOp);
63     AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
64     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
65     bool isIdempotentRMW(AtomicRMWInst *AI);
66     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
67   };
68 }
69 
70 char AtomicExpand::ID = 0;
71 char &llvm::AtomicExpandID = AtomicExpand::ID;
72 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand",
73     "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg",
74     false, false)
75 
76 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
77   return new AtomicExpand(TM);
78 }
79 
80 bool AtomicExpand::runOnFunction(Function &F) {
81   if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
82     return false;
83   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
84 
85   SmallVector<Instruction *, 1> AtomicInsts;
86 
87   // Changing control-flow while iterating through it is a bad idea, so gather a
88   // list of all atomic instructions before we start.
89   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
90     if (I->isAtomic())
91       AtomicInsts.push_back(&*I);
92   }
93 
94   bool MadeChange = false;
95   for (auto I : AtomicInsts) {
96     auto LI = dyn_cast<LoadInst>(I);
97     auto SI = dyn_cast<StoreInst>(I);
98     auto RMWI = dyn_cast<AtomicRMWInst>(I);
99     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
100     assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) &&
101            "Unknown atomic instruction");
102 
103     if (TLI->getInsertFencesForAtomic()) {
104       auto FenceOrdering = Monotonic;
105       bool IsStore, IsLoad;
106       if (LI && isAtLeastAcquire(LI->getOrdering())) {
107         FenceOrdering = LI->getOrdering();
108         LI->setOrdering(Monotonic);
109         IsStore = false;
110         IsLoad = true;
111       } else if (SI && isAtLeastRelease(SI->getOrdering())) {
112         FenceOrdering = SI->getOrdering();
113         SI->setOrdering(Monotonic);
114         IsStore = true;
115         IsLoad = false;
116       } else if (RMWI && (isAtLeastRelease(RMWI->getOrdering()) ||
117                           isAtLeastAcquire(RMWI->getOrdering()))) {
118         FenceOrdering = RMWI->getOrdering();
119         RMWI->setOrdering(Monotonic);
120         IsStore = IsLoad = true;
121       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
122                  (isAtLeastRelease(CASI->getSuccessOrdering()) ||
123                   isAtLeastAcquire(CASI->getSuccessOrdering()))) {
124         // If a compare and swap is lowered to LL/SC, we can do smarter fence
125         // insertion, with a stronger one on the success path than on the
126         // failure path. As a result, fence insertion is directly done by
127         // expandAtomicCmpXchg in that case.
128         FenceOrdering = CASI->getSuccessOrdering();
129         CASI->setSuccessOrdering(Monotonic);
130         CASI->setFailureOrdering(Monotonic);
131         IsStore = IsLoad = true;
132       }
133 
134       if (FenceOrdering != Monotonic) {
135         MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
136       }
137     }
138 
139     if (LI) {
140       if (LI->getType()->isFloatingPointTy()) {
141         // TODO: add a TLI hook to control this so that each target can
142         // convert to lowering the original type one at a time.
143         LI = convertAtomicLoadToIntegerType(LI);
144         assert(LI->getType()->isIntegerTy() && "invariant broken");
145         MadeChange = true;
146       }
147 
148       MadeChange |= tryExpandAtomicLoad(LI);
149     } else if (SI) {
150       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
151         // TODO: add a TLI hook to control this so that each target can
152         // convert to lowering the original type one at a time.
153         SI = convertAtomicStoreToIntegerType(SI);
154         assert(SI->getValueOperand()->getType()->isIntegerTy() &&
155                "invariant broken");
156         MadeChange = true;
157       }
158 
159       if (TLI->shouldExpandAtomicStoreInIR(SI))
160         MadeChange |= expandAtomicStore(SI);
161     } else if (RMWI) {
162       // There are two different ways of expanding RMW instructions:
163       // - into a load if it is idempotent
164       // - into a Cmpxchg/LL-SC loop otherwise
165       // we try them in that order.
166 
167       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
168         MadeChange = true;
169       } else {
170         MadeChange |= tryExpandAtomicRMW(RMWI);
171       }
172     } else if (CASI) {
173       // TODO: when we're ready to make the change at the IR level, we can
174       // extend convertCmpXchgToInteger for floating point too.
175       assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
176              "unimplemented - floating point not legal at IR level");
177       if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
178         // TODO: add a TLI hook to control this so that each target can
179         // convert to lowering the original type one at a time.
180         CASI = convertCmpXchgToIntegerType(CASI);
181         assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
182                "invariant broken");
183         MadeChange = true;
184       }
185 
186       if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
187         MadeChange |= expandAtomicCmpXchg(CASI);
188     }
189   }
190   return MadeChange;
191 }
192 
193 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
194                                          bool IsStore, bool IsLoad) {
195   IRBuilder<> Builder(I);
196 
197   auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
198 
199   auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
200   // The trailing fence is emitted before the instruction instead of after
201   // because there is no easy way of setting Builder insertion point after
202   // an instruction. So we must erase it from the BB, and insert it back
203   // in the right place.
204   // We have a guard here because not every atomic operation generates a
205   // trailing fence.
206   if (TrailingFence) {
207     TrailingFence->removeFromParent();
208     TrailingFence->insertAfter(I);
209   }
210 
211   return (LeadingFence || TrailingFence);
212 }
213 
214 /// Get the iX type with the same bitwidth as T.
215 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
216                                                        const DataLayout &DL) {
217   EVT VT = TLI->getValueType(DL, T);
218   unsigned BitWidth = VT.getStoreSizeInBits();
219   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
220   return IntegerType::get(T->getContext(), BitWidth);
221 }
222 
223 /// Convert an atomic load of a non-integral type to an integer load of the
224 /// equivalent bitwidth.  See the function comment on
225 /// convertAtomicStoreToIntegerType for background.
226 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
227   auto *M = LI->getModule();
228   Type *NewTy = getCorrespondingIntegerType(LI->getType(),
229                                             M->getDataLayout());
230 
231   IRBuilder<> Builder(LI);
232 
233   Value *Addr = LI->getPointerOperand();
234   Type *PT = PointerType::get(NewTy,
235                               Addr->getType()->getPointerAddressSpace());
236   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
237 
238   auto *NewLI = Builder.CreateLoad(NewAddr);
239   NewLI->setAlignment(LI->getAlignment());
240   NewLI->setVolatile(LI->isVolatile());
241   NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
242   DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
243 
244   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
245   LI->replaceAllUsesWith(NewVal);
246   LI->eraseFromParent();
247   return NewLI;
248 }
249 
250 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
251   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
252   case TargetLoweringBase::AtomicExpansionKind::None:
253     return false;
254   case TargetLoweringBase::AtomicExpansionKind::LLSC:
255     return expandAtomicOpToLLSC(
256         LI, LI->getPointerOperand(), LI->getOrdering(),
257         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
258   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
259     return expandAtomicLoadToLL(LI);
260   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
261     return expandAtomicLoadToCmpXchg(LI);
262   }
263   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
264 }
265 
266 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
267   IRBuilder<> Builder(LI);
268 
269   // On some architectures, load-linked instructions are atomic for larger
270   // sizes than normal loads. For example, the only 64-bit load guaranteed
271   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
272   Value *Val =
273       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
274   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
275 
276   LI->replaceAllUsesWith(Val);
277   LI->eraseFromParent();
278 
279   return true;
280 }
281 
282 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
283   IRBuilder<> Builder(LI);
284   AtomicOrdering Order = LI->getOrdering();
285   Value *Addr = LI->getPointerOperand();
286   Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
287   Constant *DummyVal = Constant::getNullValue(Ty);
288 
289   Value *Pair = Builder.CreateAtomicCmpXchg(
290       Addr, DummyVal, DummyVal, Order,
291       AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
292   Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
293 
294   LI->replaceAllUsesWith(Loaded);
295   LI->eraseFromParent();
296 
297   return true;
298 }
299 
300 /// Convert an atomic store of a non-integral type to an integer store of the
301 /// equivalent bitwidth.  We used to not support floating point or vector
302 /// atomics in the IR at all.  The backends learned to deal with the bitcast
303 /// idiom because that was the only way of expressing the notion of a atomic
304 /// float or vector store.  The long term plan is to teach each backend to
305 /// instruction select from the original atomic store, but as a migration
306 /// mechanism, we convert back to the old format which the backends understand.
307 /// Each backend will need individual work to recognize the new format.
308 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
309   IRBuilder<> Builder(SI);
310   auto *M = SI->getModule();
311   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
312                                             M->getDataLayout());
313   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
314 
315   Value *Addr = SI->getPointerOperand();
316   Type *PT = PointerType::get(NewTy,
317                               Addr->getType()->getPointerAddressSpace());
318   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
319 
320   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
321   NewSI->setAlignment(SI->getAlignment());
322   NewSI->setVolatile(SI->isVolatile());
323   NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
324   DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
325   SI->eraseFromParent();
326   return NewSI;
327 }
328 
329 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
330   // This function is only called on atomic stores that are too large to be
331   // atomic if implemented as a native store. So we replace them by an
332   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
333   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
334   // It is the responsibility of the target to only signal expansion via
335   // shouldExpandAtomicRMW in cases where this is required and possible.
336   IRBuilder<> Builder(SI);
337   AtomicRMWInst *AI =
338       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
339                               SI->getValueOperand(), SI->getOrdering());
340   SI->eraseFromParent();
341 
342   // Now we have an appropriate swap instruction, lower it as usual.
343   return tryExpandAtomicRMW(AI);
344 }
345 
346 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
347                                  Value *Loaded, Value *NewVal,
348                                  AtomicOrdering MemOpOrder,
349                                  Value *&Success, Value *&NewLoaded) {
350   Value* Pair = Builder.CreateAtomicCmpXchg(
351       Addr, Loaded, NewVal, MemOpOrder,
352       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
353   Success = Builder.CreateExtractValue(Pair, 1, "success");
354   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
355 }
356 
357 /// Emit IR to implement the given atomicrmw operation on values in registers,
358 /// returning the new value.
359 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
360                               Value *Loaded, Value *Inc) {
361   Value *NewVal;
362   switch (Op) {
363   case AtomicRMWInst::Xchg:
364     return Inc;
365   case AtomicRMWInst::Add:
366     return Builder.CreateAdd(Loaded, Inc, "new");
367   case AtomicRMWInst::Sub:
368     return Builder.CreateSub(Loaded, Inc, "new");
369   case AtomicRMWInst::And:
370     return Builder.CreateAnd(Loaded, Inc, "new");
371   case AtomicRMWInst::Nand:
372     return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
373   case AtomicRMWInst::Or:
374     return Builder.CreateOr(Loaded, Inc, "new");
375   case AtomicRMWInst::Xor:
376     return Builder.CreateXor(Loaded, Inc, "new");
377   case AtomicRMWInst::Max:
378     NewVal = Builder.CreateICmpSGT(Loaded, Inc);
379     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
380   case AtomicRMWInst::Min:
381     NewVal = Builder.CreateICmpSLE(Loaded, Inc);
382     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
383   case AtomicRMWInst::UMax:
384     NewVal = Builder.CreateICmpUGT(Loaded, Inc);
385     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
386   case AtomicRMWInst::UMin:
387     NewVal = Builder.CreateICmpULE(Loaded, Inc);
388     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
389   default:
390     llvm_unreachable("Unknown atomic op");
391   }
392 }
393 
394 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
395   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
396   case TargetLoweringBase::AtomicExpansionKind::None:
397     return false;
398   case TargetLoweringBase::AtomicExpansionKind::LLSC:
399     return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(),
400                                 [&](IRBuilder<> &Builder, Value *Loaded) {
401                                   return performAtomicOp(AI->getOperation(),
402                                                          Builder, Loaded,
403                                                          AI->getValOperand());
404                                 });
405   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
406     return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
407   default:
408     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
409   }
410 }
411 
412 bool AtomicExpand::expandAtomicOpToLLSC(
413     Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
414     std::function<Value *(IRBuilder<> &, Value *)> PerformOp) {
415   BasicBlock *BB = I->getParent();
416   Function *F = BB->getParent();
417   LLVMContext &Ctx = F->getContext();
418 
419   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
420   //
421   // The standard expansion we produce is:
422   //     [...]
423   //     fence?
424   // atomicrmw.start:
425   //     %loaded = @load.linked(%addr)
426   //     %new = some_op iN %loaded, %incr
427   //     %stored = @store_conditional(%new, %addr)
428   //     %try_again = icmp i32 ne %stored, 0
429   //     br i1 %try_again, label %loop, label %atomicrmw.end
430   // atomicrmw.end:
431   //     fence?
432   //     [...]
433   BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end");
434   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
435 
436   // This grabs the DebugLoc from I.
437   IRBuilder<> Builder(I);
438 
439   // The split call above "helpfully" added a branch at the end of BB (to the
440   // wrong place), but we might want a fence too. It's easiest to just remove
441   // the branch entirely.
442   std::prev(BB->end())->eraseFromParent();
443   Builder.SetInsertPoint(BB);
444   Builder.CreateBr(LoopBB);
445 
446   // Start the main loop block now that we've taken care of the preliminaries.
447   Builder.SetInsertPoint(LoopBB);
448   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
449 
450   Value *NewVal = PerformOp(Builder, Loaded);
451 
452   Value *StoreSuccess =
453       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
454   Value *TryAgain = Builder.CreateICmpNE(
455       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
456   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
457 
458   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
459 
460   I->replaceAllUsesWith(Loaded);
461   I->eraseFromParent();
462 
463   return true;
464 }
465 
466 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
467 /// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
468 /// IR.  As a migration step, we convert back to what use to be the standard
469 /// way to represent a pointer cmpxchg so that we can update backends one by
470 /// one.
471 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
472   auto *M = CI->getModule();
473   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
474                                             M->getDataLayout());
475 
476   IRBuilder<> Builder(CI);
477 
478   Value *Addr = CI->getPointerOperand();
479   Type *PT = PointerType::get(NewTy,
480                               Addr->getType()->getPointerAddressSpace());
481   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
482 
483   Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
484   Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
485 
486 
487   auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
488                                             CI->getSuccessOrdering(),
489                                             CI->getFailureOrdering(),
490                                             CI->getSynchScope());
491   NewCI->setVolatile(CI->isVolatile());
492   NewCI->setWeak(CI->isWeak());
493   DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
494 
495   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
496   Value *Succ = Builder.CreateExtractValue(NewCI, 1);
497 
498   OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
499 
500   Value *Res = UndefValue::get(CI->getType());
501   Res = Builder.CreateInsertValue(Res, OldVal, 0);
502   Res = Builder.CreateInsertValue(Res, Succ, 1);
503 
504   CI->replaceAllUsesWith(Res);
505   CI->eraseFromParent();
506   return NewCI;
507 }
508 
509 
510 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
511   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
512   AtomicOrdering FailureOrder = CI->getFailureOrdering();
513   Value *Addr = CI->getPointerOperand();
514   BasicBlock *BB = CI->getParent();
515   Function *F = BB->getParent();
516   LLVMContext &Ctx = F->getContext();
517   // If getInsertFencesForAtomic() returns true, then the target does not want
518   // to deal with memory orders, and emitLeading/TrailingFence should take care
519   // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
520   // should preserve the ordering.
521   AtomicOrdering MemOpOrder =
522       TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder;
523 
524   // In implementations which use a barrier to achieve release semantics, we can
525   // delay emitting this barrier until we know a store is actually going to be
526   // attempted. The cost of this delay is that we need 2 copies of the block
527   // emitting the load-linked, affecting code size.
528   //
529   // Ideally, this logic would be unconditional except for the minsize check
530   // since in other cases the extra blocks naturally collapse down to the
531   // minimal loop. Unfortunately, this puts too much stress on later
532   // optimisations so we avoid emitting the extra logic in those cases too.
533   bool HasReleasedLoadBB = !CI->isWeak() && TLI->getInsertFencesForAtomic() &&
534                            SuccessOrder != Monotonic &&
535                            SuccessOrder != Acquire && !F->optForMinSize();
536 
537   // There's no overhead for sinking the release barrier in a weak cmpxchg, so
538   // do it even on minsize.
539   bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
540 
541   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
542   //
543   // The full expansion we produce is:
544   //     [...]
545   // cmpxchg.start:
546   //     %unreleasedload = @load.linked(%addr)
547   //     %should_store = icmp eq %unreleasedload, %desired
548   //     br i1 %should_store, label %cmpxchg.fencedstore,
549   //                          label %cmpxchg.nostore
550   // cmpxchg.releasingstore:
551   //     fence?
552   //     br label cmpxchg.trystore
553   // cmpxchg.trystore:
554   //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
555   //                            [%releasedload, %cmpxchg.releasedload]
556   //     %stored = @store_conditional(%new, %addr)
557   //     %success = icmp eq i32 %stored, 0
558   //     br i1 %success, label %cmpxchg.success,
559   //                     label %cmpxchg.releasedload/%cmpxchg.failure
560   // cmpxchg.releasedload:
561   //     %releasedload = @load.linked(%addr)
562   //     %should_store = icmp eq %releasedload, %desired
563   //     br i1 %should_store, label %cmpxchg.trystore,
564   //                          label %cmpxchg.failure
565   // cmpxchg.success:
566   //     fence?
567   //     br label %cmpxchg.end
568   // cmpxchg.nostore:
569   //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
570   //                           [%releasedload,
571   //                               %cmpxchg.releasedload/%cmpxchg.trystore]
572   //     @load_linked_fail_balance()?
573   //     br label %cmpxchg.failure
574   // cmpxchg.failure:
575   //     fence?
576   //     br label %cmpxchg.end
577   // cmpxchg.end:
578   //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
579   //                   [%loaded.trystore, %cmpxchg.trystore]
580   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
581   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
582   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
583   //     [...]
584   BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
585   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
586   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
587   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
588   auto ReleasedLoadBB =
589       BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
590   auto TryStoreBB =
591       BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
592   auto ReleasingStoreBB =
593       BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
594   auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
595 
596   // This grabs the DebugLoc from CI
597   IRBuilder<> Builder(CI);
598 
599   // The split call above "helpfully" added a branch at the end of BB (to the
600   // wrong place), but we might want a fence too. It's easiest to just remove
601   // the branch entirely.
602   std::prev(BB->end())->eraseFromParent();
603   Builder.SetInsertPoint(BB);
604   if (UseUnconditionalReleaseBarrier)
605     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
606                           /*IsLoad=*/true);
607   Builder.CreateBr(StartBB);
608 
609   // Start the main loop block now that we've taken care of the preliminaries.
610   Builder.SetInsertPoint(StartBB);
611   Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
612   Value *ShouldStore = Builder.CreateICmpEQ(
613       UnreleasedLoad, CI->getCompareOperand(), "should_store");
614 
615   // If the cmpxchg doesn't actually need any ordering when it fails, we can
616   // jump straight past that fence instruction (if it exists).
617   Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
618 
619   Builder.SetInsertPoint(ReleasingStoreBB);
620   if (!UseUnconditionalReleaseBarrier)
621     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
622                           /*IsLoad=*/true);
623   Builder.CreateBr(TryStoreBB);
624 
625   Builder.SetInsertPoint(TryStoreBB);
626   Value *StoreSuccess = TLI->emitStoreConditional(
627       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
628   StoreSuccess = Builder.CreateICmpEQ(
629       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
630   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
631   Builder.CreateCondBr(StoreSuccess, SuccessBB,
632                        CI->isWeak() ? FailureBB : RetryBB);
633 
634   Builder.SetInsertPoint(ReleasedLoadBB);
635   Value *SecondLoad;
636   if (HasReleasedLoadBB) {
637     SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
638     ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
639                                        "should_store");
640 
641     // If the cmpxchg doesn't actually need any ordering when it fails, we can
642     // jump straight past that fence instruction (if it exists).
643     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
644   } else
645     Builder.CreateUnreachable();
646 
647   // Make sure later instructions don't get reordered with a fence if
648   // necessary.
649   Builder.SetInsertPoint(SuccessBB);
650   TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
651                          /*IsLoad=*/true);
652   Builder.CreateBr(ExitBB);
653 
654   Builder.SetInsertPoint(NoStoreBB);
655   // In the failing case, where we don't execute the store-conditional, the
656   // target might want to balance out the load-linked with a dedicated
657   // instruction (e.g., on ARM, clearing the exclusive monitor).
658   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
659   Builder.CreateBr(FailureBB);
660 
661   Builder.SetInsertPoint(FailureBB);
662   TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
663                          /*IsLoad=*/true);
664   Builder.CreateBr(ExitBB);
665 
666   // Finally, we have control-flow based knowledge of whether the cmpxchg
667   // succeeded or not. We expose this to later passes by converting any
668   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
669   // PHI.
670   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
671   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
672   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
673   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
674 
675   // Setup the builder so we can create any PHIs we need.
676   Value *Loaded;
677   if (!HasReleasedLoadBB)
678     Loaded = UnreleasedLoad;
679   else {
680     Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
681     PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
682     TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
683     TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
684 
685     Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
686     PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
687     NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
688     NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
689 
690     Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
691     PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
692     ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
693     ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
694 
695     Loaded = ExitLoaded;
696   }
697 
698   // Look for any users of the cmpxchg that are just comparing the loaded value
699   // against the desired one, and replace them with the CFG-derived version.
700   SmallVector<ExtractValueInst *, 2> PrunedInsts;
701   for (auto User : CI->users()) {
702     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
703     if (!EV)
704       continue;
705 
706     assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
707            "weird extraction from { iN, i1 }");
708 
709     if (EV->getIndices()[0] == 0)
710       EV->replaceAllUsesWith(Loaded);
711     else
712       EV->replaceAllUsesWith(Success);
713 
714     PrunedInsts.push_back(EV);
715   }
716 
717   // We can remove the instructions now we're no longer iterating through them.
718   for (auto EV : PrunedInsts)
719     EV->eraseFromParent();
720 
721   if (!CI->use_empty()) {
722     // Some use of the full struct return that we don't understand has happened,
723     // so we've got to reconstruct it properly.
724     Value *Res;
725     Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
726     Res = Builder.CreateInsertValue(Res, Success, 1);
727 
728     CI->replaceAllUsesWith(Res);
729   }
730 
731   CI->eraseFromParent();
732   return true;
733 }
734 
735 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
736   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
737   if(!C)
738     return false;
739 
740   AtomicRMWInst::BinOp Op = RMWI->getOperation();
741   switch(Op) {
742     case AtomicRMWInst::Add:
743     case AtomicRMWInst::Sub:
744     case AtomicRMWInst::Or:
745     case AtomicRMWInst::Xor:
746       return C->isZero();
747     case AtomicRMWInst::And:
748       return C->isMinusOne();
749     // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
750     default:
751       return false;
752   }
753 }
754 
755 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
756   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
757     tryExpandAtomicLoad(ResultingLoad);
758     return true;
759   }
760   return false;
761 }
762 
763 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
764                                     CreateCmpXchgInstFun CreateCmpXchg) {
765   assert(AI);
766 
767   AtomicOrdering MemOpOrder =
768       AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
769   Value *Addr = AI->getPointerOperand();
770   BasicBlock *BB = AI->getParent();
771   Function *F = BB->getParent();
772   LLVMContext &Ctx = F->getContext();
773 
774   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
775   //
776   // The standard expansion we produce is:
777   //     [...]
778   //     %init_loaded = load atomic iN* %addr
779   //     br label %loop
780   // loop:
781   //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
782   //     %new = some_op iN %loaded, %incr
783   //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
784   //     %new_loaded = extractvalue { iN, i1 } %pair, 0
785   //     %success = extractvalue { iN, i1 } %pair, 1
786   //     br i1 %success, label %atomicrmw.end, label %loop
787   // atomicrmw.end:
788   //     [...]
789   BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end");
790   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
791 
792   // This grabs the DebugLoc from AI.
793   IRBuilder<> Builder(AI);
794 
795   // The split call above "helpfully" added a branch at the end of BB (to the
796   // wrong place), but we want a load. It's easiest to just remove
797   // the branch entirely.
798   std::prev(BB->end())->eraseFromParent();
799   Builder.SetInsertPoint(BB);
800   LoadInst *InitLoaded = Builder.CreateLoad(Addr);
801   // Atomics require at least natural alignment.
802   InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8);
803   Builder.CreateBr(LoopBB);
804 
805   // Start the main loop block now that we've taken care of the preliminaries.
806   Builder.SetInsertPoint(LoopBB);
807   PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
808   Loaded->addIncoming(InitLoaded, BB);
809 
810   Value *NewVal =
811       performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
812 
813   Value *NewLoaded = nullptr;
814   Value *Success = nullptr;
815 
816   CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder,
817                 Success, NewLoaded);
818   assert(Success && NewLoaded);
819 
820   Loaded->addIncoming(NewLoaded, LoopBB);
821 
822   Builder.CreateCondBr(Success, ExitBB, LoopBB);
823 
824   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
825 
826   AI->replaceAllUsesWith(NewLoaded);
827   AI->eraseFromParent();
828 
829   return true;
830 }
831