1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass (at IR level) to replace atomic instructions with 11 // target specific instruction which implement the same semantics in a way 12 // which better fits the target backend. This can include the use of either 13 // (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or 14 // type coercions. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/CodeGen/AtomicExpandUtils.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/InstIterator.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/IR/Module.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include "llvm/Target/TargetLowering.h" 29 #include "llvm/Target/TargetMachine.h" 30 #include "llvm/Target/TargetSubtargetInfo.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "atomic-expand" 35 36 namespace { 37 class AtomicExpand: public FunctionPass { 38 const TargetMachine *TM; 39 const TargetLowering *TLI; 40 public: 41 static char ID; // Pass identification, replacement for typeid 42 explicit AtomicExpand(const TargetMachine *TM = nullptr) 43 : FunctionPass(ID), TM(TM), TLI(nullptr) { 44 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 45 } 46 47 bool runOnFunction(Function &F) override; 48 49 private: 50 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, 51 bool IsStore, bool IsLoad); 52 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 53 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 54 bool tryExpandAtomicLoad(LoadInst *LI); 55 bool expandAtomicLoadToLL(LoadInst *LI); 56 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 57 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 58 bool expandAtomicStore(StoreInst *SI); 59 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 60 bool expandAtomicOpToLLSC( 61 Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, 62 std::function<Value *(IRBuilder<> &, Value *)> PerformOp); 63 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 64 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 65 bool isIdempotentRMW(AtomicRMWInst *AI); 66 bool simplifyIdempotentRMW(AtomicRMWInst *AI); 67 }; 68 } 69 70 char AtomicExpand::ID = 0; 71 char &llvm::AtomicExpandID = AtomicExpand::ID; 72 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", 73 "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg", 74 false, false) 75 76 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) { 77 return new AtomicExpand(TM); 78 } 79 80 bool AtomicExpand::runOnFunction(Function &F) { 81 if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand()) 82 return false; 83 TLI = TM->getSubtargetImpl(F)->getTargetLowering(); 84 85 SmallVector<Instruction *, 1> AtomicInsts; 86 87 // Changing control-flow while iterating through it is a bad idea, so gather a 88 // list of all atomic instructions before we start. 89 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 90 Instruction *I = &*II; 91 if (I->isAtomic() && !isa<FenceInst>(I)) 92 AtomicInsts.push_back(I); 93 } 94 95 bool MadeChange = false; 96 for (auto I : AtomicInsts) { 97 auto LI = dyn_cast<LoadInst>(I); 98 auto SI = dyn_cast<StoreInst>(I); 99 auto RMWI = dyn_cast<AtomicRMWInst>(I); 100 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 101 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 102 103 if (TLI->shouldInsertFencesForAtomic(I)) { 104 auto FenceOrdering = AtomicOrdering::Monotonic; 105 bool IsStore, IsLoad; 106 if (LI && isAcquireOrStronger(LI->getOrdering())) { 107 FenceOrdering = LI->getOrdering(); 108 LI->setOrdering(AtomicOrdering::Monotonic); 109 IsStore = false; 110 IsLoad = true; 111 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 112 FenceOrdering = SI->getOrdering(); 113 SI->setOrdering(AtomicOrdering::Monotonic); 114 IsStore = true; 115 IsLoad = false; 116 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 117 isAcquireOrStronger(RMWI->getOrdering()))) { 118 FenceOrdering = RMWI->getOrdering(); 119 RMWI->setOrdering(AtomicOrdering::Monotonic); 120 IsStore = IsLoad = true; 121 } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 122 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 123 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 124 // If a compare and swap is lowered to LL/SC, we can do smarter fence 125 // insertion, with a stronger one on the success path than on the 126 // failure path. As a result, fence insertion is directly done by 127 // expandAtomicCmpXchg in that case. 128 FenceOrdering = CASI->getSuccessOrdering(); 129 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 130 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 131 IsStore = IsLoad = true; 132 } 133 134 if (FenceOrdering != AtomicOrdering::Monotonic) { 135 MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); 136 } 137 } 138 139 if (LI) { 140 if (LI->getType()->isFloatingPointTy()) { 141 // TODO: add a TLI hook to control this so that each target can 142 // convert to lowering the original type one at a time. 143 LI = convertAtomicLoadToIntegerType(LI); 144 assert(LI->getType()->isIntegerTy() && "invariant broken"); 145 MadeChange = true; 146 } 147 148 MadeChange |= tryExpandAtomicLoad(LI); 149 } else if (SI) { 150 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 151 // TODO: add a TLI hook to control this so that each target can 152 // convert to lowering the original type one at a time. 153 SI = convertAtomicStoreToIntegerType(SI); 154 assert(SI->getValueOperand()->getType()->isIntegerTy() && 155 "invariant broken"); 156 MadeChange = true; 157 } 158 159 if (TLI->shouldExpandAtomicStoreInIR(SI)) 160 MadeChange |= expandAtomicStore(SI); 161 } else if (RMWI) { 162 // There are two different ways of expanding RMW instructions: 163 // - into a load if it is idempotent 164 // - into a Cmpxchg/LL-SC loop otherwise 165 // we try them in that order. 166 167 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 168 MadeChange = true; 169 } else { 170 MadeChange |= tryExpandAtomicRMW(RMWI); 171 } 172 } else if (CASI) { 173 // TODO: when we're ready to make the change at the IR level, we can 174 // extend convertCmpXchgToInteger for floating point too. 175 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 176 "unimplemented - floating point not legal at IR level"); 177 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 178 // TODO: add a TLI hook to control this so that each target can 179 // convert to lowering the original type one at a time. 180 CASI = convertCmpXchgToIntegerType(CASI); 181 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 182 "invariant broken"); 183 MadeChange = true; 184 } 185 186 if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) 187 MadeChange |= expandAtomicCmpXchg(CASI); 188 } 189 } 190 return MadeChange; 191 } 192 193 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, 194 bool IsStore, bool IsLoad) { 195 IRBuilder<> Builder(I); 196 197 auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); 198 199 auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); 200 // The trailing fence is emitted before the instruction instead of after 201 // because there is no easy way of setting Builder insertion point after 202 // an instruction. So we must erase it from the BB, and insert it back 203 // in the right place. 204 // We have a guard here because not every atomic operation generates a 205 // trailing fence. 206 if (TrailingFence) { 207 TrailingFence->removeFromParent(); 208 TrailingFence->insertAfter(I); 209 } 210 211 return (LeadingFence || TrailingFence); 212 } 213 214 /// Get the iX type with the same bitwidth as T. 215 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 216 const DataLayout &DL) { 217 EVT VT = TLI->getValueType(DL, T); 218 unsigned BitWidth = VT.getStoreSizeInBits(); 219 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 220 return IntegerType::get(T->getContext(), BitWidth); 221 } 222 223 /// Convert an atomic load of a non-integral type to an integer load of the 224 /// equivalent bitwidth. See the function comment on 225 /// convertAtomicStoreToIntegerType for background. 226 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 227 auto *M = LI->getModule(); 228 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 229 M->getDataLayout()); 230 231 IRBuilder<> Builder(LI); 232 233 Value *Addr = LI->getPointerOperand(); 234 Type *PT = PointerType::get(NewTy, 235 Addr->getType()->getPointerAddressSpace()); 236 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 237 238 auto *NewLI = Builder.CreateLoad(NewAddr); 239 NewLI->setAlignment(LI->getAlignment()); 240 NewLI->setVolatile(LI->isVolatile()); 241 NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope()); 242 DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 243 244 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 245 LI->replaceAllUsesWith(NewVal); 246 LI->eraseFromParent(); 247 return NewLI; 248 } 249 250 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 251 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 252 case TargetLoweringBase::AtomicExpansionKind::None: 253 return false; 254 case TargetLoweringBase::AtomicExpansionKind::LLSC: 255 return expandAtomicOpToLLSC( 256 LI, LI->getPointerOperand(), LI->getOrdering(), 257 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 258 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 259 return expandAtomicLoadToLL(LI); 260 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 261 return expandAtomicLoadToCmpXchg(LI); 262 } 263 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 264 } 265 266 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 267 IRBuilder<> Builder(LI); 268 269 // On some architectures, load-linked instructions are atomic for larger 270 // sizes than normal loads. For example, the only 64-bit load guaranteed 271 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 272 Value *Val = 273 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 274 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 275 276 LI->replaceAllUsesWith(Val); 277 LI->eraseFromParent(); 278 279 return true; 280 } 281 282 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 283 IRBuilder<> Builder(LI); 284 AtomicOrdering Order = LI->getOrdering(); 285 Value *Addr = LI->getPointerOperand(); 286 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 287 Constant *DummyVal = Constant::getNullValue(Ty); 288 289 Value *Pair = Builder.CreateAtomicCmpXchg( 290 Addr, DummyVal, DummyVal, Order, 291 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 292 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 293 294 LI->replaceAllUsesWith(Loaded); 295 LI->eraseFromParent(); 296 297 return true; 298 } 299 300 /// Convert an atomic store of a non-integral type to an integer store of the 301 /// equivalent bitwidth. We used to not support floating point or vector 302 /// atomics in the IR at all. The backends learned to deal with the bitcast 303 /// idiom because that was the only way of expressing the notion of a atomic 304 /// float or vector store. The long term plan is to teach each backend to 305 /// instruction select from the original atomic store, but as a migration 306 /// mechanism, we convert back to the old format which the backends understand. 307 /// Each backend will need individual work to recognize the new format. 308 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 309 IRBuilder<> Builder(SI); 310 auto *M = SI->getModule(); 311 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 312 M->getDataLayout()); 313 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 314 315 Value *Addr = SI->getPointerOperand(); 316 Type *PT = PointerType::get(NewTy, 317 Addr->getType()->getPointerAddressSpace()); 318 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 319 320 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 321 NewSI->setAlignment(SI->getAlignment()); 322 NewSI->setVolatile(SI->isVolatile()); 323 NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope()); 324 DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 325 SI->eraseFromParent(); 326 return NewSI; 327 } 328 329 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 330 // This function is only called on atomic stores that are too large to be 331 // atomic if implemented as a native store. So we replace them by an 332 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 333 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 334 // It is the responsibility of the target to only signal expansion via 335 // shouldExpandAtomicRMW in cases where this is required and possible. 336 IRBuilder<> Builder(SI); 337 AtomicRMWInst *AI = 338 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 339 SI->getValueOperand(), SI->getOrdering()); 340 SI->eraseFromParent(); 341 342 // Now we have an appropriate swap instruction, lower it as usual. 343 return tryExpandAtomicRMW(AI); 344 } 345 346 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 347 Value *Loaded, Value *NewVal, 348 AtomicOrdering MemOpOrder, 349 Value *&Success, Value *&NewLoaded) { 350 Value* Pair = Builder.CreateAtomicCmpXchg( 351 Addr, Loaded, NewVal, MemOpOrder, 352 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 353 Success = Builder.CreateExtractValue(Pair, 1, "success"); 354 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 355 } 356 357 /// Emit IR to implement the given atomicrmw operation on values in registers, 358 /// returning the new value. 359 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 360 Value *Loaded, Value *Inc) { 361 Value *NewVal; 362 switch (Op) { 363 case AtomicRMWInst::Xchg: 364 return Inc; 365 case AtomicRMWInst::Add: 366 return Builder.CreateAdd(Loaded, Inc, "new"); 367 case AtomicRMWInst::Sub: 368 return Builder.CreateSub(Loaded, Inc, "new"); 369 case AtomicRMWInst::And: 370 return Builder.CreateAnd(Loaded, Inc, "new"); 371 case AtomicRMWInst::Nand: 372 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 373 case AtomicRMWInst::Or: 374 return Builder.CreateOr(Loaded, Inc, "new"); 375 case AtomicRMWInst::Xor: 376 return Builder.CreateXor(Loaded, Inc, "new"); 377 case AtomicRMWInst::Max: 378 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 379 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 380 case AtomicRMWInst::Min: 381 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 382 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 383 case AtomicRMWInst::UMax: 384 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 385 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 386 case AtomicRMWInst::UMin: 387 NewVal = Builder.CreateICmpULE(Loaded, Inc); 388 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 389 default: 390 llvm_unreachable("Unknown atomic op"); 391 } 392 } 393 394 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 395 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 396 case TargetLoweringBase::AtomicExpansionKind::None: 397 return false; 398 case TargetLoweringBase::AtomicExpansionKind::LLSC: 399 return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(), 400 [&](IRBuilder<> &Builder, Value *Loaded) { 401 return performAtomicOp(AI->getOperation(), 402 Builder, Loaded, 403 AI->getValOperand()); 404 }); 405 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 406 return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 407 default: 408 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 409 } 410 } 411 412 bool AtomicExpand::expandAtomicOpToLLSC( 413 Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, 414 std::function<Value *(IRBuilder<> &, Value *)> PerformOp) { 415 BasicBlock *BB = I->getParent(); 416 Function *F = BB->getParent(); 417 LLVMContext &Ctx = F->getContext(); 418 419 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 420 // 421 // The standard expansion we produce is: 422 // [...] 423 // fence? 424 // atomicrmw.start: 425 // %loaded = @load.linked(%addr) 426 // %new = some_op iN %loaded, %incr 427 // %stored = @store_conditional(%new, %addr) 428 // %try_again = icmp i32 ne %stored, 0 429 // br i1 %try_again, label %loop, label %atomicrmw.end 430 // atomicrmw.end: 431 // fence? 432 // [...] 433 BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end"); 434 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 435 436 // This grabs the DebugLoc from I. 437 IRBuilder<> Builder(I); 438 439 // The split call above "helpfully" added a branch at the end of BB (to the 440 // wrong place), but we might want a fence too. It's easiest to just remove 441 // the branch entirely. 442 std::prev(BB->end())->eraseFromParent(); 443 Builder.SetInsertPoint(BB); 444 Builder.CreateBr(LoopBB); 445 446 // Start the main loop block now that we've taken care of the preliminaries. 447 Builder.SetInsertPoint(LoopBB); 448 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 449 450 Value *NewVal = PerformOp(Builder, Loaded); 451 452 Value *StoreSuccess = 453 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 454 Value *TryAgain = Builder.CreateICmpNE( 455 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 456 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 457 458 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 459 460 I->replaceAllUsesWith(Loaded); 461 I->eraseFromParent(); 462 463 return true; 464 } 465 466 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 467 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 468 /// IR. As a migration step, we convert back to what use to be the standard 469 /// way to represent a pointer cmpxchg so that we can update backends one by 470 /// one. 471 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 472 auto *M = CI->getModule(); 473 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 474 M->getDataLayout()); 475 476 IRBuilder<> Builder(CI); 477 478 Value *Addr = CI->getPointerOperand(); 479 Type *PT = PointerType::get(NewTy, 480 Addr->getType()->getPointerAddressSpace()); 481 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 482 483 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 484 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 485 486 487 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 488 CI->getSuccessOrdering(), 489 CI->getFailureOrdering(), 490 CI->getSynchScope()); 491 NewCI->setVolatile(CI->isVolatile()); 492 NewCI->setWeak(CI->isWeak()); 493 DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 494 495 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 496 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 497 498 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 499 500 Value *Res = UndefValue::get(CI->getType()); 501 Res = Builder.CreateInsertValue(Res, OldVal, 0); 502 Res = Builder.CreateInsertValue(Res, Succ, 1); 503 504 CI->replaceAllUsesWith(Res); 505 CI->eraseFromParent(); 506 return NewCI; 507 } 508 509 510 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 511 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 512 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 513 Value *Addr = CI->getPointerOperand(); 514 BasicBlock *BB = CI->getParent(); 515 Function *F = BB->getParent(); 516 LLVMContext &Ctx = F->getContext(); 517 // If shouldInsertFencesForAtomic() returns true, then the target does not 518 // want to deal with memory orders, and emitLeading/TrailingFence should take 519 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 520 // should preserve the ordering. 521 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 522 AtomicOrdering MemOpOrder = 523 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 524 525 // In implementations which use a barrier to achieve release semantics, we can 526 // delay emitting this barrier until we know a store is actually going to be 527 // attempted. The cost of this delay is that we need 2 copies of the block 528 // emitting the load-linked, affecting code size. 529 // 530 // Ideally, this logic would be unconditional except for the minsize check 531 // since in other cases the extra blocks naturally collapse down to the 532 // minimal loop. Unfortunately, this puts too much stress on later 533 // optimisations so we avoid emitting the extra logic in those cases too. 534 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 535 SuccessOrder != AtomicOrdering::Monotonic && 536 SuccessOrder != AtomicOrdering::Acquire && 537 !F->optForMinSize(); 538 539 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 540 // do it even on minsize. 541 bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); 542 543 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 544 // 545 // The full expansion we produce is: 546 // [...] 547 // cmpxchg.start: 548 // %unreleasedload = @load.linked(%addr) 549 // %should_store = icmp eq %unreleasedload, %desired 550 // br i1 %should_store, label %cmpxchg.fencedstore, 551 // label %cmpxchg.nostore 552 // cmpxchg.releasingstore: 553 // fence? 554 // br label cmpxchg.trystore 555 // cmpxchg.trystore: 556 // %loaded.trystore = phi [%unreleasedload, %releasingstore], 557 // [%releasedload, %cmpxchg.releasedload] 558 // %stored = @store_conditional(%new, %addr) 559 // %success = icmp eq i32 %stored, 0 560 // br i1 %success, label %cmpxchg.success, 561 // label %cmpxchg.releasedload/%cmpxchg.failure 562 // cmpxchg.releasedload: 563 // %releasedload = @load.linked(%addr) 564 // %should_store = icmp eq %releasedload, %desired 565 // br i1 %should_store, label %cmpxchg.trystore, 566 // label %cmpxchg.failure 567 // cmpxchg.success: 568 // fence? 569 // br label %cmpxchg.end 570 // cmpxchg.nostore: 571 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 572 // [%releasedload, 573 // %cmpxchg.releasedload/%cmpxchg.trystore] 574 // @load_linked_fail_balance()? 575 // br label %cmpxchg.failure 576 // cmpxchg.failure: 577 // fence? 578 // br label %cmpxchg.end 579 // cmpxchg.end: 580 // %loaded = phi [%loaded.nostore, %cmpxchg.failure], 581 // [%loaded.trystore, %cmpxchg.trystore] 582 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 583 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 584 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 585 // [...] 586 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 587 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 588 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 589 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 590 auto ReleasedLoadBB = 591 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 592 auto TryStoreBB = 593 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 594 auto ReleasingStoreBB = 595 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 596 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 597 598 // This grabs the DebugLoc from CI 599 IRBuilder<> Builder(CI); 600 601 // The split call above "helpfully" added a branch at the end of BB (to the 602 // wrong place), but we might want a fence too. It's easiest to just remove 603 // the branch entirely. 604 std::prev(BB->end())->eraseFromParent(); 605 Builder.SetInsertPoint(BB); 606 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 607 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 608 /*IsLoad=*/true); 609 Builder.CreateBr(StartBB); 610 611 // Start the main loop block now that we've taken care of the preliminaries. 612 Builder.SetInsertPoint(StartBB); 613 Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 614 Value *ShouldStore = Builder.CreateICmpEQ( 615 UnreleasedLoad, CI->getCompareOperand(), "should_store"); 616 617 // If the cmpxchg doesn't actually need any ordering when it fails, we can 618 // jump straight past that fence instruction (if it exists). 619 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 620 621 Builder.SetInsertPoint(ReleasingStoreBB); 622 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 623 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 624 /*IsLoad=*/true); 625 Builder.CreateBr(TryStoreBB); 626 627 Builder.SetInsertPoint(TryStoreBB); 628 Value *StoreSuccess = TLI->emitStoreConditional( 629 Builder, CI->getNewValOperand(), Addr, MemOpOrder); 630 StoreSuccess = Builder.CreateICmpEQ( 631 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 632 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 633 Builder.CreateCondBr(StoreSuccess, SuccessBB, 634 CI->isWeak() ? FailureBB : RetryBB); 635 636 Builder.SetInsertPoint(ReleasedLoadBB); 637 Value *SecondLoad; 638 if (HasReleasedLoadBB) { 639 SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 640 ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), 641 "should_store"); 642 643 // If the cmpxchg doesn't actually need any ordering when it fails, we can 644 // jump straight past that fence instruction (if it exists). 645 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 646 } else 647 Builder.CreateUnreachable(); 648 649 // Make sure later instructions don't get reordered with a fence if 650 // necessary. 651 Builder.SetInsertPoint(SuccessBB); 652 if (ShouldInsertFencesForAtomic) 653 TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, 654 /*IsLoad=*/true); 655 Builder.CreateBr(ExitBB); 656 657 Builder.SetInsertPoint(NoStoreBB); 658 // In the failing case, where we don't execute the store-conditional, the 659 // target might want to balance out the load-linked with a dedicated 660 // instruction (e.g., on ARM, clearing the exclusive monitor). 661 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 662 Builder.CreateBr(FailureBB); 663 664 Builder.SetInsertPoint(FailureBB); 665 if (ShouldInsertFencesForAtomic) 666 TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, 667 /*IsLoad=*/true); 668 Builder.CreateBr(ExitBB); 669 670 // Finally, we have control-flow based knowledge of whether the cmpxchg 671 // succeeded or not. We expose this to later passes by converting any 672 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 673 // PHI. 674 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 675 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); 676 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 677 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 678 679 // Setup the builder so we can create any PHIs we need. 680 Value *Loaded; 681 if (!HasReleasedLoadBB) 682 Loaded = UnreleasedLoad; 683 else { 684 Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); 685 PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 686 TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); 687 TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 688 689 Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); 690 PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 691 NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); 692 NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 693 694 Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); 695 PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 696 ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); 697 ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); 698 699 Loaded = ExitLoaded; 700 } 701 702 // Look for any users of the cmpxchg that are just comparing the loaded value 703 // against the desired one, and replace them with the CFG-derived version. 704 SmallVector<ExtractValueInst *, 2> PrunedInsts; 705 for (auto User : CI->users()) { 706 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 707 if (!EV) 708 continue; 709 710 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 711 "weird extraction from { iN, i1 }"); 712 713 if (EV->getIndices()[0] == 0) 714 EV->replaceAllUsesWith(Loaded); 715 else 716 EV->replaceAllUsesWith(Success); 717 718 PrunedInsts.push_back(EV); 719 } 720 721 // We can remove the instructions now we're no longer iterating through them. 722 for (auto EV : PrunedInsts) 723 EV->eraseFromParent(); 724 725 if (!CI->use_empty()) { 726 // Some use of the full struct return that we don't understand has happened, 727 // so we've got to reconstruct it properly. 728 Value *Res; 729 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 730 Res = Builder.CreateInsertValue(Res, Success, 1); 731 732 CI->replaceAllUsesWith(Res); 733 } 734 735 CI->eraseFromParent(); 736 return true; 737 } 738 739 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 740 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 741 if(!C) 742 return false; 743 744 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 745 switch(Op) { 746 case AtomicRMWInst::Add: 747 case AtomicRMWInst::Sub: 748 case AtomicRMWInst::Or: 749 case AtomicRMWInst::Xor: 750 return C->isZero(); 751 case AtomicRMWInst::And: 752 return C->isMinusOne(); 753 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 754 default: 755 return false; 756 } 757 } 758 759 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 760 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 761 tryExpandAtomicLoad(ResultingLoad); 762 return true; 763 } 764 return false; 765 } 766 767 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 768 CreateCmpXchgInstFun CreateCmpXchg) { 769 assert(AI); 770 771 AtomicOrdering MemOpOrder = AI->getOrdering() == AtomicOrdering::Unordered 772 ? AtomicOrdering::Monotonic 773 : AI->getOrdering(); 774 Value *Addr = AI->getPointerOperand(); 775 BasicBlock *BB = AI->getParent(); 776 Function *F = BB->getParent(); 777 LLVMContext &Ctx = F->getContext(); 778 779 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 780 // 781 // The standard expansion we produce is: 782 // [...] 783 // %init_loaded = load atomic iN* %addr 784 // br label %loop 785 // loop: 786 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 787 // %new = some_op iN %loaded, %incr 788 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 789 // %new_loaded = extractvalue { iN, i1 } %pair, 0 790 // %success = extractvalue { iN, i1 } %pair, 1 791 // br i1 %success, label %atomicrmw.end, label %loop 792 // atomicrmw.end: 793 // [...] 794 BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end"); 795 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 796 797 // This grabs the DebugLoc from AI. 798 IRBuilder<> Builder(AI); 799 800 // The split call above "helpfully" added a branch at the end of BB (to the 801 // wrong place), but we want a load. It's easiest to just remove 802 // the branch entirely. 803 std::prev(BB->end())->eraseFromParent(); 804 Builder.SetInsertPoint(BB); 805 LoadInst *InitLoaded = Builder.CreateLoad(Addr); 806 // Atomics require at least natural alignment. 807 InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8); 808 Builder.CreateBr(LoopBB); 809 810 // Start the main loop block now that we've taken care of the preliminaries. 811 Builder.SetInsertPoint(LoopBB); 812 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); 813 Loaded->addIncoming(InitLoaded, BB); 814 815 Value *NewVal = 816 performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); 817 818 Value *NewLoaded = nullptr; 819 Value *Success = nullptr; 820 821 CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder, 822 Success, NewLoaded); 823 assert(Success && NewLoaded); 824 825 Loaded->addIncoming(NewLoaded, LoopBB); 826 827 Builder.CreateCondBr(Success, ExitBB, LoopBB); 828 829 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 830 831 AI->replaceAllUsesWith(NewLoaded); 832 AI->eraseFromParent(); 833 834 return true; 835 } 836