1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass (at IR level) to replace atomic instructions with 11 // target specific instruction which implement the same semantics in a way 12 // which better fits the target backend. This can include the use of either 13 // (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or 14 // type coercions. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/CodeGen/AtomicExpandUtils.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/InstIterator.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/IR/Module.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include "llvm/Target/TargetLowering.h" 29 #include "llvm/Target/TargetMachine.h" 30 #include "llvm/Target/TargetSubtargetInfo.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "atomic-expand" 35 36 namespace { 37 class AtomicExpand: public FunctionPass { 38 const TargetMachine *TM; 39 const TargetLowering *TLI; 40 public: 41 static char ID; // Pass identification, replacement for typeid 42 explicit AtomicExpand(const TargetMachine *TM = nullptr) 43 : FunctionPass(ID), TM(TM), TLI(nullptr) { 44 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 45 } 46 47 bool runOnFunction(Function &F) override; 48 49 private: 50 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, 51 bool IsStore, bool IsLoad); 52 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 53 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 54 bool tryExpandAtomicLoad(LoadInst *LI); 55 bool expandAtomicLoadToLL(LoadInst *LI); 56 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 57 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 58 bool expandAtomicStore(StoreInst *SI); 59 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 60 bool expandAtomicOpToLLSC( 61 Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, 62 std::function<Value *(IRBuilder<> &, Value *)> PerformOp); 63 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 64 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 65 bool isIdempotentRMW(AtomicRMWInst *AI); 66 bool simplifyIdempotentRMW(AtomicRMWInst *AI); 67 }; 68 } 69 70 char AtomicExpand::ID = 0; 71 char &llvm::AtomicExpandID = AtomicExpand::ID; 72 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", 73 "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg", 74 false, false) 75 76 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) { 77 return new AtomicExpand(TM); 78 } 79 80 bool AtomicExpand::runOnFunction(Function &F) { 81 if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand()) 82 return false; 83 TLI = TM->getSubtargetImpl(F)->getTargetLowering(); 84 85 SmallVector<Instruction *, 1> AtomicInsts; 86 87 // Changing control-flow while iterating through it is a bad idea, so gather a 88 // list of all atomic instructions before we start. 89 for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { 90 if (I->isAtomic()) 91 AtomicInsts.push_back(&*I); 92 } 93 94 bool MadeChange = false; 95 for (auto I : AtomicInsts) { 96 auto LI = dyn_cast<LoadInst>(I); 97 auto SI = dyn_cast<StoreInst>(I); 98 auto RMWI = dyn_cast<AtomicRMWInst>(I); 99 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 100 assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) && 101 "Unknown atomic instruction"); 102 103 if (TLI->getInsertFencesForAtomic()) { 104 auto FenceOrdering = Monotonic; 105 bool IsStore, IsLoad; 106 if (LI && isAtLeastAcquire(LI->getOrdering())) { 107 FenceOrdering = LI->getOrdering(); 108 LI->setOrdering(Monotonic); 109 IsStore = false; 110 IsLoad = true; 111 } else if (SI && isAtLeastRelease(SI->getOrdering())) { 112 FenceOrdering = SI->getOrdering(); 113 SI->setOrdering(Monotonic); 114 IsStore = true; 115 IsLoad = false; 116 } else if (RMWI && (isAtLeastRelease(RMWI->getOrdering()) || 117 isAtLeastAcquire(RMWI->getOrdering()))) { 118 FenceOrdering = RMWI->getOrdering(); 119 RMWI->setOrdering(Monotonic); 120 IsStore = IsLoad = true; 121 } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 122 (isAtLeastRelease(CASI->getSuccessOrdering()) || 123 isAtLeastAcquire(CASI->getSuccessOrdering()))) { 124 // If a compare and swap is lowered to LL/SC, we can do smarter fence 125 // insertion, with a stronger one on the success path than on the 126 // failure path. As a result, fence insertion is directly done by 127 // expandAtomicCmpXchg in that case. 128 FenceOrdering = CASI->getSuccessOrdering(); 129 CASI->setSuccessOrdering(Monotonic); 130 CASI->setFailureOrdering(Monotonic); 131 IsStore = IsLoad = true; 132 } 133 134 if (FenceOrdering != Monotonic) { 135 MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); 136 } 137 } 138 139 if (LI) { 140 if (LI->getType()->isFloatingPointTy()) { 141 // TODO: add a TLI hook to control this so that each target can 142 // convert to lowering the original type one at a time. 143 LI = convertAtomicLoadToIntegerType(LI); 144 assert(LI->getType()->isIntegerTy() && "invariant broken"); 145 MadeChange = true; 146 } 147 148 MadeChange |= tryExpandAtomicLoad(LI); 149 } else if (SI) { 150 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 151 // TODO: add a TLI hook to control this so that each target can 152 // convert to lowering the original type one at a time. 153 SI = convertAtomicStoreToIntegerType(SI); 154 assert(SI->getValueOperand()->getType()->isIntegerTy() && 155 "invariant broken"); 156 MadeChange = true; 157 } 158 159 if (TLI->shouldExpandAtomicStoreInIR(SI)) 160 MadeChange |= expandAtomicStore(SI); 161 } else if (RMWI) { 162 // There are two different ways of expanding RMW instructions: 163 // - into a load if it is idempotent 164 // - into a Cmpxchg/LL-SC loop otherwise 165 // we try them in that order. 166 167 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 168 MadeChange = true; 169 } else { 170 MadeChange |= tryExpandAtomicRMW(RMWI); 171 } 172 } else if (CASI) { 173 // TODO: when we're ready to make the change at the IR level, we can 174 // extend convertCmpXchgToInteger for floating point too. 175 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 176 "unimplemented - floating point not legal at IR level"); 177 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 178 // TODO: add a TLI hook to control this so that each target can 179 // convert to lowering the original type one at a time. 180 CASI = convertCmpXchgToIntegerType(CASI); 181 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 182 "invariant broken"); 183 MadeChange = true; 184 } 185 186 if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) 187 MadeChange |= expandAtomicCmpXchg(CASI); 188 } 189 } 190 return MadeChange; 191 } 192 193 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, 194 bool IsStore, bool IsLoad) { 195 IRBuilder<> Builder(I); 196 197 auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); 198 199 auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); 200 // The trailing fence is emitted before the instruction instead of after 201 // because there is no easy way of setting Builder insertion point after 202 // an instruction. So we must erase it from the BB, and insert it back 203 // in the right place. 204 // We have a guard here because not every atomic operation generates a 205 // trailing fence. 206 if (TrailingFence) { 207 TrailingFence->removeFromParent(); 208 TrailingFence->insertAfter(I); 209 } 210 211 return (LeadingFence || TrailingFence); 212 } 213 214 /// Get the iX type with the same bitwidth as T. 215 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 216 const DataLayout &DL) { 217 EVT VT = TLI->getValueType(DL, T); 218 unsigned BitWidth = VT.getStoreSizeInBits(); 219 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 220 return IntegerType::get(T->getContext(), BitWidth); 221 } 222 223 /// Convert an atomic load of a non-integral type to an integer load of the 224 /// equivalent bitwidth. See the function comment on 225 /// convertAtomicStoreToIntegerType for background. 226 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 227 auto *M = LI->getModule(); 228 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 229 M->getDataLayout()); 230 231 IRBuilder<> Builder(LI); 232 233 Value *Addr = LI->getPointerOperand(); 234 Type *PT = PointerType::get(NewTy, 235 Addr->getType()->getPointerAddressSpace()); 236 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 237 238 auto *NewLI = Builder.CreateLoad(NewAddr); 239 NewLI->setAlignment(LI->getAlignment()); 240 NewLI->setVolatile(LI->isVolatile()); 241 NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope()); 242 DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 243 244 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 245 LI->replaceAllUsesWith(NewVal); 246 LI->eraseFromParent(); 247 return NewLI; 248 } 249 250 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 251 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 252 case TargetLoweringBase::AtomicExpansionKind::None: 253 return false; 254 case TargetLoweringBase::AtomicExpansionKind::LLSC: 255 return expandAtomicOpToLLSC( 256 LI, LI->getPointerOperand(), LI->getOrdering(), 257 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 258 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 259 return expandAtomicLoadToLL(LI); 260 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 261 return expandAtomicLoadToCmpXchg(LI); 262 } 263 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 264 } 265 266 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 267 IRBuilder<> Builder(LI); 268 269 // On some architectures, load-linked instructions are atomic for larger 270 // sizes than normal loads. For example, the only 64-bit load guaranteed 271 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 272 Value *Val = 273 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 274 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 275 276 LI->replaceAllUsesWith(Val); 277 LI->eraseFromParent(); 278 279 return true; 280 } 281 282 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 283 IRBuilder<> Builder(LI); 284 AtomicOrdering Order = LI->getOrdering(); 285 Value *Addr = LI->getPointerOperand(); 286 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 287 Constant *DummyVal = Constant::getNullValue(Ty); 288 289 Value *Pair = Builder.CreateAtomicCmpXchg( 290 Addr, DummyVal, DummyVal, Order, 291 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 292 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 293 294 LI->replaceAllUsesWith(Loaded); 295 LI->eraseFromParent(); 296 297 return true; 298 } 299 300 /// Convert an atomic store of a non-integral type to an integer store of the 301 /// equivalent bitwidth. We used to not support floating point or vector 302 /// atomics in the IR at all. The backends learned to deal with the bitcast 303 /// idiom because that was the only way of expressing the notion of a atomic 304 /// float or vector store. The long term plan is to teach each backend to 305 /// instruction select from the original atomic store, but as a migration 306 /// mechanism, we convert back to the old format which the backends understand. 307 /// Each backend will need individual work to recognize the new format. 308 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 309 IRBuilder<> Builder(SI); 310 auto *M = SI->getModule(); 311 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 312 M->getDataLayout()); 313 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 314 315 Value *Addr = SI->getPointerOperand(); 316 Type *PT = PointerType::get(NewTy, 317 Addr->getType()->getPointerAddressSpace()); 318 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 319 320 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 321 NewSI->setAlignment(SI->getAlignment()); 322 NewSI->setVolatile(SI->isVolatile()); 323 NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope()); 324 DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 325 SI->eraseFromParent(); 326 return NewSI; 327 } 328 329 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 330 // This function is only called on atomic stores that are too large to be 331 // atomic if implemented as a native store. So we replace them by an 332 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 333 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 334 // It is the responsibility of the target to only signal expansion via 335 // shouldExpandAtomicRMW in cases where this is required and possible. 336 IRBuilder<> Builder(SI); 337 AtomicRMWInst *AI = 338 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 339 SI->getValueOperand(), SI->getOrdering()); 340 SI->eraseFromParent(); 341 342 // Now we have an appropriate swap instruction, lower it as usual. 343 return tryExpandAtomicRMW(AI); 344 } 345 346 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 347 Value *Loaded, Value *NewVal, 348 AtomicOrdering MemOpOrder, 349 Value *&Success, Value *&NewLoaded) { 350 Value* Pair = Builder.CreateAtomicCmpXchg( 351 Addr, Loaded, NewVal, MemOpOrder, 352 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 353 Success = Builder.CreateExtractValue(Pair, 1, "success"); 354 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 355 } 356 357 /// Emit IR to implement the given atomicrmw operation on values in registers, 358 /// returning the new value. 359 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 360 Value *Loaded, Value *Inc) { 361 Value *NewVal; 362 switch (Op) { 363 case AtomicRMWInst::Xchg: 364 return Inc; 365 case AtomicRMWInst::Add: 366 return Builder.CreateAdd(Loaded, Inc, "new"); 367 case AtomicRMWInst::Sub: 368 return Builder.CreateSub(Loaded, Inc, "new"); 369 case AtomicRMWInst::And: 370 return Builder.CreateAnd(Loaded, Inc, "new"); 371 case AtomicRMWInst::Nand: 372 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 373 case AtomicRMWInst::Or: 374 return Builder.CreateOr(Loaded, Inc, "new"); 375 case AtomicRMWInst::Xor: 376 return Builder.CreateXor(Loaded, Inc, "new"); 377 case AtomicRMWInst::Max: 378 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 379 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 380 case AtomicRMWInst::Min: 381 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 382 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 383 case AtomicRMWInst::UMax: 384 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 385 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 386 case AtomicRMWInst::UMin: 387 NewVal = Builder.CreateICmpULE(Loaded, Inc); 388 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 389 default: 390 llvm_unreachable("Unknown atomic op"); 391 } 392 } 393 394 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 395 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 396 case TargetLoweringBase::AtomicExpansionKind::None: 397 return false; 398 case TargetLoweringBase::AtomicExpansionKind::LLSC: 399 return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(), 400 [&](IRBuilder<> &Builder, Value *Loaded) { 401 return performAtomicOp(AI->getOperation(), 402 Builder, Loaded, 403 AI->getValOperand()); 404 }); 405 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 406 return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 407 default: 408 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 409 } 410 } 411 412 bool AtomicExpand::expandAtomicOpToLLSC( 413 Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, 414 std::function<Value *(IRBuilder<> &, Value *)> PerformOp) { 415 BasicBlock *BB = I->getParent(); 416 Function *F = BB->getParent(); 417 LLVMContext &Ctx = F->getContext(); 418 419 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 420 // 421 // The standard expansion we produce is: 422 // [...] 423 // fence? 424 // atomicrmw.start: 425 // %loaded = @load.linked(%addr) 426 // %new = some_op iN %loaded, %incr 427 // %stored = @store_conditional(%new, %addr) 428 // %try_again = icmp i32 ne %stored, 0 429 // br i1 %try_again, label %loop, label %atomicrmw.end 430 // atomicrmw.end: 431 // fence? 432 // [...] 433 BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end"); 434 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 435 436 // This grabs the DebugLoc from I. 437 IRBuilder<> Builder(I); 438 439 // The split call above "helpfully" added a branch at the end of BB (to the 440 // wrong place), but we might want a fence too. It's easiest to just remove 441 // the branch entirely. 442 std::prev(BB->end())->eraseFromParent(); 443 Builder.SetInsertPoint(BB); 444 Builder.CreateBr(LoopBB); 445 446 // Start the main loop block now that we've taken care of the preliminaries. 447 Builder.SetInsertPoint(LoopBB); 448 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 449 450 Value *NewVal = PerformOp(Builder, Loaded); 451 452 Value *StoreSuccess = 453 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 454 Value *TryAgain = Builder.CreateICmpNE( 455 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 456 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 457 458 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 459 460 I->replaceAllUsesWith(Loaded); 461 I->eraseFromParent(); 462 463 return true; 464 } 465 466 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 467 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 468 /// IR. As a migration step, we convert back to what use to be the standard 469 /// way to represent a pointer cmpxchg so that we can update backends one by 470 /// one. 471 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 472 auto *M = CI->getModule(); 473 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 474 M->getDataLayout()); 475 476 IRBuilder<> Builder(CI); 477 478 Value *Addr = CI->getPointerOperand(); 479 Type *PT = PointerType::get(NewTy, 480 Addr->getType()->getPointerAddressSpace()); 481 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 482 483 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 484 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 485 486 487 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 488 CI->getSuccessOrdering(), 489 CI->getFailureOrdering(), 490 CI->getSynchScope()); 491 NewCI->setVolatile(CI->isVolatile()); 492 NewCI->setWeak(CI->isWeak()); 493 DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 494 495 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 496 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 497 498 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 499 500 Value *Res = UndefValue::get(CI->getType()); 501 Res = Builder.CreateInsertValue(Res, OldVal, 0); 502 Res = Builder.CreateInsertValue(Res, Succ, 1); 503 504 CI->replaceAllUsesWith(Res); 505 CI->eraseFromParent(); 506 return NewCI; 507 } 508 509 510 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 511 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 512 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 513 Value *Addr = CI->getPointerOperand(); 514 BasicBlock *BB = CI->getParent(); 515 Function *F = BB->getParent(); 516 LLVMContext &Ctx = F->getContext(); 517 // If getInsertFencesForAtomic() returns true, then the target does not want 518 // to deal with memory orders, and emitLeading/TrailingFence should take care 519 // of everything. Otherwise, emitLeading/TrailingFence are no-op and we 520 // should preserve the ordering. 521 AtomicOrdering MemOpOrder = 522 TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder; 523 524 // In implementations which use a barrier to achieve release semantics, we can 525 // delay emitting this barrier until we know a store is actually going to be 526 // attempted. The cost of this delay is that we need 2 copies of the block 527 // emitting the load-linked, affecting code size. 528 // 529 // Ideally, this logic would be unconditional except for the minsize check 530 // since in other cases the extra blocks naturally collapse down to the 531 // minimal loop. Unfortunately, this puts too much stress on later 532 // optimisations so we avoid emitting the extra logic in those cases too. 533 bool HasReleasedLoadBB = !CI->isWeak() && TLI->getInsertFencesForAtomic() && 534 SuccessOrder != Monotonic && 535 SuccessOrder != Acquire && !F->optForMinSize(); 536 537 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 538 // do it even on minsize. 539 bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); 540 541 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 542 // 543 // The full expansion we produce is: 544 // [...] 545 // cmpxchg.start: 546 // %unreleasedload = @load.linked(%addr) 547 // %should_store = icmp eq %unreleasedload, %desired 548 // br i1 %should_store, label %cmpxchg.fencedstore, 549 // label %cmpxchg.nostore 550 // cmpxchg.releasingstore: 551 // fence? 552 // br label cmpxchg.trystore 553 // cmpxchg.trystore: 554 // %loaded.trystore = phi [%unreleasedload, %releasingstore], 555 // [%releasedload, %cmpxchg.releasedload] 556 // %stored = @store_conditional(%new, %addr) 557 // %success = icmp eq i32 %stored, 0 558 // br i1 %success, label %cmpxchg.success, 559 // label %cmpxchg.releasedload/%cmpxchg.failure 560 // cmpxchg.releasedload: 561 // %releasedload = @load.linked(%addr) 562 // %should_store = icmp eq %releasedload, %desired 563 // br i1 %should_store, label %cmpxchg.trystore, 564 // label %cmpxchg.failure 565 // cmpxchg.success: 566 // fence? 567 // br label %cmpxchg.end 568 // cmpxchg.nostore: 569 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 570 // [%releasedload, 571 // %cmpxchg.releasedload/%cmpxchg.trystore] 572 // @load_linked_fail_balance()? 573 // br label %cmpxchg.failure 574 // cmpxchg.failure: 575 // fence? 576 // br label %cmpxchg.end 577 // cmpxchg.end: 578 // %loaded = phi [%loaded.nostore, %cmpxchg.failure], 579 // [%loaded.trystore, %cmpxchg.trystore] 580 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 581 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 582 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 583 // [...] 584 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 585 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 586 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 587 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 588 auto ReleasedLoadBB = 589 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 590 auto TryStoreBB = 591 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 592 auto ReleasingStoreBB = 593 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 594 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 595 596 // This grabs the DebugLoc from CI 597 IRBuilder<> Builder(CI); 598 599 // The split call above "helpfully" added a branch at the end of BB (to the 600 // wrong place), but we might want a fence too. It's easiest to just remove 601 // the branch entirely. 602 std::prev(BB->end())->eraseFromParent(); 603 Builder.SetInsertPoint(BB); 604 if (UseUnconditionalReleaseBarrier) 605 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 606 /*IsLoad=*/true); 607 Builder.CreateBr(StartBB); 608 609 // Start the main loop block now that we've taken care of the preliminaries. 610 Builder.SetInsertPoint(StartBB); 611 Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 612 Value *ShouldStore = Builder.CreateICmpEQ( 613 UnreleasedLoad, CI->getCompareOperand(), "should_store"); 614 615 // If the cmpxchg doesn't actually need any ordering when it fails, we can 616 // jump straight past that fence instruction (if it exists). 617 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 618 619 Builder.SetInsertPoint(ReleasingStoreBB); 620 if (!UseUnconditionalReleaseBarrier) 621 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 622 /*IsLoad=*/true); 623 Builder.CreateBr(TryStoreBB); 624 625 Builder.SetInsertPoint(TryStoreBB); 626 Value *StoreSuccess = TLI->emitStoreConditional( 627 Builder, CI->getNewValOperand(), Addr, MemOpOrder); 628 StoreSuccess = Builder.CreateICmpEQ( 629 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 630 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 631 Builder.CreateCondBr(StoreSuccess, SuccessBB, 632 CI->isWeak() ? FailureBB : RetryBB); 633 634 Builder.SetInsertPoint(ReleasedLoadBB); 635 Value *SecondLoad; 636 if (HasReleasedLoadBB) { 637 SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 638 ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), 639 "should_store"); 640 641 // If the cmpxchg doesn't actually need any ordering when it fails, we can 642 // jump straight past that fence instruction (if it exists). 643 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 644 } else 645 Builder.CreateUnreachable(); 646 647 // Make sure later instructions don't get reordered with a fence if 648 // necessary. 649 Builder.SetInsertPoint(SuccessBB); 650 TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, 651 /*IsLoad=*/true); 652 Builder.CreateBr(ExitBB); 653 654 Builder.SetInsertPoint(NoStoreBB); 655 // In the failing case, where we don't execute the store-conditional, the 656 // target might want to balance out the load-linked with a dedicated 657 // instruction (e.g., on ARM, clearing the exclusive monitor). 658 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 659 Builder.CreateBr(FailureBB); 660 661 Builder.SetInsertPoint(FailureBB); 662 TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, 663 /*IsLoad=*/true); 664 Builder.CreateBr(ExitBB); 665 666 // Finally, we have control-flow based knowledge of whether the cmpxchg 667 // succeeded or not. We expose this to later passes by converting any 668 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 669 // PHI. 670 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 671 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); 672 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 673 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 674 675 // Setup the builder so we can create any PHIs we need. 676 Value *Loaded; 677 if (!HasReleasedLoadBB) 678 Loaded = UnreleasedLoad; 679 else { 680 Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); 681 PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 682 TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); 683 TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 684 685 Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); 686 PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 687 NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); 688 NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 689 690 Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); 691 PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 692 ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); 693 ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); 694 695 Loaded = ExitLoaded; 696 } 697 698 // Look for any users of the cmpxchg that are just comparing the loaded value 699 // against the desired one, and replace them with the CFG-derived version. 700 SmallVector<ExtractValueInst *, 2> PrunedInsts; 701 for (auto User : CI->users()) { 702 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 703 if (!EV) 704 continue; 705 706 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 707 "weird extraction from { iN, i1 }"); 708 709 if (EV->getIndices()[0] == 0) 710 EV->replaceAllUsesWith(Loaded); 711 else 712 EV->replaceAllUsesWith(Success); 713 714 PrunedInsts.push_back(EV); 715 } 716 717 // We can remove the instructions now we're no longer iterating through them. 718 for (auto EV : PrunedInsts) 719 EV->eraseFromParent(); 720 721 if (!CI->use_empty()) { 722 // Some use of the full struct return that we don't understand has happened, 723 // so we've got to reconstruct it properly. 724 Value *Res; 725 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 726 Res = Builder.CreateInsertValue(Res, Success, 1); 727 728 CI->replaceAllUsesWith(Res); 729 } 730 731 CI->eraseFromParent(); 732 return true; 733 } 734 735 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 736 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 737 if(!C) 738 return false; 739 740 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 741 switch(Op) { 742 case AtomicRMWInst::Add: 743 case AtomicRMWInst::Sub: 744 case AtomicRMWInst::Or: 745 case AtomicRMWInst::Xor: 746 return C->isZero(); 747 case AtomicRMWInst::And: 748 return C->isMinusOne(); 749 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 750 default: 751 return false; 752 } 753 } 754 755 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 756 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 757 tryExpandAtomicLoad(ResultingLoad); 758 return true; 759 } 760 return false; 761 } 762 763 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 764 CreateCmpXchgInstFun CreateCmpXchg) { 765 assert(AI); 766 767 AtomicOrdering MemOpOrder = 768 AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); 769 Value *Addr = AI->getPointerOperand(); 770 BasicBlock *BB = AI->getParent(); 771 Function *F = BB->getParent(); 772 LLVMContext &Ctx = F->getContext(); 773 774 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 775 // 776 // The standard expansion we produce is: 777 // [...] 778 // %init_loaded = load atomic iN* %addr 779 // br label %loop 780 // loop: 781 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 782 // %new = some_op iN %loaded, %incr 783 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 784 // %new_loaded = extractvalue { iN, i1 } %pair, 0 785 // %success = extractvalue { iN, i1 } %pair, 1 786 // br i1 %success, label %atomicrmw.end, label %loop 787 // atomicrmw.end: 788 // [...] 789 BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end"); 790 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 791 792 // This grabs the DebugLoc from AI. 793 IRBuilder<> Builder(AI); 794 795 // The split call above "helpfully" added a branch at the end of BB (to the 796 // wrong place), but we want a load. It's easiest to just remove 797 // the branch entirely. 798 std::prev(BB->end())->eraseFromParent(); 799 Builder.SetInsertPoint(BB); 800 LoadInst *InitLoaded = Builder.CreateLoad(Addr); 801 // Atomics require at least natural alignment. 802 InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8); 803 Builder.CreateBr(LoopBB); 804 805 // Start the main loop block now that we've taken care of the preliminaries. 806 Builder.SetInsertPoint(LoopBB); 807 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); 808 Loaded->addIncoming(InitLoaded, BB); 809 810 Value *NewVal = 811 performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); 812 813 Value *NewLoaded = nullptr; 814 Value *Success = nullptr; 815 816 CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder, 817 Success, NewLoaded); 818 assert(Success && NewLoaded); 819 820 Loaded->addIncoming(NewLoaded, LoopBB); 821 822 Builder.CreateCondBr(Success, ExitBB, LoopBB); 823 824 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 825 826 AI->replaceAllUsesWith(NewLoaded); 827 AI->eraseFromParent(); 828 829 return true; 830 } 831