1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass (at IR level) to replace atomic instructions with 11 // __atomic_* library calls, or target specific instruction which implement the 12 // same semantics in a way which better fits the target backend. This can 13 // include the use of (intrinsic-based) load-linked/store-conditional loops, 14 // AtomicCmpXchg, or type coercions. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/CodeGen/AtomicExpandUtils.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/InstIterator.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/IR/Module.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include "llvm/Target/TargetLowering.h" 29 #include "llvm/Target/TargetMachine.h" 30 #include "llvm/Target/TargetSubtargetInfo.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "atomic-expand" 35 36 namespace { 37 class AtomicExpand: public FunctionPass { 38 const TargetMachine *TM; 39 const TargetLowering *TLI; 40 public: 41 static char ID; // Pass identification, replacement for typeid 42 explicit AtomicExpand(const TargetMachine *TM = nullptr) 43 : FunctionPass(ID), TM(TM), TLI(nullptr) { 44 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 45 } 46 47 bool runOnFunction(Function &F) override; 48 49 private: 50 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, 51 bool IsStore, bool IsLoad); 52 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 53 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 54 bool tryExpandAtomicLoad(LoadInst *LI); 55 bool expandAtomicLoadToLL(LoadInst *LI); 56 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 57 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 58 bool expandAtomicStore(StoreInst *SI); 59 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 60 bool expandAtomicOpToLLSC( 61 Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, 62 std::function<Value *(IRBuilder<> &, Value *)> PerformOp); 63 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 64 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 65 bool isIdempotentRMW(AtomicRMWInst *AI); 66 bool simplifyIdempotentRMW(AtomicRMWInst *AI); 67 68 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align, 69 Value *PointerOperand, Value *ValueOperand, 70 Value *CASExpected, AtomicOrdering Ordering, 71 AtomicOrdering Ordering2, 72 ArrayRef<RTLIB::Libcall> Libcalls); 73 void expandAtomicLoadToLibcall(LoadInst *LI); 74 void expandAtomicStoreToLibcall(StoreInst *LI); 75 void expandAtomicRMWToLibcall(AtomicRMWInst *I); 76 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); 77 }; 78 } 79 80 char AtomicExpand::ID = 0; 81 char &llvm::AtomicExpandID = AtomicExpand::ID; 82 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions", 83 false, false) 84 85 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) { 86 return new AtomicExpand(TM); 87 } 88 89 namespace { 90 // Helper functions to retrieve the size of atomic instructions. 91 unsigned getAtomicOpSize(LoadInst *LI) { 92 const DataLayout &DL = LI->getModule()->getDataLayout(); 93 return DL.getTypeStoreSize(LI->getType()); 94 } 95 96 unsigned getAtomicOpSize(StoreInst *SI) { 97 const DataLayout &DL = SI->getModule()->getDataLayout(); 98 return DL.getTypeStoreSize(SI->getValueOperand()->getType()); 99 } 100 101 unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { 102 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 103 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 104 } 105 106 unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { 107 const DataLayout &DL = CASI->getModule()->getDataLayout(); 108 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 109 } 110 111 // Helper functions to retrieve the alignment of atomic instructions. 112 unsigned getAtomicOpAlign(LoadInst *LI) { 113 unsigned Align = LI->getAlignment(); 114 // In the future, if this IR restriction is relaxed, we should 115 // return DataLayout::getABITypeAlignment when there's no align 116 // value. 117 assert(Align != 0 && "An atomic LoadInst always has an explicit alignment"); 118 return Align; 119 } 120 121 unsigned getAtomicOpAlign(StoreInst *SI) { 122 unsigned Align = SI->getAlignment(); 123 // In the future, if this IR restriction is relaxed, we should 124 // return DataLayout::getABITypeAlignment when there's no align 125 // value. 126 assert(Align != 0 && "An atomic StoreInst always has an explicit alignment"); 127 return Align; 128 } 129 130 unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) { 131 // TODO(PR27168): This instruction has no alignment attribute, but unlike the 132 // default alignment for load/store, the default here is to assume 133 // it has NATURAL alignment, not DataLayout-specified alignment. 134 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 135 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 136 } 137 138 unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) { 139 // TODO(PR27168): same comment as above. 140 const DataLayout &DL = CASI->getModule()->getDataLayout(); 141 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 142 } 143 144 // Determine if a particular atomic operation has a supported size, 145 // and is of appropriate alignment, to be passed through for target 146 // lowering. (Versus turning into a __atomic libcall) 147 template <typename Inst> 148 bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { 149 unsigned Size = getAtomicOpSize(I); 150 unsigned Align = getAtomicOpAlign(I); 151 return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; 152 } 153 154 } // end anonymous namespace 155 156 bool AtomicExpand::runOnFunction(Function &F) { 157 if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand()) 158 return false; 159 TLI = TM->getSubtargetImpl(F)->getTargetLowering(); 160 161 SmallVector<Instruction *, 1> AtomicInsts; 162 163 // Changing control-flow while iterating through it is a bad idea, so gather a 164 // list of all atomic instructions before we start. 165 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 166 Instruction *I = &*II; 167 if (I->isAtomic() && !isa<FenceInst>(I)) 168 AtomicInsts.push_back(I); 169 } 170 171 bool MadeChange = false; 172 for (auto I : AtomicInsts) { 173 auto LI = dyn_cast<LoadInst>(I); 174 auto SI = dyn_cast<StoreInst>(I); 175 auto RMWI = dyn_cast<AtomicRMWInst>(I); 176 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 177 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 178 179 // If the Size/Alignment is not supported, replace with a libcall. 180 if (LI) { 181 if (!atomicSizeSupported(TLI, LI)) { 182 expandAtomicLoadToLibcall(LI); 183 MadeChange = true; 184 continue; 185 } 186 } else if (SI) { 187 if (!atomicSizeSupported(TLI, SI)) { 188 expandAtomicStoreToLibcall(SI); 189 MadeChange = true; 190 continue; 191 } 192 } else if (RMWI) { 193 if (!atomicSizeSupported(TLI, RMWI)) { 194 expandAtomicRMWToLibcall(RMWI); 195 MadeChange = true; 196 continue; 197 } 198 } else if (CASI) { 199 if (!atomicSizeSupported(TLI, CASI)) { 200 expandAtomicCASToLibcall(CASI); 201 MadeChange = true; 202 continue; 203 } 204 } 205 206 if (TLI->shouldInsertFencesForAtomic(I)) { 207 auto FenceOrdering = AtomicOrdering::Monotonic; 208 bool IsStore, IsLoad; 209 if (LI && isAcquireOrStronger(LI->getOrdering())) { 210 FenceOrdering = LI->getOrdering(); 211 LI->setOrdering(AtomicOrdering::Monotonic); 212 IsStore = false; 213 IsLoad = true; 214 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 215 FenceOrdering = SI->getOrdering(); 216 SI->setOrdering(AtomicOrdering::Monotonic); 217 IsStore = true; 218 IsLoad = false; 219 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 220 isAcquireOrStronger(RMWI->getOrdering()))) { 221 FenceOrdering = RMWI->getOrdering(); 222 RMWI->setOrdering(AtomicOrdering::Monotonic); 223 IsStore = IsLoad = true; 224 } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 225 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 226 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 227 // If a compare and swap is lowered to LL/SC, we can do smarter fence 228 // insertion, with a stronger one on the success path than on the 229 // failure path. As a result, fence insertion is directly done by 230 // expandAtomicCmpXchg in that case. 231 FenceOrdering = CASI->getSuccessOrdering(); 232 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 233 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 234 IsStore = IsLoad = true; 235 } 236 237 if (FenceOrdering != AtomicOrdering::Monotonic) { 238 MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); 239 } 240 } 241 242 if (LI) { 243 if (LI->getType()->isFloatingPointTy()) { 244 // TODO: add a TLI hook to control this so that each target can 245 // convert to lowering the original type one at a time. 246 LI = convertAtomicLoadToIntegerType(LI); 247 assert(LI->getType()->isIntegerTy() && "invariant broken"); 248 MadeChange = true; 249 } 250 251 MadeChange |= tryExpandAtomicLoad(LI); 252 } else if (SI) { 253 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 254 // TODO: add a TLI hook to control this so that each target can 255 // convert to lowering the original type one at a time. 256 SI = convertAtomicStoreToIntegerType(SI); 257 assert(SI->getValueOperand()->getType()->isIntegerTy() && 258 "invariant broken"); 259 MadeChange = true; 260 } 261 262 if (TLI->shouldExpandAtomicStoreInIR(SI)) 263 MadeChange |= expandAtomicStore(SI); 264 } else if (RMWI) { 265 // There are two different ways of expanding RMW instructions: 266 // - into a load if it is idempotent 267 // - into a Cmpxchg/LL-SC loop otherwise 268 // we try them in that order. 269 270 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 271 MadeChange = true; 272 } else { 273 MadeChange |= tryExpandAtomicRMW(RMWI); 274 } 275 } else if (CASI) { 276 // TODO: when we're ready to make the change at the IR level, we can 277 // extend convertCmpXchgToInteger for floating point too. 278 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 279 "unimplemented - floating point not legal at IR level"); 280 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 281 // TODO: add a TLI hook to control this so that each target can 282 // convert to lowering the original type one at a time. 283 CASI = convertCmpXchgToIntegerType(CASI); 284 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 285 "invariant broken"); 286 MadeChange = true; 287 } 288 289 if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) 290 MadeChange |= expandAtomicCmpXchg(CASI); 291 } 292 } 293 return MadeChange; 294 } 295 296 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, 297 bool IsStore, bool IsLoad) { 298 IRBuilder<> Builder(I); 299 300 auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); 301 302 auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); 303 // The trailing fence is emitted before the instruction instead of after 304 // because there is no easy way of setting Builder insertion point after 305 // an instruction. So we must erase it from the BB, and insert it back 306 // in the right place. 307 // We have a guard here because not every atomic operation generates a 308 // trailing fence. 309 if (TrailingFence) { 310 TrailingFence->removeFromParent(); 311 TrailingFence->insertAfter(I); 312 } 313 314 return (LeadingFence || TrailingFence); 315 } 316 317 /// Get the iX type with the same bitwidth as T. 318 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 319 const DataLayout &DL) { 320 EVT VT = TLI->getValueType(DL, T); 321 unsigned BitWidth = VT.getStoreSizeInBits(); 322 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 323 return IntegerType::get(T->getContext(), BitWidth); 324 } 325 326 /// Convert an atomic load of a non-integral type to an integer load of the 327 /// equivalent bitwidth. See the function comment on 328 /// convertAtomicStoreToIntegerType for background. 329 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 330 auto *M = LI->getModule(); 331 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 332 M->getDataLayout()); 333 334 IRBuilder<> Builder(LI); 335 336 Value *Addr = LI->getPointerOperand(); 337 Type *PT = PointerType::get(NewTy, 338 Addr->getType()->getPointerAddressSpace()); 339 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 340 341 auto *NewLI = Builder.CreateLoad(NewAddr); 342 NewLI->setAlignment(LI->getAlignment()); 343 NewLI->setVolatile(LI->isVolatile()); 344 NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope()); 345 DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 346 347 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 348 LI->replaceAllUsesWith(NewVal); 349 LI->eraseFromParent(); 350 return NewLI; 351 } 352 353 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 354 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 355 case TargetLoweringBase::AtomicExpansionKind::None: 356 return false; 357 case TargetLoweringBase::AtomicExpansionKind::LLSC: 358 return expandAtomicOpToLLSC( 359 LI, LI->getPointerOperand(), LI->getOrdering(), 360 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 361 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 362 return expandAtomicLoadToLL(LI); 363 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 364 return expandAtomicLoadToCmpXchg(LI); 365 } 366 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 367 } 368 369 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 370 IRBuilder<> Builder(LI); 371 372 // On some architectures, load-linked instructions are atomic for larger 373 // sizes than normal loads. For example, the only 64-bit load guaranteed 374 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 375 Value *Val = 376 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 377 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 378 379 LI->replaceAllUsesWith(Val); 380 LI->eraseFromParent(); 381 382 return true; 383 } 384 385 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 386 IRBuilder<> Builder(LI); 387 AtomicOrdering Order = LI->getOrdering(); 388 Value *Addr = LI->getPointerOperand(); 389 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 390 Constant *DummyVal = Constant::getNullValue(Ty); 391 392 Value *Pair = Builder.CreateAtomicCmpXchg( 393 Addr, DummyVal, DummyVal, Order, 394 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 395 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 396 397 LI->replaceAllUsesWith(Loaded); 398 LI->eraseFromParent(); 399 400 return true; 401 } 402 403 /// Convert an atomic store of a non-integral type to an integer store of the 404 /// equivalent bitwidth. We used to not support floating point or vector 405 /// atomics in the IR at all. The backends learned to deal with the bitcast 406 /// idiom because that was the only way of expressing the notion of a atomic 407 /// float or vector store. The long term plan is to teach each backend to 408 /// instruction select from the original atomic store, but as a migration 409 /// mechanism, we convert back to the old format which the backends understand. 410 /// Each backend will need individual work to recognize the new format. 411 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 412 IRBuilder<> Builder(SI); 413 auto *M = SI->getModule(); 414 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 415 M->getDataLayout()); 416 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 417 418 Value *Addr = SI->getPointerOperand(); 419 Type *PT = PointerType::get(NewTy, 420 Addr->getType()->getPointerAddressSpace()); 421 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 422 423 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 424 NewSI->setAlignment(SI->getAlignment()); 425 NewSI->setVolatile(SI->isVolatile()); 426 NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope()); 427 DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 428 SI->eraseFromParent(); 429 return NewSI; 430 } 431 432 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 433 // This function is only called on atomic stores that are too large to be 434 // atomic if implemented as a native store. So we replace them by an 435 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 436 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 437 // It is the responsibility of the target to only signal expansion via 438 // shouldExpandAtomicRMW in cases where this is required and possible. 439 IRBuilder<> Builder(SI); 440 AtomicRMWInst *AI = 441 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 442 SI->getValueOperand(), SI->getOrdering()); 443 SI->eraseFromParent(); 444 445 // Now we have an appropriate swap instruction, lower it as usual. 446 return tryExpandAtomicRMW(AI); 447 } 448 449 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 450 Value *Loaded, Value *NewVal, 451 AtomicOrdering MemOpOrder, 452 Value *&Success, Value *&NewLoaded) { 453 Value* Pair = Builder.CreateAtomicCmpXchg( 454 Addr, Loaded, NewVal, MemOpOrder, 455 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 456 Success = Builder.CreateExtractValue(Pair, 1, "success"); 457 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 458 } 459 460 /// Emit IR to implement the given atomicrmw operation on values in registers, 461 /// returning the new value. 462 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 463 Value *Loaded, Value *Inc) { 464 Value *NewVal; 465 switch (Op) { 466 case AtomicRMWInst::Xchg: 467 return Inc; 468 case AtomicRMWInst::Add: 469 return Builder.CreateAdd(Loaded, Inc, "new"); 470 case AtomicRMWInst::Sub: 471 return Builder.CreateSub(Loaded, Inc, "new"); 472 case AtomicRMWInst::And: 473 return Builder.CreateAnd(Loaded, Inc, "new"); 474 case AtomicRMWInst::Nand: 475 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 476 case AtomicRMWInst::Or: 477 return Builder.CreateOr(Loaded, Inc, "new"); 478 case AtomicRMWInst::Xor: 479 return Builder.CreateXor(Loaded, Inc, "new"); 480 case AtomicRMWInst::Max: 481 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 482 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 483 case AtomicRMWInst::Min: 484 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 485 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 486 case AtomicRMWInst::UMax: 487 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 488 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 489 case AtomicRMWInst::UMin: 490 NewVal = Builder.CreateICmpULE(Loaded, Inc); 491 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 492 default: 493 llvm_unreachable("Unknown atomic op"); 494 } 495 } 496 497 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 498 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 499 case TargetLoweringBase::AtomicExpansionKind::None: 500 return false; 501 case TargetLoweringBase::AtomicExpansionKind::LLSC: 502 return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(), 503 [&](IRBuilder<> &Builder, Value *Loaded) { 504 return performAtomicOp(AI->getOperation(), 505 Builder, Loaded, 506 AI->getValOperand()); 507 }); 508 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 509 return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 510 default: 511 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 512 } 513 } 514 515 bool AtomicExpand::expandAtomicOpToLLSC( 516 Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, 517 std::function<Value *(IRBuilder<> &, Value *)> PerformOp) { 518 BasicBlock *BB = I->getParent(); 519 Function *F = BB->getParent(); 520 LLVMContext &Ctx = F->getContext(); 521 522 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 523 // 524 // The standard expansion we produce is: 525 // [...] 526 // fence? 527 // atomicrmw.start: 528 // %loaded = @load.linked(%addr) 529 // %new = some_op iN %loaded, %incr 530 // %stored = @store_conditional(%new, %addr) 531 // %try_again = icmp i32 ne %stored, 0 532 // br i1 %try_again, label %loop, label %atomicrmw.end 533 // atomicrmw.end: 534 // fence? 535 // [...] 536 BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end"); 537 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 538 539 // This grabs the DebugLoc from I. 540 IRBuilder<> Builder(I); 541 542 // The split call above "helpfully" added a branch at the end of BB (to the 543 // wrong place), but we might want a fence too. It's easiest to just remove 544 // the branch entirely. 545 std::prev(BB->end())->eraseFromParent(); 546 Builder.SetInsertPoint(BB); 547 Builder.CreateBr(LoopBB); 548 549 // Start the main loop block now that we've taken care of the preliminaries. 550 Builder.SetInsertPoint(LoopBB); 551 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 552 553 Value *NewVal = PerformOp(Builder, Loaded); 554 555 Value *StoreSuccess = 556 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 557 Value *TryAgain = Builder.CreateICmpNE( 558 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 559 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 560 561 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 562 563 I->replaceAllUsesWith(Loaded); 564 I->eraseFromParent(); 565 566 return true; 567 } 568 569 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 570 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 571 /// IR. As a migration step, we convert back to what use to be the standard 572 /// way to represent a pointer cmpxchg so that we can update backends one by 573 /// one. 574 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 575 auto *M = CI->getModule(); 576 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 577 M->getDataLayout()); 578 579 IRBuilder<> Builder(CI); 580 581 Value *Addr = CI->getPointerOperand(); 582 Type *PT = PointerType::get(NewTy, 583 Addr->getType()->getPointerAddressSpace()); 584 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 585 586 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 587 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 588 589 590 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 591 CI->getSuccessOrdering(), 592 CI->getFailureOrdering(), 593 CI->getSynchScope()); 594 NewCI->setVolatile(CI->isVolatile()); 595 NewCI->setWeak(CI->isWeak()); 596 DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 597 598 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 599 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 600 601 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 602 603 Value *Res = UndefValue::get(CI->getType()); 604 Res = Builder.CreateInsertValue(Res, OldVal, 0); 605 Res = Builder.CreateInsertValue(Res, Succ, 1); 606 607 CI->replaceAllUsesWith(Res); 608 CI->eraseFromParent(); 609 return NewCI; 610 } 611 612 613 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 614 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 615 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 616 Value *Addr = CI->getPointerOperand(); 617 BasicBlock *BB = CI->getParent(); 618 Function *F = BB->getParent(); 619 LLVMContext &Ctx = F->getContext(); 620 // If shouldInsertFencesForAtomic() returns true, then the target does not 621 // want to deal with memory orders, and emitLeading/TrailingFence should take 622 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 623 // should preserve the ordering. 624 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 625 AtomicOrdering MemOpOrder = 626 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 627 628 // In implementations which use a barrier to achieve release semantics, we can 629 // delay emitting this barrier until we know a store is actually going to be 630 // attempted. The cost of this delay is that we need 2 copies of the block 631 // emitting the load-linked, affecting code size. 632 // 633 // Ideally, this logic would be unconditional except for the minsize check 634 // since in other cases the extra blocks naturally collapse down to the 635 // minimal loop. Unfortunately, this puts too much stress on later 636 // optimisations so we avoid emitting the extra logic in those cases too. 637 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 638 SuccessOrder != AtomicOrdering::Monotonic && 639 SuccessOrder != AtomicOrdering::Acquire && 640 !F->optForMinSize(); 641 642 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 643 // do it even on minsize. 644 bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); 645 646 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 647 // 648 // The full expansion we produce is: 649 // [...] 650 // cmpxchg.start: 651 // %unreleasedload = @load.linked(%addr) 652 // %should_store = icmp eq %unreleasedload, %desired 653 // br i1 %should_store, label %cmpxchg.fencedstore, 654 // label %cmpxchg.nostore 655 // cmpxchg.releasingstore: 656 // fence? 657 // br label cmpxchg.trystore 658 // cmpxchg.trystore: 659 // %loaded.trystore = phi [%unreleasedload, %releasingstore], 660 // [%releasedload, %cmpxchg.releasedload] 661 // %stored = @store_conditional(%new, %addr) 662 // %success = icmp eq i32 %stored, 0 663 // br i1 %success, label %cmpxchg.success, 664 // label %cmpxchg.releasedload/%cmpxchg.failure 665 // cmpxchg.releasedload: 666 // %releasedload = @load.linked(%addr) 667 // %should_store = icmp eq %releasedload, %desired 668 // br i1 %should_store, label %cmpxchg.trystore, 669 // label %cmpxchg.failure 670 // cmpxchg.success: 671 // fence? 672 // br label %cmpxchg.end 673 // cmpxchg.nostore: 674 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 675 // [%releasedload, 676 // %cmpxchg.releasedload/%cmpxchg.trystore] 677 // @load_linked_fail_balance()? 678 // br label %cmpxchg.failure 679 // cmpxchg.failure: 680 // fence? 681 // br label %cmpxchg.end 682 // cmpxchg.end: 683 // %loaded = phi [%loaded.nostore, %cmpxchg.failure], 684 // [%loaded.trystore, %cmpxchg.trystore] 685 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 686 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 687 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 688 // [...] 689 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 690 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 691 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 692 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 693 auto ReleasedLoadBB = 694 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 695 auto TryStoreBB = 696 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 697 auto ReleasingStoreBB = 698 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 699 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 700 701 // This grabs the DebugLoc from CI 702 IRBuilder<> Builder(CI); 703 704 // The split call above "helpfully" added a branch at the end of BB (to the 705 // wrong place), but we might want a fence too. It's easiest to just remove 706 // the branch entirely. 707 std::prev(BB->end())->eraseFromParent(); 708 Builder.SetInsertPoint(BB); 709 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 710 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 711 /*IsLoad=*/true); 712 Builder.CreateBr(StartBB); 713 714 // Start the main loop block now that we've taken care of the preliminaries. 715 Builder.SetInsertPoint(StartBB); 716 Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 717 Value *ShouldStore = Builder.CreateICmpEQ( 718 UnreleasedLoad, CI->getCompareOperand(), "should_store"); 719 720 // If the cmpxchg doesn't actually need any ordering when it fails, we can 721 // jump straight past that fence instruction (if it exists). 722 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 723 724 Builder.SetInsertPoint(ReleasingStoreBB); 725 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 726 TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, 727 /*IsLoad=*/true); 728 Builder.CreateBr(TryStoreBB); 729 730 Builder.SetInsertPoint(TryStoreBB); 731 Value *StoreSuccess = TLI->emitStoreConditional( 732 Builder, CI->getNewValOperand(), Addr, MemOpOrder); 733 StoreSuccess = Builder.CreateICmpEQ( 734 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 735 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 736 Builder.CreateCondBr(StoreSuccess, SuccessBB, 737 CI->isWeak() ? FailureBB : RetryBB); 738 739 Builder.SetInsertPoint(ReleasedLoadBB); 740 Value *SecondLoad; 741 if (HasReleasedLoadBB) { 742 SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 743 ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), 744 "should_store"); 745 746 // If the cmpxchg doesn't actually need any ordering when it fails, we can 747 // jump straight past that fence instruction (if it exists). 748 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 749 } else 750 Builder.CreateUnreachable(); 751 752 // Make sure later instructions don't get reordered with a fence if 753 // necessary. 754 Builder.SetInsertPoint(SuccessBB); 755 if (ShouldInsertFencesForAtomic) 756 TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, 757 /*IsLoad=*/true); 758 Builder.CreateBr(ExitBB); 759 760 Builder.SetInsertPoint(NoStoreBB); 761 // In the failing case, where we don't execute the store-conditional, the 762 // target might want to balance out the load-linked with a dedicated 763 // instruction (e.g., on ARM, clearing the exclusive monitor). 764 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 765 Builder.CreateBr(FailureBB); 766 767 Builder.SetInsertPoint(FailureBB); 768 if (ShouldInsertFencesForAtomic) 769 TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, 770 /*IsLoad=*/true); 771 Builder.CreateBr(ExitBB); 772 773 // Finally, we have control-flow based knowledge of whether the cmpxchg 774 // succeeded or not. We expose this to later passes by converting any 775 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 776 // PHI. 777 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 778 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); 779 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 780 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 781 782 // Setup the builder so we can create any PHIs we need. 783 Value *Loaded; 784 if (!HasReleasedLoadBB) 785 Loaded = UnreleasedLoad; 786 else { 787 Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); 788 PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 789 TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); 790 TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 791 792 Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); 793 PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 794 NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); 795 NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 796 797 Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); 798 PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 799 ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); 800 ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); 801 802 Loaded = ExitLoaded; 803 } 804 805 // Look for any users of the cmpxchg that are just comparing the loaded value 806 // against the desired one, and replace them with the CFG-derived version. 807 SmallVector<ExtractValueInst *, 2> PrunedInsts; 808 for (auto User : CI->users()) { 809 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 810 if (!EV) 811 continue; 812 813 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 814 "weird extraction from { iN, i1 }"); 815 816 if (EV->getIndices()[0] == 0) 817 EV->replaceAllUsesWith(Loaded); 818 else 819 EV->replaceAllUsesWith(Success); 820 821 PrunedInsts.push_back(EV); 822 } 823 824 // We can remove the instructions now we're no longer iterating through them. 825 for (auto EV : PrunedInsts) 826 EV->eraseFromParent(); 827 828 if (!CI->use_empty()) { 829 // Some use of the full struct return that we don't understand has happened, 830 // so we've got to reconstruct it properly. 831 Value *Res; 832 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 833 Res = Builder.CreateInsertValue(Res, Success, 1); 834 835 CI->replaceAllUsesWith(Res); 836 } 837 838 CI->eraseFromParent(); 839 return true; 840 } 841 842 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 843 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 844 if(!C) 845 return false; 846 847 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 848 switch(Op) { 849 case AtomicRMWInst::Add: 850 case AtomicRMWInst::Sub: 851 case AtomicRMWInst::Or: 852 case AtomicRMWInst::Xor: 853 return C->isZero(); 854 case AtomicRMWInst::And: 855 return C->isMinusOne(); 856 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 857 default: 858 return false; 859 } 860 } 861 862 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 863 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 864 tryExpandAtomicLoad(ResultingLoad); 865 return true; 866 } 867 return false; 868 } 869 870 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 871 CreateCmpXchgInstFun CreateCmpXchg) { 872 assert(AI); 873 874 AtomicOrdering MemOpOrder = AI->getOrdering() == AtomicOrdering::Unordered 875 ? AtomicOrdering::Monotonic 876 : AI->getOrdering(); 877 Value *Addr = AI->getPointerOperand(); 878 BasicBlock *BB = AI->getParent(); 879 Function *F = BB->getParent(); 880 LLVMContext &Ctx = F->getContext(); 881 882 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 883 // 884 // The standard expansion we produce is: 885 // [...] 886 // %init_loaded = load atomic iN* %addr 887 // br label %loop 888 // loop: 889 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 890 // %new = some_op iN %loaded, %incr 891 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 892 // %new_loaded = extractvalue { iN, i1 } %pair, 0 893 // %success = extractvalue { iN, i1 } %pair, 1 894 // br i1 %success, label %atomicrmw.end, label %loop 895 // atomicrmw.end: 896 // [...] 897 BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end"); 898 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 899 900 // This grabs the DebugLoc from AI. 901 IRBuilder<> Builder(AI); 902 903 // The split call above "helpfully" added a branch at the end of BB (to the 904 // wrong place), but we want a load. It's easiest to just remove 905 // the branch entirely. 906 std::prev(BB->end())->eraseFromParent(); 907 Builder.SetInsertPoint(BB); 908 LoadInst *InitLoaded = Builder.CreateLoad(Addr); 909 // Atomics require at least natural alignment. 910 InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8); 911 Builder.CreateBr(LoopBB); 912 913 // Start the main loop block now that we've taken care of the preliminaries. 914 Builder.SetInsertPoint(LoopBB); 915 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); 916 Loaded->addIncoming(InitLoaded, BB); 917 918 Value *NewVal = 919 performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); 920 921 Value *NewLoaded = nullptr; 922 Value *Success = nullptr; 923 924 CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder, 925 Success, NewLoaded); 926 assert(Success && NewLoaded); 927 928 Loaded->addIncoming(NewLoaded, LoopBB); 929 930 Builder.CreateCondBr(Success, ExitBB, LoopBB); 931 932 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 933 934 AI->replaceAllUsesWith(NewLoaded); 935 AI->eraseFromParent(); 936 937 return true; 938 } 939 940 // This converts from LLVM's internal AtomicOrdering enum to the 941 // memory_order_* value required by the __atomic_* libcalls. 942 static int libcallAtomicModel(AtomicOrdering AO) { 943 enum { 944 AO_ABI_memory_order_relaxed = 0, 945 AO_ABI_memory_order_consume = 1, 946 AO_ABI_memory_order_acquire = 2, 947 AO_ABI_memory_order_release = 3, 948 AO_ABI_memory_order_acq_rel = 4, 949 AO_ABI_memory_order_seq_cst = 5 950 }; 951 952 switch (AO) { 953 case AtomicOrdering::NotAtomic: 954 llvm_unreachable("Expected atomic memory order."); 955 case AtomicOrdering::Unordered: 956 case AtomicOrdering::Monotonic: 957 return AO_ABI_memory_order_relaxed; 958 // Not implemented yet in llvm: 959 // case AtomicOrdering::Consume: 960 // return AO_ABI_memory_order_consume; 961 case AtomicOrdering::Acquire: 962 return AO_ABI_memory_order_acquire; 963 case AtomicOrdering::Release: 964 return AO_ABI_memory_order_release; 965 case AtomicOrdering::AcquireRelease: 966 return AO_ABI_memory_order_acq_rel; 967 case AtomicOrdering::SequentiallyConsistent: 968 return AO_ABI_memory_order_seq_cst; 969 } 970 llvm_unreachable("Unknown atomic memory order."); 971 } 972 973 // In order to use one of the sized library calls such as 974 // __atomic_fetch_add_4, the alignment must be sufficient, the size 975 // must be one of the potentially-specialized sizes, and the value 976 // type must actually exist in C on the target (otherwise, the 977 // function wouldn't actually be defined.) 978 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align, 979 const DataLayout &DL) { 980 // TODO: "LargestSize" is an approximation for "largest type that 981 // you can express in C". It seems to be the case that int128 is 982 // supported on all 64-bit platforms, otherwise only up to 64-bit 983 // integers are supported. If we get this wrong, then we'll try to 984 // call a sized libcall that doesn't actually exist. There should 985 // really be some more reliable way in LLVM of determining integer 986 // sizes which are valid in the target's C ABI... 987 unsigned LargestSize = DL.getLargestLegalIntTypeSize() >= 64 ? 16 : 8; 988 return Align >= Size && 989 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) && 990 Size <= LargestSize; 991 } 992 993 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { 994 static const RTLIB::Libcall Libcalls[6] = { 995 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, 996 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; 997 unsigned Size = getAtomicOpSize(I); 998 unsigned Align = getAtomicOpAlign(I); 999 1000 bool expanded = expandAtomicOpToLibcall( 1001 I, Size, Align, I->getPointerOperand(), nullptr, nullptr, 1002 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1003 (void)expanded; 1004 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); 1005 } 1006 1007 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { 1008 static const RTLIB::Libcall Libcalls[6] = { 1009 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, 1010 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; 1011 unsigned Size = getAtomicOpSize(I); 1012 unsigned Align = getAtomicOpAlign(I); 1013 1014 bool expanded = expandAtomicOpToLibcall( 1015 I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr, 1016 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1017 (void)expanded; 1018 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); 1019 } 1020 1021 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { 1022 static const RTLIB::Libcall Libcalls[6] = { 1023 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, 1024 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, 1025 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; 1026 unsigned Size = getAtomicOpSize(I); 1027 unsigned Align = getAtomicOpAlign(I); 1028 1029 bool expanded = expandAtomicOpToLibcall( 1030 I, Size, Align, I->getPointerOperand(), I->getNewValOperand(), 1031 I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), 1032 Libcalls); 1033 (void)expanded; 1034 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); 1035 } 1036 1037 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { 1038 static const RTLIB::Libcall LibcallsXchg[6] = { 1039 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, 1040 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, 1041 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; 1042 static const RTLIB::Libcall LibcallsAdd[6] = { 1043 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, 1044 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, 1045 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; 1046 static const RTLIB::Libcall LibcallsSub[6] = { 1047 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, 1048 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, 1049 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; 1050 static const RTLIB::Libcall LibcallsAnd[6] = { 1051 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, 1052 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, 1053 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; 1054 static const RTLIB::Libcall LibcallsOr[6] = { 1055 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, 1056 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, 1057 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; 1058 static const RTLIB::Libcall LibcallsXor[6] = { 1059 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, 1060 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, 1061 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; 1062 static const RTLIB::Libcall LibcallsNand[6] = { 1063 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, 1064 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, 1065 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; 1066 1067 switch (Op) { 1068 case AtomicRMWInst::BAD_BINOP: 1069 llvm_unreachable("Should not have BAD_BINOP."); 1070 case AtomicRMWInst::Xchg: 1071 return makeArrayRef(LibcallsXchg); 1072 case AtomicRMWInst::Add: 1073 return makeArrayRef(LibcallsAdd); 1074 case AtomicRMWInst::Sub: 1075 return makeArrayRef(LibcallsSub); 1076 case AtomicRMWInst::And: 1077 return makeArrayRef(LibcallsAnd); 1078 case AtomicRMWInst::Or: 1079 return makeArrayRef(LibcallsOr); 1080 case AtomicRMWInst::Xor: 1081 return makeArrayRef(LibcallsXor); 1082 case AtomicRMWInst::Nand: 1083 return makeArrayRef(LibcallsNand); 1084 case AtomicRMWInst::Max: 1085 case AtomicRMWInst::Min: 1086 case AtomicRMWInst::UMax: 1087 case AtomicRMWInst::UMin: 1088 // No atomic libcalls are available for max/min/umax/umin. 1089 return {}; 1090 } 1091 llvm_unreachable("Unexpected AtomicRMW operation."); 1092 } 1093 1094 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { 1095 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); 1096 1097 unsigned Size = getAtomicOpSize(I); 1098 unsigned Align = getAtomicOpAlign(I); 1099 1100 bool Success = false; 1101 if (!Libcalls.empty()) 1102 Success = expandAtomicOpToLibcall( 1103 I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr, 1104 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1105 1106 // The expansion failed: either there were no libcalls at all for 1107 // the operation (min/max), or there were only size-specialized 1108 // libcalls (add/sub/etc) and we needed a generic. So, expand to a 1109 // CAS libcall, via a CAS loop, instead. 1110 if (!Success) { 1111 expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, 1112 Value *Loaded, Value *NewVal, 1113 AtomicOrdering MemOpOrder, 1114 Value *&Success, Value *&NewLoaded) { 1115 // Create the CAS instruction normally... 1116 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( 1117 Addr, Loaded, NewVal, MemOpOrder, 1118 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 1119 Success = Builder.CreateExtractValue(Pair, 1, "success"); 1120 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 1121 1122 // ...and then expand the CAS into a libcall. 1123 expandAtomicCASToLibcall(Pair); 1124 }); 1125 } 1126 } 1127 1128 // A helper routine for the above expandAtomic*ToLibcall functions. 1129 // 1130 // 'Libcalls' contains an array of enum values for the particular 1131 // ATOMIC libcalls to be emitted. All of the other arguments besides 1132 // 'I' are extracted from the Instruction subclass by the 1133 // caller. Depending on the particular call, some will be null. 1134 bool AtomicExpand::expandAtomicOpToLibcall( 1135 Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand, 1136 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, 1137 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { 1138 assert(Libcalls.size() == 6); 1139 1140 LLVMContext &Ctx = I->getContext(); 1141 Module *M = I->getModule(); 1142 const DataLayout &DL = M->getDataLayout(); 1143 IRBuilder<> Builder(I); 1144 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); 1145 1146 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL); 1147 Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); 1148 1149 unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy); 1150 1151 // TODO: the "order" argument type is "int", not int32. So 1152 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. 1153 ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); 1154 Constant *OrderingVal = 1155 ConstantInt::get(Type::getInt32Ty(Ctx), libcallAtomicModel(Ordering)); 1156 Constant *Ordering2Val = CASExpected 1157 ? ConstantInt::get(Type::getInt32Ty(Ctx), 1158 libcallAtomicModel(Ordering2)) 1159 : nullptr; 1160 bool HasResult = I->getType() != Type::getVoidTy(Ctx); 1161 1162 RTLIB::Libcall RTLibType; 1163 if (UseSizedLibcall) { 1164 switch (Size) { 1165 case 1: RTLibType = Libcalls[1]; break; 1166 case 2: RTLibType = Libcalls[2]; break; 1167 case 4: RTLibType = Libcalls[3]; break; 1168 case 8: RTLibType = Libcalls[4]; break; 1169 case 16: RTLibType = Libcalls[5]; break; 1170 } 1171 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) { 1172 RTLibType = Libcalls[0]; 1173 } else { 1174 // Can't use sized function, and there's no generic for this 1175 // operation, so give up. 1176 return false; 1177 } 1178 1179 // Build up the function call. There's two kinds. First, the sized 1180 // variants. These calls are going to be one of the following (with 1181 // N=1,2,4,8,16): 1182 // iN __atomic_load_N(iN *ptr, int ordering) 1183 // void __atomic_store_N(iN *ptr, iN val, int ordering) 1184 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) 1185 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, 1186 // int success_order, int failure_order) 1187 // 1188 // Note that these functions can be used for non-integer atomic 1189 // operations, the values just need to be bitcast to integers on the 1190 // way in and out. 1191 // 1192 // And, then, the generic variants. They look like the following: 1193 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) 1194 // void __atomic_store(size_t size, void *ptr, void *val, int ordering) 1195 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, 1196 // int ordering) 1197 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, 1198 // void *desired, int success_order, 1199 // int failure_order) 1200 // 1201 // The different signatures are built up depending on the 1202 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' 1203 // variables. 1204 1205 AllocaInst *AllocaCASExpected = nullptr; 1206 Value *AllocaCASExpected_i8 = nullptr; 1207 AllocaInst *AllocaValue = nullptr; 1208 Value *AllocaValue_i8 = nullptr; 1209 AllocaInst *AllocaResult = nullptr; 1210 Value *AllocaResult_i8 = nullptr; 1211 1212 Type *ResultTy; 1213 SmallVector<Value *, 6> Args; 1214 AttributeSet Attr; 1215 1216 // 'size' argument. 1217 if (!UseSizedLibcall) { 1218 // Note, getIntPtrType is assumed equivalent to size_t. 1219 Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); 1220 } 1221 1222 // 'ptr' argument. 1223 Value *PtrVal = 1224 Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx)); 1225 Args.push_back(PtrVal); 1226 1227 // 'expected' argument, if present. 1228 if (CASExpected) { 1229 AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); 1230 AllocaCASExpected->setAlignment(AllocaAlignment); 1231 AllocaCASExpected_i8 = 1232 Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx)); 1233 Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); 1234 Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); 1235 Args.push_back(AllocaCASExpected_i8); 1236 } 1237 1238 // 'val' argument ('desired' for cas), if present. 1239 if (ValueOperand) { 1240 if (UseSizedLibcall) { 1241 Value *IntValue = 1242 Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); 1243 Args.push_back(IntValue); 1244 } else { 1245 AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); 1246 AllocaValue->setAlignment(AllocaAlignment); 1247 AllocaValue_i8 = 1248 Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); 1249 Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); 1250 Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); 1251 Args.push_back(AllocaValue_i8); 1252 } 1253 } 1254 1255 // 'ret' argument. 1256 if (!CASExpected && HasResult && !UseSizedLibcall) { 1257 AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); 1258 AllocaResult->setAlignment(AllocaAlignment); 1259 AllocaResult_i8 = 1260 Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx)); 1261 Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); 1262 Args.push_back(AllocaResult_i8); 1263 } 1264 1265 // 'ordering' ('success_order' for cas) argument. 1266 Args.push_back(OrderingVal); 1267 1268 // 'failure_order' argument, if present. 1269 if (Ordering2Val) 1270 Args.push_back(Ordering2Val); 1271 1272 // Now, the return type. 1273 if (CASExpected) { 1274 ResultTy = Type::getInt1Ty(Ctx); 1275 Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt); 1276 } else if (HasResult && UseSizedLibcall) 1277 ResultTy = SizedIntTy; 1278 else 1279 ResultTy = Type::getVoidTy(Ctx); 1280 1281 // Done with setting up arguments and return types, create the call: 1282 SmallVector<Type *, 6> ArgTys; 1283 for (Value *Arg : Args) 1284 ArgTys.push_back(Arg->getType()); 1285 FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); 1286 Constant *LibcallFn = 1287 M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); 1288 CallInst *Call = Builder.CreateCall(LibcallFn, Args); 1289 Call->setAttributes(Attr); 1290 Value *Result = Call; 1291 1292 // And then, extract the results... 1293 if (ValueOperand && !UseSizedLibcall) 1294 Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); 1295 1296 if (CASExpected) { 1297 // The final result from the CAS is {load of 'expected' alloca, bool result 1298 // from call} 1299 Type *FinalResultTy = I->getType(); 1300 Value *V = UndefValue::get(FinalResultTy); 1301 Value *ExpectedOut = 1302 Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment); 1303 Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); 1304 V = Builder.CreateInsertValue(V, ExpectedOut, 0); 1305 V = Builder.CreateInsertValue(V, Result, 1); 1306 I->replaceAllUsesWith(V); 1307 } else if (HasResult) { 1308 Value *V; 1309 if (UseSizedLibcall) 1310 V = Builder.CreateBitOrPointerCast(Result, I->getType()); 1311 else { 1312 V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment); 1313 Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); 1314 } 1315 I->replaceAllUsesWith(V); 1316 } 1317 I->eraseFromParent(); 1318 return true; 1319 } 1320