1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass (at IR level) to replace atomic instructions with 11 // __atomic_* library calls, or target specific instruction which implement the 12 // same semantics in a way which better fits the target backend. This can 13 // include the use of (intrinsic-based) load-linked/store-conditional loops, 14 // AtomicCmpXchg, or type coercions. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/CodeGen/AtomicExpandUtils.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/CodeGen/TargetPassConfig.h" 21 #include "llvm/IR/Function.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/InstIterator.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Intrinsics.h" 26 #include "llvm/IR/Module.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetLowering.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "atomic-expand" 36 37 namespace { 38 class AtomicExpand: public FunctionPass { 39 const TargetLowering *TLI; 40 public: 41 static char ID; // Pass identification, replacement for typeid 42 AtomicExpand() : FunctionPass(ID), TLI(nullptr) { 43 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 44 } 45 46 bool runOnFunction(Function &F) override; 47 48 private: 49 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); 50 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 51 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 52 bool tryExpandAtomicLoad(LoadInst *LI); 53 bool expandAtomicLoadToLL(LoadInst *LI); 54 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 55 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 56 bool expandAtomicStore(StoreInst *SI); 57 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 58 Value * 59 insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 60 AtomicOrdering MemOpOrder, 61 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 62 void expandAtomicOpToLLSC( 63 Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, 64 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 65 void expandPartwordAtomicRMW( 66 AtomicRMWInst *I, 67 TargetLoweringBase::AtomicExpansionKind ExpansionKind); 68 void expandPartwordCmpXchg(AtomicCmpXchgInst *I); 69 70 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 71 static Value *insertRMWCmpXchgLoop( 72 IRBuilder<> &Builder, Type *ResultType, Value *Addr, 73 AtomicOrdering MemOpOrder, 74 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 75 CreateCmpXchgInstFun CreateCmpXchg); 76 77 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 78 bool isIdempotentRMW(AtomicRMWInst *AI); 79 bool simplifyIdempotentRMW(AtomicRMWInst *AI); 80 81 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align, 82 Value *PointerOperand, Value *ValueOperand, 83 Value *CASExpected, AtomicOrdering Ordering, 84 AtomicOrdering Ordering2, 85 ArrayRef<RTLIB::Libcall> Libcalls); 86 void expandAtomicLoadToLibcall(LoadInst *LI); 87 void expandAtomicStoreToLibcall(StoreInst *LI); 88 void expandAtomicRMWToLibcall(AtomicRMWInst *I); 89 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); 90 91 friend bool 92 llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 93 CreateCmpXchgInstFun CreateCmpXchg); 94 }; 95 } 96 97 char AtomicExpand::ID = 0; 98 char &llvm::AtomicExpandID = AtomicExpand::ID; 99 INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", 100 false, false) 101 102 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } 103 104 namespace { 105 // Helper functions to retrieve the size of atomic instructions. 106 unsigned getAtomicOpSize(LoadInst *LI) { 107 const DataLayout &DL = LI->getModule()->getDataLayout(); 108 return DL.getTypeStoreSize(LI->getType()); 109 } 110 111 unsigned getAtomicOpSize(StoreInst *SI) { 112 const DataLayout &DL = SI->getModule()->getDataLayout(); 113 return DL.getTypeStoreSize(SI->getValueOperand()->getType()); 114 } 115 116 unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { 117 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 118 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 119 } 120 121 unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { 122 const DataLayout &DL = CASI->getModule()->getDataLayout(); 123 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 124 } 125 126 // Helper functions to retrieve the alignment of atomic instructions. 127 unsigned getAtomicOpAlign(LoadInst *LI) { 128 unsigned Align = LI->getAlignment(); 129 // In the future, if this IR restriction is relaxed, we should 130 // return DataLayout::getABITypeAlignment when there's no align 131 // value. 132 assert(Align != 0 && "An atomic LoadInst always has an explicit alignment"); 133 return Align; 134 } 135 136 unsigned getAtomicOpAlign(StoreInst *SI) { 137 unsigned Align = SI->getAlignment(); 138 // In the future, if this IR restriction is relaxed, we should 139 // return DataLayout::getABITypeAlignment when there's no align 140 // value. 141 assert(Align != 0 && "An atomic StoreInst always has an explicit alignment"); 142 return Align; 143 } 144 145 unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) { 146 // TODO(PR27168): This instruction has no alignment attribute, but unlike the 147 // default alignment for load/store, the default here is to assume 148 // it has NATURAL alignment, not DataLayout-specified alignment. 149 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 150 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 151 } 152 153 unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) { 154 // TODO(PR27168): same comment as above. 155 const DataLayout &DL = CASI->getModule()->getDataLayout(); 156 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 157 } 158 159 // Determine if a particular atomic operation has a supported size, 160 // and is of appropriate alignment, to be passed through for target 161 // lowering. (Versus turning into a __atomic libcall) 162 template <typename Inst> 163 bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { 164 unsigned Size = getAtomicOpSize(I); 165 unsigned Align = getAtomicOpAlign(I); 166 return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; 167 } 168 169 } // end anonymous namespace 170 171 bool AtomicExpand::runOnFunction(Function &F) { 172 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 173 if (!TPC) 174 return false; 175 176 auto &TM = TPC->getTM<TargetMachine>(); 177 if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) 178 return false; 179 TLI = TM.getSubtargetImpl(F)->getTargetLowering(); 180 181 SmallVector<Instruction *, 1> AtomicInsts; 182 183 // Changing control-flow while iterating through it is a bad idea, so gather a 184 // list of all atomic instructions before we start. 185 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 186 Instruction *I = &*II; 187 if (I->isAtomic() && !isa<FenceInst>(I)) 188 AtomicInsts.push_back(I); 189 } 190 191 bool MadeChange = false; 192 for (auto I : AtomicInsts) { 193 auto LI = dyn_cast<LoadInst>(I); 194 auto SI = dyn_cast<StoreInst>(I); 195 auto RMWI = dyn_cast<AtomicRMWInst>(I); 196 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 197 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 198 199 // If the Size/Alignment is not supported, replace with a libcall. 200 if (LI) { 201 if (!atomicSizeSupported(TLI, LI)) { 202 expandAtomicLoadToLibcall(LI); 203 MadeChange = true; 204 continue; 205 } 206 } else if (SI) { 207 if (!atomicSizeSupported(TLI, SI)) { 208 expandAtomicStoreToLibcall(SI); 209 MadeChange = true; 210 continue; 211 } 212 } else if (RMWI) { 213 if (!atomicSizeSupported(TLI, RMWI)) { 214 expandAtomicRMWToLibcall(RMWI); 215 MadeChange = true; 216 continue; 217 } 218 } else if (CASI) { 219 if (!atomicSizeSupported(TLI, CASI)) { 220 expandAtomicCASToLibcall(CASI); 221 MadeChange = true; 222 continue; 223 } 224 } 225 226 if (TLI->shouldInsertFencesForAtomic(I)) { 227 auto FenceOrdering = AtomicOrdering::Monotonic; 228 if (LI && isAcquireOrStronger(LI->getOrdering())) { 229 FenceOrdering = LI->getOrdering(); 230 LI->setOrdering(AtomicOrdering::Monotonic); 231 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 232 FenceOrdering = SI->getOrdering(); 233 SI->setOrdering(AtomicOrdering::Monotonic); 234 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 235 isAcquireOrStronger(RMWI->getOrdering()))) { 236 FenceOrdering = RMWI->getOrdering(); 237 RMWI->setOrdering(AtomicOrdering::Monotonic); 238 } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 239 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 240 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 241 // If a compare and swap is lowered to LL/SC, we can do smarter fence 242 // insertion, with a stronger one on the success path than on the 243 // failure path. As a result, fence insertion is directly done by 244 // expandAtomicCmpXchg in that case. 245 FenceOrdering = CASI->getSuccessOrdering(); 246 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 247 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 248 } 249 250 if (FenceOrdering != AtomicOrdering::Monotonic) { 251 MadeChange |= bracketInstWithFences(I, FenceOrdering); 252 } 253 } 254 255 if (LI) { 256 if (LI->getType()->isFloatingPointTy()) { 257 // TODO: add a TLI hook to control this so that each target can 258 // convert to lowering the original type one at a time. 259 LI = convertAtomicLoadToIntegerType(LI); 260 assert(LI->getType()->isIntegerTy() && "invariant broken"); 261 MadeChange = true; 262 } 263 264 MadeChange |= tryExpandAtomicLoad(LI); 265 } else if (SI) { 266 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 267 // TODO: add a TLI hook to control this so that each target can 268 // convert to lowering the original type one at a time. 269 SI = convertAtomicStoreToIntegerType(SI); 270 assert(SI->getValueOperand()->getType()->isIntegerTy() && 271 "invariant broken"); 272 MadeChange = true; 273 } 274 275 if (TLI->shouldExpandAtomicStoreInIR(SI)) 276 MadeChange |= expandAtomicStore(SI); 277 } else if (RMWI) { 278 // There are two different ways of expanding RMW instructions: 279 // - into a load if it is idempotent 280 // - into a Cmpxchg/LL-SC loop otherwise 281 // we try them in that order. 282 283 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 284 MadeChange = true; 285 } else { 286 MadeChange |= tryExpandAtomicRMW(RMWI); 287 } 288 } else if (CASI) { 289 // TODO: when we're ready to make the change at the IR level, we can 290 // extend convertCmpXchgToInteger for floating point too. 291 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 292 "unimplemented - floating point not legal at IR level"); 293 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 294 // TODO: add a TLI hook to control this so that each target can 295 // convert to lowering the original type one at a time. 296 CASI = convertCmpXchgToIntegerType(CASI); 297 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 298 "invariant broken"); 299 MadeChange = true; 300 } 301 302 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 303 unsigned ValueSize = getAtomicOpSize(CASI); 304 if (ValueSize < MinCASSize) { 305 assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 306 "MinCmpXchgSizeInBits not yet supported for LL/SC expansions."); 307 expandPartwordCmpXchg(CASI); 308 } else { 309 if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) 310 MadeChange |= expandAtomicCmpXchg(CASI); 311 } 312 } 313 } 314 return MadeChange; 315 } 316 317 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { 318 IRBuilder<> Builder(I); 319 320 auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); 321 322 auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); 323 // The trailing fence is emitted before the instruction instead of after 324 // because there is no easy way of setting Builder insertion point after 325 // an instruction. So we must erase it from the BB, and insert it back 326 // in the right place. 327 // We have a guard here because not every atomic operation generates a 328 // trailing fence. 329 if (TrailingFence) { 330 TrailingFence->removeFromParent(); 331 TrailingFence->insertAfter(I); 332 } 333 334 return (LeadingFence || TrailingFence); 335 } 336 337 /// Get the iX type with the same bitwidth as T. 338 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 339 const DataLayout &DL) { 340 EVT VT = TLI->getValueType(DL, T); 341 unsigned BitWidth = VT.getStoreSizeInBits(); 342 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 343 return IntegerType::get(T->getContext(), BitWidth); 344 } 345 346 /// Convert an atomic load of a non-integral type to an integer load of the 347 /// equivalent bitwidth. See the function comment on 348 /// convertAtomicStoreToIntegerType for background. 349 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 350 auto *M = LI->getModule(); 351 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 352 M->getDataLayout()); 353 354 IRBuilder<> Builder(LI); 355 356 Value *Addr = LI->getPointerOperand(); 357 Type *PT = PointerType::get(NewTy, 358 Addr->getType()->getPointerAddressSpace()); 359 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 360 361 auto *NewLI = Builder.CreateLoad(NewAddr); 362 NewLI->setAlignment(LI->getAlignment()); 363 NewLI->setVolatile(LI->isVolatile()); 364 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); 365 DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 366 367 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 368 LI->replaceAllUsesWith(NewVal); 369 LI->eraseFromParent(); 370 return NewLI; 371 } 372 373 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 374 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 375 case TargetLoweringBase::AtomicExpansionKind::None: 376 return false; 377 case TargetLoweringBase::AtomicExpansionKind::LLSC: 378 expandAtomicOpToLLSC( 379 LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), 380 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 381 return true; 382 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 383 return expandAtomicLoadToLL(LI); 384 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 385 return expandAtomicLoadToCmpXchg(LI); 386 } 387 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 388 } 389 390 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 391 IRBuilder<> Builder(LI); 392 393 // On some architectures, load-linked instructions are atomic for larger 394 // sizes than normal loads. For example, the only 64-bit load guaranteed 395 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 396 Value *Val = 397 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 398 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 399 400 LI->replaceAllUsesWith(Val); 401 LI->eraseFromParent(); 402 403 return true; 404 } 405 406 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 407 IRBuilder<> Builder(LI); 408 AtomicOrdering Order = LI->getOrdering(); 409 Value *Addr = LI->getPointerOperand(); 410 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 411 Constant *DummyVal = Constant::getNullValue(Ty); 412 413 Value *Pair = Builder.CreateAtomicCmpXchg( 414 Addr, DummyVal, DummyVal, Order, 415 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 416 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 417 418 LI->replaceAllUsesWith(Loaded); 419 LI->eraseFromParent(); 420 421 return true; 422 } 423 424 /// Convert an atomic store of a non-integral type to an integer store of the 425 /// equivalent bitwidth. We used to not support floating point or vector 426 /// atomics in the IR at all. The backends learned to deal with the bitcast 427 /// idiom because that was the only way of expressing the notion of a atomic 428 /// float or vector store. The long term plan is to teach each backend to 429 /// instruction select from the original atomic store, but as a migration 430 /// mechanism, we convert back to the old format which the backends understand. 431 /// Each backend will need individual work to recognize the new format. 432 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 433 IRBuilder<> Builder(SI); 434 auto *M = SI->getModule(); 435 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 436 M->getDataLayout()); 437 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 438 439 Value *Addr = SI->getPointerOperand(); 440 Type *PT = PointerType::get(NewTy, 441 Addr->getType()->getPointerAddressSpace()); 442 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 443 444 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 445 NewSI->setAlignment(SI->getAlignment()); 446 NewSI->setVolatile(SI->isVolatile()); 447 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); 448 DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 449 SI->eraseFromParent(); 450 return NewSI; 451 } 452 453 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 454 // This function is only called on atomic stores that are too large to be 455 // atomic if implemented as a native store. So we replace them by an 456 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 457 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 458 // It is the responsibility of the target to only signal expansion via 459 // shouldExpandAtomicRMW in cases where this is required and possible. 460 IRBuilder<> Builder(SI); 461 AtomicRMWInst *AI = 462 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 463 SI->getValueOperand(), SI->getOrdering()); 464 SI->eraseFromParent(); 465 466 // Now we have an appropriate swap instruction, lower it as usual. 467 return tryExpandAtomicRMW(AI); 468 } 469 470 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 471 Value *Loaded, Value *NewVal, 472 AtomicOrdering MemOpOrder, 473 Value *&Success, Value *&NewLoaded) { 474 Value* Pair = Builder.CreateAtomicCmpXchg( 475 Addr, Loaded, NewVal, MemOpOrder, 476 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 477 Success = Builder.CreateExtractValue(Pair, 1, "success"); 478 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 479 } 480 481 /// Emit IR to implement the given atomicrmw operation on values in registers, 482 /// returning the new value. 483 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 484 Value *Loaded, Value *Inc) { 485 Value *NewVal; 486 switch (Op) { 487 case AtomicRMWInst::Xchg: 488 return Inc; 489 case AtomicRMWInst::Add: 490 return Builder.CreateAdd(Loaded, Inc, "new"); 491 case AtomicRMWInst::Sub: 492 return Builder.CreateSub(Loaded, Inc, "new"); 493 case AtomicRMWInst::And: 494 return Builder.CreateAnd(Loaded, Inc, "new"); 495 case AtomicRMWInst::Nand: 496 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 497 case AtomicRMWInst::Or: 498 return Builder.CreateOr(Loaded, Inc, "new"); 499 case AtomicRMWInst::Xor: 500 return Builder.CreateXor(Loaded, Inc, "new"); 501 case AtomicRMWInst::Max: 502 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 503 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 504 case AtomicRMWInst::Min: 505 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 506 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 507 case AtomicRMWInst::UMax: 508 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 509 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 510 case AtomicRMWInst::UMin: 511 NewVal = Builder.CreateICmpULE(Loaded, Inc); 512 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 513 default: 514 llvm_unreachable("Unknown atomic op"); 515 } 516 } 517 518 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 519 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 520 case TargetLoweringBase::AtomicExpansionKind::None: 521 return false; 522 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 523 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 524 unsigned ValueSize = getAtomicOpSize(AI); 525 if (ValueSize < MinCASSize) { 526 llvm_unreachable( 527 "MinCmpXchgSizeInBits not yet supported for LL/SC architectures."); 528 } else { 529 auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { 530 return performAtomicOp(AI->getOperation(), Builder, Loaded, 531 AI->getValOperand()); 532 }; 533 expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), 534 AI->getOrdering(), PerformOp); 535 } 536 return true; 537 } 538 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { 539 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 540 unsigned ValueSize = getAtomicOpSize(AI); 541 if (ValueSize < MinCASSize) { 542 expandPartwordAtomicRMW(AI, 543 TargetLoweringBase::AtomicExpansionKind::CmpXChg); 544 } else { 545 expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 546 } 547 return true; 548 } 549 default: 550 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 551 } 552 } 553 554 namespace { 555 556 /// Result values from createMaskInstrs helper. 557 struct PartwordMaskValues { 558 Type *WordType; 559 Type *ValueType; 560 Value *AlignedAddr; 561 Value *ShiftAmt; 562 Value *Mask; 563 Value *Inv_Mask; 564 }; 565 } // end anonymous namespace 566 567 /// This is a helper function which builds instructions to provide 568 /// values necessary for partword atomic operations. It takes an 569 /// incoming address, Addr, and ValueType, and constructs the address, 570 /// shift-amounts and masks needed to work with a larger value of size 571 /// WordSize. 572 /// 573 /// AlignedAddr: Addr rounded down to a multiple of WordSize 574 /// 575 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded 576 /// from AlignAddr for it to have the same value as if 577 /// ValueType was loaded from Addr. 578 /// 579 /// Mask: Value to mask with the value loaded from AlignAddr to 580 /// include only the part that would've been loaded from Addr. 581 /// 582 /// Inv_Mask: The inverse of Mask. 583 584 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, 585 Type *ValueType, Value *Addr, 586 unsigned WordSize) { 587 PartwordMaskValues Ret; 588 589 BasicBlock *BB = I->getParent(); 590 Function *F = BB->getParent(); 591 Module *M = I->getModule(); 592 593 LLVMContext &Ctx = F->getContext(); 594 const DataLayout &DL = M->getDataLayout(); 595 596 unsigned ValueSize = DL.getTypeStoreSize(ValueType); 597 598 assert(ValueSize < WordSize); 599 600 Ret.ValueType = ValueType; 601 Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8); 602 603 Type *WordPtrType = 604 Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); 605 606 Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); 607 Ret.AlignedAddr = Builder.CreateIntToPtr( 608 Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType, 609 "AlignedAddr"); 610 611 Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB"); 612 if (DL.isLittleEndian()) { 613 // turn bytes into bits 614 Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3); 615 } else { 616 // turn bytes into bits, and count from the other side. 617 Ret.ShiftAmt = 618 Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3); 619 } 620 621 Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt"); 622 Ret.Mask = Builder.CreateShl( 623 ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt, 624 "Mask"); 625 Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask"); 626 627 return Ret; 628 } 629 630 /// Emit IR to implement a masked version of a given atomicrmw 631 /// operation. (That is, only the bits under the Mask should be 632 /// affected by the operation) 633 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, 634 IRBuilder<> &Builder, Value *Loaded, 635 Value *Shifted_Inc, Value *Inc, 636 const PartwordMaskValues &PMV) { 637 switch (Op) { 638 case AtomicRMWInst::Xchg: { 639 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 640 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); 641 return FinalVal; 642 } 643 case AtomicRMWInst::Or: 644 case AtomicRMWInst::Xor: 645 // Or/Xor won't affect any other bits, so can just be done 646 // directly. 647 return performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 648 case AtomicRMWInst::Add: 649 case AtomicRMWInst::Sub: 650 case AtomicRMWInst::And: 651 case AtomicRMWInst::Nand: { 652 // The other arithmetic ops need to be masked into place. 653 Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 654 Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); 655 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 656 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); 657 return FinalVal; 658 } 659 case AtomicRMWInst::Max: 660 case AtomicRMWInst::Min: 661 case AtomicRMWInst::UMax: 662 case AtomicRMWInst::UMin: { 663 // Finally, comparison ops will operate on the full value, so 664 // truncate down to the original size, and expand out again after 665 // doing the operation. 666 Value *Loaded_Shiftdown = Builder.CreateTrunc( 667 Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType); 668 Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc); 669 Value *NewVal_Shiftup = Builder.CreateShl( 670 Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 671 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 672 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup); 673 return FinalVal; 674 } 675 default: 676 llvm_unreachable("Unknown atomic op"); 677 } 678 } 679 680 /// Expand a sub-word atomicrmw operation into an appropriate 681 /// word-sized operation. 682 /// 683 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same 684 /// way as a typical atomicrmw expansion. The only difference here is 685 /// that the operation inside of the loop must operate only upon a 686 /// part of the value. 687 void AtomicExpand::expandPartwordAtomicRMW( 688 AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { 689 690 assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg); 691 692 AtomicOrdering MemOpOrder = AI->getOrdering(); 693 694 IRBuilder<> Builder(AI); 695 696 PartwordMaskValues PMV = 697 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 698 TLI->getMinCmpXchgSizeInBits() / 8); 699 700 Value *ValOperand_Shifted = 701 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 702 PMV.ShiftAmt, "ValOperand_Shifted"); 703 704 auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { 705 return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, 706 ValOperand_Shifted, AI->getValOperand(), PMV); 707 }; 708 709 // TODO: When we're ready to support LLSC conversions too, use 710 // insertRMWLLSCLoop here for ExpansionKind==LLSC. 711 Value *OldResult = 712 insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, 713 PerformPartwordOp, createCmpXchgInstFun); 714 Value *FinalOldResult = Builder.CreateTrunc( 715 Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType); 716 AI->replaceAllUsesWith(FinalOldResult); 717 AI->eraseFromParent(); 718 } 719 720 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { 721 // The basic idea here is that we're expanding a cmpxchg of a 722 // smaller memory size up to a word-sized cmpxchg. To do this, we 723 // need to add a retry-loop for strong cmpxchg, so that 724 // modifications to other parts of the word don't cause a spurious 725 // failure. 726 727 // This generates code like the following: 728 // [[Setup mask values PMV.*]] 729 // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt 730 // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt 731 // %InitLoaded = load i32* %addr 732 // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask 733 // br partword.cmpxchg.loop 734 // partword.cmpxchg.loop: 735 // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], 736 // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] 737 // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted 738 // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted 739 // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, 740 // i32 %FullWord_NewVal success_ordering failure_ordering 741 // %OldVal = extractvalue { i32, i1 } %NewCI, 0 742 // %Success = extractvalue { i32, i1 } %NewCI, 1 743 // br i1 %Success, label %partword.cmpxchg.end, 744 // label %partword.cmpxchg.failure 745 // partword.cmpxchg.failure: 746 // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask 747 // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut 748 // br i1 %ShouldContinue, label %partword.cmpxchg.loop, 749 // label %partword.cmpxchg.end 750 // partword.cmpxchg.end: 751 // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt 752 // %FinalOldVal = trunc i32 %tmp1 to i8 753 // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 754 // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 755 756 Value *Addr = CI->getPointerOperand(); 757 Value *Cmp = CI->getCompareOperand(); 758 Value *NewVal = CI->getNewValOperand(); 759 760 BasicBlock *BB = CI->getParent(); 761 Function *F = BB->getParent(); 762 IRBuilder<> Builder(CI); 763 LLVMContext &Ctx = Builder.getContext(); 764 765 const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; 766 767 BasicBlock *EndBB = 768 BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); 769 auto FailureBB = 770 BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); 771 auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); 772 773 // The split call above "helpfully" added a branch at the end of BB 774 // (to the wrong place). 775 std::prev(BB->end())->eraseFromParent(); 776 Builder.SetInsertPoint(BB); 777 778 PartwordMaskValues PMV = createMaskInstrs( 779 Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); 780 781 // Shift the incoming values over, into the right location in the word. 782 Value *NewVal_Shifted = 783 Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 784 Value *Cmp_Shifted = 785 Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); 786 787 // Load the entire current word, and mask into place the expected and new 788 // values 789 LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); 790 InitLoaded->setVolatile(CI->isVolatile()); 791 Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); 792 Builder.CreateBr(LoopBB); 793 794 // partword.cmpxchg.loop: 795 Builder.SetInsertPoint(LoopBB); 796 PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); 797 Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); 798 799 // Mask/Or the expected and new values into place in the loaded word. 800 Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); 801 Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); 802 AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( 803 PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), 804 CI->getFailureOrdering(), CI->getSyncScopeID()); 805 NewCI->setVolatile(CI->isVolatile()); 806 // When we're building a strong cmpxchg, we need a loop, so you 807 // might think we could use a weak cmpxchg inside. But, using strong 808 // allows the below comparison for ShouldContinue, and we're 809 // expecting the underlying cmpxchg to be a machine instruction, 810 // which is strong anyways. 811 NewCI->setWeak(CI->isWeak()); 812 813 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 814 Value *Success = Builder.CreateExtractValue(NewCI, 1); 815 816 if (CI->isWeak()) 817 Builder.CreateBr(EndBB); 818 else 819 Builder.CreateCondBr(Success, EndBB, FailureBB); 820 821 // partword.cmpxchg.failure: 822 Builder.SetInsertPoint(FailureBB); 823 // Upon failure, verify that the masked-out part of the loaded value 824 // has been modified. If it didn't, abort the cmpxchg, since the 825 // masked-in part must've. 826 Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); 827 Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); 828 Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); 829 830 // Add the second value to the phi from above 831 Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); 832 833 // partword.cmpxchg.end: 834 Builder.SetInsertPoint(CI); 835 836 Value *FinalOldVal = Builder.CreateTrunc( 837 Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType); 838 Value *Res = UndefValue::get(CI->getType()); 839 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 840 Res = Builder.CreateInsertValue(Res, Success, 1); 841 842 CI->replaceAllUsesWith(Res); 843 CI->eraseFromParent(); 844 } 845 846 void AtomicExpand::expandAtomicOpToLLSC( 847 Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, 848 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 849 IRBuilder<> Builder(I); 850 Value *Loaded = 851 insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); 852 853 I->replaceAllUsesWith(Loaded); 854 I->eraseFromParent(); 855 } 856 857 Value *AtomicExpand::insertRMWLLSCLoop( 858 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 859 AtomicOrdering MemOpOrder, 860 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 861 LLVMContext &Ctx = Builder.getContext(); 862 BasicBlock *BB = Builder.GetInsertBlock(); 863 Function *F = BB->getParent(); 864 865 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 866 // 867 // The standard expansion we produce is: 868 // [...] 869 // atomicrmw.start: 870 // %loaded = @load.linked(%addr) 871 // %new = some_op iN %loaded, %incr 872 // %stored = @store_conditional(%new, %addr) 873 // %try_again = icmp i32 ne %stored, 0 874 // br i1 %try_again, label %loop, label %atomicrmw.end 875 // atomicrmw.end: 876 // [...] 877 BasicBlock *ExitBB = 878 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 879 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 880 881 // The split call above "helpfully" added a branch at the end of BB (to the 882 // wrong place). 883 std::prev(BB->end())->eraseFromParent(); 884 Builder.SetInsertPoint(BB); 885 Builder.CreateBr(LoopBB); 886 887 // Start the main loop block now that we've taken care of the preliminaries. 888 Builder.SetInsertPoint(LoopBB); 889 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 890 891 Value *NewVal = PerformOp(Builder, Loaded); 892 893 Value *StoreSuccess = 894 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 895 Value *TryAgain = Builder.CreateICmpNE( 896 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 897 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 898 899 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 900 return Loaded; 901 } 902 903 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 904 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 905 /// IR. As a migration step, we convert back to what use to be the standard 906 /// way to represent a pointer cmpxchg so that we can update backends one by 907 /// one. 908 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 909 auto *M = CI->getModule(); 910 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 911 M->getDataLayout()); 912 913 IRBuilder<> Builder(CI); 914 915 Value *Addr = CI->getPointerOperand(); 916 Type *PT = PointerType::get(NewTy, 917 Addr->getType()->getPointerAddressSpace()); 918 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 919 920 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 921 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 922 923 924 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 925 CI->getSuccessOrdering(), 926 CI->getFailureOrdering(), 927 CI->getSyncScopeID()); 928 NewCI->setVolatile(CI->isVolatile()); 929 NewCI->setWeak(CI->isWeak()); 930 DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 931 932 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 933 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 934 935 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 936 937 Value *Res = UndefValue::get(CI->getType()); 938 Res = Builder.CreateInsertValue(Res, OldVal, 0); 939 Res = Builder.CreateInsertValue(Res, Succ, 1); 940 941 CI->replaceAllUsesWith(Res); 942 CI->eraseFromParent(); 943 return NewCI; 944 } 945 946 947 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 948 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 949 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 950 Value *Addr = CI->getPointerOperand(); 951 BasicBlock *BB = CI->getParent(); 952 Function *F = BB->getParent(); 953 LLVMContext &Ctx = F->getContext(); 954 // If shouldInsertFencesForAtomic() returns true, then the target does not 955 // want to deal with memory orders, and emitLeading/TrailingFence should take 956 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 957 // should preserve the ordering. 958 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 959 AtomicOrdering MemOpOrder = 960 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 961 962 // In implementations which use a barrier to achieve release semantics, we can 963 // delay emitting this barrier until we know a store is actually going to be 964 // attempted. The cost of this delay is that we need 2 copies of the block 965 // emitting the load-linked, affecting code size. 966 // 967 // Ideally, this logic would be unconditional except for the minsize check 968 // since in other cases the extra blocks naturally collapse down to the 969 // minimal loop. Unfortunately, this puts too much stress on later 970 // optimisations so we avoid emitting the extra logic in those cases too. 971 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 972 SuccessOrder != AtomicOrdering::Monotonic && 973 SuccessOrder != AtomicOrdering::Acquire && 974 !F->optForMinSize(); 975 976 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 977 // do it even on minsize. 978 bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); 979 980 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 981 // 982 // The full expansion we produce is: 983 // [...] 984 // cmpxchg.start: 985 // %unreleasedload = @load.linked(%addr) 986 // %should_store = icmp eq %unreleasedload, %desired 987 // br i1 %should_store, label %cmpxchg.fencedstore, 988 // label %cmpxchg.nostore 989 // cmpxchg.releasingstore: 990 // fence? 991 // br label cmpxchg.trystore 992 // cmpxchg.trystore: 993 // %loaded.trystore = phi [%unreleasedload, %releasingstore], 994 // [%releasedload, %cmpxchg.releasedload] 995 // %stored = @store_conditional(%new, %addr) 996 // %success = icmp eq i32 %stored, 0 997 // br i1 %success, label %cmpxchg.success, 998 // label %cmpxchg.releasedload/%cmpxchg.failure 999 // cmpxchg.releasedload: 1000 // %releasedload = @load.linked(%addr) 1001 // %should_store = icmp eq %releasedload, %desired 1002 // br i1 %should_store, label %cmpxchg.trystore, 1003 // label %cmpxchg.failure 1004 // cmpxchg.success: 1005 // fence? 1006 // br label %cmpxchg.end 1007 // cmpxchg.nostore: 1008 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 1009 // [%releasedload, 1010 // %cmpxchg.releasedload/%cmpxchg.trystore] 1011 // @load_linked_fail_balance()? 1012 // br label %cmpxchg.failure 1013 // cmpxchg.failure: 1014 // fence? 1015 // br label %cmpxchg.end 1016 // cmpxchg.end: 1017 // %loaded = phi [%loaded.nostore, %cmpxchg.failure], 1018 // [%loaded.trystore, %cmpxchg.trystore] 1019 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 1020 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 1021 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 1022 // [...] 1023 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 1024 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 1025 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 1026 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 1027 auto ReleasedLoadBB = 1028 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 1029 auto TryStoreBB = 1030 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 1031 auto ReleasingStoreBB = 1032 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 1033 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 1034 1035 // This grabs the DebugLoc from CI 1036 IRBuilder<> Builder(CI); 1037 1038 // The split call above "helpfully" added a branch at the end of BB (to the 1039 // wrong place), but we might want a fence too. It's easiest to just remove 1040 // the branch entirely. 1041 std::prev(BB->end())->eraseFromParent(); 1042 Builder.SetInsertPoint(BB); 1043 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 1044 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1045 Builder.CreateBr(StartBB); 1046 1047 // Start the main loop block now that we've taken care of the preliminaries. 1048 Builder.SetInsertPoint(StartBB); 1049 Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1050 Value *ShouldStore = Builder.CreateICmpEQ( 1051 UnreleasedLoad, CI->getCompareOperand(), "should_store"); 1052 1053 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1054 // jump straight past that fence instruction (if it exists). 1055 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 1056 1057 Builder.SetInsertPoint(ReleasingStoreBB); 1058 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 1059 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1060 Builder.CreateBr(TryStoreBB); 1061 1062 Builder.SetInsertPoint(TryStoreBB); 1063 Value *StoreSuccess = TLI->emitStoreConditional( 1064 Builder, CI->getNewValOperand(), Addr, MemOpOrder); 1065 StoreSuccess = Builder.CreateICmpEQ( 1066 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 1067 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 1068 Builder.CreateCondBr(StoreSuccess, SuccessBB, 1069 CI->isWeak() ? FailureBB : RetryBB); 1070 1071 Builder.SetInsertPoint(ReleasedLoadBB); 1072 Value *SecondLoad; 1073 if (HasReleasedLoadBB) { 1074 SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1075 ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), 1076 "should_store"); 1077 1078 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1079 // jump straight past that fence instruction (if it exists). 1080 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 1081 } else 1082 Builder.CreateUnreachable(); 1083 1084 // Make sure later instructions don't get reordered with a fence if 1085 // necessary. 1086 Builder.SetInsertPoint(SuccessBB); 1087 if (ShouldInsertFencesForAtomic) 1088 TLI->emitTrailingFence(Builder, CI, SuccessOrder); 1089 Builder.CreateBr(ExitBB); 1090 1091 Builder.SetInsertPoint(NoStoreBB); 1092 // In the failing case, where we don't execute the store-conditional, the 1093 // target might want to balance out the load-linked with a dedicated 1094 // instruction (e.g., on ARM, clearing the exclusive monitor). 1095 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 1096 Builder.CreateBr(FailureBB); 1097 1098 Builder.SetInsertPoint(FailureBB); 1099 if (ShouldInsertFencesForAtomic) 1100 TLI->emitTrailingFence(Builder, CI, FailureOrder); 1101 Builder.CreateBr(ExitBB); 1102 1103 // Finally, we have control-flow based knowledge of whether the cmpxchg 1104 // succeeded or not. We expose this to later passes by converting any 1105 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 1106 // PHI. 1107 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1108 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); 1109 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 1110 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 1111 1112 // Setup the builder so we can create any PHIs we need. 1113 Value *Loaded; 1114 if (!HasReleasedLoadBB) 1115 Loaded = UnreleasedLoad; 1116 else { 1117 Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); 1118 PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1119 TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); 1120 TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1121 1122 Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); 1123 PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1124 NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); 1125 NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1126 1127 Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); 1128 PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1129 ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); 1130 ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); 1131 1132 Loaded = ExitLoaded; 1133 } 1134 1135 // Look for any users of the cmpxchg that are just comparing the loaded value 1136 // against the desired one, and replace them with the CFG-derived version. 1137 SmallVector<ExtractValueInst *, 2> PrunedInsts; 1138 for (auto User : CI->users()) { 1139 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 1140 if (!EV) 1141 continue; 1142 1143 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 1144 "weird extraction from { iN, i1 }"); 1145 1146 if (EV->getIndices()[0] == 0) 1147 EV->replaceAllUsesWith(Loaded); 1148 else 1149 EV->replaceAllUsesWith(Success); 1150 1151 PrunedInsts.push_back(EV); 1152 } 1153 1154 // We can remove the instructions now we're no longer iterating through them. 1155 for (auto EV : PrunedInsts) 1156 EV->eraseFromParent(); 1157 1158 if (!CI->use_empty()) { 1159 // Some use of the full struct return that we don't understand has happened, 1160 // so we've got to reconstruct it properly. 1161 Value *Res; 1162 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 1163 Res = Builder.CreateInsertValue(Res, Success, 1); 1164 1165 CI->replaceAllUsesWith(Res); 1166 } 1167 1168 CI->eraseFromParent(); 1169 return true; 1170 } 1171 1172 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 1173 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 1174 if(!C) 1175 return false; 1176 1177 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 1178 switch(Op) { 1179 case AtomicRMWInst::Add: 1180 case AtomicRMWInst::Sub: 1181 case AtomicRMWInst::Or: 1182 case AtomicRMWInst::Xor: 1183 return C->isZero(); 1184 case AtomicRMWInst::And: 1185 return C->isMinusOne(); 1186 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 1187 default: 1188 return false; 1189 } 1190 } 1191 1192 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 1193 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 1194 tryExpandAtomicLoad(ResultingLoad); 1195 return true; 1196 } 1197 return false; 1198 } 1199 1200 Value *AtomicExpand::insertRMWCmpXchgLoop( 1201 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1202 AtomicOrdering MemOpOrder, 1203 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 1204 CreateCmpXchgInstFun CreateCmpXchg) { 1205 LLVMContext &Ctx = Builder.getContext(); 1206 BasicBlock *BB = Builder.GetInsertBlock(); 1207 Function *F = BB->getParent(); 1208 1209 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1210 // 1211 // The standard expansion we produce is: 1212 // [...] 1213 // %init_loaded = load atomic iN* %addr 1214 // br label %loop 1215 // loop: 1216 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 1217 // %new = some_op iN %loaded, %incr 1218 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 1219 // %new_loaded = extractvalue { iN, i1 } %pair, 0 1220 // %success = extractvalue { iN, i1 } %pair, 1 1221 // br i1 %success, label %atomicrmw.end, label %loop 1222 // atomicrmw.end: 1223 // [...] 1224 BasicBlock *ExitBB = 1225 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1226 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1227 1228 // The split call above "helpfully" added a branch at the end of BB (to the 1229 // wrong place), but we want a load. It's easiest to just remove 1230 // the branch entirely. 1231 std::prev(BB->end())->eraseFromParent(); 1232 Builder.SetInsertPoint(BB); 1233 LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); 1234 // Atomics require at least natural alignment. 1235 InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8); 1236 Builder.CreateBr(LoopBB); 1237 1238 // Start the main loop block now that we've taken care of the preliminaries. 1239 Builder.SetInsertPoint(LoopBB); 1240 PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); 1241 Loaded->addIncoming(InitLoaded, BB); 1242 1243 Value *NewVal = PerformOp(Builder, Loaded); 1244 1245 Value *NewLoaded = nullptr; 1246 Value *Success = nullptr; 1247 1248 CreateCmpXchg(Builder, Addr, Loaded, NewVal, 1249 MemOpOrder == AtomicOrdering::Unordered 1250 ? AtomicOrdering::Monotonic 1251 : MemOpOrder, 1252 Success, NewLoaded); 1253 assert(Success && NewLoaded); 1254 1255 Loaded->addIncoming(NewLoaded, LoopBB); 1256 1257 Builder.CreateCondBr(Success, ExitBB, LoopBB); 1258 1259 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1260 return NewLoaded; 1261 } 1262 1263 // Note: This function is exposed externally by AtomicExpandUtils.h 1264 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 1265 CreateCmpXchgInstFun CreateCmpXchg) { 1266 IRBuilder<> Builder(AI); 1267 Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( 1268 Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), 1269 [&](IRBuilder<> &Builder, Value *Loaded) { 1270 return performAtomicOp(AI->getOperation(), Builder, Loaded, 1271 AI->getValOperand()); 1272 }, 1273 CreateCmpXchg); 1274 1275 AI->replaceAllUsesWith(Loaded); 1276 AI->eraseFromParent(); 1277 return true; 1278 } 1279 1280 // In order to use one of the sized library calls such as 1281 // __atomic_fetch_add_4, the alignment must be sufficient, the size 1282 // must be one of the potentially-specialized sizes, and the value 1283 // type must actually exist in C on the target (otherwise, the 1284 // function wouldn't actually be defined.) 1285 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align, 1286 const DataLayout &DL) { 1287 // TODO: "LargestSize" is an approximation for "largest type that 1288 // you can express in C". It seems to be the case that int128 is 1289 // supported on all 64-bit platforms, otherwise only up to 64-bit 1290 // integers are supported. If we get this wrong, then we'll try to 1291 // call a sized libcall that doesn't actually exist. There should 1292 // really be some more reliable way in LLVM of determining integer 1293 // sizes which are valid in the target's C ABI... 1294 unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8; 1295 return Align >= Size && 1296 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) && 1297 Size <= LargestSize; 1298 } 1299 1300 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { 1301 static const RTLIB::Libcall Libcalls[6] = { 1302 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, 1303 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; 1304 unsigned Size = getAtomicOpSize(I); 1305 unsigned Align = getAtomicOpAlign(I); 1306 1307 bool expanded = expandAtomicOpToLibcall( 1308 I, Size, Align, I->getPointerOperand(), nullptr, nullptr, 1309 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1310 (void)expanded; 1311 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); 1312 } 1313 1314 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { 1315 static const RTLIB::Libcall Libcalls[6] = { 1316 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, 1317 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; 1318 unsigned Size = getAtomicOpSize(I); 1319 unsigned Align = getAtomicOpAlign(I); 1320 1321 bool expanded = expandAtomicOpToLibcall( 1322 I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr, 1323 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1324 (void)expanded; 1325 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); 1326 } 1327 1328 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { 1329 static const RTLIB::Libcall Libcalls[6] = { 1330 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, 1331 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, 1332 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; 1333 unsigned Size = getAtomicOpSize(I); 1334 unsigned Align = getAtomicOpAlign(I); 1335 1336 bool expanded = expandAtomicOpToLibcall( 1337 I, Size, Align, I->getPointerOperand(), I->getNewValOperand(), 1338 I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), 1339 Libcalls); 1340 (void)expanded; 1341 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); 1342 } 1343 1344 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { 1345 static const RTLIB::Libcall LibcallsXchg[6] = { 1346 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, 1347 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, 1348 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; 1349 static const RTLIB::Libcall LibcallsAdd[6] = { 1350 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, 1351 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, 1352 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; 1353 static const RTLIB::Libcall LibcallsSub[6] = { 1354 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, 1355 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, 1356 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; 1357 static const RTLIB::Libcall LibcallsAnd[6] = { 1358 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, 1359 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, 1360 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; 1361 static const RTLIB::Libcall LibcallsOr[6] = { 1362 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, 1363 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, 1364 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; 1365 static const RTLIB::Libcall LibcallsXor[6] = { 1366 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, 1367 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, 1368 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; 1369 static const RTLIB::Libcall LibcallsNand[6] = { 1370 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, 1371 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, 1372 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; 1373 1374 switch (Op) { 1375 case AtomicRMWInst::BAD_BINOP: 1376 llvm_unreachable("Should not have BAD_BINOP."); 1377 case AtomicRMWInst::Xchg: 1378 return makeArrayRef(LibcallsXchg); 1379 case AtomicRMWInst::Add: 1380 return makeArrayRef(LibcallsAdd); 1381 case AtomicRMWInst::Sub: 1382 return makeArrayRef(LibcallsSub); 1383 case AtomicRMWInst::And: 1384 return makeArrayRef(LibcallsAnd); 1385 case AtomicRMWInst::Or: 1386 return makeArrayRef(LibcallsOr); 1387 case AtomicRMWInst::Xor: 1388 return makeArrayRef(LibcallsXor); 1389 case AtomicRMWInst::Nand: 1390 return makeArrayRef(LibcallsNand); 1391 case AtomicRMWInst::Max: 1392 case AtomicRMWInst::Min: 1393 case AtomicRMWInst::UMax: 1394 case AtomicRMWInst::UMin: 1395 // No atomic libcalls are available for max/min/umax/umin. 1396 return {}; 1397 } 1398 llvm_unreachable("Unexpected AtomicRMW operation."); 1399 } 1400 1401 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { 1402 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); 1403 1404 unsigned Size = getAtomicOpSize(I); 1405 unsigned Align = getAtomicOpAlign(I); 1406 1407 bool Success = false; 1408 if (!Libcalls.empty()) 1409 Success = expandAtomicOpToLibcall( 1410 I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr, 1411 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1412 1413 // The expansion failed: either there were no libcalls at all for 1414 // the operation (min/max), or there were only size-specialized 1415 // libcalls (add/sub/etc) and we needed a generic. So, expand to a 1416 // CAS libcall, via a CAS loop, instead. 1417 if (!Success) { 1418 expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, 1419 Value *Loaded, Value *NewVal, 1420 AtomicOrdering MemOpOrder, 1421 Value *&Success, Value *&NewLoaded) { 1422 // Create the CAS instruction normally... 1423 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( 1424 Addr, Loaded, NewVal, MemOpOrder, 1425 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 1426 Success = Builder.CreateExtractValue(Pair, 1, "success"); 1427 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 1428 1429 // ...and then expand the CAS into a libcall. 1430 expandAtomicCASToLibcall(Pair); 1431 }); 1432 } 1433 } 1434 1435 // A helper routine for the above expandAtomic*ToLibcall functions. 1436 // 1437 // 'Libcalls' contains an array of enum values for the particular 1438 // ATOMIC libcalls to be emitted. All of the other arguments besides 1439 // 'I' are extracted from the Instruction subclass by the 1440 // caller. Depending on the particular call, some will be null. 1441 bool AtomicExpand::expandAtomicOpToLibcall( 1442 Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand, 1443 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, 1444 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { 1445 assert(Libcalls.size() == 6); 1446 1447 LLVMContext &Ctx = I->getContext(); 1448 Module *M = I->getModule(); 1449 const DataLayout &DL = M->getDataLayout(); 1450 IRBuilder<> Builder(I); 1451 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); 1452 1453 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL); 1454 Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); 1455 1456 unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy); 1457 1458 // TODO: the "order" argument type is "int", not int32. So 1459 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. 1460 ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); 1461 assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); 1462 Constant *OrderingVal = 1463 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); 1464 Constant *Ordering2Val = nullptr; 1465 if (CASExpected) { 1466 assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO"); 1467 Ordering2Val = 1468 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2)); 1469 } 1470 bool HasResult = I->getType() != Type::getVoidTy(Ctx); 1471 1472 RTLIB::Libcall RTLibType; 1473 if (UseSizedLibcall) { 1474 switch (Size) { 1475 case 1: RTLibType = Libcalls[1]; break; 1476 case 2: RTLibType = Libcalls[2]; break; 1477 case 4: RTLibType = Libcalls[3]; break; 1478 case 8: RTLibType = Libcalls[4]; break; 1479 case 16: RTLibType = Libcalls[5]; break; 1480 } 1481 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) { 1482 RTLibType = Libcalls[0]; 1483 } else { 1484 // Can't use sized function, and there's no generic for this 1485 // operation, so give up. 1486 return false; 1487 } 1488 1489 // Build up the function call. There's two kinds. First, the sized 1490 // variants. These calls are going to be one of the following (with 1491 // N=1,2,4,8,16): 1492 // iN __atomic_load_N(iN *ptr, int ordering) 1493 // void __atomic_store_N(iN *ptr, iN val, int ordering) 1494 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) 1495 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, 1496 // int success_order, int failure_order) 1497 // 1498 // Note that these functions can be used for non-integer atomic 1499 // operations, the values just need to be bitcast to integers on the 1500 // way in and out. 1501 // 1502 // And, then, the generic variants. They look like the following: 1503 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) 1504 // void __atomic_store(size_t size, void *ptr, void *val, int ordering) 1505 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, 1506 // int ordering) 1507 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, 1508 // void *desired, int success_order, 1509 // int failure_order) 1510 // 1511 // The different signatures are built up depending on the 1512 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' 1513 // variables. 1514 1515 AllocaInst *AllocaCASExpected = nullptr; 1516 Value *AllocaCASExpected_i8 = nullptr; 1517 AllocaInst *AllocaValue = nullptr; 1518 Value *AllocaValue_i8 = nullptr; 1519 AllocaInst *AllocaResult = nullptr; 1520 Value *AllocaResult_i8 = nullptr; 1521 1522 Type *ResultTy; 1523 SmallVector<Value *, 6> Args; 1524 AttributeList Attr; 1525 1526 // 'size' argument. 1527 if (!UseSizedLibcall) { 1528 // Note, getIntPtrType is assumed equivalent to size_t. 1529 Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); 1530 } 1531 1532 // 'ptr' argument. 1533 Value *PtrVal = 1534 Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx)); 1535 Args.push_back(PtrVal); 1536 1537 // 'expected' argument, if present. 1538 if (CASExpected) { 1539 AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); 1540 AllocaCASExpected->setAlignment(AllocaAlignment); 1541 AllocaCASExpected_i8 = 1542 Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx)); 1543 Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); 1544 Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); 1545 Args.push_back(AllocaCASExpected_i8); 1546 } 1547 1548 // 'val' argument ('desired' for cas), if present. 1549 if (ValueOperand) { 1550 if (UseSizedLibcall) { 1551 Value *IntValue = 1552 Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); 1553 Args.push_back(IntValue); 1554 } else { 1555 AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); 1556 AllocaValue->setAlignment(AllocaAlignment); 1557 AllocaValue_i8 = 1558 Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); 1559 Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); 1560 Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); 1561 Args.push_back(AllocaValue_i8); 1562 } 1563 } 1564 1565 // 'ret' argument. 1566 if (!CASExpected && HasResult && !UseSizedLibcall) { 1567 AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); 1568 AllocaResult->setAlignment(AllocaAlignment); 1569 AllocaResult_i8 = 1570 Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx)); 1571 Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); 1572 Args.push_back(AllocaResult_i8); 1573 } 1574 1575 // 'ordering' ('success_order' for cas) argument. 1576 Args.push_back(OrderingVal); 1577 1578 // 'failure_order' argument, if present. 1579 if (Ordering2Val) 1580 Args.push_back(Ordering2Val); 1581 1582 // Now, the return type. 1583 if (CASExpected) { 1584 ResultTy = Type::getInt1Ty(Ctx); 1585 Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); 1586 } else if (HasResult && UseSizedLibcall) 1587 ResultTy = SizedIntTy; 1588 else 1589 ResultTy = Type::getVoidTy(Ctx); 1590 1591 // Done with setting up arguments and return types, create the call: 1592 SmallVector<Type *, 6> ArgTys; 1593 for (Value *Arg : Args) 1594 ArgTys.push_back(Arg->getType()); 1595 FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); 1596 Constant *LibcallFn = 1597 M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); 1598 CallInst *Call = Builder.CreateCall(LibcallFn, Args); 1599 Call->setAttributes(Attr); 1600 Value *Result = Call; 1601 1602 // And then, extract the results... 1603 if (ValueOperand && !UseSizedLibcall) 1604 Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); 1605 1606 if (CASExpected) { 1607 // The final result from the CAS is {load of 'expected' alloca, bool result 1608 // from call} 1609 Type *FinalResultTy = I->getType(); 1610 Value *V = UndefValue::get(FinalResultTy); 1611 Value *ExpectedOut = 1612 Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment); 1613 Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); 1614 V = Builder.CreateInsertValue(V, ExpectedOut, 0); 1615 V = Builder.CreateInsertValue(V, Result, 1); 1616 I->replaceAllUsesWith(V); 1617 } else if (HasResult) { 1618 Value *V; 1619 if (UseSizedLibcall) 1620 V = Builder.CreateBitOrPointerCast(Result, I->getType()); 1621 else { 1622 V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment); 1623 Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); 1624 } 1625 I->replaceAllUsesWith(V); 1626 } 1627 I->eraseFromParent(); 1628 return true; 1629 } 1630