1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass (at IR level) to replace atomic instructions with 11 // __atomic_* library calls, or target specific instruction which implement the 12 // same semantics in a way which better fits the target backend. This can 13 // include the use of (intrinsic-based) load-linked/store-conditional loops, 14 // AtomicCmpXchg, or type coercions. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/CodeGen/AtomicExpandUtils.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/CodeGen/TargetPassConfig.h" 21 #include "llvm/IR/Function.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/InstIterator.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Intrinsics.h" 26 #include "llvm/IR/Module.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetLowering.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "atomic-expand" 36 37 namespace { 38 class AtomicExpand: public FunctionPass { 39 const TargetLowering *TLI; 40 public: 41 static char ID; // Pass identification, replacement for typeid 42 AtomicExpand() : FunctionPass(ID), TLI(nullptr) { 43 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 44 } 45 46 bool runOnFunction(Function &F) override; 47 48 private: 49 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); 50 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 51 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 52 bool tryExpandAtomicLoad(LoadInst *LI); 53 bool expandAtomicLoadToLL(LoadInst *LI); 54 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 55 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 56 bool expandAtomicStore(StoreInst *SI); 57 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 58 Value * 59 insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 60 AtomicOrdering MemOpOrder, 61 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 62 void expandAtomicOpToLLSC( 63 Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, 64 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 65 void expandPartwordAtomicRMW( 66 AtomicRMWInst *I, 67 TargetLoweringBase::AtomicExpansionKind ExpansionKind); 68 void expandPartwordCmpXchg(AtomicCmpXchgInst *I); 69 70 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 71 static Value *insertRMWCmpXchgLoop( 72 IRBuilder<> &Builder, Type *ResultType, Value *Addr, 73 AtomicOrdering MemOpOrder, 74 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 75 CreateCmpXchgInstFun CreateCmpXchg); 76 77 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 78 bool isIdempotentRMW(AtomicRMWInst *AI); 79 bool simplifyIdempotentRMW(AtomicRMWInst *AI); 80 81 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align, 82 Value *PointerOperand, Value *ValueOperand, 83 Value *CASExpected, AtomicOrdering Ordering, 84 AtomicOrdering Ordering2, 85 ArrayRef<RTLIB::Libcall> Libcalls); 86 void expandAtomicLoadToLibcall(LoadInst *LI); 87 void expandAtomicStoreToLibcall(StoreInst *LI); 88 void expandAtomicRMWToLibcall(AtomicRMWInst *I); 89 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); 90 91 friend bool 92 llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 93 CreateCmpXchgInstFun CreateCmpXchg); 94 }; 95 } 96 97 char AtomicExpand::ID = 0; 98 char &llvm::AtomicExpandID = AtomicExpand::ID; 99 INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", 100 false, false) 101 102 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } 103 104 namespace { 105 // Helper functions to retrieve the size of atomic instructions. 106 unsigned getAtomicOpSize(LoadInst *LI) { 107 const DataLayout &DL = LI->getModule()->getDataLayout(); 108 return DL.getTypeStoreSize(LI->getType()); 109 } 110 111 unsigned getAtomicOpSize(StoreInst *SI) { 112 const DataLayout &DL = SI->getModule()->getDataLayout(); 113 return DL.getTypeStoreSize(SI->getValueOperand()->getType()); 114 } 115 116 unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { 117 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 118 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 119 } 120 121 unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { 122 const DataLayout &DL = CASI->getModule()->getDataLayout(); 123 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 124 } 125 126 // Helper functions to retrieve the alignment of atomic instructions. 127 unsigned getAtomicOpAlign(LoadInst *LI) { 128 unsigned Align = LI->getAlignment(); 129 // In the future, if this IR restriction is relaxed, we should 130 // return DataLayout::getABITypeAlignment when there's no align 131 // value. 132 assert(Align != 0 && "An atomic LoadInst always has an explicit alignment"); 133 return Align; 134 } 135 136 unsigned getAtomicOpAlign(StoreInst *SI) { 137 unsigned Align = SI->getAlignment(); 138 // In the future, if this IR restriction is relaxed, we should 139 // return DataLayout::getABITypeAlignment when there's no align 140 // value. 141 assert(Align != 0 && "An atomic StoreInst always has an explicit alignment"); 142 return Align; 143 } 144 145 unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) { 146 // TODO(PR27168): This instruction has no alignment attribute, but unlike the 147 // default alignment for load/store, the default here is to assume 148 // it has NATURAL alignment, not DataLayout-specified alignment. 149 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 150 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 151 } 152 153 unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) { 154 // TODO(PR27168): same comment as above. 155 const DataLayout &DL = CASI->getModule()->getDataLayout(); 156 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 157 } 158 159 // Determine if a particular atomic operation has a supported size, 160 // and is of appropriate alignment, to be passed through for target 161 // lowering. (Versus turning into a __atomic libcall) 162 template <typename Inst> 163 bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { 164 unsigned Size = getAtomicOpSize(I); 165 unsigned Align = getAtomicOpAlign(I); 166 return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; 167 } 168 169 } // end anonymous namespace 170 171 bool AtomicExpand::runOnFunction(Function &F) { 172 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 173 if (!TPC) 174 return false; 175 176 auto &TM = TPC->getTM<TargetMachine>(); 177 if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) 178 return false; 179 TLI = TM.getSubtargetImpl(F)->getTargetLowering(); 180 181 SmallVector<Instruction *, 1> AtomicInsts; 182 183 // Changing control-flow while iterating through it is a bad idea, so gather a 184 // list of all atomic instructions before we start. 185 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 186 Instruction *I = &*II; 187 if (I->isAtomic() && !isa<FenceInst>(I)) 188 AtomicInsts.push_back(I); 189 } 190 191 bool MadeChange = false; 192 for (auto I : AtomicInsts) { 193 auto LI = dyn_cast<LoadInst>(I); 194 auto SI = dyn_cast<StoreInst>(I); 195 auto RMWI = dyn_cast<AtomicRMWInst>(I); 196 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 197 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 198 199 // If the Size/Alignment is not supported, replace with a libcall. 200 if (LI) { 201 if (!atomicSizeSupported(TLI, LI)) { 202 expandAtomicLoadToLibcall(LI); 203 MadeChange = true; 204 continue; 205 } 206 } else if (SI) { 207 if (!atomicSizeSupported(TLI, SI)) { 208 expandAtomicStoreToLibcall(SI); 209 MadeChange = true; 210 continue; 211 } 212 } else if (RMWI) { 213 if (!atomicSizeSupported(TLI, RMWI)) { 214 expandAtomicRMWToLibcall(RMWI); 215 MadeChange = true; 216 continue; 217 } 218 } else if (CASI) { 219 if (!atomicSizeSupported(TLI, CASI)) { 220 expandAtomicCASToLibcall(CASI); 221 MadeChange = true; 222 continue; 223 } 224 } 225 226 if (TLI->shouldInsertFencesForAtomic(I)) { 227 auto FenceOrdering = AtomicOrdering::Monotonic; 228 if (LI && isAcquireOrStronger(LI->getOrdering())) { 229 FenceOrdering = LI->getOrdering(); 230 LI->setOrdering(AtomicOrdering::Monotonic); 231 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 232 FenceOrdering = SI->getOrdering(); 233 SI->setOrdering(AtomicOrdering::Monotonic); 234 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 235 isAcquireOrStronger(RMWI->getOrdering()))) { 236 FenceOrdering = RMWI->getOrdering(); 237 RMWI->setOrdering(AtomicOrdering::Monotonic); 238 } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 239 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 240 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 241 // If a compare and swap is lowered to LL/SC, we can do smarter fence 242 // insertion, with a stronger one on the success path than on the 243 // failure path. As a result, fence insertion is directly done by 244 // expandAtomicCmpXchg in that case. 245 FenceOrdering = CASI->getSuccessOrdering(); 246 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 247 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 248 } 249 250 if (FenceOrdering != AtomicOrdering::Monotonic) { 251 MadeChange |= bracketInstWithFences(I, FenceOrdering); 252 } 253 } 254 255 if (LI) { 256 if (LI->getType()->isFloatingPointTy()) { 257 // TODO: add a TLI hook to control this so that each target can 258 // convert to lowering the original type one at a time. 259 LI = convertAtomicLoadToIntegerType(LI); 260 assert(LI->getType()->isIntegerTy() && "invariant broken"); 261 MadeChange = true; 262 } 263 264 MadeChange |= tryExpandAtomicLoad(LI); 265 } else if (SI) { 266 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 267 // TODO: add a TLI hook to control this so that each target can 268 // convert to lowering the original type one at a time. 269 SI = convertAtomicStoreToIntegerType(SI); 270 assert(SI->getValueOperand()->getType()->isIntegerTy() && 271 "invariant broken"); 272 MadeChange = true; 273 } 274 275 if (TLI->shouldExpandAtomicStoreInIR(SI)) 276 MadeChange |= expandAtomicStore(SI); 277 } else if (RMWI) { 278 // There are two different ways of expanding RMW instructions: 279 // - into a load if it is idempotent 280 // - into a Cmpxchg/LL-SC loop otherwise 281 // we try them in that order. 282 283 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 284 MadeChange = true; 285 } else { 286 MadeChange |= tryExpandAtomicRMW(RMWI); 287 } 288 } else if (CASI) { 289 // TODO: when we're ready to make the change at the IR level, we can 290 // extend convertCmpXchgToInteger for floating point too. 291 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 292 "unimplemented - floating point not legal at IR level"); 293 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 294 // TODO: add a TLI hook to control this so that each target can 295 // convert to lowering the original type one at a time. 296 CASI = convertCmpXchgToIntegerType(CASI); 297 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 298 "invariant broken"); 299 MadeChange = true; 300 } 301 302 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 303 unsigned ValueSize = getAtomicOpSize(CASI); 304 if (ValueSize < MinCASSize) { 305 assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 306 "MinCmpXchgSizeInBits not yet supported for LL/SC expansions."); 307 expandPartwordCmpXchg(CASI); 308 } else { 309 if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) 310 MadeChange |= expandAtomicCmpXchg(CASI); 311 } 312 } 313 } 314 return MadeChange; 315 } 316 317 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { 318 IRBuilder<> Builder(I); 319 320 auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); 321 322 auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); 323 // We have a guard here because not every atomic operation generates a 324 // trailing fence. 325 if (TrailingFence) 326 TrailingFence->moveAfter(I); 327 328 return (LeadingFence || TrailingFence); 329 } 330 331 /// Get the iX type with the same bitwidth as T. 332 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 333 const DataLayout &DL) { 334 EVT VT = TLI->getValueType(DL, T); 335 unsigned BitWidth = VT.getStoreSizeInBits(); 336 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 337 return IntegerType::get(T->getContext(), BitWidth); 338 } 339 340 /// Convert an atomic load of a non-integral type to an integer load of the 341 /// equivalent bitwidth. See the function comment on 342 /// convertAtomicStoreToIntegerType for background. 343 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 344 auto *M = LI->getModule(); 345 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 346 M->getDataLayout()); 347 348 IRBuilder<> Builder(LI); 349 350 Value *Addr = LI->getPointerOperand(); 351 Type *PT = PointerType::get(NewTy, 352 Addr->getType()->getPointerAddressSpace()); 353 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 354 355 auto *NewLI = Builder.CreateLoad(NewAddr); 356 NewLI->setAlignment(LI->getAlignment()); 357 NewLI->setVolatile(LI->isVolatile()); 358 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); 359 DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 360 361 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 362 LI->replaceAllUsesWith(NewVal); 363 LI->eraseFromParent(); 364 return NewLI; 365 } 366 367 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 368 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 369 case TargetLoweringBase::AtomicExpansionKind::None: 370 return false; 371 case TargetLoweringBase::AtomicExpansionKind::LLSC: 372 expandAtomicOpToLLSC( 373 LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), 374 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 375 return true; 376 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 377 return expandAtomicLoadToLL(LI); 378 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 379 return expandAtomicLoadToCmpXchg(LI); 380 } 381 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 382 } 383 384 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 385 IRBuilder<> Builder(LI); 386 387 // On some architectures, load-linked instructions are atomic for larger 388 // sizes than normal loads. For example, the only 64-bit load guaranteed 389 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 390 Value *Val = 391 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 392 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 393 394 LI->replaceAllUsesWith(Val); 395 LI->eraseFromParent(); 396 397 return true; 398 } 399 400 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 401 IRBuilder<> Builder(LI); 402 AtomicOrdering Order = LI->getOrdering(); 403 Value *Addr = LI->getPointerOperand(); 404 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 405 Constant *DummyVal = Constant::getNullValue(Ty); 406 407 Value *Pair = Builder.CreateAtomicCmpXchg( 408 Addr, DummyVal, DummyVal, Order, 409 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 410 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 411 412 LI->replaceAllUsesWith(Loaded); 413 LI->eraseFromParent(); 414 415 return true; 416 } 417 418 /// Convert an atomic store of a non-integral type to an integer store of the 419 /// equivalent bitwidth. We used to not support floating point or vector 420 /// atomics in the IR at all. The backends learned to deal with the bitcast 421 /// idiom because that was the only way of expressing the notion of a atomic 422 /// float or vector store. The long term plan is to teach each backend to 423 /// instruction select from the original atomic store, but as a migration 424 /// mechanism, we convert back to the old format which the backends understand. 425 /// Each backend will need individual work to recognize the new format. 426 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 427 IRBuilder<> Builder(SI); 428 auto *M = SI->getModule(); 429 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 430 M->getDataLayout()); 431 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 432 433 Value *Addr = SI->getPointerOperand(); 434 Type *PT = PointerType::get(NewTy, 435 Addr->getType()->getPointerAddressSpace()); 436 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 437 438 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 439 NewSI->setAlignment(SI->getAlignment()); 440 NewSI->setVolatile(SI->isVolatile()); 441 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); 442 DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 443 SI->eraseFromParent(); 444 return NewSI; 445 } 446 447 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 448 // This function is only called on atomic stores that are too large to be 449 // atomic if implemented as a native store. So we replace them by an 450 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 451 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 452 // It is the responsibility of the target to only signal expansion via 453 // shouldExpandAtomicRMW in cases where this is required and possible. 454 IRBuilder<> Builder(SI); 455 AtomicRMWInst *AI = 456 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 457 SI->getValueOperand(), SI->getOrdering()); 458 SI->eraseFromParent(); 459 460 // Now we have an appropriate swap instruction, lower it as usual. 461 return tryExpandAtomicRMW(AI); 462 } 463 464 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 465 Value *Loaded, Value *NewVal, 466 AtomicOrdering MemOpOrder, 467 Value *&Success, Value *&NewLoaded) { 468 Value* Pair = Builder.CreateAtomicCmpXchg( 469 Addr, Loaded, NewVal, MemOpOrder, 470 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 471 Success = Builder.CreateExtractValue(Pair, 1, "success"); 472 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 473 } 474 475 /// Emit IR to implement the given atomicrmw operation on values in registers, 476 /// returning the new value. 477 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 478 Value *Loaded, Value *Inc) { 479 Value *NewVal; 480 switch (Op) { 481 case AtomicRMWInst::Xchg: 482 return Inc; 483 case AtomicRMWInst::Add: 484 return Builder.CreateAdd(Loaded, Inc, "new"); 485 case AtomicRMWInst::Sub: 486 return Builder.CreateSub(Loaded, Inc, "new"); 487 case AtomicRMWInst::And: 488 return Builder.CreateAnd(Loaded, Inc, "new"); 489 case AtomicRMWInst::Nand: 490 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 491 case AtomicRMWInst::Or: 492 return Builder.CreateOr(Loaded, Inc, "new"); 493 case AtomicRMWInst::Xor: 494 return Builder.CreateXor(Loaded, Inc, "new"); 495 case AtomicRMWInst::Max: 496 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 497 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 498 case AtomicRMWInst::Min: 499 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 500 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 501 case AtomicRMWInst::UMax: 502 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 503 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 504 case AtomicRMWInst::UMin: 505 NewVal = Builder.CreateICmpULE(Loaded, Inc); 506 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 507 default: 508 llvm_unreachable("Unknown atomic op"); 509 } 510 } 511 512 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 513 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 514 case TargetLoweringBase::AtomicExpansionKind::None: 515 return false; 516 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 517 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 518 unsigned ValueSize = getAtomicOpSize(AI); 519 if (ValueSize < MinCASSize) { 520 llvm_unreachable( 521 "MinCmpXchgSizeInBits not yet supported for LL/SC architectures."); 522 } else { 523 auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { 524 return performAtomicOp(AI->getOperation(), Builder, Loaded, 525 AI->getValOperand()); 526 }; 527 expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), 528 AI->getOrdering(), PerformOp); 529 } 530 return true; 531 } 532 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { 533 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 534 unsigned ValueSize = getAtomicOpSize(AI); 535 if (ValueSize < MinCASSize) { 536 expandPartwordAtomicRMW(AI, 537 TargetLoweringBase::AtomicExpansionKind::CmpXChg); 538 } else { 539 expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 540 } 541 return true; 542 } 543 default: 544 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 545 } 546 } 547 548 namespace { 549 550 /// Result values from createMaskInstrs helper. 551 struct PartwordMaskValues { 552 Type *WordType; 553 Type *ValueType; 554 Value *AlignedAddr; 555 Value *ShiftAmt; 556 Value *Mask; 557 Value *Inv_Mask; 558 }; 559 } // end anonymous namespace 560 561 /// This is a helper function which builds instructions to provide 562 /// values necessary for partword atomic operations. It takes an 563 /// incoming address, Addr, and ValueType, and constructs the address, 564 /// shift-amounts and masks needed to work with a larger value of size 565 /// WordSize. 566 /// 567 /// AlignedAddr: Addr rounded down to a multiple of WordSize 568 /// 569 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded 570 /// from AlignAddr for it to have the same value as if 571 /// ValueType was loaded from Addr. 572 /// 573 /// Mask: Value to mask with the value loaded from AlignAddr to 574 /// include only the part that would've been loaded from Addr. 575 /// 576 /// Inv_Mask: The inverse of Mask. 577 578 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, 579 Type *ValueType, Value *Addr, 580 unsigned WordSize) { 581 PartwordMaskValues Ret; 582 583 BasicBlock *BB = I->getParent(); 584 Function *F = BB->getParent(); 585 Module *M = I->getModule(); 586 587 LLVMContext &Ctx = F->getContext(); 588 const DataLayout &DL = M->getDataLayout(); 589 590 unsigned ValueSize = DL.getTypeStoreSize(ValueType); 591 592 assert(ValueSize < WordSize); 593 594 Ret.ValueType = ValueType; 595 Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8); 596 597 Type *WordPtrType = 598 Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); 599 600 Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); 601 Ret.AlignedAddr = Builder.CreateIntToPtr( 602 Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType, 603 "AlignedAddr"); 604 605 Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB"); 606 if (DL.isLittleEndian()) { 607 // turn bytes into bits 608 Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3); 609 } else { 610 // turn bytes into bits, and count from the other side. 611 Ret.ShiftAmt = 612 Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3); 613 } 614 615 Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt"); 616 Ret.Mask = Builder.CreateShl( 617 ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt, 618 "Mask"); 619 Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask"); 620 621 return Ret; 622 } 623 624 /// Emit IR to implement a masked version of a given atomicrmw 625 /// operation. (That is, only the bits under the Mask should be 626 /// affected by the operation) 627 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, 628 IRBuilder<> &Builder, Value *Loaded, 629 Value *Shifted_Inc, Value *Inc, 630 const PartwordMaskValues &PMV) { 631 switch (Op) { 632 case AtomicRMWInst::Xchg: { 633 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 634 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); 635 return FinalVal; 636 } 637 case AtomicRMWInst::Or: 638 case AtomicRMWInst::Xor: 639 // Or/Xor won't affect any other bits, so can just be done 640 // directly. 641 return performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 642 case AtomicRMWInst::Add: 643 case AtomicRMWInst::Sub: 644 case AtomicRMWInst::And: 645 case AtomicRMWInst::Nand: { 646 // The other arithmetic ops need to be masked into place. 647 Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 648 Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); 649 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 650 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); 651 return FinalVal; 652 } 653 case AtomicRMWInst::Max: 654 case AtomicRMWInst::Min: 655 case AtomicRMWInst::UMax: 656 case AtomicRMWInst::UMin: { 657 // Finally, comparison ops will operate on the full value, so 658 // truncate down to the original size, and expand out again after 659 // doing the operation. 660 Value *Loaded_Shiftdown = Builder.CreateTrunc( 661 Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType); 662 Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc); 663 Value *NewVal_Shiftup = Builder.CreateShl( 664 Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 665 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 666 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup); 667 return FinalVal; 668 } 669 default: 670 llvm_unreachable("Unknown atomic op"); 671 } 672 } 673 674 /// Expand a sub-word atomicrmw operation into an appropriate 675 /// word-sized operation. 676 /// 677 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same 678 /// way as a typical atomicrmw expansion. The only difference here is 679 /// that the operation inside of the loop must operate only upon a 680 /// part of the value. 681 void AtomicExpand::expandPartwordAtomicRMW( 682 AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { 683 684 assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg); 685 686 AtomicOrdering MemOpOrder = AI->getOrdering(); 687 688 IRBuilder<> Builder(AI); 689 690 PartwordMaskValues PMV = 691 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 692 TLI->getMinCmpXchgSizeInBits() / 8); 693 694 Value *ValOperand_Shifted = 695 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 696 PMV.ShiftAmt, "ValOperand_Shifted"); 697 698 auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { 699 return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, 700 ValOperand_Shifted, AI->getValOperand(), PMV); 701 }; 702 703 // TODO: When we're ready to support LLSC conversions too, use 704 // insertRMWLLSCLoop here for ExpansionKind==LLSC. 705 Value *OldResult = 706 insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, 707 PerformPartwordOp, createCmpXchgInstFun); 708 Value *FinalOldResult = Builder.CreateTrunc( 709 Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType); 710 AI->replaceAllUsesWith(FinalOldResult); 711 AI->eraseFromParent(); 712 } 713 714 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { 715 // The basic idea here is that we're expanding a cmpxchg of a 716 // smaller memory size up to a word-sized cmpxchg. To do this, we 717 // need to add a retry-loop for strong cmpxchg, so that 718 // modifications to other parts of the word don't cause a spurious 719 // failure. 720 721 // This generates code like the following: 722 // [[Setup mask values PMV.*]] 723 // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt 724 // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt 725 // %InitLoaded = load i32* %addr 726 // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask 727 // br partword.cmpxchg.loop 728 // partword.cmpxchg.loop: 729 // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], 730 // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] 731 // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted 732 // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted 733 // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, 734 // i32 %FullWord_NewVal success_ordering failure_ordering 735 // %OldVal = extractvalue { i32, i1 } %NewCI, 0 736 // %Success = extractvalue { i32, i1 } %NewCI, 1 737 // br i1 %Success, label %partword.cmpxchg.end, 738 // label %partword.cmpxchg.failure 739 // partword.cmpxchg.failure: 740 // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask 741 // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut 742 // br i1 %ShouldContinue, label %partword.cmpxchg.loop, 743 // label %partword.cmpxchg.end 744 // partword.cmpxchg.end: 745 // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt 746 // %FinalOldVal = trunc i32 %tmp1 to i8 747 // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 748 // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 749 750 Value *Addr = CI->getPointerOperand(); 751 Value *Cmp = CI->getCompareOperand(); 752 Value *NewVal = CI->getNewValOperand(); 753 754 BasicBlock *BB = CI->getParent(); 755 Function *F = BB->getParent(); 756 IRBuilder<> Builder(CI); 757 LLVMContext &Ctx = Builder.getContext(); 758 759 const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; 760 761 BasicBlock *EndBB = 762 BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); 763 auto FailureBB = 764 BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); 765 auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); 766 767 // The split call above "helpfully" added a branch at the end of BB 768 // (to the wrong place). 769 std::prev(BB->end())->eraseFromParent(); 770 Builder.SetInsertPoint(BB); 771 772 PartwordMaskValues PMV = createMaskInstrs( 773 Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); 774 775 // Shift the incoming values over, into the right location in the word. 776 Value *NewVal_Shifted = 777 Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 778 Value *Cmp_Shifted = 779 Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); 780 781 // Load the entire current word, and mask into place the expected and new 782 // values 783 LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); 784 InitLoaded->setVolatile(CI->isVolatile()); 785 Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); 786 Builder.CreateBr(LoopBB); 787 788 // partword.cmpxchg.loop: 789 Builder.SetInsertPoint(LoopBB); 790 PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); 791 Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); 792 793 // Mask/Or the expected and new values into place in the loaded word. 794 Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); 795 Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); 796 AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( 797 PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), 798 CI->getFailureOrdering(), CI->getSyncScopeID()); 799 NewCI->setVolatile(CI->isVolatile()); 800 // When we're building a strong cmpxchg, we need a loop, so you 801 // might think we could use a weak cmpxchg inside. But, using strong 802 // allows the below comparison for ShouldContinue, and we're 803 // expecting the underlying cmpxchg to be a machine instruction, 804 // which is strong anyways. 805 NewCI->setWeak(CI->isWeak()); 806 807 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 808 Value *Success = Builder.CreateExtractValue(NewCI, 1); 809 810 if (CI->isWeak()) 811 Builder.CreateBr(EndBB); 812 else 813 Builder.CreateCondBr(Success, EndBB, FailureBB); 814 815 // partword.cmpxchg.failure: 816 Builder.SetInsertPoint(FailureBB); 817 // Upon failure, verify that the masked-out part of the loaded value 818 // has been modified. If it didn't, abort the cmpxchg, since the 819 // masked-in part must've. 820 Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); 821 Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); 822 Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); 823 824 // Add the second value to the phi from above 825 Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); 826 827 // partword.cmpxchg.end: 828 Builder.SetInsertPoint(CI); 829 830 Value *FinalOldVal = Builder.CreateTrunc( 831 Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType); 832 Value *Res = UndefValue::get(CI->getType()); 833 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 834 Res = Builder.CreateInsertValue(Res, Success, 1); 835 836 CI->replaceAllUsesWith(Res); 837 CI->eraseFromParent(); 838 } 839 840 void AtomicExpand::expandAtomicOpToLLSC( 841 Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, 842 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 843 IRBuilder<> Builder(I); 844 Value *Loaded = 845 insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); 846 847 I->replaceAllUsesWith(Loaded); 848 I->eraseFromParent(); 849 } 850 851 Value *AtomicExpand::insertRMWLLSCLoop( 852 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 853 AtomicOrdering MemOpOrder, 854 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 855 LLVMContext &Ctx = Builder.getContext(); 856 BasicBlock *BB = Builder.GetInsertBlock(); 857 Function *F = BB->getParent(); 858 859 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 860 // 861 // The standard expansion we produce is: 862 // [...] 863 // atomicrmw.start: 864 // %loaded = @load.linked(%addr) 865 // %new = some_op iN %loaded, %incr 866 // %stored = @store_conditional(%new, %addr) 867 // %try_again = icmp i32 ne %stored, 0 868 // br i1 %try_again, label %loop, label %atomicrmw.end 869 // atomicrmw.end: 870 // [...] 871 BasicBlock *ExitBB = 872 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 873 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 874 875 // The split call above "helpfully" added a branch at the end of BB (to the 876 // wrong place). 877 std::prev(BB->end())->eraseFromParent(); 878 Builder.SetInsertPoint(BB); 879 Builder.CreateBr(LoopBB); 880 881 // Start the main loop block now that we've taken care of the preliminaries. 882 Builder.SetInsertPoint(LoopBB); 883 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 884 885 Value *NewVal = PerformOp(Builder, Loaded); 886 887 Value *StoreSuccess = 888 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 889 Value *TryAgain = Builder.CreateICmpNE( 890 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 891 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 892 893 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 894 return Loaded; 895 } 896 897 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 898 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 899 /// IR. As a migration step, we convert back to what use to be the standard 900 /// way to represent a pointer cmpxchg so that we can update backends one by 901 /// one. 902 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 903 auto *M = CI->getModule(); 904 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 905 M->getDataLayout()); 906 907 IRBuilder<> Builder(CI); 908 909 Value *Addr = CI->getPointerOperand(); 910 Type *PT = PointerType::get(NewTy, 911 Addr->getType()->getPointerAddressSpace()); 912 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 913 914 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 915 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 916 917 918 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 919 CI->getSuccessOrdering(), 920 CI->getFailureOrdering(), 921 CI->getSyncScopeID()); 922 NewCI->setVolatile(CI->isVolatile()); 923 NewCI->setWeak(CI->isWeak()); 924 DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 925 926 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 927 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 928 929 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 930 931 Value *Res = UndefValue::get(CI->getType()); 932 Res = Builder.CreateInsertValue(Res, OldVal, 0); 933 Res = Builder.CreateInsertValue(Res, Succ, 1); 934 935 CI->replaceAllUsesWith(Res); 936 CI->eraseFromParent(); 937 return NewCI; 938 } 939 940 941 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 942 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 943 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 944 Value *Addr = CI->getPointerOperand(); 945 BasicBlock *BB = CI->getParent(); 946 Function *F = BB->getParent(); 947 LLVMContext &Ctx = F->getContext(); 948 // If shouldInsertFencesForAtomic() returns true, then the target does not 949 // want to deal with memory orders, and emitLeading/TrailingFence should take 950 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 951 // should preserve the ordering. 952 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 953 AtomicOrdering MemOpOrder = 954 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 955 956 // In implementations which use a barrier to achieve release semantics, we can 957 // delay emitting this barrier until we know a store is actually going to be 958 // attempted. The cost of this delay is that we need 2 copies of the block 959 // emitting the load-linked, affecting code size. 960 // 961 // Ideally, this logic would be unconditional except for the minsize check 962 // since in other cases the extra blocks naturally collapse down to the 963 // minimal loop. Unfortunately, this puts too much stress on later 964 // optimisations so we avoid emitting the extra logic in those cases too. 965 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 966 SuccessOrder != AtomicOrdering::Monotonic && 967 SuccessOrder != AtomicOrdering::Acquire && 968 !F->optForMinSize(); 969 970 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 971 // do it even on minsize. 972 bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); 973 974 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 975 // 976 // The full expansion we produce is: 977 // [...] 978 // cmpxchg.start: 979 // %unreleasedload = @load.linked(%addr) 980 // %should_store = icmp eq %unreleasedload, %desired 981 // br i1 %should_store, label %cmpxchg.fencedstore, 982 // label %cmpxchg.nostore 983 // cmpxchg.releasingstore: 984 // fence? 985 // br label cmpxchg.trystore 986 // cmpxchg.trystore: 987 // %loaded.trystore = phi [%unreleasedload, %releasingstore], 988 // [%releasedload, %cmpxchg.releasedload] 989 // %stored = @store_conditional(%new, %addr) 990 // %success = icmp eq i32 %stored, 0 991 // br i1 %success, label %cmpxchg.success, 992 // label %cmpxchg.releasedload/%cmpxchg.failure 993 // cmpxchg.releasedload: 994 // %releasedload = @load.linked(%addr) 995 // %should_store = icmp eq %releasedload, %desired 996 // br i1 %should_store, label %cmpxchg.trystore, 997 // label %cmpxchg.failure 998 // cmpxchg.success: 999 // fence? 1000 // br label %cmpxchg.end 1001 // cmpxchg.nostore: 1002 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 1003 // [%releasedload, 1004 // %cmpxchg.releasedload/%cmpxchg.trystore] 1005 // @load_linked_fail_balance()? 1006 // br label %cmpxchg.failure 1007 // cmpxchg.failure: 1008 // fence? 1009 // br label %cmpxchg.end 1010 // cmpxchg.end: 1011 // %loaded = phi [%loaded.nostore, %cmpxchg.failure], 1012 // [%loaded.trystore, %cmpxchg.trystore] 1013 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 1014 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 1015 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 1016 // [...] 1017 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 1018 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 1019 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 1020 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 1021 auto ReleasedLoadBB = 1022 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 1023 auto TryStoreBB = 1024 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 1025 auto ReleasingStoreBB = 1026 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 1027 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 1028 1029 // This grabs the DebugLoc from CI 1030 IRBuilder<> Builder(CI); 1031 1032 // The split call above "helpfully" added a branch at the end of BB (to the 1033 // wrong place), but we might want a fence too. It's easiest to just remove 1034 // the branch entirely. 1035 std::prev(BB->end())->eraseFromParent(); 1036 Builder.SetInsertPoint(BB); 1037 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 1038 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1039 Builder.CreateBr(StartBB); 1040 1041 // Start the main loop block now that we've taken care of the preliminaries. 1042 Builder.SetInsertPoint(StartBB); 1043 Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1044 Value *ShouldStore = Builder.CreateICmpEQ( 1045 UnreleasedLoad, CI->getCompareOperand(), "should_store"); 1046 1047 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1048 // jump straight past that fence instruction (if it exists). 1049 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 1050 1051 Builder.SetInsertPoint(ReleasingStoreBB); 1052 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 1053 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1054 Builder.CreateBr(TryStoreBB); 1055 1056 Builder.SetInsertPoint(TryStoreBB); 1057 Value *StoreSuccess = TLI->emitStoreConditional( 1058 Builder, CI->getNewValOperand(), Addr, MemOpOrder); 1059 StoreSuccess = Builder.CreateICmpEQ( 1060 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 1061 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 1062 Builder.CreateCondBr(StoreSuccess, SuccessBB, 1063 CI->isWeak() ? FailureBB : RetryBB); 1064 1065 Builder.SetInsertPoint(ReleasedLoadBB); 1066 Value *SecondLoad; 1067 if (HasReleasedLoadBB) { 1068 SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1069 ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), 1070 "should_store"); 1071 1072 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1073 // jump straight past that fence instruction (if it exists). 1074 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 1075 } else 1076 Builder.CreateUnreachable(); 1077 1078 // Make sure later instructions don't get reordered with a fence if 1079 // necessary. 1080 Builder.SetInsertPoint(SuccessBB); 1081 if (ShouldInsertFencesForAtomic) 1082 TLI->emitTrailingFence(Builder, CI, SuccessOrder); 1083 Builder.CreateBr(ExitBB); 1084 1085 Builder.SetInsertPoint(NoStoreBB); 1086 // In the failing case, where we don't execute the store-conditional, the 1087 // target might want to balance out the load-linked with a dedicated 1088 // instruction (e.g., on ARM, clearing the exclusive monitor). 1089 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 1090 Builder.CreateBr(FailureBB); 1091 1092 Builder.SetInsertPoint(FailureBB); 1093 if (ShouldInsertFencesForAtomic) 1094 TLI->emitTrailingFence(Builder, CI, FailureOrder); 1095 Builder.CreateBr(ExitBB); 1096 1097 // Finally, we have control-flow based knowledge of whether the cmpxchg 1098 // succeeded or not. We expose this to later passes by converting any 1099 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 1100 // PHI. 1101 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1102 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); 1103 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 1104 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 1105 1106 // Setup the builder so we can create any PHIs we need. 1107 Value *Loaded; 1108 if (!HasReleasedLoadBB) 1109 Loaded = UnreleasedLoad; 1110 else { 1111 Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); 1112 PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1113 TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); 1114 TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1115 1116 Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); 1117 PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1118 NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); 1119 NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1120 1121 Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); 1122 PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1123 ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); 1124 ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); 1125 1126 Loaded = ExitLoaded; 1127 } 1128 1129 // Look for any users of the cmpxchg that are just comparing the loaded value 1130 // against the desired one, and replace them with the CFG-derived version. 1131 SmallVector<ExtractValueInst *, 2> PrunedInsts; 1132 for (auto User : CI->users()) { 1133 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 1134 if (!EV) 1135 continue; 1136 1137 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 1138 "weird extraction from { iN, i1 }"); 1139 1140 if (EV->getIndices()[0] == 0) 1141 EV->replaceAllUsesWith(Loaded); 1142 else 1143 EV->replaceAllUsesWith(Success); 1144 1145 PrunedInsts.push_back(EV); 1146 } 1147 1148 // We can remove the instructions now we're no longer iterating through them. 1149 for (auto EV : PrunedInsts) 1150 EV->eraseFromParent(); 1151 1152 if (!CI->use_empty()) { 1153 // Some use of the full struct return that we don't understand has happened, 1154 // so we've got to reconstruct it properly. 1155 Value *Res; 1156 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 1157 Res = Builder.CreateInsertValue(Res, Success, 1); 1158 1159 CI->replaceAllUsesWith(Res); 1160 } 1161 1162 CI->eraseFromParent(); 1163 return true; 1164 } 1165 1166 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 1167 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 1168 if(!C) 1169 return false; 1170 1171 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 1172 switch(Op) { 1173 case AtomicRMWInst::Add: 1174 case AtomicRMWInst::Sub: 1175 case AtomicRMWInst::Or: 1176 case AtomicRMWInst::Xor: 1177 return C->isZero(); 1178 case AtomicRMWInst::And: 1179 return C->isMinusOne(); 1180 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 1181 default: 1182 return false; 1183 } 1184 } 1185 1186 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 1187 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 1188 tryExpandAtomicLoad(ResultingLoad); 1189 return true; 1190 } 1191 return false; 1192 } 1193 1194 Value *AtomicExpand::insertRMWCmpXchgLoop( 1195 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1196 AtomicOrdering MemOpOrder, 1197 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 1198 CreateCmpXchgInstFun CreateCmpXchg) { 1199 LLVMContext &Ctx = Builder.getContext(); 1200 BasicBlock *BB = Builder.GetInsertBlock(); 1201 Function *F = BB->getParent(); 1202 1203 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1204 // 1205 // The standard expansion we produce is: 1206 // [...] 1207 // %init_loaded = load atomic iN* %addr 1208 // br label %loop 1209 // loop: 1210 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 1211 // %new = some_op iN %loaded, %incr 1212 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 1213 // %new_loaded = extractvalue { iN, i1 } %pair, 0 1214 // %success = extractvalue { iN, i1 } %pair, 1 1215 // br i1 %success, label %atomicrmw.end, label %loop 1216 // atomicrmw.end: 1217 // [...] 1218 BasicBlock *ExitBB = 1219 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1220 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1221 1222 // The split call above "helpfully" added a branch at the end of BB (to the 1223 // wrong place), but we want a load. It's easiest to just remove 1224 // the branch entirely. 1225 std::prev(BB->end())->eraseFromParent(); 1226 Builder.SetInsertPoint(BB); 1227 LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); 1228 // Atomics require at least natural alignment. 1229 InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8); 1230 Builder.CreateBr(LoopBB); 1231 1232 // Start the main loop block now that we've taken care of the preliminaries. 1233 Builder.SetInsertPoint(LoopBB); 1234 PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); 1235 Loaded->addIncoming(InitLoaded, BB); 1236 1237 Value *NewVal = PerformOp(Builder, Loaded); 1238 1239 Value *NewLoaded = nullptr; 1240 Value *Success = nullptr; 1241 1242 CreateCmpXchg(Builder, Addr, Loaded, NewVal, 1243 MemOpOrder == AtomicOrdering::Unordered 1244 ? AtomicOrdering::Monotonic 1245 : MemOpOrder, 1246 Success, NewLoaded); 1247 assert(Success && NewLoaded); 1248 1249 Loaded->addIncoming(NewLoaded, LoopBB); 1250 1251 Builder.CreateCondBr(Success, ExitBB, LoopBB); 1252 1253 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1254 return NewLoaded; 1255 } 1256 1257 // Note: This function is exposed externally by AtomicExpandUtils.h 1258 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 1259 CreateCmpXchgInstFun CreateCmpXchg) { 1260 IRBuilder<> Builder(AI); 1261 Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( 1262 Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), 1263 [&](IRBuilder<> &Builder, Value *Loaded) { 1264 return performAtomicOp(AI->getOperation(), Builder, Loaded, 1265 AI->getValOperand()); 1266 }, 1267 CreateCmpXchg); 1268 1269 AI->replaceAllUsesWith(Loaded); 1270 AI->eraseFromParent(); 1271 return true; 1272 } 1273 1274 // In order to use one of the sized library calls such as 1275 // __atomic_fetch_add_4, the alignment must be sufficient, the size 1276 // must be one of the potentially-specialized sizes, and the value 1277 // type must actually exist in C on the target (otherwise, the 1278 // function wouldn't actually be defined.) 1279 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align, 1280 const DataLayout &DL) { 1281 // TODO: "LargestSize" is an approximation for "largest type that 1282 // you can express in C". It seems to be the case that int128 is 1283 // supported on all 64-bit platforms, otherwise only up to 64-bit 1284 // integers are supported. If we get this wrong, then we'll try to 1285 // call a sized libcall that doesn't actually exist. There should 1286 // really be some more reliable way in LLVM of determining integer 1287 // sizes which are valid in the target's C ABI... 1288 unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8; 1289 return Align >= Size && 1290 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) && 1291 Size <= LargestSize; 1292 } 1293 1294 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { 1295 static const RTLIB::Libcall Libcalls[6] = { 1296 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, 1297 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; 1298 unsigned Size = getAtomicOpSize(I); 1299 unsigned Align = getAtomicOpAlign(I); 1300 1301 bool expanded = expandAtomicOpToLibcall( 1302 I, Size, Align, I->getPointerOperand(), nullptr, nullptr, 1303 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1304 (void)expanded; 1305 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); 1306 } 1307 1308 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { 1309 static const RTLIB::Libcall Libcalls[6] = { 1310 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, 1311 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; 1312 unsigned Size = getAtomicOpSize(I); 1313 unsigned Align = getAtomicOpAlign(I); 1314 1315 bool expanded = expandAtomicOpToLibcall( 1316 I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr, 1317 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1318 (void)expanded; 1319 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); 1320 } 1321 1322 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { 1323 static const RTLIB::Libcall Libcalls[6] = { 1324 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, 1325 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, 1326 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; 1327 unsigned Size = getAtomicOpSize(I); 1328 unsigned Align = getAtomicOpAlign(I); 1329 1330 bool expanded = expandAtomicOpToLibcall( 1331 I, Size, Align, I->getPointerOperand(), I->getNewValOperand(), 1332 I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), 1333 Libcalls); 1334 (void)expanded; 1335 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); 1336 } 1337 1338 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { 1339 static const RTLIB::Libcall LibcallsXchg[6] = { 1340 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, 1341 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, 1342 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; 1343 static const RTLIB::Libcall LibcallsAdd[6] = { 1344 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, 1345 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, 1346 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; 1347 static const RTLIB::Libcall LibcallsSub[6] = { 1348 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, 1349 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, 1350 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; 1351 static const RTLIB::Libcall LibcallsAnd[6] = { 1352 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, 1353 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, 1354 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; 1355 static const RTLIB::Libcall LibcallsOr[6] = { 1356 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, 1357 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, 1358 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; 1359 static const RTLIB::Libcall LibcallsXor[6] = { 1360 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, 1361 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, 1362 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; 1363 static const RTLIB::Libcall LibcallsNand[6] = { 1364 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, 1365 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, 1366 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; 1367 1368 switch (Op) { 1369 case AtomicRMWInst::BAD_BINOP: 1370 llvm_unreachable("Should not have BAD_BINOP."); 1371 case AtomicRMWInst::Xchg: 1372 return makeArrayRef(LibcallsXchg); 1373 case AtomicRMWInst::Add: 1374 return makeArrayRef(LibcallsAdd); 1375 case AtomicRMWInst::Sub: 1376 return makeArrayRef(LibcallsSub); 1377 case AtomicRMWInst::And: 1378 return makeArrayRef(LibcallsAnd); 1379 case AtomicRMWInst::Or: 1380 return makeArrayRef(LibcallsOr); 1381 case AtomicRMWInst::Xor: 1382 return makeArrayRef(LibcallsXor); 1383 case AtomicRMWInst::Nand: 1384 return makeArrayRef(LibcallsNand); 1385 case AtomicRMWInst::Max: 1386 case AtomicRMWInst::Min: 1387 case AtomicRMWInst::UMax: 1388 case AtomicRMWInst::UMin: 1389 // No atomic libcalls are available for max/min/umax/umin. 1390 return {}; 1391 } 1392 llvm_unreachable("Unexpected AtomicRMW operation."); 1393 } 1394 1395 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { 1396 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); 1397 1398 unsigned Size = getAtomicOpSize(I); 1399 unsigned Align = getAtomicOpAlign(I); 1400 1401 bool Success = false; 1402 if (!Libcalls.empty()) 1403 Success = expandAtomicOpToLibcall( 1404 I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr, 1405 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1406 1407 // The expansion failed: either there were no libcalls at all for 1408 // the operation (min/max), or there were only size-specialized 1409 // libcalls (add/sub/etc) and we needed a generic. So, expand to a 1410 // CAS libcall, via a CAS loop, instead. 1411 if (!Success) { 1412 expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, 1413 Value *Loaded, Value *NewVal, 1414 AtomicOrdering MemOpOrder, 1415 Value *&Success, Value *&NewLoaded) { 1416 // Create the CAS instruction normally... 1417 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( 1418 Addr, Loaded, NewVal, MemOpOrder, 1419 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 1420 Success = Builder.CreateExtractValue(Pair, 1, "success"); 1421 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 1422 1423 // ...and then expand the CAS into a libcall. 1424 expandAtomicCASToLibcall(Pair); 1425 }); 1426 } 1427 } 1428 1429 // A helper routine for the above expandAtomic*ToLibcall functions. 1430 // 1431 // 'Libcalls' contains an array of enum values for the particular 1432 // ATOMIC libcalls to be emitted. All of the other arguments besides 1433 // 'I' are extracted from the Instruction subclass by the 1434 // caller. Depending on the particular call, some will be null. 1435 bool AtomicExpand::expandAtomicOpToLibcall( 1436 Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand, 1437 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, 1438 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { 1439 assert(Libcalls.size() == 6); 1440 1441 LLVMContext &Ctx = I->getContext(); 1442 Module *M = I->getModule(); 1443 const DataLayout &DL = M->getDataLayout(); 1444 IRBuilder<> Builder(I); 1445 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); 1446 1447 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL); 1448 Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); 1449 1450 unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy); 1451 1452 // TODO: the "order" argument type is "int", not int32. So 1453 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. 1454 ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); 1455 assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); 1456 Constant *OrderingVal = 1457 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); 1458 Constant *Ordering2Val = nullptr; 1459 if (CASExpected) { 1460 assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO"); 1461 Ordering2Val = 1462 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2)); 1463 } 1464 bool HasResult = I->getType() != Type::getVoidTy(Ctx); 1465 1466 RTLIB::Libcall RTLibType; 1467 if (UseSizedLibcall) { 1468 switch (Size) { 1469 case 1: RTLibType = Libcalls[1]; break; 1470 case 2: RTLibType = Libcalls[2]; break; 1471 case 4: RTLibType = Libcalls[3]; break; 1472 case 8: RTLibType = Libcalls[4]; break; 1473 case 16: RTLibType = Libcalls[5]; break; 1474 } 1475 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) { 1476 RTLibType = Libcalls[0]; 1477 } else { 1478 // Can't use sized function, and there's no generic for this 1479 // operation, so give up. 1480 return false; 1481 } 1482 1483 // Build up the function call. There's two kinds. First, the sized 1484 // variants. These calls are going to be one of the following (with 1485 // N=1,2,4,8,16): 1486 // iN __atomic_load_N(iN *ptr, int ordering) 1487 // void __atomic_store_N(iN *ptr, iN val, int ordering) 1488 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) 1489 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, 1490 // int success_order, int failure_order) 1491 // 1492 // Note that these functions can be used for non-integer atomic 1493 // operations, the values just need to be bitcast to integers on the 1494 // way in and out. 1495 // 1496 // And, then, the generic variants. They look like the following: 1497 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) 1498 // void __atomic_store(size_t size, void *ptr, void *val, int ordering) 1499 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, 1500 // int ordering) 1501 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, 1502 // void *desired, int success_order, 1503 // int failure_order) 1504 // 1505 // The different signatures are built up depending on the 1506 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' 1507 // variables. 1508 1509 AllocaInst *AllocaCASExpected = nullptr; 1510 Value *AllocaCASExpected_i8 = nullptr; 1511 AllocaInst *AllocaValue = nullptr; 1512 Value *AllocaValue_i8 = nullptr; 1513 AllocaInst *AllocaResult = nullptr; 1514 Value *AllocaResult_i8 = nullptr; 1515 1516 Type *ResultTy; 1517 SmallVector<Value *, 6> Args; 1518 AttributeList Attr; 1519 1520 // 'size' argument. 1521 if (!UseSizedLibcall) { 1522 // Note, getIntPtrType is assumed equivalent to size_t. 1523 Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); 1524 } 1525 1526 // 'ptr' argument. 1527 Value *PtrVal = 1528 Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx)); 1529 Args.push_back(PtrVal); 1530 1531 // 'expected' argument, if present. 1532 if (CASExpected) { 1533 AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); 1534 AllocaCASExpected->setAlignment(AllocaAlignment); 1535 AllocaCASExpected_i8 = 1536 Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx)); 1537 Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); 1538 Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); 1539 Args.push_back(AllocaCASExpected_i8); 1540 } 1541 1542 // 'val' argument ('desired' for cas), if present. 1543 if (ValueOperand) { 1544 if (UseSizedLibcall) { 1545 Value *IntValue = 1546 Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); 1547 Args.push_back(IntValue); 1548 } else { 1549 AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); 1550 AllocaValue->setAlignment(AllocaAlignment); 1551 AllocaValue_i8 = 1552 Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); 1553 Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); 1554 Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); 1555 Args.push_back(AllocaValue_i8); 1556 } 1557 } 1558 1559 // 'ret' argument. 1560 if (!CASExpected && HasResult && !UseSizedLibcall) { 1561 AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); 1562 AllocaResult->setAlignment(AllocaAlignment); 1563 AllocaResult_i8 = 1564 Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx)); 1565 Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); 1566 Args.push_back(AllocaResult_i8); 1567 } 1568 1569 // 'ordering' ('success_order' for cas) argument. 1570 Args.push_back(OrderingVal); 1571 1572 // 'failure_order' argument, if present. 1573 if (Ordering2Val) 1574 Args.push_back(Ordering2Val); 1575 1576 // Now, the return type. 1577 if (CASExpected) { 1578 ResultTy = Type::getInt1Ty(Ctx); 1579 Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); 1580 } else if (HasResult && UseSizedLibcall) 1581 ResultTy = SizedIntTy; 1582 else 1583 ResultTy = Type::getVoidTy(Ctx); 1584 1585 // Done with setting up arguments and return types, create the call: 1586 SmallVector<Type *, 6> ArgTys; 1587 for (Value *Arg : Args) 1588 ArgTys.push_back(Arg->getType()); 1589 FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); 1590 Constant *LibcallFn = 1591 M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); 1592 CallInst *Call = Builder.CreateCall(LibcallFn, Args); 1593 Call->setAttributes(Attr); 1594 Value *Result = Call; 1595 1596 // And then, extract the results... 1597 if (ValueOperand && !UseSizedLibcall) 1598 Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); 1599 1600 if (CASExpected) { 1601 // The final result from the CAS is {load of 'expected' alloca, bool result 1602 // from call} 1603 Type *FinalResultTy = I->getType(); 1604 Value *V = UndefValue::get(FinalResultTy); 1605 Value *ExpectedOut = 1606 Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment); 1607 Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); 1608 V = Builder.CreateInsertValue(V, ExpectedOut, 0); 1609 V = Builder.CreateInsertValue(V, Result, 1); 1610 I->replaceAllUsesWith(V); 1611 } else if (HasResult) { 1612 Value *V; 1613 if (UseSizedLibcall) 1614 V = Builder.CreateBitOrPointerCast(Result, I->getType()); 1615 else { 1616 V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment); 1617 Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); 1618 } 1619 I->replaceAllUsesWith(V); 1620 } 1621 I->eraseFromParent(); 1622 return true; 1623 } 1624