1 //===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass (at IR level) to replace atomic instructions with 11 // __atomic_* library calls, or target specific instruction which implement the 12 // same semantics in a way which better fits the target backend. This can 13 // include the use of (intrinsic-based) load-linked/store-conditional loops, 14 // AtomicCmpXchg, or type coercions. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/AtomicExpandUtils.h" 22 #include "llvm/CodeGen/RuntimeLibcalls.h" 23 #include "llvm/CodeGen/TargetLowering.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/CodeGen/TargetSubtargetInfo.h" 26 #include "llvm/CodeGen/ValueTypes.h" 27 #include "llvm/IR/Attributes.h" 28 #include "llvm/IR/BasicBlock.h" 29 #include "llvm/IR/Constant.h" 30 #include "llvm/IR/Constants.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/Function.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/InstIterator.h" 36 #include "llvm/IR/Instruction.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/IR/Module.h" 39 #include "llvm/IR/Type.h" 40 #include "llvm/IR/User.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/AtomicOrdering.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include <cassert> 50 #include <cstdint> 51 #include <iterator> 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "atomic-expand" 56 57 namespace { 58 59 class AtomicExpand: public FunctionPass { 60 const TargetLowering *TLI = nullptr; 61 62 public: 63 static char ID; // Pass identification, replacement for typeid 64 65 AtomicExpand() : FunctionPass(ID) { 66 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 67 } 68 69 bool runOnFunction(Function &F) override; 70 71 private: 72 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); 73 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 74 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 75 bool tryExpandAtomicLoad(LoadInst *LI); 76 bool expandAtomicLoadToLL(LoadInst *LI); 77 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 78 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 79 bool expandAtomicStore(StoreInst *SI); 80 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 81 Value * 82 insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 83 AtomicOrdering MemOpOrder, 84 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 85 void expandAtomicOpToLLSC( 86 Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, 87 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 88 void expandPartwordAtomicRMW( 89 AtomicRMWInst *I, 90 TargetLoweringBase::AtomicExpansionKind ExpansionKind); 91 AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI); 92 void expandPartwordCmpXchg(AtomicCmpXchgInst *I); 93 94 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 95 static Value *insertRMWCmpXchgLoop( 96 IRBuilder<> &Builder, Type *ResultType, Value *Addr, 97 AtomicOrdering MemOpOrder, 98 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 99 CreateCmpXchgInstFun CreateCmpXchg); 100 101 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 102 bool isIdempotentRMW(AtomicRMWInst *RMWI); 103 bool simplifyIdempotentRMW(AtomicRMWInst *RMWI); 104 105 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align, 106 Value *PointerOperand, Value *ValueOperand, 107 Value *CASExpected, AtomicOrdering Ordering, 108 AtomicOrdering Ordering2, 109 ArrayRef<RTLIB::Libcall> Libcalls); 110 void expandAtomicLoadToLibcall(LoadInst *LI); 111 void expandAtomicStoreToLibcall(StoreInst *LI); 112 void expandAtomicRMWToLibcall(AtomicRMWInst *I); 113 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); 114 115 friend bool 116 llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 117 CreateCmpXchgInstFun CreateCmpXchg); 118 }; 119 120 } // end anonymous namespace 121 122 char AtomicExpand::ID = 0; 123 124 char &llvm::AtomicExpandID = AtomicExpand::ID; 125 126 INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", 127 false, false) 128 129 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } 130 131 // Helper functions to retrieve the size of atomic instructions. 132 static unsigned getAtomicOpSize(LoadInst *LI) { 133 const DataLayout &DL = LI->getModule()->getDataLayout(); 134 return DL.getTypeStoreSize(LI->getType()); 135 } 136 137 static unsigned getAtomicOpSize(StoreInst *SI) { 138 const DataLayout &DL = SI->getModule()->getDataLayout(); 139 return DL.getTypeStoreSize(SI->getValueOperand()->getType()); 140 } 141 142 static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { 143 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 144 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 145 } 146 147 static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { 148 const DataLayout &DL = CASI->getModule()->getDataLayout(); 149 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 150 } 151 152 // Helper functions to retrieve the alignment of atomic instructions. 153 static unsigned getAtomicOpAlign(LoadInst *LI) { 154 unsigned Align = LI->getAlignment(); 155 // In the future, if this IR restriction is relaxed, we should 156 // return DataLayout::getABITypeAlignment when there's no align 157 // value. 158 assert(Align != 0 && "An atomic LoadInst always has an explicit alignment"); 159 return Align; 160 } 161 162 static unsigned getAtomicOpAlign(StoreInst *SI) { 163 unsigned Align = SI->getAlignment(); 164 // In the future, if this IR restriction is relaxed, we should 165 // return DataLayout::getABITypeAlignment when there's no align 166 // value. 167 assert(Align != 0 && "An atomic StoreInst always has an explicit alignment"); 168 return Align; 169 } 170 171 static unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) { 172 // TODO(PR27168): This instruction has no alignment attribute, but unlike the 173 // default alignment for load/store, the default here is to assume 174 // it has NATURAL alignment, not DataLayout-specified alignment. 175 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 176 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 177 } 178 179 static unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) { 180 // TODO(PR27168): same comment as above. 181 const DataLayout &DL = CASI->getModule()->getDataLayout(); 182 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 183 } 184 185 // Determine if a particular atomic operation has a supported size, 186 // and is of appropriate alignment, to be passed through for target 187 // lowering. (Versus turning into a __atomic libcall) 188 template <typename Inst> 189 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { 190 unsigned Size = getAtomicOpSize(I); 191 unsigned Align = getAtomicOpAlign(I); 192 return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; 193 } 194 195 bool AtomicExpand::runOnFunction(Function &F) { 196 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 197 if (!TPC) 198 return false; 199 200 auto &TM = TPC->getTM<TargetMachine>(); 201 if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) 202 return false; 203 TLI = TM.getSubtargetImpl(F)->getTargetLowering(); 204 205 SmallVector<Instruction *, 1> AtomicInsts; 206 207 // Changing control-flow while iterating through it is a bad idea, so gather a 208 // list of all atomic instructions before we start. 209 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 210 Instruction *I = &*II; 211 if (I->isAtomic() && !isa<FenceInst>(I)) 212 AtomicInsts.push_back(I); 213 } 214 215 bool MadeChange = false; 216 for (auto I : AtomicInsts) { 217 auto LI = dyn_cast<LoadInst>(I); 218 auto SI = dyn_cast<StoreInst>(I); 219 auto RMWI = dyn_cast<AtomicRMWInst>(I); 220 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 221 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 222 223 // If the Size/Alignment is not supported, replace with a libcall. 224 if (LI) { 225 if (!atomicSizeSupported(TLI, LI)) { 226 expandAtomicLoadToLibcall(LI); 227 MadeChange = true; 228 continue; 229 } 230 } else if (SI) { 231 if (!atomicSizeSupported(TLI, SI)) { 232 expandAtomicStoreToLibcall(SI); 233 MadeChange = true; 234 continue; 235 } 236 } else if (RMWI) { 237 if (!atomicSizeSupported(TLI, RMWI)) { 238 expandAtomicRMWToLibcall(RMWI); 239 MadeChange = true; 240 continue; 241 } 242 } else if (CASI) { 243 if (!atomicSizeSupported(TLI, CASI)) { 244 expandAtomicCASToLibcall(CASI); 245 MadeChange = true; 246 continue; 247 } 248 } 249 250 if (TLI->shouldInsertFencesForAtomic(I)) { 251 auto FenceOrdering = AtomicOrdering::Monotonic; 252 if (LI && isAcquireOrStronger(LI->getOrdering())) { 253 FenceOrdering = LI->getOrdering(); 254 LI->setOrdering(AtomicOrdering::Monotonic); 255 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 256 FenceOrdering = SI->getOrdering(); 257 SI->setOrdering(AtomicOrdering::Monotonic); 258 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 259 isAcquireOrStronger(RMWI->getOrdering()))) { 260 FenceOrdering = RMWI->getOrdering(); 261 RMWI->setOrdering(AtomicOrdering::Monotonic); 262 } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 263 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 264 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 265 // If a compare and swap is lowered to LL/SC, we can do smarter fence 266 // insertion, with a stronger one on the success path than on the 267 // failure path. As a result, fence insertion is directly done by 268 // expandAtomicCmpXchg in that case. 269 FenceOrdering = CASI->getSuccessOrdering(); 270 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 271 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 272 } 273 274 if (FenceOrdering != AtomicOrdering::Monotonic) { 275 MadeChange |= bracketInstWithFences(I, FenceOrdering); 276 } 277 } 278 279 if (LI) { 280 if (LI->getType()->isFloatingPointTy()) { 281 // TODO: add a TLI hook to control this so that each target can 282 // convert to lowering the original type one at a time. 283 LI = convertAtomicLoadToIntegerType(LI); 284 assert(LI->getType()->isIntegerTy() && "invariant broken"); 285 MadeChange = true; 286 } 287 288 MadeChange |= tryExpandAtomicLoad(LI); 289 } else if (SI) { 290 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 291 // TODO: add a TLI hook to control this so that each target can 292 // convert to lowering the original type one at a time. 293 SI = convertAtomicStoreToIntegerType(SI); 294 assert(SI->getValueOperand()->getType()->isIntegerTy() && 295 "invariant broken"); 296 MadeChange = true; 297 } 298 299 if (TLI->shouldExpandAtomicStoreInIR(SI)) 300 MadeChange |= expandAtomicStore(SI); 301 } else if (RMWI) { 302 // There are two different ways of expanding RMW instructions: 303 // - into a load if it is idempotent 304 // - into a Cmpxchg/LL-SC loop otherwise 305 // we try them in that order. 306 307 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 308 MadeChange = true; 309 } else { 310 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 311 unsigned ValueSize = getAtomicOpSize(RMWI); 312 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 313 if (ValueSize < MinCASSize && 314 (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor || 315 Op == AtomicRMWInst::And)) { 316 RMWI = widenPartwordAtomicRMW(RMWI); 317 MadeChange = true; 318 } 319 320 MadeChange |= tryExpandAtomicRMW(RMWI); 321 } 322 } else if (CASI) { 323 // TODO: when we're ready to make the change at the IR level, we can 324 // extend convertCmpXchgToInteger for floating point too. 325 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 326 "unimplemented - floating point not legal at IR level"); 327 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 328 // TODO: add a TLI hook to control this so that each target can 329 // convert to lowering the original type one at a time. 330 CASI = convertCmpXchgToIntegerType(CASI); 331 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 332 "invariant broken"); 333 MadeChange = true; 334 } 335 336 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 337 unsigned ValueSize = getAtomicOpSize(CASI); 338 if (ValueSize < MinCASSize) { 339 assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) && 340 "MinCmpXchgSizeInBits not yet supported for LL/SC expansions."); 341 expandPartwordCmpXchg(CASI); 342 } else { 343 if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) 344 MadeChange |= expandAtomicCmpXchg(CASI); 345 } 346 } 347 } 348 return MadeChange; 349 } 350 351 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { 352 IRBuilder<> Builder(I); 353 354 auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); 355 356 auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); 357 // We have a guard here because not every atomic operation generates a 358 // trailing fence. 359 if (TrailingFence) 360 TrailingFence->moveAfter(I); 361 362 return (LeadingFence || TrailingFence); 363 } 364 365 /// Get the iX type with the same bitwidth as T. 366 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 367 const DataLayout &DL) { 368 EVT VT = TLI->getValueType(DL, T); 369 unsigned BitWidth = VT.getStoreSizeInBits(); 370 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 371 return IntegerType::get(T->getContext(), BitWidth); 372 } 373 374 /// Convert an atomic load of a non-integral type to an integer load of the 375 /// equivalent bitwidth. See the function comment on 376 /// convertAtomicStoreToIntegerType for background. 377 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 378 auto *M = LI->getModule(); 379 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 380 M->getDataLayout()); 381 382 IRBuilder<> Builder(LI); 383 384 Value *Addr = LI->getPointerOperand(); 385 Type *PT = PointerType::get(NewTy, 386 Addr->getType()->getPointerAddressSpace()); 387 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 388 389 auto *NewLI = Builder.CreateLoad(NewAddr); 390 NewLI->setAlignment(LI->getAlignment()); 391 NewLI->setVolatile(LI->isVolatile()); 392 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); 393 LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 394 395 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 396 LI->replaceAllUsesWith(NewVal); 397 LI->eraseFromParent(); 398 return NewLI; 399 } 400 401 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 402 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 403 case TargetLoweringBase::AtomicExpansionKind::None: 404 return false; 405 case TargetLoweringBase::AtomicExpansionKind::LLSC: 406 expandAtomicOpToLLSC( 407 LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), 408 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 409 return true; 410 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 411 return expandAtomicLoadToLL(LI); 412 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 413 return expandAtomicLoadToCmpXchg(LI); 414 } 415 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 416 } 417 418 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 419 IRBuilder<> Builder(LI); 420 421 // On some architectures, load-linked instructions are atomic for larger 422 // sizes than normal loads. For example, the only 64-bit load guaranteed 423 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 424 Value *Val = 425 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 426 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 427 428 LI->replaceAllUsesWith(Val); 429 LI->eraseFromParent(); 430 431 return true; 432 } 433 434 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 435 IRBuilder<> Builder(LI); 436 AtomicOrdering Order = LI->getOrdering(); 437 Value *Addr = LI->getPointerOperand(); 438 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 439 Constant *DummyVal = Constant::getNullValue(Ty); 440 441 Value *Pair = Builder.CreateAtomicCmpXchg( 442 Addr, DummyVal, DummyVal, Order, 443 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 444 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 445 446 LI->replaceAllUsesWith(Loaded); 447 LI->eraseFromParent(); 448 449 return true; 450 } 451 452 /// Convert an atomic store of a non-integral type to an integer store of the 453 /// equivalent bitwidth. We used to not support floating point or vector 454 /// atomics in the IR at all. The backends learned to deal with the bitcast 455 /// idiom because that was the only way of expressing the notion of a atomic 456 /// float or vector store. The long term plan is to teach each backend to 457 /// instruction select from the original atomic store, but as a migration 458 /// mechanism, we convert back to the old format which the backends understand. 459 /// Each backend will need individual work to recognize the new format. 460 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 461 IRBuilder<> Builder(SI); 462 auto *M = SI->getModule(); 463 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 464 M->getDataLayout()); 465 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 466 467 Value *Addr = SI->getPointerOperand(); 468 Type *PT = PointerType::get(NewTy, 469 Addr->getType()->getPointerAddressSpace()); 470 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 471 472 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 473 NewSI->setAlignment(SI->getAlignment()); 474 NewSI->setVolatile(SI->isVolatile()); 475 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); 476 LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 477 SI->eraseFromParent(); 478 return NewSI; 479 } 480 481 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 482 // This function is only called on atomic stores that are too large to be 483 // atomic if implemented as a native store. So we replace them by an 484 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 485 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 486 // It is the responsibility of the target to only signal expansion via 487 // shouldExpandAtomicRMW in cases where this is required and possible. 488 IRBuilder<> Builder(SI); 489 AtomicRMWInst *AI = 490 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 491 SI->getValueOperand(), SI->getOrdering()); 492 SI->eraseFromParent(); 493 494 // Now we have an appropriate swap instruction, lower it as usual. 495 return tryExpandAtomicRMW(AI); 496 } 497 498 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 499 Value *Loaded, Value *NewVal, 500 AtomicOrdering MemOpOrder, 501 Value *&Success, Value *&NewLoaded) { 502 Value* Pair = Builder.CreateAtomicCmpXchg( 503 Addr, Loaded, NewVal, MemOpOrder, 504 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 505 Success = Builder.CreateExtractValue(Pair, 1, "success"); 506 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 507 } 508 509 /// Emit IR to implement the given atomicrmw operation on values in registers, 510 /// returning the new value. 511 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 512 Value *Loaded, Value *Inc) { 513 Value *NewVal; 514 switch (Op) { 515 case AtomicRMWInst::Xchg: 516 return Inc; 517 case AtomicRMWInst::Add: 518 return Builder.CreateAdd(Loaded, Inc, "new"); 519 case AtomicRMWInst::Sub: 520 return Builder.CreateSub(Loaded, Inc, "new"); 521 case AtomicRMWInst::And: 522 return Builder.CreateAnd(Loaded, Inc, "new"); 523 case AtomicRMWInst::Nand: 524 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 525 case AtomicRMWInst::Or: 526 return Builder.CreateOr(Loaded, Inc, "new"); 527 case AtomicRMWInst::Xor: 528 return Builder.CreateXor(Loaded, Inc, "new"); 529 case AtomicRMWInst::Max: 530 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 531 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 532 case AtomicRMWInst::Min: 533 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 534 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 535 case AtomicRMWInst::UMax: 536 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 537 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 538 case AtomicRMWInst::UMin: 539 NewVal = Builder.CreateICmpULE(Loaded, Inc); 540 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 541 default: 542 llvm_unreachable("Unknown atomic op"); 543 } 544 } 545 546 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 547 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 548 case TargetLoweringBase::AtomicExpansionKind::None: 549 return false; 550 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 551 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 552 unsigned ValueSize = getAtomicOpSize(AI); 553 if (ValueSize < MinCASSize) { 554 llvm_unreachable( 555 "MinCmpXchgSizeInBits not yet supported for LL/SC architectures."); 556 } else { 557 auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { 558 return performAtomicOp(AI->getOperation(), Builder, Loaded, 559 AI->getValOperand()); 560 }; 561 expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), 562 AI->getOrdering(), PerformOp); 563 } 564 return true; 565 } 566 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { 567 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 568 unsigned ValueSize = getAtomicOpSize(AI); 569 if (ValueSize < MinCASSize) { 570 expandPartwordAtomicRMW(AI, 571 TargetLoweringBase::AtomicExpansionKind::CmpXChg); 572 } else { 573 expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 574 } 575 return true; 576 } 577 default: 578 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 579 } 580 } 581 582 namespace { 583 584 /// Result values from createMaskInstrs helper. 585 struct PartwordMaskValues { 586 Type *WordType; 587 Type *ValueType; 588 Value *AlignedAddr; 589 Value *ShiftAmt; 590 Value *Mask; 591 Value *Inv_Mask; 592 }; 593 594 } // end anonymous namespace 595 596 /// This is a helper function which builds instructions to provide 597 /// values necessary for partword atomic operations. It takes an 598 /// incoming address, Addr, and ValueType, and constructs the address, 599 /// shift-amounts and masks needed to work with a larger value of size 600 /// WordSize. 601 /// 602 /// AlignedAddr: Addr rounded down to a multiple of WordSize 603 /// 604 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded 605 /// from AlignAddr for it to have the same value as if 606 /// ValueType was loaded from Addr. 607 /// 608 /// Mask: Value to mask with the value loaded from AlignAddr to 609 /// include only the part that would've been loaded from Addr. 610 /// 611 /// Inv_Mask: The inverse of Mask. 612 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, 613 Type *ValueType, Value *Addr, 614 unsigned WordSize) { 615 PartwordMaskValues Ret; 616 617 BasicBlock *BB = I->getParent(); 618 Function *F = BB->getParent(); 619 Module *M = I->getModule(); 620 621 LLVMContext &Ctx = F->getContext(); 622 const DataLayout &DL = M->getDataLayout(); 623 624 unsigned ValueSize = DL.getTypeStoreSize(ValueType); 625 626 assert(ValueSize < WordSize); 627 628 Ret.ValueType = ValueType; 629 Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8); 630 631 Type *WordPtrType = 632 Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); 633 634 Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); 635 Ret.AlignedAddr = Builder.CreateIntToPtr( 636 Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType, 637 "AlignedAddr"); 638 639 Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB"); 640 if (DL.isLittleEndian()) { 641 // turn bytes into bits 642 Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3); 643 } else { 644 // turn bytes into bits, and count from the other side. 645 Ret.ShiftAmt = 646 Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3); 647 } 648 649 Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt"); 650 Ret.Mask = Builder.CreateShl( 651 ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt, 652 "Mask"); 653 Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask"); 654 655 return Ret; 656 } 657 658 /// Emit IR to implement a masked version of a given atomicrmw 659 /// operation. (That is, only the bits under the Mask should be 660 /// affected by the operation) 661 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, 662 IRBuilder<> &Builder, Value *Loaded, 663 Value *Shifted_Inc, Value *Inc, 664 const PartwordMaskValues &PMV) { 665 switch (Op) { 666 case AtomicRMWInst::Xchg: { 667 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 668 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); 669 return FinalVal; 670 } 671 case AtomicRMWInst::Or: 672 case AtomicRMWInst::Xor: 673 case AtomicRMWInst::And: 674 llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW"); 675 case AtomicRMWInst::Add: 676 case AtomicRMWInst::Sub: 677 case AtomicRMWInst::Nand: { 678 // The other arithmetic ops need to be masked into place. 679 Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 680 Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); 681 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 682 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); 683 return FinalVal; 684 } 685 case AtomicRMWInst::Max: 686 case AtomicRMWInst::Min: 687 case AtomicRMWInst::UMax: 688 case AtomicRMWInst::UMin: { 689 // Finally, comparison ops will operate on the full value, so 690 // truncate down to the original size, and expand out again after 691 // doing the operation. 692 Value *Loaded_Shiftdown = Builder.CreateTrunc( 693 Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType); 694 Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc); 695 Value *NewVal_Shiftup = Builder.CreateShl( 696 Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 697 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 698 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup); 699 return FinalVal; 700 } 701 default: 702 llvm_unreachable("Unknown atomic op"); 703 } 704 } 705 706 /// Expand a sub-word atomicrmw operation into an appropriate 707 /// word-sized operation. 708 /// 709 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same 710 /// way as a typical atomicrmw expansion. The only difference here is 711 /// that the operation inside of the loop must operate only upon a 712 /// part of the value. 713 void AtomicExpand::expandPartwordAtomicRMW( 714 AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { 715 assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg); 716 717 AtomicOrdering MemOpOrder = AI->getOrdering(); 718 719 IRBuilder<> Builder(AI); 720 721 PartwordMaskValues PMV = 722 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 723 TLI->getMinCmpXchgSizeInBits() / 8); 724 725 Value *ValOperand_Shifted = 726 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 727 PMV.ShiftAmt, "ValOperand_Shifted"); 728 729 auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { 730 return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, 731 ValOperand_Shifted, AI->getValOperand(), PMV); 732 }; 733 734 // TODO: When we're ready to support LLSC conversions too, use 735 // insertRMWLLSCLoop here for ExpansionKind==LLSC. 736 Value *OldResult = 737 insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, 738 PerformPartwordOp, createCmpXchgInstFun); 739 Value *FinalOldResult = Builder.CreateTrunc( 740 Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType); 741 AI->replaceAllUsesWith(FinalOldResult); 742 AI->eraseFromParent(); 743 } 744 745 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width. 746 AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) { 747 IRBuilder<> Builder(AI); 748 AtomicRMWInst::BinOp Op = AI->getOperation(); 749 750 assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor || 751 Op == AtomicRMWInst::And) && 752 "Unable to widen operation"); 753 754 PartwordMaskValues PMV = 755 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 756 TLI->getMinCmpXchgSizeInBits() / 8); 757 758 Value *ValOperand_Shifted = 759 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 760 PMV.ShiftAmt, "ValOperand_Shifted"); 761 762 Value *NewOperand; 763 764 if (Op == AtomicRMWInst::And) 765 NewOperand = 766 Builder.CreateOr(PMV.Inv_Mask, ValOperand_Shifted, "AndOperand"); 767 else 768 NewOperand = ValOperand_Shifted; 769 770 AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(Op, PMV.AlignedAddr, 771 NewOperand, AI->getOrdering()); 772 773 Value *FinalOldResult = Builder.CreateTrunc( 774 Builder.CreateLShr(NewAI, PMV.ShiftAmt), PMV.ValueType); 775 AI->replaceAllUsesWith(FinalOldResult); 776 AI->eraseFromParent(); 777 return NewAI; 778 } 779 780 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { 781 // The basic idea here is that we're expanding a cmpxchg of a 782 // smaller memory size up to a word-sized cmpxchg. To do this, we 783 // need to add a retry-loop for strong cmpxchg, so that 784 // modifications to other parts of the word don't cause a spurious 785 // failure. 786 787 // This generates code like the following: 788 // [[Setup mask values PMV.*]] 789 // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt 790 // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt 791 // %InitLoaded = load i32* %addr 792 // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask 793 // br partword.cmpxchg.loop 794 // partword.cmpxchg.loop: 795 // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], 796 // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] 797 // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted 798 // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted 799 // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, 800 // i32 %FullWord_NewVal success_ordering failure_ordering 801 // %OldVal = extractvalue { i32, i1 } %NewCI, 0 802 // %Success = extractvalue { i32, i1 } %NewCI, 1 803 // br i1 %Success, label %partword.cmpxchg.end, 804 // label %partword.cmpxchg.failure 805 // partword.cmpxchg.failure: 806 // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask 807 // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut 808 // br i1 %ShouldContinue, label %partword.cmpxchg.loop, 809 // label %partword.cmpxchg.end 810 // partword.cmpxchg.end: 811 // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt 812 // %FinalOldVal = trunc i32 %tmp1 to i8 813 // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 814 // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 815 816 Value *Addr = CI->getPointerOperand(); 817 Value *Cmp = CI->getCompareOperand(); 818 Value *NewVal = CI->getNewValOperand(); 819 820 BasicBlock *BB = CI->getParent(); 821 Function *F = BB->getParent(); 822 IRBuilder<> Builder(CI); 823 LLVMContext &Ctx = Builder.getContext(); 824 825 const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; 826 827 BasicBlock *EndBB = 828 BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); 829 auto FailureBB = 830 BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); 831 auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); 832 833 // The split call above "helpfully" added a branch at the end of BB 834 // (to the wrong place). 835 std::prev(BB->end())->eraseFromParent(); 836 Builder.SetInsertPoint(BB); 837 838 PartwordMaskValues PMV = createMaskInstrs( 839 Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); 840 841 // Shift the incoming values over, into the right location in the word. 842 Value *NewVal_Shifted = 843 Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 844 Value *Cmp_Shifted = 845 Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); 846 847 // Load the entire current word, and mask into place the expected and new 848 // values 849 LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); 850 InitLoaded->setVolatile(CI->isVolatile()); 851 Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); 852 Builder.CreateBr(LoopBB); 853 854 // partword.cmpxchg.loop: 855 Builder.SetInsertPoint(LoopBB); 856 PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); 857 Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); 858 859 // Mask/Or the expected and new values into place in the loaded word. 860 Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); 861 Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); 862 AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( 863 PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), 864 CI->getFailureOrdering(), CI->getSyncScopeID()); 865 NewCI->setVolatile(CI->isVolatile()); 866 // When we're building a strong cmpxchg, we need a loop, so you 867 // might think we could use a weak cmpxchg inside. But, using strong 868 // allows the below comparison for ShouldContinue, and we're 869 // expecting the underlying cmpxchg to be a machine instruction, 870 // which is strong anyways. 871 NewCI->setWeak(CI->isWeak()); 872 873 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 874 Value *Success = Builder.CreateExtractValue(NewCI, 1); 875 876 if (CI->isWeak()) 877 Builder.CreateBr(EndBB); 878 else 879 Builder.CreateCondBr(Success, EndBB, FailureBB); 880 881 // partword.cmpxchg.failure: 882 Builder.SetInsertPoint(FailureBB); 883 // Upon failure, verify that the masked-out part of the loaded value 884 // has been modified. If it didn't, abort the cmpxchg, since the 885 // masked-in part must've. 886 Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); 887 Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); 888 Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); 889 890 // Add the second value to the phi from above 891 Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); 892 893 // partword.cmpxchg.end: 894 Builder.SetInsertPoint(CI); 895 896 Value *FinalOldVal = Builder.CreateTrunc( 897 Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType); 898 Value *Res = UndefValue::get(CI->getType()); 899 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 900 Res = Builder.CreateInsertValue(Res, Success, 1); 901 902 CI->replaceAllUsesWith(Res); 903 CI->eraseFromParent(); 904 } 905 906 void AtomicExpand::expandAtomicOpToLLSC( 907 Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, 908 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 909 IRBuilder<> Builder(I); 910 Value *Loaded = 911 insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); 912 913 I->replaceAllUsesWith(Loaded); 914 I->eraseFromParent(); 915 } 916 917 Value *AtomicExpand::insertRMWLLSCLoop( 918 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 919 AtomicOrdering MemOpOrder, 920 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 921 LLVMContext &Ctx = Builder.getContext(); 922 BasicBlock *BB = Builder.GetInsertBlock(); 923 Function *F = BB->getParent(); 924 925 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 926 // 927 // The standard expansion we produce is: 928 // [...] 929 // atomicrmw.start: 930 // %loaded = @load.linked(%addr) 931 // %new = some_op iN %loaded, %incr 932 // %stored = @store_conditional(%new, %addr) 933 // %try_again = icmp i32 ne %stored, 0 934 // br i1 %try_again, label %loop, label %atomicrmw.end 935 // atomicrmw.end: 936 // [...] 937 BasicBlock *ExitBB = 938 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 939 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 940 941 // The split call above "helpfully" added a branch at the end of BB (to the 942 // wrong place). 943 std::prev(BB->end())->eraseFromParent(); 944 Builder.SetInsertPoint(BB); 945 Builder.CreateBr(LoopBB); 946 947 // Start the main loop block now that we've taken care of the preliminaries. 948 Builder.SetInsertPoint(LoopBB); 949 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 950 951 Value *NewVal = PerformOp(Builder, Loaded); 952 953 Value *StoreSuccess = 954 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 955 Value *TryAgain = Builder.CreateICmpNE( 956 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 957 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 958 959 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 960 return Loaded; 961 } 962 963 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 964 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 965 /// IR. As a migration step, we convert back to what use to be the standard 966 /// way to represent a pointer cmpxchg so that we can update backends one by 967 /// one. 968 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 969 auto *M = CI->getModule(); 970 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 971 M->getDataLayout()); 972 973 IRBuilder<> Builder(CI); 974 975 Value *Addr = CI->getPointerOperand(); 976 Type *PT = PointerType::get(NewTy, 977 Addr->getType()->getPointerAddressSpace()); 978 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 979 980 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 981 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 982 983 984 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 985 CI->getSuccessOrdering(), 986 CI->getFailureOrdering(), 987 CI->getSyncScopeID()); 988 NewCI->setVolatile(CI->isVolatile()); 989 NewCI->setWeak(CI->isWeak()); 990 LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 991 992 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 993 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 994 995 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 996 997 Value *Res = UndefValue::get(CI->getType()); 998 Res = Builder.CreateInsertValue(Res, OldVal, 0); 999 Res = Builder.CreateInsertValue(Res, Succ, 1); 1000 1001 CI->replaceAllUsesWith(Res); 1002 CI->eraseFromParent(); 1003 return NewCI; 1004 } 1005 1006 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 1007 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 1008 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 1009 Value *Addr = CI->getPointerOperand(); 1010 BasicBlock *BB = CI->getParent(); 1011 Function *F = BB->getParent(); 1012 LLVMContext &Ctx = F->getContext(); 1013 // If shouldInsertFencesForAtomic() returns true, then the target does not 1014 // want to deal with memory orders, and emitLeading/TrailingFence should take 1015 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 1016 // should preserve the ordering. 1017 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 1018 AtomicOrdering MemOpOrder = 1019 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 1020 1021 // In implementations which use a barrier to achieve release semantics, we can 1022 // delay emitting this barrier until we know a store is actually going to be 1023 // attempted. The cost of this delay is that we need 2 copies of the block 1024 // emitting the load-linked, affecting code size. 1025 // 1026 // Ideally, this logic would be unconditional except for the minsize check 1027 // since in other cases the extra blocks naturally collapse down to the 1028 // minimal loop. Unfortunately, this puts too much stress on later 1029 // optimisations so we avoid emitting the extra logic in those cases too. 1030 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 1031 SuccessOrder != AtomicOrdering::Monotonic && 1032 SuccessOrder != AtomicOrdering::Acquire && 1033 !F->optForMinSize(); 1034 1035 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 1036 // do it even on minsize. 1037 bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); 1038 1039 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 1040 // 1041 // The full expansion we produce is: 1042 // [...] 1043 // cmpxchg.start: 1044 // %unreleasedload = @load.linked(%addr) 1045 // %should_store = icmp eq %unreleasedload, %desired 1046 // br i1 %should_store, label %cmpxchg.fencedstore, 1047 // label %cmpxchg.nostore 1048 // cmpxchg.releasingstore: 1049 // fence? 1050 // br label cmpxchg.trystore 1051 // cmpxchg.trystore: 1052 // %loaded.trystore = phi [%unreleasedload, %releasingstore], 1053 // [%releasedload, %cmpxchg.releasedload] 1054 // %stored = @store_conditional(%new, %addr) 1055 // %success = icmp eq i32 %stored, 0 1056 // br i1 %success, label %cmpxchg.success, 1057 // label %cmpxchg.releasedload/%cmpxchg.failure 1058 // cmpxchg.releasedload: 1059 // %releasedload = @load.linked(%addr) 1060 // %should_store = icmp eq %releasedload, %desired 1061 // br i1 %should_store, label %cmpxchg.trystore, 1062 // label %cmpxchg.failure 1063 // cmpxchg.success: 1064 // fence? 1065 // br label %cmpxchg.end 1066 // cmpxchg.nostore: 1067 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 1068 // [%releasedload, 1069 // %cmpxchg.releasedload/%cmpxchg.trystore] 1070 // @load_linked_fail_balance()? 1071 // br label %cmpxchg.failure 1072 // cmpxchg.failure: 1073 // fence? 1074 // br label %cmpxchg.end 1075 // cmpxchg.end: 1076 // %loaded = phi [%loaded.nostore, %cmpxchg.failure], 1077 // [%loaded.trystore, %cmpxchg.trystore] 1078 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 1079 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 1080 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 1081 // [...] 1082 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 1083 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 1084 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 1085 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 1086 auto ReleasedLoadBB = 1087 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 1088 auto TryStoreBB = 1089 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 1090 auto ReleasingStoreBB = 1091 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 1092 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 1093 1094 // This grabs the DebugLoc from CI 1095 IRBuilder<> Builder(CI); 1096 1097 // The split call above "helpfully" added a branch at the end of BB (to the 1098 // wrong place), but we might want a fence too. It's easiest to just remove 1099 // the branch entirely. 1100 std::prev(BB->end())->eraseFromParent(); 1101 Builder.SetInsertPoint(BB); 1102 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 1103 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1104 Builder.CreateBr(StartBB); 1105 1106 // Start the main loop block now that we've taken care of the preliminaries. 1107 Builder.SetInsertPoint(StartBB); 1108 Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1109 Value *ShouldStore = Builder.CreateICmpEQ( 1110 UnreleasedLoad, CI->getCompareOperand(), "should_store"); 1111 1112 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1113 // jump straight past that fence instruction (if it exists). 1114 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 1115 1116 Builder.SetInsertPoint(ReleasingStoreBB); 1117 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 1118 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1119 Builder.CreateBr(TryStoreBB); 1120 1121 Builder.SetInsertPoint(TryStoreBB); 1122 Value *StoreSuccess = TLI->emitStoreConditional( 1123 Builder, CI->getNewValOperand(), Addr, MemOpOrder); 1124 StoreSuccess = Builder.CreateICmpEQ( 1125 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 1126 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 1127 Builder.CreateCondBr(StoreSuccess, SuccessBB, 1128 CI->isWeak() ? FailureBB : RetryBB); 1129 1130 Builder.SetInsertPoint(ReleasedLoadBB); 1131 Value *SecondLoad; 1132 if (HasReleasedLoadBB) { 1133 SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1134 ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), 1135 "should_store"); 1136 1137 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1138 // jump straight past that fence instruction (if it exists). 1139 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 1140 } else 1141 Builder.CreateUnreachable(); 1142 1143 // Make sure later instructions don't get reordered with a fence if 1144 // necessary. 1145 Builder.SetInsertPoint(SuccessBB); 1146 if (ShouldInsertFencesForAtomic) 1147 TLI->emitTrailingFence(Builder, CI, SuccessOrder); 1148 Builder.CreateBr(ExitBB); 1149 1150 Builder.SetInsertPoint(NoStoreBB); 1151 // In the failing case, where we don't execute the store-conditional, the 1152 // target might want to balance out the load-linked with a dedicated 1153 // instruction (e.g., on ARM, clearing the exclusive monitor). 1154 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 1155 Builder.CreateBr(FailureBB); 1156 1157 Builder.SetInsertPoint(FailureBB); 1158 if (ShouldInsertFencesForAtomic) 1159 TLI->emitTrailingFence(Builder, CI, FailureOrder); 1160 Builder.CreateBr(ExitBB); 1161 1162 // Finally, we have control-flow based knowledge of whether the cmpxchg 1163 // succeeded or not. We expose this to later passes by converting any 1164 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 1165 // PHI. 1166 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1167 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); 1168 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 1169 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 1170 1171 // Setup the builder so we can create any PHIs we need. 1172 Value *Loaded; 1173 if (!HasReleasedLoadBB) 1174 Loaded = UnreleasedLoad; 1175 else { 1176 Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); 1177 PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1178 TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); 1179 TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1180 1181 Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); 1182 PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1183 NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); 1184 NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); 1185 1186 Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); 1187 PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); 1188 ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); 1189 ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); 1190 1191 Loaded = ExitLoaded; 1192 } 1193 1194 // Look for any users of the cmpxchg that are just comparing the loaded value 1195 // against the desired one, and replace them with the CFG-derived version. 1196 SmallVector<ExtractValueInst *, 2> PrunedInsts; 1197 for (auto User : CI->users()) { 1198 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 1199 if (!EV) 1200 continue; 1201 1202 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 1203 "weird extraction from { iN, i1 }"); 1204 1205 if (EV->getIndices()[0] == 0) 1206 EV->replaceAllUsesWith(Loaded); 1207 else 1208 EV->replaceAllUsesWith(Success); 1209 1210 PrunedInsts.push_back(EV); 1211 } 1212 1213 // We can remove the instructions now we're no longer iterating through them. 1214 for (auto EV : PrunedInsts) 1215 EV->eraseFromParent(); 1216 1217 if (!CI->use_empty()) { 1218 // Some use of the full struct return that we don't understand has happened, 1219 // so we've got to reconstruct it properly. 1220 Value *Res; 1221 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 1222 Res = Builder.CreateInsertValue(Res, Success, 1); 1223 1224 CI->replaceAllUsesWith(Res); 1225 } 1226 1227 CI->eraseFromParent(); 1228 return true; 1229 } 1230 1231 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 1232 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 1233 if(!C) 1234 return false; 1235 1236 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 1237 switch(Op) { 1238 case AtomicRMWInst::Add: 1239 case AtomicRMWInst::Sub: 1240 case AtomicRMWInst::Or: 1241 case AtomicRMWInst::Xor: 1242 return C->isZero(); 1243 case AtomicRMWInst::And: 1244 return C->isMinusOne(); 1245 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 1246 default: 1247 return false; 1248 } 1249 } 1250 1251 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 1252 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 1253 tryExpandAtomicLoad(ResultingLoad); 1254 return true; 1255 } 1256 return false; 1257 } 1258 1259 Value *AtomicExpand::insertRMWCmpXchgLoop( 1260 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1261 AtomicOrdering MemOpOrder, 1262 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 1263 CreateCmpXchgInstFun CreateCmpXchg) { 1264 LLVMContext &Ctx = Builder.getContext(); 1265 BasicBlock *BB = Builder.GetInsertBlock(); 1266 Function *F = BB->getParent(); 1267 1268 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1269 // 1270 // The standard expansion we produce is: 1271 // [...] 1272 // %init_loaded = load atomic iN* %addr 1273 // br label %loop 1274 // loop: 1275 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 1276 // %new = some_op iN %loaded, %incr 1277 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 1278 // %new_loaded = extractvalue { iN, i1 } %pair, 0 1279 // %success = extractvalue { iN, i1 } %pair, 1 1280 // br i1 %success, label %atomicrmw.end, label %loop 1281 // atomicrmw.end: 1282 // [...] 1283 BasicBlock *ExitBB = 1284 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1285 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1286 1287 // The split call above "helpfully" added a branch at the end of BB (to the 1288 // wrong place), but we want a load. It's easiest to just remove 1289 // the branch entirely. 1290 std::prev(BB->end())->eraseFromParent(); 1291 Builder.SetInsertPoint(BB); 1292 LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); 1293 // Atomics require at least natural alignment. 1294 InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8); 1295 Builder.CreateBr(LoopBB); 1296 1297 // Start the main loop block now that we've taken care of the preliminaries. 1298 Builder.SetInsertPoint(LoopBB); 1299 PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); 1300 Loaded->addIncoming(InitLoaded, BB); 1301 1302 Value *NewVal = PerformOp(Builder, Loaded); 1303 1304 Value *NewLoaded = nullptr; 1305 Value *Success = nullptr; 1306 1307 CreateCmpXchg(Builder, Addr, Loaded, NewVal, 1308 MemOpOrder == AtomicOrdering::Unordered 1309 ? AtomicOrdering::Monotonic 1310 : MemOpOrder, 1311 Success, NewLoaded); 1312 assert(Success && NewLoaded); 1313 1314 Loaded->addIncoming(NewLoaded, LoopBB); 1315 1316 Builder.CreateCondBr(Success, ExitBB, LoopBB); 1317 1318 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1319 return NewLoaded; 1320 } 1321 1322 // Note: This function is exposed externally by AtomicExpandUtils.h 1323 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 1324 CreateCmpXchgInstFun CreateCmpXchg) { 1325 IRBuilder<> Builder(AI); 1326 Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( 1327 Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), 1328 [&](IRBuilder<> &Builder, Value *Loaded) { 1329 return performAtomicOp(AI->getOperation(), Builder, Loaded, 1330 AI->getValOperand()); 1331 }, 1332 CreateCmpXchg); 1333 1334 AI->replaceAllUsesWith(Loaded); 1335 AI->eraseFromParent(); 1336 return true; 1337 } 1338 1339 // In order to use one of the sized library calls such as 1340 // __atomic_fetch_add_4, the alignment must be sufficient, the size 1341 // must be one of the potentially-specialized sizes, and the value 1342 // type must actually exist in C on the target (otherwise, the 1343 // function wouldn't actually be defined.) 1344 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align, 1345 const DataLayout &DL) { 1346 // TODO: "LargestSize" is an approximation for "largest type that 1347 // you can express in C". It seems to be the case that int128 is 1348 // supported on all 64-bit platforms, otherwise only up to 64-bit 1349 // integers are supported. If we get this wrong, then we'll try to 1350 // call a sized libcall that doesn't actually exist. There should 1351 // really be some more reliable way in LLVM of determining integer 1352 // sizes which are valid in the target's C ABI... 1353 unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8; 1354 return Align >= Size && 1355 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) && 1356 Size <= LargestSize; 1357 } 1358 1359 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { 1360 static const RTLIB::Libcall Libcalls[6] = { 1361 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, 1362 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; 1363 unsigned Size = getAtomicOpSize(I); 1364 unsigned Align = getAtomicOpAlign(I); 1365 1366 bool expanded = expandAtomicOpToLibcall( 1367 I, Size, Align, I->getPointerOperand(), nullptr, nullptr, 1368 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1369 (void)expanded; 1370 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); 1371 } 1372 1373 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { 1374 static const RTLIB::Libcall Libcalls[6] = { 1375 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, 1376 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; 1377 unsigned Size = getAtomicOpSize(I); 1378 unsigned Align = getAtomicOpAlign(I); 1379 1380 bool expanded = expandAtomicOpToLibcall( 1381 I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr, 1382 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1383 (void)expanded; 1384 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); 1385 } 1386 1387 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { 1388 static const RTLIB::Libcall Libcalls[6] = { 1389 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, 1390 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, 1391 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; 1392 unsigned Size = getAtomicOpSize(I); 1393 unsigned Align = getAtomicOpAlign(I); 1394 1395 bool expanded = expandAtomicOpToLibcall( 1396 I, Size, Align, I->getPointerOperand(), I->getNewValOperand(), 1397 I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), 1398 Libcalls); 1399 (void)expanded; 1400 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); 1401 } 1402 1403 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { 1404 static const RTLIB::Libcall LibcallsXchg[6] = { 1405 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, 1406 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, 1407 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; 1408 static const RTLIB::Libcall LibcallsAdd[6] = { 1409 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, 1410 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, 1411 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; 1412 static const RTLIB::Libcall LibcallsSub[6] = { 1413 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, 1414 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, 1415 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; 1416 static const RTLIB::Libcall LibcallsAnd[6] = { 1417 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, 1418 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, 1419 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; 1420 static const RTLIB::Libcall LibcallsOr[6] = { 1421 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, 1422 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, 1423 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; 1424 static const RTLIB::Libcall LibcallsXor[6] = { 1425 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, 1426 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, 1427 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; 1428 static const RTLIB::Libcall LibcallsNand[6] = { 1429 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, 1430 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, 1431 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; 1432 1433 switch (Op) { 1434 case AtomicRMWInst::BAD_BINOP: 1435 llvm_unreachable("Should not have BAD_BINOP."); 1436 case AtomicRMWInst::Xchg: 1437 return makeArrayRef(LibcallsXchg); 1438 case AtomicRMWInst::Add: 1439 return makeArrayRef(LibcallsAdd); 1440 case AtomicRMWInst::Sub: 1441 return makeArrayRef(LibcallsSub); 1442 case AtomicRMWInst::And: 1443 return makeArrayRef(LibcallsAnd); 1444 case AtomicRMWInst::Or: 1445 return makeArrayRef(LibcallsOr); 1446 case AtomicRMWInst::Xor: 1447 return makeArrayRef(LibcallsXor); 1448 case AtomicRMWInst::Nand: 1449 return makeArrayRef(LibcallsNand); 1450 case AtomicRMWInst::Max: 1451 case AtomicRMWInst::Min: 1452 case AtomicRMWInst::UMax: 1453 case AtomicRMWInst::UMin: 1454 // No atomic libcalls are available for max/min/umax/umin. 1455 return {}; 1456 } 1457 llvm_unreachable("Unexpected AtomicRMW operation."); 1458 } 1459 1460 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { 1461 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); 1462 1463 unsigned Size = getAtomicOpSize(I); 1464 unsigned Align = getAtomicOpAlign(I); 1465 1466 bool Success = false; 1467 if (!Libcalls.empty()) 1468 Success = expandAtomicOpToLibcall( 1469 I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr, 1470 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1471 1472 // The expansion failed: either there were no libcalls at all for 1473 // the operation (min/max), or there were only size-specialized 1474 // libcalls (add/sub/etc) and we needed a generic. So, expand to a 1475 // CAS libcall, via a CAS loop, instead. 1476 if (!Success) { 1477 expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, 1478 Value *Loaded, Value *NewVal, 1479 AtomicOrdering MemOpOrder, 1480 Value *&Success, Value *&NewLoaded) { 1481 // Create the CAS instruction normally... 1482 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( 1483 Addr, Loaded, NewVal, MemOpOrder, 1484 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 1485 Success = Builder.CreateExtractValue(Pair, 1, "success"); 1486 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 1487 1488 // ...and then expand the CAS into a libcall. 1489 expandAtomicCASToLibcall(Pair); 1490 }); 1491 } 1492 } 1493 1494 // A helper routine for the above expandAtomic*ToLibcall functions. 1495 // 1496 // 'Libcalls' contains an array of enum values for the particular 1497 // ATOMIC libcalls to be emitted. All of the other arguments besides 1498 // 'I' are extracted from the Instruction subclass by the 1499 // caller. Depending on the particular call, some will be null. 1500 bool AtomicExpand::expandAtomicOpToLibcall( 1501 Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand, 1502 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, 1503 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { 1504 assert(Libcalls.size() == 6); 1505 1506 LLVMContext &Ctx = I->getContext(); 1507 Module *M = I->getModule(); 1508 const DataLayout &DL = M->getDataLayout(); 1509 IRBuilder<> Builder(I); 1510 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); 1511 1512 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL); 1513 Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); 1514 1515 unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy); 1516 1517 // TODO: the "order" argument type is "int", not int32. So 1518 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. 1519 ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); 1520 assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); 1521 Constant *OrderingVal = 1522 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); 1523 Constant *Ordering2Val = nullptr; 1524 if (CASExpected) { 1525 assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO"); 1526 Ordering2Val = 1527 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2)); 1528 } 1529 bool HasResult = I->getType() != Type::getVoidTy(Ctx); 1530 1531 RTLIB::Libcall RTLibType; 1532 if (UseSizedLibcall) { 1533 switch (Size) { 1534 case 1: RTLibType = Libcalls[1]; break; 1535 case 2: RTLibType = Libcalls[2]; break; 1536 case 4: RTLibType = Libcalls[3]; break; 1537 case 8: RTLibType = Libcalls[4]; break; 1538 case 16: RTLibType = Libcalls[5]; break; 1539 } 1540 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) { 1541 RTLibType = Libcalls[0]; 1542 } else { 1543 // Can't use sized function, and there's no generic for this 1544 // operation, so give up. 1545 return false; 1546 } 1547 1548 // Build up the function call. There's two kinds. First, the sized 1549 // variants. These calls are going to be one of the following (with 1550 // N=1,2,4,8,16): 1551 // iN __atomic_load_N(iN *ptr, int ordering) 1552 // void __atomic_store_N(iN *ptr, iN val, int ordering) 1553 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) 1554 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, 1555 // int success_order, int failure_order) 1556 // 1557 // Note that these functions can be used for non-integer atomic 1558 // operations, the values just need to be bitcast to integers on the 1559 // way in and out. 1560 // 1561 // And, then, the generic variants. They look like the following: 1562 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) 1563 // void __atomic_store(size_t size, void *ptr, void *val, int ordering) 1564 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, 1565 // int ordering) 1566 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, 1567 // void *desired, int success_order, 1568 // int failure_order) 1569 // 1570 // The different signatures are built up depending on the 1571 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' 1572 // variables. 1573 1574 AllocaInst *AllocaCASExpected = nullptr; 1575 Value *AllocaCASExpected_i8 = nullptr; 1576 AllocaInst *AllocaValue = nullptr; 1577 Value *AllocaValue_i8 = nullptr; 1578 AllocaInst *AllocaResult = nullptr; 1579 Value *AllocaResult_i8 = nullptr; 1580 1581 Type *ResultTy; 1582 SmallVector<Value *, 6> Args; 1583 AttributeList Attr; 1584 1585 // 'size' argument. 1586 if (!UseSizedLibcall) { 1587 // Note, getIntPtrType is assumed equivalent to size_t. 1588 Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); 1589 } 1590 1591 // 'ptr' argument. 1592 Value *PtrVal = 1593 Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx)); 1594 Args.push_back(PtrVal); 1595 1596 // 'expected' argument, if present. 1597 if (CASExpected) { 1598 AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); 1599 AllocaCASExpected->setAlignment(AllocaAlignment); 1600 AllocaCASExpected_i8 = 1601 Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx)); 1602 Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); 1603 Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); 1604 Args.push_back(AllocaCASExpected_i8); 1605 } 1606 1607 // 'val' argument ('desired' for cas), if present. 1608 if (ValueOperand) { 1609 if (UseSizedLibcall) { 1610 Value *IntValue = 1611 Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); 1612 Args.push_back(IntValue); 1613 } else { 1614 AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); 1615 AllocaValue->setAlignment(AllocaAlignment); 1616 AllocaValue_i8 = 1617 Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); 1618 Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); 1619 Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); 1620 Args.push_back(AllocaValue_i8); 1621 } 1622 } 1623 1624 // 'ret' argument. 1625 if (!CASExpected && HasResult && !UseSizedLibcall) { 1626 AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); 1627 AllocaResult->setAlignment(AllocaAlignment); 1628 AllocaResult_i8 = 1629 Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx)); 1630 Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); 1631 Args.push_back(AllocaResult_i8); 1632 } 1633 1634 // 'ordering' ('success_order' for cas) argument. 1635 Args.push_back(OrderingVal); 1636 1637 // 'failure_order' argument, if present. 1638 if (Ordering2Val) 1639 Args.push_back(Ordering2Val); 1640 1641 // Now, the return type. 1642 if (CASExpected) { 1643 ResultTy = Type::getInt1Ty(Ctx); 1644 Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); 1645 } else if (HasResult && UseSizedLibcall) 1646 ResultTy = SizedIntTy; 1647 else 1648 ResultTy = Type::getVoidTy(Ctx); 1649 1650 // Done with setting up arguments and return types, create the call: 1651 SmallVector<Type *, 6> ArgTys; 1652 for (Value *Arg : Args) 1653 ArgTys.push_back(Arg->getType()); 1654 FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); 1655 Constant *LibcallFn = 1656 M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); 1657 CallInst *Call = Builder.CreateCall(LibcallFn, Args); 1658 Call->setAttributes(Attr); 1659 Value *Result = Call; 1660 1661 // And then, extract the results... 1662 if (ValueOperand && !UseSizedLibcall) 1663 Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); 1664 1665 if (CASExpected) { 1666 // The final result from the CAS is {load of 'expected' alloca, bool result 1667 // from call} 1668 Type *FinalResultTy = I->getType(); 1669 Value *V = UndefValue::get(FinalResultTy); 1670 Value *ExpectedOut = 1671 Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment); 1672 Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); 1673 V = Builder.CreateInsertValue(V, ExpectedOut, 0); 1674 V = Builder.CreateInsertValue(V, Result, 1); 1675 I->replaceAllUsesWith(V); 1676 } else if (HasResult) { 1677 Value *V; 1678 if (UseSizedLibcall) 1679 V = Builder.CreateBitOrPointerCast(Result, I->getType()); 1680 else { 1681 V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment); 1682 Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); 1683 } 1684 I->replaceAllUsesWith(V); 1685 } 1686 I->eraseFromParent(); 1687 return true; 1688 } 1689