1 //===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains a pass (at IR level) to replace atomic instructions with 10 // __atomic_* library calls, or target specific instruction which implement the 11 // same semantics in a way which better fits the target backend. This can 12 // include the use of (intrinsic-based) load-linked/store-conditional loops, 13 // AtomicCmpXchg, or type coercions. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/CodeGen/AtomicExpandUtils.h" 21 #include "llvm/CodeGen/RuntimeLibcalls.h" 22 #include "llvm/CodeGen/TargetLowering.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/CodeGen/TargetSubtargetInfo.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/Attributes.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/Constant.h" 29 #include "llvm/IR/Constants.h" 30 #include "llvm/IR/DataLayout.h" 31 #include "llvm/IR/DerivedTypes.h" 32 #include "llvm/IR/Function.h" 33 #include "llvm/IR/IRBuilder.h" 34 #include "llvm/IR/InstIterator.h" 35 #include "llvm/IR/Instruction.h" 36 #include "llvm/IR/Instructions.h" 37 #include "llvm/IR/Module.h" 38 #include "llvm/IR/Type.h" 39 #include "llvm/IR/User.h" 40 #include "llvm/IR/Value.h" 41 #include "llvm/InitializePasses.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/AtomicOrdering.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include <cassert> 50 #include <cstdint> 51 #include <iterator> 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "atomic-expand" 56 57 namespace { 58 59 class AtomicExpand: public FunctionPass { 60 const TargetLowering *TLI = nullptr; 61 62 public: 63 static char ID; // Pass identification, replacement for typeid 64 65 AtomicExpand() : FunctionPass(ID) { 66 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 67 } 68 69 bool runOnFunction(Function &F) override; 70 71 private: 72 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); 73 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 74 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 75 bool tryExpandAtomicLoad(LoadInst *LI); 76 bool expandAtomicLoadToLL(LoadInst *LI); 77 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 78 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 79 bool expandAtomicStore(StoreInst *SI); 80 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 81 Value * 82 insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 83 AtomicOrdering MemOpOrder, 84 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 85 void expandAtomicOpToLLSC( 86 Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, 87 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 88 void expandPartwordAtomicRMW( 89 AtomicRMWInst *I, 90 TargetLoweringBase::AtomicExpansionKind ExpansionKind); 91 AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI); 92 void expandPartwordCmpXchg(AtomicCmpXchgInst *I); 93 void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI); 94 void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI); 95 96 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 97 static Value *insertRMWCmpXchgLoop( 98 IRBuilder<> &Builder, Type *ResultType, Value *Addr, 99 AtomicOrdering MemOpOrder, 100 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 101 CreateCmpXchgInstFun CreateCmpXchg); 102 bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI); 103 104 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 105 bool isIdempotentRMW(AtomicRMWInst *RMWI); 106 bool simplifyIdempotentRMW(AtomicRMWInst *RMWI); 107 108 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment, 109 Value *PointerOperand, Value *ValueOperand, 110 Value *CASExpected, AtomicOrdering Ordering, 111 AtomicOrdering Ordering2, 112 ArrayRef<RTLIB::Libcall> Libcalls); 113 void expandAtomicLoadToLibcall(LoadInst *LI); 114 void expandAtomicStoreToLibcall(StoreInst *LI); 115 void expandAtomicRMWToLibcall(AtomicRMWInst *I); 116 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); 117 118 friend bool 119 llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 120 CreateCmpXchgInstFun CreateCmpXchg); 121 }; 122 123 } // end anonymous namespace 124 125 char AtomicExpand::ID = 0; 126 127 char &llvm::AtomicExpandID = AtomicExpand::ID; 128 129 INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", 130 false, false) 131 132 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } 133 134 // Helper functions to retrieve the size of atomic instructions. 135 static unsigned getAtomicOpSize(LoadInst *LI) { 136 const DataLayout &DL = LI->getModule()->getDataLayout(); 137 return DL.getTypeStoreSize(LI->getType()); 138 } 139 140 static unsigned getAtomicOpSize(StoreInst *SI) { 141 const DataLayout &DL = SI->getModule()->getDataLayout(); 142 return DL.getTypeStoreSize(SI->getValueOperand()->getType()); 143 } 144 145 static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { 146 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 147 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 148 } 149 150 static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { 151 const DataLayout &DL = CASI->getModule()->getDataLayout(); 152 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 153 } 154 155 // Determine if a particular atomic operation has a supported size, 156 // and is of appropriate alignment, to be passed through for target 157 // lowering. (Versus turning into a __atomic libcall) 158 template <typename Inst> 159 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { 160 unsigned Size = getAtomicOpSize(I); 161 Align Alignment = I->getAlign(); 162 return Alignment >= Size && 163 Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; 164 } 165 166 bool AtomicExpand::runOnFunction(Function &F) { 167 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 168 if (!TPC) 169 return false; 170 171 auto &TM = TPC->getTM<TargetMachine>(); 172 if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) 173 return false; 174 TLI = TM.getSubtargetImpl(F)->getTargetLowering(); 175 176 SmallVector<Instruction *, 1> AtomicInsts; 177 178 // Changing control-flow while iterating through it is a bad idea, so gather a 179 // list of all atomic instructions before we start. 180 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 181 Instruction *I = &*II; 182 if (I->isAtomic() && !isa<FenceInst>(I)) 183 AtomicInsts.push_back(I); 184 } 185 186 bool MadeChange = false; 187 for (auto I : AtomicInsts) { 188 auto LI = dyn_cast<LoadInst>(I); 189 auto SI = dyn_cast<StoreInst>(I); 190 auto RMWI = dyn_cast<AtomicRMWInst>(I); 191 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 192 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 193 194 // If the Size/Alignment is not supported, replace with a libcall. 195 if (LI) { 196 if (!atomicSizeSupported(TLI, LI)) { 197 expandAtomicLoadToLibcall(LI); 198 MadeChange = true; 199 continue; 200 } 201 } else if (SI) { 202 if (!atomicSizeSupported(TLI, SI)) { 203 expandAtomicStoreToLibcall(SI); 204 MadeChange = true; 205 continue; 206 } 207 } else if (RMWI) { 208 if (!atomicSizeSupported(TLI, RMWI)) { 209 expandAtomicRMWToLibcall(RMWI); 210 MadeChange = true; 211 continue; 212 } 213 } else if (CASI) { 214 if (!atomicSizeSupported(TLI, CASI)) { 215 expandAtomicCASToLibcall(CASI); 216 MadeChange = true; 217 continue; 218 } 219 } 220 221 if (TLI->shouldInsertFencesForAtomic(I)) { 222 auto FenceOrdering = AtomicOrdering::Monotonic; 223 if (LI && isAcquireOrStronger(LI->getOrdering())) { 224 FenceOrdering = LI->getOrdering(); 225 LI->setOrdering(AtomicOrdering::Monotonic); 226 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 227 FenceOrdering = SI->getOrdering(); 228 SI->setOrdering(AtomicOrdering::Monotonic); 229 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 230 isAcquireOrStronger(RMWI->getOrdering()))) { 231 FenceOrdering = RMWI->getOrdering(); 232 RMWI->setOrdering(AtomicOrdering::Monotonic); 233 } else if (CASI && 234 TLI->shouldExpandAtomicCmpXchgInIR(CASI) == 235 TargetLoweringBase::AtomicExpansionKind::None && 236 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 237 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 238 // If a compare and swap is lowered to LL/SC, we can do smarter fence 239 // insertion, with a stronger one on the success path than on the 240 // failure path. As a result, fence insertion is directly done by 241 // expandAtomicCmpXchg in that case. 242 FenceOrdering = CASI->getSuccessOrdering(); 243 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 244 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 245 } 246 247 if (FenceOrdering != AtomicOrdering::Monotonic) { 248 MadeChange |= bracketInstWithFences(I, FenceOrdering); 249 } 250 } 251 252 if (LI) { 253 if (LI->getType()->isFloatingPointTy()) { 254 // TODO: add a TLI hook to control this so that each target can 255 // convert to lowering the original type one at a time. 256 LI = convertAtomicLoadToIntegerType(LI); 257 assert(LI->getType()->isIntegerTy() && "invariant broken"); 258 MadeChange = true; 259 } 260 261 MadeChange |= tryExpandAtomicLoad(LI); 262 } else if (SI) { 263 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 264 // TODO: add a TLI hook to control this so that each target can 265 // convert to lowering the original type one at a time. 266 SI = convertAtomicStoreToIntegerType(SI); 267 assert(SI->getValueOperand()->getType()->isIntegerTy() && 268 "invariant broken"); 269 MadeChange = true; 270 } 271 272 if (TLI->shouldExpandAtomicStoreInIR(SI)) 273 MadeChange |= expandAtomicStore(SI); 274 } else if (RMWI) { 275 // There are two different ways of expanding RMW instructions: 276 // - into a load if it is idempotent 277 // - into a Cmpxchg/LL-SC loop otherwise 278 // we try them in that order. 279 280 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 281 MadeChange = true; 282 } else { 283 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 284 unsigned ValueSize = getAtomicOpSize(RMWI); 285 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 286 if (ValueSize < MinCASSize && 287 (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor || 288 Op == AtomicRMWInst::And)) { 289 RMWI = widenPartwordAtomicRMW(RMWI); 290 MadeChange = true; 291 } 292 293 MadeChange |= tryExpandAtomicRMW(RMWI); 294 } 295 } else if (CASI) { 296 // TODO: when we're ready to make the change at the IR level, we can 297 // extend convertCmpXchgToInteger for floating point too. 298 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 299 "unimplemented - floating point not legal at IR level"); 300 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 301 // TODO: add a TLI hook to control this so that each target can 302 // convert to lowering the original type one at a time. 303 CASI = convertCmpXchgToIntegerType(CASI); 304 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 305 "invariant broken"); 306 MadeChange = true; 307 } 308 309 MadeChange |= tryExpandAtomicCmpXchg(CASI); 310 } 311 } 312 return MadeChange; 313 } 314 315 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { 316 IRBuilder<> Builder(I); 317 318 auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); 319 320 auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); 321 // We have a guard here because not every atomic operation generates a 322 // trailing fence. 323 if (TrailingFence) 324 TrailingFence->moveAfter(I); 325 326 return (LeadingFence || TrailingFence); 327 } 328 329 /// Get the iX type with the same bitwidth as T. 330 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 331 const DataLayout &DL) { 332 EVT VT = TLI->getMemValueType(DL, T); 333 unsigned BitWidth = VT.getStoreSizeInBits(); 334 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 335 return IntegerType::get(T->getContext(), BitWidth); 336 } 337 338 /// Convert an atomic load of a non-integral type to an integer load of the 339 /// equivalent bitwidth. See the function comment on 340 /// convertAtomicStoreToIntegerType for background. 341 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 342 auto *M = LI->getModule(); 343 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 344 M->getDataLayout()); 345 346 IRBuilder<> Builder(LI); 347 348 Value *Addr = LI->getPointerOperand(); 349 Type *PT = PointerType::get(NewTy, 350 Addr->getType()->getPointerAddressSpace()); 351 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 352 353 auto *NewLI = Builder.CreateLoad(NewTy, NewAddr); 354 NewLI->setAlignment(LI->getAlign()); 355 NewLI->setVolatile(LI->isVolatile()); 356 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); 357 LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 358 359 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 360 LI->replaceAllUsesWith(NewVal); 361 LI->eraseFromParent(); 362 return NewLI; 363 } 364 365 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 366 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 367 case TargetLoweringBase::AtomicExpansionKind::None: 368 return false; 369 case TargetLoweringBase::AtomicExpansionKind::LLSC: 370 expandAtomicOpToLLSC( 371 LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), 372 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 373 return true; 374 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 375 return expandAtomicLoadToLL(LI); 376 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 377 return expandAtomicLoadToCmpXchg(LI); 378 default: 379 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 380 } 381 } 382 383 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 384 IRBuilder<> Builder(LI); 385 386 // On some architectures, load-linked instructions are atomic for larger 387 // sizes than normal loads. For example, the only 64-bit load guaranteed 388 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 389 Value *Val = 390 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 391 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 392 393 LI->replaceAllUsesWith(Val); 394 LI->eraseFromParent(); 395 396 return true; 397 } 398 399 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 400 IRBuilder<> Builder(LI); 401 AtomicOrdering Order = LI->getOrdering(); 402 if (Order == AtomicOrdering::Unordered) 403 Order = AtomicOrdering::Monotonic; 404 405 Value *Addr = LI->getPointerOperand(); 406 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 407 Constant *DummyVal = Constant::getNullValue(Ty); 408 409 Value *Pair = Builder.CreateAtomicCmpXchg( 410 Addr, DummyVal, DummyVal, Order, 411 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 412 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 413 414 LI->replaceAllUsesWith(Loaded); 415 LI->eraseFromParent(); 416 417 return true; 418 } 419 420 /// Convert an atomic store of a non-integral type to an integer store of the 421 /// equivalent bitwidth. We used to not support floating point or vector 422 /// atomics in the IR at all. The backends learned to deal with the bitcast 423 /// idiom because that was the only way of expressing the notion of a atomic 424 /// float or vector store. The long term plan is to teach each backend to 425 /// instruction select from the original atomic store, but as a migration 426 /// mechanism, we convert back to the old format which the backends understand. 427 /// Each backend will need individual work to recognize the new format. 428 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 429 IRBuilder<> Builder(SI); 430 auto *M = SI->getModule(); 431 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 432 M->getDataLayout()); 433 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 434 435 Value *Addr = SI->getPointerOperand(); 436 Type *PT = PointerType::get(NewTy, 437 Addr->getType()->getPointerAddressSpace()); 438 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 439 440 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 441 NewSI->setAlignment(SI->getAlign()); 442 NewSI->setVolatile(SI->isVolatile()); 443 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); 444 LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 445 SI->eraseFromParent(); 446 return NewSI; 447 } 448 449 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 450 // This function is only called on atomic stores that are too large to be 451 // atomic if implemented as a native store. So we replace them by an 452 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 453 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 454 // It is the responsibility of the target to only signal expansion via 455 // shouldExpandAtomicRMW in cases where this is required and possible. 456 IRBuilder<> Builder(SI); 457 AtomicRMWInst *AI = 458 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 459 SI->getValueOperand(), SI->getOrdering()); 460 SI->eraseFromParent(); 461 462 // Now we have an appropriate swap instruction, lower it as usual. 463 return tryExpandAtomicRMW(AI); 464 } 465 466 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 467 Value *Loaded, Value *NewVal, 468 AtomicOrdering MemOpOrder, 469 Value *&Success, Value *&NewLoaded) { 470 Type *OrigTy = NewVal->getType(); 471 472 // This code can go away when cmpxchg supports FP types. 473 bool NeedBitcast = OrigTy->isFloatingPointTy(); 474 if (NeedBitcast) { 475 IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits()); 476 unsigned AS = Addr->getType()->getPointerAddressSpace(); 477 Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS)); 478 NewVal = Builder.CreateBitCast(NewVal, IntTy); 479 Loaded = Builder.CreateBitCast(Loaded, IntTy); 480 } 481 482 Value* Pair = Builder.CreateAtomicCmpXchg( 483 Addr, Loaded, NewVal, MemOpOrder, 484 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 485 Success = Builder.CreateExtractValue(Pair, 1, "success"); 486 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 487 488 if (NeedBitcast) 489 NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy); 490 } 491 492 /// Emit IR to implement the given atomicrmw operation on values in registers, 493 /// returning the new value. 494 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 495 Value *Loaded, Value *Inc) { 496 Value *NewVal; 497 switch (Op) { 498 case AtomicRMWInst::Xchg: 499 return Inc; 500 case AtomicRMWInst::Add: 501 return Builder.CreateAdd(Loaded, Inc, "new"); 502 case AtomicRMWInst::Sub: 503 return Builder.CreateSub(Loaded, Inc, "new"); 504 case AtomicRMWInst::And: 505 return Builder.CreateAnd(Loaded, Inc, "new"); 506 case AtomicRMWInst::Nand: 507 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 508 case AtomicRMWInst::Or: 509 return Builder.CreateOr(Loaded, Inc, "new"); 510 case AtomicRMWInst::Xor: 511 return Builder.CreateXor(Loaded, Inc, "new"); 512 case AtomicRMWInst::Max: 513 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 514 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 515 case AtomicRMWInst::Min: 516 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 517 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 518 case AtomicRMWInst::UMax: 519 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 520 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 521 case AtomicRMWInst::UMin: 522 NewVal = Builder.CreateICmpULE(Loaded, Inc); 523 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 524 case AtomicRMWInst::FAdd: 525 return Builder.CreateFAdd(Loaded, Inc, "new"); 526 case AtomicRMWInst::FSub: 527 return Builder.CreateFSub(Loaded, Inc, "new"); 528 default: 529 llvm_unreachable("Unknown atomic op"); 530 } 531 } 532 533 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 534 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 535 case TargetLoweringBase::AtomicExpansionKind::None: 536 return false; 537 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 538 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 539 unsigned ValueSize = getAtomicOpSize(AI); 540 if (ValueSize < MinCASSize) { 541 expandPartwordAtomicRMW(AI, 542 TargetLoweringBase::AtomicExpansionKind::LLSC); 543 } else { 544 auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { 545 return performAtomicOp(AI->getOperation(), Builder, Loaded, 546 AI->getValOperand()); 547 }; 548 expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), 549 AI->getOrdering(), PerformOp); 550 } 551 return true; 552 } 553 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { 554 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 555 unsigned ValueSize = getAtomicOpSize(AI); 556 if (ValueSize < MinCASSize) { 557 // TODO: Handle atomicrmw fadd/fsub 558 if (AI->getType()->isFloatingPointTy()) 559 return false; 560 561 expandPartwordAtomicRMW(AI, 562 TargetLoweringBase::AtomicExpansionKind::CmpXChg); 563 } else { 564 expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 565 } 566 return true; 567 } 568 case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: { 569 expandAtomicRMWToMaskedIntrinsic(AI); 570 return true; 571 } 572 default: 573 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 574 } 575 } 576 577 namespace { 578 579 struct PartwordMaskValues { 580 // These three fields are guaranteed to be set by createMaskInstrs. 581 Type *WordType = nullptr; 582 Type *ValueType = nullptr; 583 Value *AlignedAddr = nullptr; 584 // The remaining fields can be null. 585 Value *ShiftAmt = nullptr; 586 Value *Mask = nullptr; 587 Value *Inv_Mask = nullptr; 588 }; 589 590 LLVM_ATTRIBUTE_UNUSED 591 raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) { 592 auto PrintObj = [&O](auto *V) { 593 if (V) 594 O << *V; 595 else 596 O << "nullptr"; 597 O << '\n'; 598 }; 599 O << "PartwordMaskValues {\n"; 600 O << " WordType: "; 601 PrintObj(PMV.WordType); 602 O << " ValueType: "; 603 PrintObj(PMV.ValueType); 604 O << " AlignedAddr: "; 605 PrintObj(PMV.AlignedAddr); 606 O << " ShiftAmt: "; 607 PrintObj(PMV.ShiftAmt); 608 O << " Mask: "; 609 PrintObj(PMV.Mask); 610 O << " Inv_Mask: "; 611 PrintObj(PMV.Inv_Mask); 612 O << "}\n"; 613 return O; 614 } 615 616 } // end anonymous namespace 617 618 /// This is a helper function which builds instructions to provide 619 /// values necessary for partword atomic operations. It takes an 620 /// incoming address, Addr, and ValueType, and constructs the address, 621 /// shift-amounts and masks needed to work with a larger value of size 622 /// WordSize. 623 /// 624 /// AlignedAddr: Addr rounded down to a multiple of WordSize 625 /// 626 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded 627 /// from AlignAddr for it to have the same value as if 628 /// ValueType was loaded from Addr. 629 /// 630 /// Mask: Value to mask with the value loaded from AlignAddr to 631 /// include only the part that would've been loaded from Addr. 632 /// 633 /// Inv_Mask: The inverse of Mask. 634 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, 635 Type *ValueType, Value *Addr, 636 unsigned MinWordSize) { 637 PartwordMaskValues PMV; 638 639 Module *M = I->getModule(); 640 LLVMContext &Ctx = M->getContext(); 641 const DataLayout &DL = M->getDataLayout(); 642 unsigned ValueSize = DL.getTypeStoreSize(ValueType); 643 644 PMV.ValueType = ValueType; 645 PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8) 646 : ValueType; 647 if (PMV.ValueType == PMV.WordType) { 648 PMV.AlignedAddr = Addr; 649 return PMV; 650 } 651 652 assert(ValueSize < MinWordSize); 653 654 Type *WordPtrType = 655 PMV.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); 656 657 Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); 658 PMV.AlignedAddr = Builder.CreateIntToPtr( 659 Builder.CreateAnd(AddrInt, ~(uint64_t)(MinWordSize - 1)), WordPtrType, 660 "AlignedAddr"); 661 662 Value *PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB"); 663 if (DL.isLittleEndian()) { 664 // turn bytes into bits 665 PMV.ShiftAmt = Builder.CreateShl(PtrLSB, 3); 666 } else { 667 // turn bytes into bits, and count from the other side. 668 PMV.ShiftAmt = Builder.CreateShl( 669 Builder.CreateXor(PtrLSB, MinWordSize - ValueSize), 3); 670 } 671 672 PMV.ShiftAmt = Builder.CreateTrunc(PMV.ShiftAmt, PMV.WordType, "ShiftAmt"); 673 PMV.Mask = Builder.CreateShl( 674 ConstantInt::get(PMV.WordType, (1 << (ValueSize * 8)) - 1), PMV.ShiftAmt, 675 "Mask"); 676 PMV.Inv_Mask = Builder.CreateNot(PMV.Mask, "Inv_Mask"); 677 return PMV; 678 } 679 680 static Value *extractMaskedValue(IRBuilder<> &Builder, Value *WideWord, 681 const PartwordMaskValues &PMV) { 682 assert(WideWord->getType() == PMV.WordType && "Widened type mismatch"); 683 if (PMV.WordType == PMV.ValueType) 684 return WideWord; 685 686 Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted"); 687 Value *Trunc = Builder.CreateTrunc(Shift, PMV.ValueType, "extracted"); 688 return Trunc; 689 } 690 691 static Value *insertMaskedValue(IRBuilder<> &Builder, Value *WideWord, 692 Value *Updated, const PartwordMaskValues &PMV) { 693 assert(WideWord->getType() == PMV.WordType && "Widened type mismatch"); 694 assert(Updated->getType() == PMV.ValueType && "Value type mismatch"); 695 if (PMV.WordType == PMV.ValueType) 696 return Updated; 697 698 Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended"); 699 Value *Shift = 700 Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true); 701 Value *And = Builder.CreateAnd(WideWord, PMV.Inv_Mask, "unmasked"); 702 Value *Or = Builder.CreateOr(And, Shift, "inserted"); 703 return Or; 704 } 705 706 /// Emit IR to implement a masked version of a given atomicrmw 707 /// operation. (That is, only the bits under the Mask should be 708 /// affected by the operation) 709 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, 710 IRBuilder<> &Builder, Value *Loaded, 711 Value *Shifted_Inc, Value *Inc, 712 const PartwordMaskValues &PMV) { 713 // TODO: update to use 714 // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order 715 // to merge bits from two values without requiring PMV.Inv_Mask. 716 switch (Op) { 717 case AtomicRMWInst::Xchg: { 718 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 719 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); 720 return FinalVal; 721 } 722 case AtomicRMWInst::Or: 723 case AtomicRMWInst::Xor: 724 case AtomicRMWInst::And: 725 llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW"); 726 case AtomicRMWInst::Add: 727 case AtomicRMWInst::Sub: 728 case AtomicRMWInst::Nand: { 729 // The other arithmetic ops need to be masked into place. 730 Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 731 Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); 732 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 733 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); 734 return FinalVal; 735 } 736 case AtomicRMWInst::Max: 737 case AtomicRMWInst::Min: 738 case AtomicRMWInst::UMax: 739 case AtomicRMWInst::UMin: { 740 // Finally, comparison ops will operate on the full value, so 741 // truncate down to the original size, and expand out again after 742 // doing the operation. 743 Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV); 744 Value *NewVal = performAtomicOp(Op, Builder, Loaded_Extract, Inc); 745 Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV); 746 return FinalVal; 747 } 748 default: 749 llvm_unreachable("Unknown atomic op"); 750 } 751 } 752 753 /// Expand a sub-word atomicrmw operation into an appropriate 754 /// word-sized operation. 755 /// 756 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same 757 /// way as a typical atomicrmw expansion. The only difference here is 758 /// that the operation inside of the loop may operate upon only a 759 /// part of the value. 760 void AtomicExpand::expandPartwordAtomicRMW( 761 AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { 762 AtomicOrdering MemOpOrder = AI->getOrdering(); 763 764 IRBuilder<> Builder(AI); 765 766 PartwordMaskValues PMV = 767 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 768 TLI->getMinCmpXchgSizeInBits() / 8); 769 770 Value *ValOperand_Shifted = 771 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 772 PMV.ShiftAmt, "ValOperand_Shifted"); 773 774 auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { 775 return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, 776 ValOperand_Shifted, AI->getValOperand(), PMV); 777 }; 778 779 Value *OldResult; 780 if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) { 781 OldResult = 782 insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, 783 PerformPartwordOp, createCmpXchgInstFun); 784 } else { 785 assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC); 786 OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr, 787 MemOpOrder, PerformPartwordOp); 788 } 789 790 Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV); 791 AI->replaceAllUsesWith(FinalOldResult); 792 AI->eraseFromParent(); 793 } 794 795 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width. 796 AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) { 797 IRBuilder<> Builder(AI); 798 AtomicRMWInst::BinOp Op = AI->getOperation(); 799 800 assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor || 801 Op == AtomicRMWInst::And) && 802 "Unable to widen operation"); 803 804 PartwordMaskValues PMV = 805 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 806 TLI->getMinCmpXchgSizeInBits() / 8); 807 808 Value *ValOperand_Shifted = 809 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 810 PMV.ShiftAmt, "ValOperand_Shifted"); 811 812 Value *NewOperand; 813 814 if (Op == AtomicRMWInst::And) 815 NewOperand = 816 Builder.CreateOr(PMV.Inv_Mask, ValOperand_Shifted, "AndOperand"); 817 else 818 NewOperand = ValOperand_Shifted; 819 820 AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(Op, PMV.AlignedAddr, 821 NewOperand, AI->getOrdering()); 822 823 Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV); 824 AI->replaceAllUsesWith(FinalOldResult); 825 AI->eraseFromParent(); 826 return NewAI; 827 } 828 829 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { 830 // The basic idea here is that we're expanding a cmpxchg of a 831 // smaller memory size up to a word-sized cmpxchg. To do this, we 832 // need to add a retry-loop for strong cmpxchg, so that 833 // modifications to other parts of the word don't cause a spurious 834 // failure. 835 836 // This generates code like the following: 837 // [[Setup mask values PMV.*]] 838 // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt 839 // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt 840 // %InitLoaded = load i32* %addr 841 // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask 842 // br partword.cmpxchg.loop 843 // partword.cmpxchg.loop: 844 // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], 845 // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] 846 // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted 847 // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted 848 // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, 849 // i32 %FullWord_NewVal success_ordering failure_ordering 850 // %OldVal = extractvalue { i32, i1 } %NewCI, 0 851 // %Success = extractvalue { i32, i1 } %NewCI, 1 852 // br i1 %Success, label %partword.cmpxchg.end, 853 // label %partword.cmpxchg.failure 854 // partword.cmpxchg.failure: 855 // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask 856 // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut 857 // br i1 %ShouldContinue, label %partword.cmpxchg.loop, 858 // label %partword.cmpxchg.end 859 // partword.cmpxchg.end: 860 // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt 861 // %FinalOldVal = trunc i32 %tmp1 to i8 862 // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 863 // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 864 865 Value *Addr = CI->getPointerOperand(); 866 Value *Cmp = CI->getCompareOperand(); 867 Value *NewVal = CI->getNewValOperand(); 868 869 BasicBlock *BB = CI->getParent(); 870 Function *F = BB->getParent(); 871 IRBuilder<> Builder(CI); 872 LLVMContext &Ctx = Builder.getContext(); 873 874 const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; 875 876 BasicBlock *EndBB = 877 BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); 878 auto FailureBB = 879 BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); 880 auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); 881 882 // The split call above "helpfully" added a branch at the end of BB 883 // (to the wrong place). 884 std::prev(BB->end())->eraseFromParent(); 885 Builder.SetInsertPoint(BB); 886 887 PartwordMaskValues PMV = createMaskInstrs( 888 Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); 889 890 // Shift the incoming values over, into the right location in the word. 891 Value *NewVal_Shifted = 892 Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 893 Value *Cmp_Shifted = 894 Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); 895 896 // Load the entire current word, and mask into place the expected and new 897 // values 898 LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); 899 InitLoaded->setVolatile(CI->isVolatile()); 900 Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); 901 Builder.CreateBr(LoopBB); 902 903 // partword.cmpxchg.loop: 904 Builder.SetInsertPoint(LoopBB); 905 PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); 906 Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); 907 908 // Mask/Or the expected and new values into place in the loaded word. 909 Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); 910 Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); 911 AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( 912 PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), 913 CI->getFailureOrdering(), CI->getSyncScopeID()); 914 NewCI->setVolatile(CI->isVolatile()); 915 // When we're building a strong cmpxchg, we need a loop, so you 916 // might think we could use a weak cmpxchg inside. But, using strong 917 // allows the below comparison for ShouldContinue, and we're 918 // expecting the underlying cmpxchg to be a machine instruction, 919 // which is strong anyways. 920 NewCI->setWeak(CI->isWeak()); 921 922 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 923 Value *Success = Builder.CreateExtractValue(NewCI, 1); 924 925 if (CI->isWeak()) 926 Builder.CreateBr(EndBB); 927 else 928 Builder.CreateCondBr(Success, EndBB, FailureBB); 929 930 // partword.cmpxchg.failure: 931 Builder.SetInsertPoint(FailureBB); 932 // Upon failure, verify that the masked-out part of the loaded value 933 // has been modified. If it didn't, abort the cmpxchg, since the 934 // masked-in part must've. 935 Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); 936 Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); 937 Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); 938 939 // Add the second value to the phi from above 940 Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); 941 942 // partword.cmpxchg.end: 943 Builder.SetInsertPoint(CI); 944 945 Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV); 946 Value *Res = UndefValue::get(CI->getType()); 947 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 948 Res = Builder.CreateInsertValue(Res, Success, 1); 949 950 CI->replaceAllUsesWith(Res); 951 CI->eraseFromParent(); 952 } 953 954 void AtomicExpand::expandAtomicOpToLLSC( 955 Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, 956 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 957 IRBuilder<> Builder(I); 958 Value *Loaded = 959 insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); 960 961 I->replaceAllUsesWith(Loaded); 962 I->eraseFromParent(); 963 } 964 965 void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) { 966 IRBuilder<> Builder(AI); 967 968 PartwordMaskValues PMV = 969 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 970 TLI->getMinCmpXchgSizeInBits() / 8); 971 972 // The value operand must be sign-extended for signed min/max so that the 973 // target's signed comparison instructions can be used. Otherwise, just 974 // zero-ext. 975 Instruction::CastOps CastOp = Instruction::ZExt; 976 AtomicRMWInst::BinOp RMWOp = AI->getOperation(); 977 if (RMWOp == AtomicRMWInst::Max || RMWOp == AtomicRMWInst::Min) 978 CastOp = Instruction::SExt; 979 980 Value *ValOperand_Shifted = Builder.CreateShl( 981 Builder.CreateCast(CastOp, AI->getValOperand(), PMV.WordType), 982 PMV.ShiftAmt, "ValOperand_Shifted"); 983 Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic( 984 Builder, AI, PMV.AlignedAddr, ValOperand_Shifted, PMV.Mask, PMV.ShiftAmt, 985 AI->getOrdering()); 986 Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV); 987 AI->replaceAllUsesWith(FinalOldResult); 988 AI->eraseFromParent(); 989 } 990 991 void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) { 992 IRBuilder<> Builder(CI); 993 994 PartwordMaskValues PMV = createMaskInstrs( 995 Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(), 996 TLI->getMinCmpXchgSizeInBits() / 8); 997 998 Value *CmpVal_Shifted = Builder.CreateShl( 999 Builder.CreateZExt(CI->getCompareOperand(), PMV.WordType), PMV.ShiftAmt, 1000 "CmpVal_Shifted"); 1001 Value *NewVal_Shifted = Builder.CreateShl( 1002 Builder.CreateZExt(CI->getNewValOperand(), PMV.WordType), PMV.ShiftAmt, 1003 "NewVal_Shifted"); 1004 Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic( 1005 Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask, 1006 CI->getSuccessOrdering()); 1007 Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV); 1008 Value *Res = UndefValue::get(CI->getType()); 1009 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 1010 Value *Success = Builder.CreateICmpEQ( 1011 CmpVal_Shifted, Builder.CreateAnd(OldVal, PMV.Mask), "Success"); 1012 Res = Builder.CreateInsertValue(Res, Success, 1); 1013 1014 CI->replaceAllUsesWith(Res); 1015 CI->eraseFromParent(); 1016 } 1017 1018 Value *AtomicExpand::insertRMWLLSCLoop( 1019 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1020 AtomicOrdering MemOpOrder, 1021 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 1022 LLVMContext &Ctx = Builder.getContext(); 1023 BasicBlock *BB = Builder.GetInsertBlock(); 1024 Function *F = BB->getParent(); 1025 1026 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1027 // 1028 // The standard expansion we produce is: 1029 // [...] 1030 // atomicrmw.start: 1031 // %loaded = @load.linked(%addr) 1032 // %new = some_op iN %loaded, %incr 1033 // %stored = @store_conditional(%new, %addr) 1034 // %try_again = icmp i32 ne %stored, 0 1035 // br i1 %try_again, label %loop, label %atomicrmw.end 1036 // atomicrmw.end: 1037 // [...] 1038 BasicBlock *ExitBB = 1039 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1040 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1041 1042 // The split call above "helpfully" added a branch at the end of BB (to the 1043 // wrong place). 1044 std::prev(BB->end())->eraseFromParent(); 1045 Builder.SetInsertPoint(BB); 1046 Builder.CreateBr(LoopBB); 1047 1048 // Start the main loop block now that we've taken care of the preliminaries. 1049 Builder.SetInsertPoint(LoopBB); 1050 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1051 1052 Value *NewVal = PerformOp(Builder, Loaded); 1053 1054 Value *StoreSuccess = 1055 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 1056 Value *TryAgain = Builder.CreateICmpNE( 1057 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 1058 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 1059 1060 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1061 return Loaded; 1062 } 1063 1064 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 1065 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 1066 /// IR. As a migration step, we convert back to what use to be the standard 1067 /// way to represent a pointer cmpxchg so that we can update backends one by 1068 /// one. 1069 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 1070 auto *M = CI->getModule(); 1071 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 1072 M->getDataLayout()); 1073 1074 IRBuilder<> Builder(CI); 1075 1076 Value *Addr = CI->getPointerOperand(); 1077 Type *PT = PointerType::get(NewTy, 1078 Addr->getType()->getPointerAddressSpace()); 1079 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 1080 1081 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 1082 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 1083 1084 1085 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 1086 CI->getSuccessOrdering(), 1087 CI->getFailureOrdering(), 1088 CI->getSyncScopeID()); 1089 NewCI->setVolatile(CI->isVolatile()); 1090 NewCI->setWeak(CI->isWeak()); 1091 LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 1092 1093 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 1094 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 1095 1096 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 1097 1098 Value *Res = UndefValue::get(CI->getType()); 1099 Res = Builder.CreateInsertValue(Res, OldVal, 0); 1100 Res = Builder.CreateInsertValue(Res, Succ, 1); 1101 1102 CI->replaceAllUsesWith(Res); 1103 CI->eraseFromParent(); 1104 return NewCI; 1105 } 1106 1107 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 1108 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 1109 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 1110 Value *Addr = CI->getPointerOperand(); 1111 BasicBlock *BB = CI->getParent(); 1112 Function *F = BB->getParent(); 1113 LLVMContext &Ctx = F->getContext(); 1114 // If shouldInsertFencesForAtomic() returns true, then the target does not 1115 // want to deal with memory orders, and emitLeading/TrailingFence should take 1116 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 1117 // should preserve the ordering. 1118 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 1119 AtomicOrdering MemOpOrder = 1120 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 1121 1122 // In implementations which use a barrier to achieve release semantics, we can 1123 // delay emitting this barrier until we know a store is actually going to be 1124 // attempted. The cost of this delay is that we need 2 copies of the block 1125 // emitting the load-linked, affecting code size. 1126 // 1127 // Ideally, this logic would be unconditional except for the minsize check 1128 // since in other cases the extra blocks naturally collapse down to the 1129 // minimal loop. Unfortunately, this puts too much stress on later 1130 // optimisations so we avoid emitting the extra logic in those cases too. 1131 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 1132 SuccessOrder != AtomicOrdering::Monotonic && 1133 SuccessOrder != AtomicOrdering::Acquire && 1134 !F->hasMinSize(); 1135 1136 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 1137 // do it even on minsize. 1138 bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak(); 1139 1140 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 1141 // 1142 // The full expansion we produce is: 1143 // [...] 1144 // %aligned.addr = ... 1145 // cmpxchg.start: 1146 // %unreleasedload = @load.linked(%aligned.addr) 1147 // %unreleasedload.extract = extract value from %unreleasedload 1148 // %should_store = icmp eq %unreleasedload.extract, %desired 1149 // br i1 %should_store, label %cmpxchg.releasingstore, 1150 // label %cmpxchg.nostore 1151 // cmpxchg.releasingstore: 1152 // fence? 1153 // br label cmpxchg.trystore 1154 // cmpxchg.trystore: 1155 // %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore], 1156 // [%releasedload, %cmpxchg.releasedload] 1157 // %updated.new = insert %new into %loaded.trystore 1158 // %stored = @store_conditional(%updated.new, %aligned.addr) 1159 // %success = icmp eq i32 %stored, 0 1160 // br i1 %success, label %cmpxchg.success, 1161 // label %cmpxchg.releasedload/%cmpxchg.failure 1162 // cmpxchg.releasedload: 1163 // %releasedload = @load.linked(%aligned.addr) 1164 // %releasedload.extract = extract value from %releasedload 1165 // %should_store = icmp eq %releasedload.extract, %desired 1166 // br i1 %should_store, label %cmpxchg.trystore, 1167 // label %cmpxchg.failure 1168 // cmpxchg.success: 1169 // fence? 1170 // br label %cmpxchg.end 1171 // cmpxchg.nostore: 1172 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 1173 // [%releasedload, 1174 // %cmpxchg.releasedload/%cmpxchg.trystore] 1175 // @load_linked_fail_balance()? 1176 // br label %cmpxchg.failure 1177 // cmpxchg.failure: 1178 // fence? 1179 // br label %cmpxchg.end 1180 // cmpxchg.end: 1181 // %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure], 1182 // [%loaded.trystore, %cmpxchg.trystore] 1183 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 1184 // %loaded = extract value from %loaded.exit 1185 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 1186 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 1187 // [...] 1188 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 1189 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 1190 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 1191 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 1192 auto ReleasedLoadBB = 1193 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 1194 auto TryStoreBB = 1195 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 1196 auto ReleasingStoreBB = 1197 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 1198 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 1199 1200 // This grabs the DebugLoc from CI 1201 IRBuilder<> Builder(CI); 1202 1203 // The split call above "helpfully" added a branch at the end of BB (to the 1204 // wrong place), but we might want a fence too. It's easiest to just remove 1205 // the branch entirely. 1206 std::prev(BB->end())->eraseFromParent(); 1207 Builder.SetInsertPoint(BB); 1208 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 1209 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1210 1211 PartwordMaskValues PMV = 1212 createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr, 1213 TLI->getMinCmpXchgSizeInBits() / 8); 1214 Builder.CreateBr(StartBB); 1215 1216 // Start the main loop block now that we've taken care of the preliminaries. 1217 Builder.SetInsertPoint(StartBB); 1218 Value *UnreleasedLoad = 1219 TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder); 1220 Value *UnreleasedLoadExtract = 1221 extractMaskedValue(Builder, UnreleasedLoad, PMV); 1222 Value *ShouldStore = Builder.CreateICmpEQ( 1223 UnreleasedLoadExtract, CI->getCompareOperand(), "should_store"); 1224 1225 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1226 // jump straight past that fence instruction (if it exists). 1227 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 1228 1229 Builder.SetInsertPoint(ReleasingStoreBB); 1230 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 1231 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1232 Builder.CreateBr(TryStoreBB); 1233 1234 Builder.SetInsertPoint(TryStoreBB); 1235 PHINode *LoadedTryStore = 1236 Builder.CreatePHI(PMV.WordType, 2, "loaded.trystore"); 1237 LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB); 1238 Value *NewValueInsert = 1239 insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV); 1240 Value *StoreSuccess = 1241 TLI->emitStoreConditional(Builder, NewValueInsert, Addr, MemOpOrder); 1242 StoreSuccess = Builder.CreateICmpEQ( 1243 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 1244 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 1245 Builder.CreateCondBr(StoreSuccess, SuccessBB, 1246 CI->isWeak() ? FailureBB : RetryBB); 1247 1248 Builder.SetInsertPoint(ReleasedLoadBB); 1249 Value *SecondLoad; 1250 if (HasReleasedLoadBB) { 1251 SecondLoad = TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder); 1252 Value *SecondLoadExtract = extractMaskedValue(Builder, SecondLoad, PMV); 1253 ShouldStore = Builder.CreateICmpEQ(SecondLoadExtract, 1254 CI->getCompareOperand(), "should_store"); 1255 1256 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1257 // jump straight past that fence instruction (if it exists). 1258 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 1259 // Update PHI node in TryStoreBB. 1260 LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB); 1261 } else 1262 Builder.CreateUnreachable(); 1263 1264 // Make sure later instructions don't get reordered with a fence if 1265 // necessary. 1266 Builder.SetInsertPoint(SuccessBB); 1267 if (ShouldInsertFencesForAtomic) 1268 TLI->emitTrailingFence(Builder, CI, SuccessOrder); 1269 Builder.CreateBr(ExitBB); 1270 1271 Builder.SetInsertPoint(NoStoreBB); 1272 PHINode *LoadedNoStore = 1273 Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.nostore"); 1274 LoadedNoStore->addIncoming(UnreleasedLoad, StartBB); 1275 if (HasReleasedLoadBB) 1276 LoadedNoStore->addIncoming(SecondLoad, ReleasedLoadBB); 1277 1278 // In the failing case, where we don't execute the store-conditional, the 1279 // target might want to balance out the load-linked with a dedicated 1280 // instruction (e.g., on ARM, clearing the exclusive monitor). 1281 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 1282 Builder.CreateBr(FailureBB); 1283 1284 Builder.SetInsertPoint(FailureBB); 1285 PHINode *LoadedFailure = 1286 Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.failure"); 1287 LoadedFailure->addIncoming(LoadedNoStore, NoStoreBB); 1288 if (CI->isWeak()) 1289 LoadedFailure->addIncoming(LoadedTryStore, TryStoreBB); 1290 if (ShouldInsertFencesForAtomic) 1291 TLI->emitTrailingFence(Builder, CI, FailureOrder); 1292 Builder.CreateBr(ExitBB); 1293 1294 // Finally, we have control-flow based knowledge of whether the cmpxchg 1295 // succeeded or not. We expose this to later passes by converting any 1296 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 1297 // PHI. 1298 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1299 PHINode *LoadedExit = 1300 Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.exit"); 1301 LoadedExit->addIncoming(LoadedTryStore, SuccessBB); 1302 LoadedExit->addIncoming(LoadedFailure, FailureBB); 1303 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2, "success"); 1304 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 1305 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 1306 1307 // This is the "exit value" from the cmpxchg expansion. It may be of 1308 // a type wider than the one in the cmpxchg instruction. 1309 Value *LoadedFull = LoadedExit; 1310 1311 Builder.SetInsertPoint(ExitBB, std::next(Success->getIterator())); 1312 Value *Loaded = extractMaskedValue(Builder, LoadedFull, PMV); 1313 1314 // Look for any users of the cmpxchg that are just comparing the loaded value 1315 // against the desired one, and replace them with the CFG-derived version. 1316 SmallVector<ExtractValueInst *, 2> PrunedInsts; 1317 for (auto User : CI->users()) { 1318 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 1319 if (!EV) 1320 continue; 1321 1322 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 1323 "weird extraction from { iN, i1 }"); 1324 1325 if (EV->getIndices()[0] == 0) 1326 EV->replaceAllUsesWith(Loaded); 1327 else 1328 EV->replaceAllUsesWith(Success); 1329 1330 PrunedInsts.push_back(EV); 1331 } 1332 1333 // We can remove the instructions now we're no longer iterating through them. 1334 for (auto EV : PrunedInsts) 1335 EV->eraseFromParent(); 1336 1337 if (!CI->use_empty()) { 1338 // Some use of the full struct return that we don't understand has happened, 1339 // so we've got to reconstruct it properly. 1340 Value *Res; 1341 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 1342 Res = Builder.CreateInsertValue(Res, Success, 1); 1343 1344 CI->replaceAllUsesWith(Res); 1345 } 1346 1347 CI->eraseFromParent(); 1348 return true; 1349 } 1350 1351 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 1352 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 1353 if(!C) 1354 return false; 1355 1356 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 1357 switch(Op) { 1358 case AtomicRMWInst::Add: 1359 case AtomicRMWInst::Sub: 1360 case AtomicRMWInst::Or: 1361 case AtomicRMWInst::Xor: 1362 return C->isZero(); 1363 case AtomicRMWInst::And: 1364 return C->isMinusOne(); 1365 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 1366 default: 1367 return false; 1368 } 1369 } 1370 1371 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 1372 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 1373 tryExpandAtomicLoad(ResultingLoad); 1374 return true; 1375 } 1376 return false; 1377 } 1378 1379 Value *AtomicExpand::insertRMWCmpXchgLoop( 1380 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1381 AtomicOrdering MemOpOrder, 1382 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 1383 CreateCmpXchgInstFun CreateCmpXchg) { 1384 LLVMContext &Ctx = Builder.getContext(); 1385 BasicBlock *BB = Builder.GetInsertBlock(); 1386 Function *F = BB->getParent(); 1387 1388 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1389 // 1390 // The standard expansion we produce is: 1391 // [...] 1392 // %init_loaded = load atomic iN* %addr 1393 // br label %loop 1394 // loop: 1395 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 1396 // %new = some_op iN %loaded, %incr 1397 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 1398 // %new_loaded = extractvalue { iN, i1 } %pair, 0 1399 // %success = extractvalue { iN, i1 } %pair, 1 1400 // br i1 %success, label %atomicrmw.end, label %loop 1401 // atomicrmw.end: 1402 // [...] 1403 BasicBlock *ExitBB = 1404 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1405 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1406 1407 // The split call above "helpfully" added a branch at the end of BB (to the 1408 // wrong place), but we want a load. It's easiest to just remove 1409 // the branch entirely. 1410 std::prev(BB->end())->eraseFromParent(); 1411 Builder.SetInsertPoint(BB); 1412 LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); 1413 // Atomics require at least natural alignment. 1414 InitLoaded->setAlignment(Align(ResultTy->getPrimitiveSizeInBits() / 8)); 1415 Builder.CreateBr(LoopBB); 1416 1417 // Start the main loop block now that we've taken care of the preliminaries. 1418 Builder.SetInsertPoint(LoopBB); 1419 PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); 1420 Loaded->addIncoming(InitLoaded, BB); 1421 1422 Value *NewVal = PerformOp(Builder, Loaded); 1423 1424 Value *NewLoaded = nullptr; 1425 Value *Success = nullptr; 1426 1427 CreateCmpXchg(Builder, Addr, Loaded, NewVal, 1428 MemOpOrder == AtomicOrdering::Unordered 1429 ? AtomicOrdering::Monotonic 1430 : MemOpOrder, 1431 Success, NewLoaded); 1432 assert(Success && NewLoaded); 1433 1434 Loaded->addIncoming(NewLoaded, LoopBB); 1435 1436 Builder.CreateCondBr(Success, ExitBB, LoopBB); 1437 1438 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1439 return NewLoaded; 1440 } 1441 1442 bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 1443 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 1444 unsigned ValueSize = getAtomicOpSize(CI); 1445 1446 switch (TLI->shouldExpandAtomicCmpXchgInIR(CI)) { 1447 default: 1448 llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg"); 1449 case TargetLoweringBase::AtomicExpansionKind::None: 1450 if (ValueSize < MinCASSize) 1451 expandPartwordCmpXchg(CI); 1452 return false; 1453 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 1454 return expandAtomicCmpXchg(CI); 1455 } 1456 case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: 1457 expandAtomicCmpXchgToMaskedIntrinsic(CI); 1458 return true; 1459 } 1460 } 1461 1462 // Note: This function is exposed externally by AtomicExpandUtils.h 1463 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 1464 CreateCmpXchgInstFun CreateCmpXchg) { 1465 IRBuilder<> Builder(AI); 1466 Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( 1467 Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), 1468 [&](IRBuilder<> &Builder, Value *Loaded) { 1469 return performAtomicOp(AI->getOperation(), Builder, Loaded, 1470 AI->getValOperand()); 1471 }, 1472 CreateCmpXchg); 1473 1474 AI->replaceAllUsesWith(Loaded); 1475 AI->eraseFromParent(); 1476 return true; 1477 } 1478 1479 // In order to use one of the sized library calls such as 1480 // __atomic_fetch_add_4, the alignment must be sufficient, the size 1481 // must be one of the potentially-specialized sizes, and the value 1482 // type must actually exist in C on the target (otherwise, the 1483 // function wouldn't actually be defined.) 1484 static bool canUseSizedAtomicCall(unsigned Size, Align Alignment, 1485 const DataLayout &DL) { 1486 // TODO: "LargestSize" is an approximation for "largest type that 1487 // you can express in C". It seems to be the case that int128 is 1488 // supported on all 64-bit platforms, otherwise only up to 64-bit 1489 // integers are supported. If we get this wrong, then we'll try to 1490 // call a sized libcall that doesn't actually exist. There should 1491 // really be some more reliable way in LLVM of determining integer 1492 // sizes which are valid in the target's C ABI... 1493 unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8; 1494 return Alignment >= Size && 1495 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) && 1496 Size <= LargestSize; 1497 } 1498 1499 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { 1500 static const RTLIB::Libcall Libcalls[6] = { 1501 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, 1502 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; 1503 unsigned Size = getAtomicOpSize(I); 1504 1505 bool expanded = expandAtomicOpToLibcall( 1506 I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr, 1507 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1508 (void)expanded; 1509 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); 1510 } 1511 1512 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { 1513 static const RTLIB::Libcall Libcalls[6] = { 1514 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, 1515 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; 1516 unsigned Size = getAtomicOpSize(I); 1517 1518 bool expanded = expandAtomicOpToLibcall( 1519 I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(), 1520 nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1521 (void)expanded; 1522 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); 1523 } 1524 1525 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { 1526 static const RTLIB::Libcall Libcalls[6] = { 1527 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, 1528 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, 1529 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; 1530 unsigned Size = getAtomicOpSize(I); 1531 1532 bool expanded = expandAtomicOpToLibcall( 1533 I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(), 1534 I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), 1535 Libcalls); 1536 (void)expanded; 1537 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); 1538 } 1539 1540 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { 1541 static const RTLIB::Libcall LibcallsXchg[6] = { 1542 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, 1543 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, 1544 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; 1545 static const RTLIB::Libcall LibcallsAdd[6] = { 1546 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, 1547 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, 1548 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; 1549 static const RTLIB::Libcall LibcallsSub[6] = { 1550 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, 1551 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, 1552 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; 1553 static const RTLIB::Libcall LibcallsAnd[6] = { 1554 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, 1555 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, 1556 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; 1557 static const RTLIB::Libcall LibcallsOr[6] = { 1558 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, 1559 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, 1560 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; 1561 static const RTLIB::Libcall LibcallsXor[6] = { 1562 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, 1563 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, 1564 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; 1565 static const RTLIB::Libcall LibcallsNand[6] = { 1566 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, 1567 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, 1568 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; 1569 1570 switch (Op) { 1571 case AtomicRMWInst::BAD_BINOP: 1572 llvm_unreachable("Should not have BAD_BINOP."); 1573 case AtomicRMWInst::Xchg: 1574 return makeArrayRef(LibcallsXchg); 1575 case AtomicRMWInst::Add: 1576 return makeArrayRef(LibcallsAdd); 1577 case AtomicRMWInst::Sub: 1578 return makeArrayRef(LibcallsSub); 1579 case AtomicRMWInst::And: 1580 return makeArrayRef(LibcallsAnd); 1581 case AtomicRMWInst::Or: 1582 return makeArrayRef(LibcallsOr); 1583 case AtomicRMWInst::Xor: 1584 return makeArrayRef(LibcallsXor); 1585 case AtomicRMWInst::Nand: 1586 return makeArrayRef(LibcallsNand); 1587 case AtomicRMWInst::Max: 1588 case AtomicRMWInst::Min: 1589 case AtomicRMWInst::UMax: 1590 case AtomicRMWInst::UMin: 1591 case AtomicRMWInst::FAdd: 1592 case AtomicRMWInst::FSub: 1593 // No atomic libcalls are available for max/min/umax/umin. 1594 return {}; 1595 } 1596 llvm_unreachable("Unexpected AtomicRMW operation."); 1597 } 1598 1599 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { 1600 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); 1601 1602 unsigned Size = getAtomicOpSize(I); 1603 1604 bool Success = false; 1605 if (!Libcalls.empty()) 1606 Success = expandAtomicOpToLibcall( 1607 I, Size, I->getAlign(), I->getPointerOperand(), I->getValOperand(), 1608 nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1609 1610 // The expansion failed: either there were no libcalls at all for 1611 // the operation (min/max), or there were only size-specialized 1612 // libcalls (add/sub/etc) and we needed a generic. So, expand to a 1613 // CAS libcall, via a CAS loop, instead. 1614 if (!Success) { 1615 expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, 1616 Value *Loaded, Value *NewVal, 1617 AtomicOrdering MemOpOrder, 1618 Value *&Success, Value *&NewLoaded) { 1619 // Create the CAS instruction normally... 1620 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( 1621 Addr, Loaded, NewVal, MemOpOrder, 1622 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 1623 Success = Builder.CreateExtractValue(Pair, 1, "success"); 1624 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 1625 1626 // ...and then expand the CAS into a libcall. 1627 expandAtomicCASToLibcall(Pair); 1628 }); 1629 } 1630 } 1631 1632 // A helper routine for the above expandAtomic*ToLibcall functions. 1633 // 1634 // 'Libcalls' contains an array of enum values for the particular 1635 // ATOMIC libcalls to be emitted. All of the other arguments besides 1636 // 'I' are extracted from the Instruction subclass by the 1637 // caller. Depending on the particular call, some will be null. 1638 bool AtomicExpand::expandAtomicOpToLibcall( 1639 Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand, 1640 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, 1641 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { 1642 assert(Libcalls.size() == 6); 1643 1644 LLVMContext &Ctx = I->getContext(); 1645 Module *M = I->getModule(); 1646 const DataLayout &DL = M->getDataLayout(); 1647 IRBuilder<> Builder(I); 1648 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); 1649 1650 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL); 1651 Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); 1652 1653 const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy); 1654 1655 // TODO: the "order" argument type is "int", not int32. So 1656 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. 1657 ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); 1658 assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); 1659 Constant *OrderingVal = 1660 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); 1661 Constant *Ordering2Val = nullptr; 1662 if (CASExpected) { 1663 assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO"); 1664 Ordering2Val = 1665 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2)); 1666 } 1667 bool HasResult = I->getType() != Type::getVoidTy(Ctx); 1668 1669 RTLIB::Libcall RTLibType; 1670 if (UseSizedLibcall) { 1671 switch (Size) { 1672 case 1: RTLibType = Libcalls[1]; break; 1673 case 2: RTLibType = Libcalls[2]; break; 1674 case 4: RTLibType = Libcalls[3]; break; 1675 case 8: RTLibType = Libcalls[4]; break; 1676 case 16: RTLibType = Libcalls[5]; break; 1677 } 1678 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) { 1679 RTLibType = Libcalls[0]; 1680 } else { 1681 // Can't use sized function, and there's no generic for this 1682 // operation, so give up. 1683 return false; 1684 } 1685 1686 // Build up the function call. There's two kinds. First, the sized 1687 // variants. These calls are going to be one of the following (with 1688 // N=1,2,4,8,16): 1689 // iN __atomic_load_N(iN *ptr, int ordering) 1690 // void __atomic_store_N(iN *ptr, iN val, int ordering) 1691 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) 1692 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, 1693 // int success_order, int failure_order) 1694 // 1695 // Note that these functions can be used for non-integer atomic 1696 // operations, the values just need to be bitcast to integers on the 1697 // way in and out. 1698 // 1699 // And, then, the generic variants. They look like the following: 1700 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) 1701 // void __atomic_store(size_t size, void *ptr, void *val, int ordering) 1702 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, 1703 // int ordering) 1704 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, 1705 // void *desired, int success_order, 1706 // int failure_order) 1707 // 1708 // The different signatures are built up depending on the 1709 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' 1710 // variables. 1711 1712 AllocaInst *AllocaCASExpected = nullptr; 1713 Value *AllocaCASExpected_i8 = nullptr; 1714 AllocaInst *AllocaValue = nullptr; 1715 Value *AllocaValue_i8 = nullptr; 1716 AllocaInst *AllocaResult = nullptr; 1717 Value *AllocaResult_i8 = nullptr; 1718 1719 Type *ResultTy; 1720 SmallVector<Value *, 6> Args; 1721 AttributeList Attr; 1722 1723 // 'size' argument. 1724 if (!UseSizedLibcall) { 1725 // Note, getIntPtrType is assumed equivalent to size_t. 1726 Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); 1727 } 1728 1729 // 'ptr' argument. 1730 // note: This assumes all address spaces share a common libfunc 1731 // implementation and that addresses are convertable. For systems without 1732 // that property, we'd need to extend this mechanism to support AS-specific 1733 // families of atomic intrinsics. 1734 auto PtrTypeAS = PointerOperand->getType()->getPointerAddressSpace(); 1735 Value *PtrVal = Builder.CreateBitCast(PointerOperand, 1736 Type::getInt8PtrTy(Ctx, PtrTypeAS)); 1737 PtrVal = Builder.CreateAddrSpaceCast(PtrVal, Type::getInt8PtrTy(Ctx)); 1738 Args.push_back(PtrVal); 1739 1740 // 'expected' argument, if present. 1741 if (CASExpected) { 1742 AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); 1743 AllocaCASExpected->setAlignment(AllocaAlignment); 1744 unsigned AllocaAS = AllocaCASExpected->getType()->getPointerAddressSpace(); 1745 1746 AllocaCASExpected_i8 = 1747 Builder.CreateBitCast(AllocaCASExpected, 1748 Type::getInt8PtrTy(Ctx, AllocaAS)); 1749 Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); 1750 Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); 1751 Args.push_back(AllocaCASExpected_i8); 1752 } 1753 1754 // 'val' argument ('desired' for cas), if present. 1755 if (ValueOperand) { 1756 if (UseSizedLibcall) { 1757 Value *IntValue = 1758 Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); 1759 Args.push_back(IntValue); 1760 } else { 1761 AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); 1762 AllocaValue->setAlignment(AllocaAlignment); 1763 AllocaValue_i8 = 1764 Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); 1765 Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); 1766 Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); 1767 Args.push_back(AllocaValue_i8); 1768 } 1769 } 1770 1771 // 'ret' argument. 1772 if (!CASExpected && HasResult && !UseSizedLibcall) { 1773 AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); 1774 AllocaResult->setAlignment(AllocaAlignment); 1775 unsigned AllocaAS = AllocaResult->getType()->getPointerAddressSpace(); 1776 AllocaResult_i8 = 1777 Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS)); 1778 Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); 1779 Args.push_back(AllocaResult_i8); 1780 } 1781 1782 // 'ordering' ('success_order' for cas) argument. 1783 Args.push_back(OrderingVal); 1784 1785 // 'failure_order' argument, if present. 1786 if (Ordering2Val) 1787 Args.push_back(Ordering2Val); 1788 1789 // Now, the return type. 1790 if (CASExpected) { 1791 ResultTy = Type::getInt1Ty(Ctx); 1792 Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); 1793 } else if (HasResult && UseSizedLibcall) 1794 ResultTy = SizedIntTy; 1795 else 1796 ResultTy = Type::getVoidTy(Ctx); 1797 1798 // Done with setting up arguments and return types, create the call: 1799 SmallVector<Type *, 6> ArgTys; 1800 for (Value *Arg : Args) 1801 ArgTys.push_back(Arg->getType()); 1802 FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); 1803 FunctionCallee LibcallFn = 1804 M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); 1805 CallInst *Call = Builder.CreateCall(LibcallFn, Args); 1806 Call->setAttributes(Attr); 1807 Value *Result = Call; 1808 1809 // And then, extract the results... 1810 if (ValueOperand && !UseSizedLibcall) 1811 Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); 1812 1813 if (CASExpected) { 1814 // The final result from the CAS is {load of 'expected' alloca, bool result 1815 // from call} 1816 Type *FinalResultTy = I->getType(); 1817 Value *V = UndefValue::get(FinalResultTy); 1818 Value *ExpectedOut = Builder.CreateAlignedLoad( 1819 CASExpected->getType(), AllocaCASExpected, AllocaAlignment); 1820 Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); 1821 V = Builder.CreateInsertValue(V, ExpectedOut, 0); 1822 V = Builder.CreateInsertValue(V, Result, 1); 1823 I->replaceAllUsesWith(V); 1824 } else if (HasResult) { 1825 Value *V; 1826 if (UseSizedLibcall) 1827 V = Builder.CreateBitOrPointerCast(Result, I->getType()); 1828 else { 1829 V = Builder.CreateAlignedLoad(I->getType(), AllocaResult, 1830 AllocaAlignment); 1831 Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); 1832 } 1833 I->replaceAllUsesWith(V); 1834 } 1835 I->eraseFromParent(); 1836 return true; 1837 } 1838