1 //===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the visit functions for load, store and alloca.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "InstCombineInternal.h"
14 #include "llvm/ADT/MapVector.h"
15 #include "llvm/ADT/SmallString.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/Analysis/Loads.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DebugInfoMetadata.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/IR/PatternMatch.h"
24 #include "llvm/Transforms/InstCombine/InstCombiner.h"
25 #include "llvm/Transforms/Utils/Local.h"
26 using namespace llvm;
27 using namespace PatternMatch;
28
29 #define DEBUG_TYPE "instcombine"
30
31 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
32 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
33
34 /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
35 /// pointer to an alloca. Ignore any reads of the pointer, return false if we
36 /// see any stores or other unknown uses. If we see pointer arithmetic, keep
37 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
38 /// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to
39 /// the alloca, and if the source pointer is a pointer to a constant global, we
40 /// can optimize this.
41 static bool
isOnlyCopiedFromConstantMemory(AAResults * AA,Value * V,MemTransferInst * & TheCopy,SmallVectorImpl<Instruction * > & ToDelete)42 isOnlyCopiedFromConstantMemory(AAResults *AA,
43 Value *V, MemTransferInst *&TheCopy,
44 SmallVectorImpl<Instruction *> &ToDelete) {
45 // We track lifetime intrinsics as we encounter them. If we decide to go
46 // ahead and replace the value with the global, this lets the caller quickly
47 // eliminate the markers.
48
49 SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
50 ValuesToInspect.emplace_back(V, false);
51 while (!ValuesToInspect.empty()) {
52 auto ValuePair = ValuesToInspect.pop_back_val();
53 const bool IsOffset = ValuePair.second;
54 for (auto &U : ValuePair.first->uses()) {
55 auto *I = cast<Instruction>(U.getUser());
56
57 if (auto *LI = dyn_cast<LoadInst>(I)) {
58 // Ignore non-volatile loads, they are always ok.
59 if (!LI->isSimple()) return false;
60 continue;
61 }
62
63 if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
64 // If uses of the bitcast are ok, we are ok.
65 ValuesToInspect.emplace_back(I, IsOffset);
66 continue;
67 }
68 if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
69 // If the GEP has all zero indices, it doesn't offset the pointer. If it
70 // doesn't, it does.
71 ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
72 continue;
73 }
74
75 if (auto *Call = dyn_cast<CallBase>(I)) {
76 // If this is the function being called then we treat it like a load and
77 // ignore it.
78 if (Call->isCallee(&U))
79 continue;
80
81 unsigned DataOpNo = Call->getDataOperandNo(&U);
82 bool IsArgOperand = Call->isArgOperand(&U);
83
84 // Inalloca arguments are clobbered by the call.
85 if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
86 return false;
87
88 // If this is a readonly/readnone call site, then we know it is just a
89 // load (but one that potentially returns the value itself), so we can
90 // ignore it if we know that the value isn't captured.
91 if (Call->onlyReadsMemory() &&
92 (Call->use_empty() || Call->doesNotCapture(DataOpNo)))
93 continue;
94
95 // If this is being passed as a byval argument, the caller is making a
96 // copy, so it is only a read of the alloca.
97 if (IsArgOperand && Call->isByValArgument(DataOpNo))
98 continue;
99 }
100
101 // Lifetime intrinsics can be handled by the caller.
102 if (I->isLifetimeStartOrEnd()) {
103 assert(I->use_empty() && "Lifetime markers have no result to use!");
104 ToDelete.push_back(I);
105 continue;
106 }
107
108 // If this is isn't our memcpy/memmove, reject it as something we can't
109 // handle.
110 MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
111 if (!MI)
112 return false;
113
114 // If the transfer is using the alloca as a source of the transfer, then
115 // ignore it since it is a load (unless the transfer is volatile).
116 if (U.getOperandNo() == 1) {
117 if (MI->isVolatile()) return false;
118 continue;
119 }
120
121 // If we already have seen a copy, reject the second one.
122 if (TheCopy) return false;
123
124 // If the pointer has been offset from the start of the alloca, we can't
125 // safely handle this.
126 if (IsOffset) return false;
127
128 // If the memintrinsic isn't using the alloca as the dest, reject it.
129 if (U.getOperandNo() != 0) return false;
130
131 // If the source of the memcpy/move is not a constant global, reject it.
132 if (!AA->pointsToConstantMemory(MI->getSource()))
133 return false;
134
135 // Otherwise, the transform is safe. Remember the copy instruction.
136 TheCopy = MI;
137 }
138 }
139 return true;
140 }
141
142 /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
143 /// modified by a copy from a constant global. If we can prove this, we can
144 /// replace any uses of the alloca with uses of the global directly.
145 static MemTransferInst *
isOnlyCopiedFromConstantMemory(AAResults * AA,AllocaInst * AI,SmallVectorImpl<Instruction * > & ToDelete)146 isOnlyCopiedFromConstantMemory(AAResults *AA,
147 AllocaInst *AI,
148 SmallVectorImpl<Instruction *> &ToDelete) {
149 MemTransferInst *TheCopy = nullptr;
150 if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
151 return TheCopy;
152 return nullptr;
153 }
154
155 /// Returns true if V is dereferenceable for size of alloca.
isDereferenceableForAllocaSize(const Value * V,const AllocaInst * AI,const DataLayout & DL)156 static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
157 const DataLayout &DL) {
158 if (AI->isArrayAllocation())
159 return false;
160 uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
161 if (!AllocaSize)
162 return false;
163 return isDereferenceableAndAlignedPointer(V, AI->getAlign(),
164 APInt(64, AllocaSize), DL);
165 }
166
simplifyAllocaArraySize(InstCombinerImpl & IC,AllocaInst & AI)167 static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
168 AllocaInst &AI) {
169 // Check for array size of 1 (scalar allocation).
170 if (!AI.isArrayAllocation()) {
171 // i32 1 is the canonical array size for scalar allocations.
172 if (AI.getArraySize()->getType()->isIntegerTy(32))
173 return nullptr;
174
175 // Canonicalize it.
176 return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
177 }
178
179 // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
180 if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
181 if (C->getValue().getActiveBits() <= 64) {
182 Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
183 AllocaInst *New = IC.Builder.CreateAlloca(NewTy, AI.getAddressSpace(),
184 nullptr, AI.getName());
185 New->setAlignment(AI.getAlign());
186
187 // Scan to the end of the allocation instructions, to skip over a block of
188 // allocas if possible...also skip interleaved debug info
189 //
190 BasicBlock::iterator It(New);
191 while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
192 ++It;
193
194 // Now that I is pointing to the first non-allocation-inst in the block,
195 // insert our getelementptr instruction...
196 //
197 Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
198 Value *NullIdx = Constant::getNullValue(IdxTy);
199 Value *Idx[2] = {NullIdx, NullIdx};
200 Instruction *GEP = GetElementPtrInst::CreateInBounds(
201 NewTy, New, Idx, New->getName() + ".sub");
202 IC.InsertNewInstBefore(GEP, *It);
203
204 // Now make everything use the getelementptr instead of the original
205 // allocation.
206 return IC.replaceInstUsesWith(AI, GEP);
207 }
208 }
209
210 if (isa<UndefValue>(AI.getArraySize()))
211 return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
212
213 // Ensure that the alloca array size argument has type intptr_t, so that
214 // any casting is exposed early.
215 Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
216 if (AI.getArraySize()->getType() != IntPtrTy) {
217 Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
218 return IC.replaceOperand(AI, 0, V);
219 }
220
221 return nullptr;
222 }
223
224 namespace {
225 // If I and V are pointers in different address space, it is not allowed to
226 // use replaceAllUsesWith since I and V have different types. A
227 // non-target-specific transformation should not use addrspacecast on V since
228 // the two address space may be disjoint depending on target.
229 //
230 // This class chases down uses of the old pointer until reaching the load
231 // instructions, then replaces the old pointer in the load instructions with
232 // the new pointer. If during the chasing it sees bitcast or GEP, it will
233 // create new bitcast or GEP with the new pointer and use them in the load
234 // instruction.
235 class PointerReplacer {
236 public:
PointerReplacer(InstCombinerImpl & IC)237 PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
238
239 bool collectUsers(Instruction &I);
240 void replacePointer(Instruction &I, Value *V);
241
242 private:
243 void replace(Instruction *I);
244 Value *getReplacement(Value *I);
245
246 SmallSetVector<Instruction *, 4> Worklist;
247 MapVector<Value *, Value *> WorkMap;
248 InstCombinerImpl &IC;
249 };
250 } // end anonymous namespace
251
collectUsers(Instruction & I)252 bool PointerReplacer::collectUsers(Instruction &I) {
253 for (auto U : I.users()) {
254 auto *Inst = cast<Instruction>(&*U);
255 if (auto *Load = dyn_cast<LoadInst>(Inst)) {
256 if (Load->isVolatile())
257 return false;
258 Worklist.insert(Load);
259 } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
260 Worklist.insert(Inst);
261 if (!collectUsers(*Inst))
262 return false;
263 } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
264 if (MI->isVolatile())
265 return false;
266 Worklist.insert(Inst);
267 } else if (Inst->isLifetimeStartOrEnd()) {
268 continue;
269 } else {
270 LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
271 return false;
272 }
273 }
274
275 return true;
276 }
277
getReplacement(Value * V)278 Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
279
replace(Instruction * I)280 void PointerReplacer::replace(Instruction *I) {
281 if (getReplacement(I))
282 return;
283
284 if (auto *LT = dyn_cast<LoadInst>(I)) {
285 auto *V = getReplacement(LT->getPointerOperand());
286 assert(V && "Operand not replaced");
287 auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
288 LT->getAlign(), LT->getOrdering(),
289 LT->getSyncScopeID());
290 NewI->takeName(LT);
291 copyMetadataForLoad(*NewI, *LT);
292
293 IC.InsertNewInstWith(NewI, *LT);
294 IC.replaceInstUsesWith(*LT, NewI);
295 WorkMap[LT] = NewI;
296 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
297 auto *V = getReplacement(GEP->getPointerOperand());
298 assert(V && "Operand not replaced");
299 SmallVector<Value *, 8> Indices;
300 Indices.append(GEP->idx_begin(), GEP->idx_end());
301 auto *NewI =
302 GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices);
303 IC.InsertNewInstWith(NewI, *GEP);
304 NewI->takeName(GEP);
305 WorkMap[GEP] = NewI;
306 } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
307 auto *V = getReplacement(BC->getOperand(0));
308 assert(V && "Operand not replaced");
309 auto *NewT = PointerType::getWithSamePointeeType(
310 cast<PointerType>(BC->getType()),
311 V->getType()->getPointerAddressSpace());
312 auto *NewI = new BitCastInst(V, NewT);
313 IC.InsertNewInstWith(NewI, *BC);
314 NewI->takeName(BC);
315 WorkMap[BC] = NewI;
316 } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
317 auto *SrcV = getReplacement(MemCpy->getRawSource());
318 // The pointer may appear in the destination of a copy, but we don't want to
319 // replace it.
320 if (!SrcV) {
321 assert(getReplacement(MemCpy->getRawDest()) &&
322 "destination not in replace list");
323 return;
324 }
325
326 IC.Builder.SetInsertPoint(MemCpy);
327 auto *NewI = IC.Builder.CreateMemTransferInst(
328 MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(),
329 SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(),
330 MemCpy->isVolatile());
331 AAMDNodes AAMD = MemCpy->getAAMetadata();
332 if (AAMD)
333 NewI->setAAMetadata(AAMD);
334
335 IC.eraseInstFromFunction(*MemCpy);
336 WorkMap[MemCpy] = NewI;
337 } else {
338 llvm_unreachable("should never reach here");
339 }
340 }
341
replacePointer(Instruction & I,Value * V)342 void PointerReplacer::replacePointer(Instruction &I, Value *V) {
343 #ifndef NDEBUG
344 auto *PT = cast<PointerType>(I.getType());
345 auto *NT = cast<PointerType>(V->getType());
346 assert(PT != NT && PT->hasSameElementTypeAs(NT) && "Invalid usage");
347 #endif
348 WorkMap[&I] = V;
349
350 for (Instruction *Workitem : Worklist)
351 replace(Workitem);
352 }
353
visitAllocaInst(AllocaInst & AI)354 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
355 if (auto *I = simplifyAllocaArraySize(*this, AI))
356 return I;
357
358 if (AI.getAllocatedType()->isSized()) {
359 // Move all alloca's of zero byte objects to the entry block and merge them
360 // together. Note that we only do this for alloca's, because malloc should
361 // allocate and return a unique pointer, even for a zero byte allocation.
362 if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) {
363 // For a zero sized alloca there is no point in doing an array allocation.
364 // This is helpful if the array size is a complicated expression not used
365 // elsewhere.
366 if (AI.isArrayAllocation())
367 return replaceOperand(AI, 0,
368 ConstantInt::get(AI.getArraySize()->getType(), 1));
369
370 // Get the first instruction in the entry block.
371 BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
372 Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
373 if (FirstInst != &AI) {
374 // If the entry block doesn't start with a zero-size alloca then move
375 // this one to the start of the entry block. There is no problem with
376 // dominance as the array size was forced to a constant earlier already.
377 AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
378 if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
379 DL.getTypeAllocSize(EntryAI->getAllocatedType())
380 .getKnownMinSize() != 0) {
381 AI.moveBefore(FirstInst);
382 return &AI;
383 }
384
385 // Replace this zero-sized alloca with the one at the start of the entry
386 // block after ensuring that the address will be aligned enough for both
387 // types.
388 const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
389 EntryAI->setAlignment(MaxAlign);
390 if (AI.getType() != EntryAI->getType())
391 return new BitCastInst(EntryAI, AI.getType());
392 return replaceInstUsesWith(AI, EntryAI);
393 }
394 }
395 }
396
397 // Check to see if this allocation is only modified by a memcpy/memmove from
398 // a constant whose alignment is equal to or exceeds that of the allocation.
399 // If this is the case, we can change all users to use the constant global
400 // instead. This is commonly produced by the CFE by constructs like "void
401 // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
402 // read.
403 SmallVector<Instruction *, 4> ToDelete;
404 if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
405 Value *TheSrc = Copy->getSource();
406 Align AllocaAlign = AI.getAlign();
407 Align SourceAlign = getOrEnforceKnownAlignment(
408 TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
409 if (AllocaAlign <= SourceAlign &&
410 isDereferenceableForAllocaSize(TheSrc, &AI, DL) &&
411 !isa<Instruction>(TheSrc)) {
412 // FIXME: Can we sink instructions without violating dominance when TheSrc
413 // is an instruction instead of a constant or argument?
414 LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
415 LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
416 unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
417 auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
418 if (AI.getType()->getAddressSpace() == SrcAddrSpace) {
419 for (Instruction *Delete : ToDelete)
420 eraseInstFromFunction(*Delete);
421
422 Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
423 Instruction *NewI = replaceInstUsesWith(AI, Cast);
424 eraseInstFromFunction(*Copy);
425 ++NumGlobalCopies;
426 return NewI;
427 }
428
429 PointerReplacer PtrReplacer(*this);
430 if (PtrReplacer.collectUsers(AI)) {
431 for (Instruction *Delete : ToDelete)
432 eraseInstFromFunction(*Delete);
433
434 Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
435 PtrReplacer.replacePointer(AI, Cast);
436 ++NumGlobalCopies;
437 }
438 }
439 }
440
441 // At last, use the generic allocation site handler to aggressively remove
442 // unused allocas.
443 return visitAllocSite(AI);
444 }
445
446 // Are we allowed to form a atomic load or store of this type?
isSupportedAtomicType(Type * Ty)447 static bool isSupportedAtomicType(Type *Ty) {
448 return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
449 }
450
451 /// Helper to combine a load to a new type.
452 ///
453 /// This just does the work of combining a load to a new type. It handles
454 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
455 /// loaded *value* type. This will convert it to a pointer, cast the operand to
456 /// that pointer type, load it, etc.
457 ///
458 /// Note that this will create all of the instructions with whatever insert
459 /// point the \c InstCombinerImpl currently is using.
combineLoadToNewType(LoadInst & LI,Type * NewTy,const Twine & Suffix)460 LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
461 const Twine &Suffix) {
462 assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
463 "can't fold an atomic load to requested type");
464
465 Value *Ptr = LI.getPointerOperand();
466 unsigned AS = LI.getPointerAddressSpace();
467 Type *NewPtrTy = NewTy->getPointerTo(AS);
468 Value *NewPtr = nullptr;
469 if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
470 NewPtr->getType() == NewPtrTy))
471 NewPtr = Builder.CreateBitCast(Ptr, NewPtrTy);
472
473 LoadInst *NewLoad = Builder.CreateAlignedLoad(
474 NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
475 NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
476 copyMetadataForLoad(*NewLoad, LI);
477 return NewLoad;
478 }
479
480 /// Combine a store to a new type.
481 ///
482 /// Returns the newly created store instruction.
combineStoreToNewValue(InstCombinerImpl & IC,StoreInst & SI,Value * V)483 static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
484 Value *V) {
485 assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
486 "can't fold an atomic store of requested type");
487
488 Value *Ptr = SI.getPointerOperand();
489 unsigned AS = SI.getPointerAddressSpace();
490 SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
491 SI.getAllMetadata(MD);
492
493 StoreInst *NewStore = IC.Builder.CreateAlignedStore(
494 V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
495 SI.getAlign(), SI.isVolatile());
496 NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
497 for (const auto &MDPair : MD) {
498 unsigned ID = MDPair.first;
499 MDNode *N = MDPair.second;
500 // Note, essentially every kind of metadata should be preserved here! This
501 // routine is supposed to clone a store instruction changing *only its
502 // type*. The only metadata it makes sense to drop is metadata which is
503 // invalidated when the pointer type changes. This should essentially
504 // never be the case in LLVM, but we explicitly switch over only known
505 // metadata to be conservatively correct. If you are adding metadata to
506 // LLVM which pertains to stores, you almost certainly want to add it
507 // here.
508 switch (ID) {
509 case LLVMContext::MD_dbg:
510 case LLVMContext::MD_tbaa:
511 case LLVMContext::MD_prof:
512 case LLVMContext::MD_fpmath:
513 case LLVMContext::MD_tbaa_struct:
514 case LLVMContext::MD_alias_scope:
515 case LLVMContext::MD_noalias:
516 case LLVMContext::MD_nontemporal:
517 case LLVMContext::MD_mem_parallel_loop_access:
518 case LLVMContext::MD_access_group:
519 // All of these directly apply.
520 NewStore->setMetadata(ID, N);
521 break;
522 case LLVMContext::MD_invariant_load:
523 case LLVMContext::MD_nonnull:
524 case LLVMContext::MD_noundef:
525 case LLVMContext::MD_range:
526 case LLVMContext::MD_align:
527 case LLVMContext::MD_dereferenceable:
528 case LLVMContext::MD_dereferenceable_or_null:
529 // These don't apply for stores.
530 break;
531 }
532 }
533
534 return NewStore;
535 }
536
537 /// Returns true if instruction represent minmax pattern like:
538 /// select ((cmp load V1, load V2), V1, V2).
isMinMaxWithLoads(Value * V,Type * & LoadTy)539 static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
540 assert(V->getType()->isPointerTy() && "Expected pointer type.");
541 // Ignore possible ty* to ixx* bitcast.
542 V = InstCombiner::peekThroughBitcast(V);
543 // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
544 // pattern.
545 CmpInst::Predicate Pred;
546 Instruction *L1;
547 Instruction *L2;
548 Value *LHS;
549 Value *RHS;
550 if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
551 m_Value(LHS), m_Value(RHS))))
552 return false;
553 LoadTy = L1->getType();
554 return (match(L1, m_Load(m_Specific(LHS))) &&
555 match(L2, m_Load(m_Specific(RHS)))) ||
556 (match(L1, m_Load(m_Specific(RHS))) &&
557 match(L2, m_Load(m_Specific(LHS))));
558 }
559
560 /// Combine loads to match the type of their uses' value after looking
561 /// through intervening bitcasts.
562 ///
563 /// The core idea here is that if the result of a load is used in an operation,
564 /// we should load the type most conducive to that operation. For example, when
565 /// loading an integer and converting that immediately to a pointer, we should
566 /// instead directly load a pointer.
567 ///
568 /// However, this routine must never change the width of a load or the number of
569 /// loads as that would introduce a semantic change. This combine is expected to
570 /// be a semantic no-op which just allows loads to more closely model the types
571 /// of their consuming operations.
572 ///
573 /// Currently, we also refuse to change the precise type used for an atomic load
574 /// or a volatile load. This is debatable, and might be reasonable to change
575 /// later. However, it is risky in case some backend or other part of LLVM is
576 /// relying on the exact type loaded to select appropriate atomic operations.
combineLoadToOperationType(InstCombinerImpl & IC,LoadInst & LI)577 static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
578 LoadInst &LI) {
579 // FIXME: We could probably with some care handle both volatile and ordered
580 // atomic loads here but it isn't clear that this is important.
581 if (!LI.isUnordered())
582 return nullptr;
583
584 if (LI.use_empty())
585 return nullptr;
586
587 // swifterror values can't be bitcasted.
588 if (LI.getPointerOperand()->isSwiftError())
589 return nullptr;
590
591 const DataLayout &DL = IC.getDataLayout();
592
593 // Fold away bit casts of the loaded value by loading the desired type.
594 // Note that we should not do this for pointer<->integer casts,
595 // because that would result in type punning.
596 if (LI.hasOneUse()) {
597 // Don't transform when the type is x86_amx, it makes the pass that lower
598 // x86_amx type happy.
599 if (auto *BC = dyn_cast<BitCastInst>(LI.user_back())) {
600 assert(!LI.getType()->isX86_AMXTy() &&
601 "load from x86_amx* should not happen!");
602 if (BC->getType()->isX86_AMXTy())
603 return nullptr;
604 }
605
606 if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
607 if (CI->isNoopCast(DL) && LI.getType()->isPtrOrPtrVectorTy() ==
608 CI->getDestTy()->isPtrOrPtrVectorTy())
609 if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
610 LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
611 CI->replaceAllUsesWith(NewLoad);
612 IC.eraseInstFromFunction(*CI);
613 return &LI;
614 }
615 }
616
617 // FIXME: We should also canonicalize loads of vectors when their elements are
618 // cast to other types.
619 return nullptr;
620 }
621
unpackLoadToAggregate(InstCombinerImpl & IC,LoadInst & LI)622 static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
623 // FIXME: We could probably with some care handle both volatile and atomic
624 // stores here but it isn't clear that this is important.
625 if (!LI.isSimple())
626 return nullptr;
627
628 Type *T = LI.getType();
629 if (!T->isAggregateType())
630 return nullptr;
631
632 StringRef Name = LI.getName();
633
634 if (auto *ST = dyn_cast<StructType>(T)) {
635 // If the struct only have one element, we unpack.
636 auto NumElements = ST->getNumElements();
637 if (NumElements == 1) {
638 LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
639 ".unpack");
640 NewLoad->setAAMetadata(LI.getAAMetadata());
641 return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
642 UndefValue::get(T), NewLoad, 0, Name));
643 }
644
645 // We don't want to break loads with padding here as we'd loose
646 // the knowledge that padding exists for the rest of the pipeline.
647 const DataLayout &DL = IC.getDataLayout();
648 auto *SL = DL.getStructLayout(ST);
649 if (SL->hasPadding())
650 return nullptr;
651
652 const auto Align = LI.getAlign();
653 auto *Addr = LI.getPointerOperand();
654 auto *IdxType = Type::getInt32Ty(T->getContext());
655 auto *Zero = ConstantInt::get(IdxType, 0);
656
657 Value *V = UndefValue::get(T);
658 for (unsigned i = 0; i < NumElements; i++) {
659 Value *Indices[2] = {
660 Zero,
661 ConstantInt::get(IdxType, i),
662 };
663 auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
664 Name + ".elt");
665 auto *L = IC.Builder.CreateAlignedLoad(
666 ST->getElementType(i), Ptr,
667 commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
668 // Propagate AA metadata. It'll still be valid on the narrowed load.
669 L->setAAMetadata(LI.getAAMetadata());
670 V = IC.Builder.CreateInsertValue(V, L, i);
671 }
672
673 V->setName(Name);
674 return IC.replaceInstUsesWith(LI, V);
675 }
676
677 if (auto *AT = dyn_cast<ArrayType>(T)) {
678 auto *ET = AT->getElementType();
679 auto NumElements = AT->getNumElements();
680 if (NumElements == 1) {
681 LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
682 NewLoad->setAAMetadata(LI.getAAMetadata());
683 return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
684 UndefValue::get(T), NewLoad, 0, Name));
685 }
686
687 // Bail out if the array is too large. Ideally we would like to optimize
688 // arrays of arbitrary size but this has a terrible impact on compile time.
689 // The threshold here is chosen arbitrarily, maybe needs a little bit of
690 // tuning.
691 if (NumElements > IC.MaxArraySizeForCombine)
692 return nullptr;
693
694 const DataLayout &DL = IC.getDataLayout();
695 auto EltSize = DL.getTypeAllocSize(ET);
696 const auto Align = LI.getAlign();
697
698 auto *Addr = LI.getPointerOperand();
699 auto *IdxType = Type::getInt64Ty(T->getContext());
700 auto *Zero = ConstantInt::get(IdxType, 0);
701
702 Value *V = UndefValue::get(T);
703 uint64_t Offset = 0;
704 for (uint64_t i = 0; i < NumElements; i++) {
705 Value *Indices[2] = {
706 Zero,
707 ConstantInt::get(IdxType, i),
708 };
709 auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
710 Name + ".elt");
711 auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
712 commonAlignment(Align, Offset),
713 Name + ".unpack");
714 L->setAAMetadata(LI.getAAMetadata());
715 V = IC.Builder.CreateInsertValue(V, L, i);
716 Offset += EltSize;
717 }
718
719 V->setName(Name);
720 return IC.replaceInstUsesWith(LI, V);
721 }
722
723 return nullptr;
724 }
725
726 // If we can determine that all possible objects pointed to by the provided
727 // pointer value are, not only dereferenceable, but also definitively less than
728 // or equal to the provided maximum size, then return true. Otherwise, return
729 // false (constant global values and allocas fall into this category).
730 //
731 // FIXME: This should probably live in ValueTracking (or similar).
isObjectSizeLessThanOrEq(Value * V,uint64_t MaxSize,const DataLayout & DL)732 static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
733 const DataLayout &DL) {
734 SmallPtrSet<Value *, 4> Visited;
735 SmallVector<Value *, 4> Worklist(1, V);
736
737 do {
738 Value *P = Worklist.pop_back_val();
739 P = P->stripPointerCasts();
740
741 if (!Visited.insert(P).second)
742 continue;
743
744 if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
745 Worklist.push_back(SI->getTrueValue());
746 Worklist.push_back(SI->getFalseValue());
747 continue;
748 }
749
750 if (PHINode *PN = dyn_cast<PHINode>(P)) {
751 append_range(Worklist, PN->incoming_values());
752 continue;
753 }
754
755 if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
756 if (GA->isInterposable())
757 return false;
758 Worklist.push_back(GA->getAliasee());
759 continue;
760 }
761
762 // If we know how big this object is, and it is less than MaxSize, continue
763 // searching. Otherwise, return false.
764 if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
765 if (!AI->getAllocatedType()->isSized())
766 return false;
767
768 ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
769 if (!CS)
770 return false;
771
772 uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
773 // Make sure that, even if the multiplication below would wrap as an
774 // uint64_t, we still do the right thing.
775 if ((CS->getValue().zext(128) * APInt(128, TypeSize)).ugt(MaxSize))
776 return false;
777 continue;
778 }
779
780 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
781 if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
782 return false;
783
784 uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
785 if (InitSize > MaxSize)
786 return false;
787 continue;
788 }
789
790 return false;
791 } while (!Worklist.empty());
792
793 return true;
794 }
795
796 // If we're indexing into an object of a known size, and the outer index is
797 // not a constant, but having any value but zero would lead to undefined
798 // behavior, replace it with zero.
799 //
800 // For example, if we have:
801 // @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
802 // ...
803 // %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
804 // ... = load i32* %arrayidx, align 4
805 // Then we know that we can replace %x in the GEP with i64 0.
806 //
807 // FIXME: We could fold any GEP index to zero that would cause UB if it were
808 // not zero. Currently, we only handle the first such index. Also, we could
809 // also search through non-zero constant indices if we kept track of the
810 // offsets those indices implied.
canReplaceGEPIdxWithZero(InstCombinerImpl & IC,GetElementPtrInst * GEPI,Instruction * MemI,unsigned & Idx)811 static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
812 GetElementPtrInst *GEPI, Instruction *MemI,
813 unsigned &Idx) {
814 if (GEPI->getNumOperands() < 2)
815 return false;
816
817 // Find the first non-zero index of a GEP. If all indices are zero, return
818 // one past the last index.
819 auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
820 unsigned I = 1;
821 for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
822 Value *V = GEPI->getOperand(I);
823 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
824 if (CI->isZero())
825 continue;
826
827 break;
828 }
829
830 return I;
831 };
832
833 // Skip through initial 'zero' indices, and find the corresponding pointer
834 // type. See if the next index is not a constant.
835 Idx = FirstNZIdx(GEPI);
836 if (Idx == GEPI->getNumOperands())
837 return false;
838 if (isa<Constant>(GEPI->getOperand(Idx)))
839 return false;
840
841 SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
842 Type *SourceElementType = GEPI->getSourceElementType();
843 // Size information about scalable vectors is not available, so we cannot
844 // deduce whether indexing at n is undefined behaviour or not. Bail out.
845 if (isa<ScalableVectorType>(SourceElementType))
846 return false;
847
848 Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
849 if (!AllocTy || !AllocTy->isSized())
850 return false;
851 const DataLayout &DL = IC.getDataLayout();
852 uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedSize();
853
854 // If there are more indices after the one we might replace with a zero, make
855 // sure they're all non-negative. If any of them are negative, the overall
856 // address being computed might be before the base address determined by the
857 // first non-zero index.
858 auto IsAllNonNegative = [&]() {
859 for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
860 KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
861 if (Known.isNonNegative())
862 continue;
863 return false;
864 }
865
866 return true;
867 };
868
869 // FIXME: If the GEP is not inbounds, and there are extra indices after the
870 // one we'll replace, those could cause the address computation to wrap
871 // (rendering the IsAllNonNegative() check below insufficient). We can do
872 // better, ignoring zero indices (and other indices we can prove small
873 // enough not to wrap).
874 if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
875 return false;
876
877 // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
878 // also known to be dereferenceable.
879 return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
880 IsAllNonNegative();
881 }
882
883 // If we're indexing into an object with a variable index for the memory
884 // access, but the object has only one element, we can assume that the index
885 // will always be zero. If we replace the GEP, return it.
886 template <typename T>
replaceGEPIdxWithZero(InstCombinerImpl & IC,Value * Ptr,T & MemI)887 static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
888 T &MemI) {
889 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
890 unsigned Idx;
891 if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
892 Instruction *NewGEPI = GEPI->clone();
893 NewGEPI->setOperand(Idx,
894 ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
895 NewGEPI->insertBefore(GEPI);
896 MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
897 return NewGEPI;
898 }
899 }
900
901 return nullptr;
902 }
903
canSimplifyNullStoreOrGEP(StoreInst & SI)904 static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
905 if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
906 return false;
907
908 auto *Ptr = SI.getPointerOperand();
909 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
910 Ptr = GEPI->getOperand(0);
911 return (isa<ConstantPointerNull>(Ptr) &&
912 !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
913 }
914
canSimplifyNullLoadOrGEP(LoadInst & LI,Value * Op)915 static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
916 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
917 const Value *GEPI0 = GEPI->getOperand(0);
918 if (isa<ConstantPointerNull>(GEPI0) &&
919 !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
920 return true;
921 }
922 if (isa<UndefValue>(Op) ||
923 (isa<ConstantPointerNull>(Op) &&
924 !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
925 return true;
926 return false;
927 }
928
visitLoadInst(LoadInst & LI)929 Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
930 Value *Op = LI.getOperand(0);
931
932 // Try to canonicalize the loaded type.
933 if (Instruction *Res = combineLoadToOperationType(*this, LI))
934 return Res;
935
936 // Attempt to improve the alignment.
937 Align KnownAlign = getOrEnforceKnownAlignment(
938 Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
939 if (KnownAlign > LI.getAlign())
940 LI.setAlignment(KnownAlign);
941
942 // Replace GEP indices if possible.
943 if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
944 Worklist.push(NewGEPI);
945 return &LI;
946 }
947
948 if (Instruction *Res = unpackLoadToAggregate(*this, LI))
949 return Res;
950
951 // Do really simple store-to-load forwarding and load CSE, to catch cases
952 // where there are several consecutive memory accesses to the same location,
953 // separated by a few arithmetic operations.
954 bool IsLoadCSE = false;
955 if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) {
956 if (IsLoadCSE)
957 combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
958
959 return replaceInstUsesWith(
960 LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
961 LI.getName() + ".cast"));
962 }
963
964 // None of the following transforms are legal for volatile/ordered atomic
965 // loads. Most of them do apply for unordered atomics.
966 if (!LI.isUnordered()) return nullptr;
967
968 // load(gep null, ...) -> unreachable
969 // load null/undef -> unreachable
970 // TODO: Consider a target hook for valid address spaces for this xforms.
971 if (canSimplifyNullLoadOrGEP(LI, Op)) {
972 // Insert a new store to null instruction before the load to indicate
973 // that this code is not reachable. We do this instead of inserting
974 // an unreachable instruction directly because we cannot modify the
975 // CFG.
976 StoreInst *SI = new StoreInst(PoisonValue::get(LI.getType()),
977 Constant::getNullValue(Op->getType()), &LI);
978 SI->setDebugLoc(LI.getDebugLoc());
979 return replaceInstUsesWith(LI, PoisonValue::get(LI.getType()));
980 }
981
982 if (Op->hasOneUse()) {
983 // Change select and PHI nodes to select values instead of addresses: this
984 // helps alias analysis out a lot, allows many others simplifications, and
985 // exposes redundancy in the code.
986 //
987 // Note that we cannot do the transformation unless we know that the
988 // introduced loads cannot trap! Something like this is valid as long as
989 // the condition is always false: load (select bool %C, int* null, int* %G),
990 // but it would not be valid if we transformed it to load from null
991 // unconditionally.
992 //
993 if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
994 // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2).
995 Align Alignment = LI.getAlign();
996 if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
997 Alignment, DL, SI) &&
998 isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
999 Alignment, DL, SI)) {
1000 LoadInst *V1 =
1001 Builder.CreateLoad(LI.getType(), SI->getOperand(1),
1002 SI->getOperand(1)->getName() + ".val");
1003 LoadInst *V2 =
1004 Builder.CreateLoad(LI.getType(), SI->getOperand(2),
1005 SI->getOperand(2)->getName() + ".val");
1006 assert(LI.isUnordered() && "implied by above");
1007 V1->setAlignment(Alignment);
1008 V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1009 V2->setAlignment(Alignment);
1010 V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1011 return SelectInst::Create(SI->getCondition(), V1, V2);
1012 }
1013
1014 // load (select (cond, null, P)) -> load P
1015 if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
1016 !NullPointerIsDefined(SI->getFunction(),
1017 LI.getPointerAddressSpace()))
1018 return replaceOperand(LI, 0, SI->getOperand(2));
1019
1020 // load (select (cond, P, null)) -> load P
1021 if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
1022 !NullPointerIsDefined(SI->getFunction(),
1023 LI.getPointerAddressSpace()))
1024 return replaceOperand(LI, 0, SI->getOperand(1));
1025 }
1026 }
1027 return nullptr;
1028 }
1029
1030 /// Look for extractelement/insertvalue sequence that acts like a bitcast.
1031 ///
1032 /// \returns underlying value that was "cast", or nullptr otherwise.
1033 ///
1034 /// For example, if we have:
1035 ///
1036 /// %E0 = extractelement <2 x double> %U, i32 0
1037 /// %V0 = insertvalue [2 x double] undef, double %E0, 0
1038 /// %E1 = extractelement <2 x double> %U, i32 1
1039 /// %V1 = insertvalue [2 x double] %V0, double %E1, 1
1040 ///
1041 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
1042 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
1043 /// Note that %U may contain non-undef values where %V1 has undef.
likeBitCastFromVector(InstCombinerImpl & IC,Value * V)1044 static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
1045 Value *U = nullptr;
1046 while (auto *IV = dyn_cast<InsertValueInst>(V)) {
1047 auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
1048 if (!E)
1049 return nullptr;
1050 auto *W = E->getVectorOperand();
1051 if (!U)
1052 U = W;
1053 else if (U != W)
1054 return nullptr;
1055 auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
1056 if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
1057 return nullptr;
1058 V = IV->getAggregateOperand();
1059 }
1060 if (!match(V, m_Undef()) || !U)
1061 return nullptr;
1062
1063 auto *UT = cast<VectorType>(U->getType());
1064 auto *VT = V->getType();
1065 // Check that types UT and VT are bitwise isomorphic.
1066 const auto &DL = IC.getDataLayout();
1067 if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
1068 return nullptr;
1069 }
1070 if (auto *AT = dyn_cast<ArrayType>(VT)) {
1071 if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1072 return nullptr;
1073 } else {
1074 auto *ST = cast<StructType>(VT);
1075 if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1076 return nullptr;
1077 for (const auto *EltT : ST->elements()) {
1078 if (EltT != UT->getElementType())
1079 return nullptr;
1080 }
1081 }
1082 return U;
1083 }
1084
1085 /// Combine stores to match the type of value being stored.
1086 ///
1087 /// The core idea here is that the memory does not have any intrinsic type and
1088 /// where we can we should match the type of a store to the type of value being
1089 /// stored.
1090 ///
1091 /// However, this routine must never change the width of a store or the number of
1092 /// stores as that would introduce a semantic change. This combine is expected to
1093 /// be a semantic no-op which just allows stores to more closely model the types
1094 /// of their incoming values.
1095 ///
1096 /// Currently, we also refuse to change the precise type used for an atomic or
1097 /// volatile store. This is debatable, and might be reasonable to change later.
1098 /// However, it is risky in case some backend or other part of LLVM is relying
1099 /// on the exact type stored to select appropriate atomic operations.
1100 ///
1101 /// \returns true if the store was successfully combined away. This indicates
1102 /// the caller must erase the store instruction. We have to let the caller erase
1103 /// the store instruction as otherwise there is no way to signal whether it was
1104 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
combineStoreToValueType(InstCombinerImpl & IC,StoreInst & SI)1105 static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
1106 // FIXME: We could probably with some care handle both volatile and ordered
1107 // atomic stores here but it isn't clear that this is important.
1108 if (!SI.isUnordered())
1109 return false;
1110
1111 // swifterror values can't be bitcasted.
1112 if (SI.getPointerOperand()->isSwiftError())
1113 return false;
1114
1115 Value *V = SI.getValueOperand();
1116
1117 // Fold away bit casts of the stored value by storing the original type.
1118 if (auto *BC = dyn_cast<BitCastInst>(V)) {
1119 assert(!BC->getType()->isX86_AMXTy() &&
1120 "store to x86_amx* should not happen!");
1121 V = BC->getOperand(0);
1122 // Don't transform when the type is x86_amx, it makes the pass that lower
1123 // x86_amx type happy.
1124 if (V->getType()->isX86_AMXTy())
1125 return false;
1126 if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
1127 combineStoreToNewValue(IC, SI, V);
1128 return true;
1129 }
1130 }
1131
1132 if (Value *U = likeBitCastFromVector(IC, V))
1133 if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
1134 combineStoreToNewValue(IC, SI, U);
1135 return true;
1136 }
1137
1138 // FIXME: We should also canonicalize stores of vectors when their elements
1139 // are cast to other types.
1140 return false;
1141 }
1142
unpackStoreToAggregate(InstCombinerImpl & IC,StoreInst & SI)1143 static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
1144 // FIXME: We could probably with some care handle both volatile and atomic
1145 // stores here but it isn't clear that this is important.
1146 if (!SI.isSimple())
1147 return false;
1148
1149 Value *V = SI.getValueOperand();
1150 Type *T = V->getType();
1151
1152 if (!T->isAggregateType())
1153 return false;
1154
1155 if (auto *ST = dyn_cast<StructType>(T)) {
1156 // If the struct only have one element, we unpack.
1157 unsigned Count = ST->getNumElements();
1158 if (Count == 1) {
1159 V = IC.Builder.CreateExtractValue(V, 0);
1160 combineStoreToNewValue(IC, SI, V);
1161 return true;
1162 }
1163
1164 // We don't want to break loads with padding here as we'd loose
1165 // the knowledge that padding exists for the rest of the pipeline.
1166 const DataLayout &DL = IC.getDataLayout();
1167 auto *SL = DL.getStructLayout(ST);
1168 if (SL->hasPadding())
1169 return false;
1170
1171 const auto Align = SI.getAlign();
1172
1173 SmallString<16> EltName = V->getName();
1174 EltName += ".elt";
1175 auto *Addr = SI.getPointerOperand();
1176 SmallString<16> AddrName = Addr->getName();
1177 AddrName += ".repack";
1178
1179 auto *IdxType = Type::getInt32Ty(ST->getContext());
1180 auto *Zero = ConstantInt::get(IdxType, 0);
1181 for (unsigned i = 0; i < Count; i++) {
1182 Value *Indices[2] = {
1183 Zero,
1184 ConstantInt::get(IdxType, i),
1185 };
1186 auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
1187 AddrName);
1188 auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1189 auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
1190 llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1191 NS->setAAMetadata(SI.getAAMetadata());
1192 }
1193
1194 return true;
1195 }
1196
1197 if (auto *AT = dyn_cast<ArrayType>(T)) {
1198 // If the array only have one element, we unpack.
1199 auto NumElements = AT->getNumElements();
1200 if (NumElements == 1) {
1201 V = IC.Builder.CreateExtractValue(V, 0);
1202 combineStoreToNewValue(IC, SI, V);
1203 return true;
1204 }
1205
1206 // Bail out if the array is too large. Ideally we would like to optimize
1207 // arrays of arbitrary size but this has a terrible impact on compile time.
1208 // The threshold here is chosen arbitrarily, maybe needs a little bit of
1209 // tuning.
1210 if (NumElements > IC.MaxArraySizeForCombine)
1211 return false;
1212
1213 const DataLayout &DL = IC.getDataLayout();
1214 auto EltSize = DL.getTypeAllocSize(AT->getElementType());
1215 const auto Align = SI.getAlign();
1216
1217 SmallString<16> EltName = V->getName();
1218 EltName += ".elt";
1219 auto *Addr = SI.getPointerOperand();
1220 SmallString<16> AddrName = Addr->getName();
1221 AddrName += ".repack";
1222
1223 auto *IdxType = Type::getInt64Ty(T->getContext());
1224 auto *Zero = ConstantInt::get(IdxType, 0);
1225
1226 uint64_t Offset = 0;
1227 for (uint64_t i = 0; i < NumElements; i++) {
1228 Value *Indices[2] = {
1229 Zero,
1230 ConstantInt::get(IdxType, i),
1231 };
1232 auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
1233 AddrName);
1234 auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1235 auto EltAlign = commonAlignment(Align, Offset);
1236 Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1237 NS->setAAMetadata(SI.getAAMetadata());
1238 Offset += EltSize;
1239 }
1240
1241 return true;
1242 }
1243
1244 return false;
1245 }
1246
1247 /// equivalentAddressValues - Test if A and B will obviously have the same
1248 /// value. This includes recognizing that %t0 and %t1 will have the same
1249 /// value in code like this:
1250 /// %t0 = getelementptr \@a, 0, 3
1251 /// store i32 0, i32* %t0
1252 /// %t1 = getelementptr \@a, 0, 3
1253 /// %t2 = load i32* %t1
1254 ///
equivalentAddressValues(Value * A,Value * B)1255 static bool equivalentAddressValues(Value *A, Value *B) {
1256 // Test if the values are trivially equivalent.
1257 if (A == B) return true;
1258
1259 // Test if the values come form identical arithmetic instructions.
1260 // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
1261 // its only used to compare two uses within the same basic block, which
1262 // means that they'll always either have the same value or one of them
1263 // will have an undefined value.
1264 if (isa<BinaryOperator>(A) ||
1265 isa<CastInst>(A) ||
1266 isa<PHINode>(A) ||
1267 isa<GetElementPtrInst>(A))
1268 if (Instruction *BI = dyn_cast<Instruction>(B))
1269 if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
1270 return true;
1271
1272 // Otherwise they may not be equivalent.
1273 return false;
1274 }
1275
1276 /// Converts store (bitcast (load (bitcast (select ...)))) to
1277 /// store (load (select ...)), where select is minmax:
1278 /// select ((cmp load V1, load V2), V1, V2).
removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl & IC,StoreInst & SI)1279 static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
1280 StoreInst &SI) {
1281 // bitcast?
1282 if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
1283 return false;
1284 // load? integer?
1285 Value *LoadAddr;
1286 if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
1287 return false;
1288 auto *LI = cast<LoadInst>(SI.getValueOperand());
1289 if (!LI->getType()->isIntegerTy())
1290 return false;
1291 Type *CmpLoadTy;
1292 if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
1293 return false;
1294
1295 // Make sure the type would actually change.
1296 // This condition can be hit with chains of bitcasts.
1297 if (LI->getType() == CmpLoadTy)
1298 return false;
1299
1300 // Make sure we're not changing the size of the load/store.
1301 const auto &DL = IC.getDataLayout();
1302 if (DL.getTypeStoreSizeInBits(LI->getType()) !=
1303 DL.getTypeStoreSizeInBits(CmpLoadTy))
1304 return false;
1305
1306 if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
1307 auto *SI = dyn_cast<StoreInst>(U);
1308 return SI && SI->getPointerOperand() != LI &&
1309 InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
1310 LoadAddr &&
1311 !SI->getPointerOperand()->isSwiftError();
1312 }))
1313 return false;
1314
1315 IC.Builder.SetInsertPoint(LI);
1316 LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
1317 // Replace all the stores with stores of the newly loaded value.
1318 for (auto *UI : LI->users()) {
1319 auto *USI = cast<StoreInst>(UI);
1320 IC.Builder.SetInsertPoint(USI);
1321 combineStoreToNewValue(IC, *USI, NewLI);
1322 }
1323 IC.replaceInstUsesWith(*LI, PoisonValue::get(LI->getType()));
1324 IC.eraseInstFromFunction(*LI);
1325 return true;
1326 }
1327
visitStoreInst(StoreInst & SI)1328 Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
1329 Value *Val = SI.getOperand(0);
1330 Value *Ptr = SI.getOperand(1);
1331
1332 // Try to canonicalize the stored type.
1333 if (combineStoreToValueType(*this, SI))
1334 return eraseInstFromFunction(SI);
1335
1336 // Attempt to improve the alignment.
1337 const Align KnownAlign = getOrEnforceKnownAlignment(
1338 Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
1339 if (KnownAlign > SI.getAlign())
1340 SI.setAlignment(KnownAlign);
1341
1342 // Try to canonicalize the stored type.
1343 if (unpackStoreToAggregate(*this, SI))
1344 return eraseInstFromFunction(SI);
1345
1346 if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
1347 return eraseInstFromFunction(SI);
1348
1349 // Replace GEP indices if possible.
1350 if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
1351 Worklist.push(NewGEPI);
1352 return &SI;
1353 }
1354
1355 // Don't hack volatile/ordered stores.
1356 // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
1357 if (!SI.isUnordered()) return nullptr;
1358
1359 // If the RHS is an alloca with a single use, zapify the store, making the
1360 // alloca dead.
1361 if (Ptr->hasOneUse()) {
1362 if (isa<AllocaInst>(Ptr))
1363 return eraseInstFromFunction(SI);
1364 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1365 if (isa<AllocaInst>(GEP->getOperand(0))) {
1366 if (GEP->getOperand(0)->hasOneUse())
1367 return eraseInstFromFunction(SI);
1368 }
1369 }
1370 }
1371
1372 // If we have a store to a location which is known constant, we can conclude
1373 // that the store must be storing the constant value (else the memory
1374 // wouldn't be constant), and this must be a noop.
1375 if (AA->pointsToConstantMemory(Ptr))
1376 return eraseInstFromFunction(SI);
1377
1378 // Do really simple DSE, to catch cases where there are several consecutive
1379 // stores to the same location, separated by a few arithmetic operations. This
1380 // situation often occurs with bitfield accesses.
1381 BasicBlock::iterator BBI(SI);
1382 for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
1383 --ScanInsts) {
1384 --BBI;
1385 // Don't count debug info directives, lest they affect codegen,
1386 // and we skip pointer-to-pointer bitcasts, which are NOPs.
1387 if (BBI->isDebugOrPseudoInst() ||
1388 (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1389 ScanInsts++;
1390 continue;
1391 }
1392
1393 if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
1394 // Prev store isn't volatile, and stores to the same location?
1395 if (PrevSI->isUnordered() &&
1396 equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1)) &&
1397 PrevSI->getValueOperand()->getType() ==
1398 SI.getValueOperand()->getType()) {
1399 ++NumDeadStore;
1400 // Manually add back the original store to the worklist now, so it will
1401 // be processed after the operands of the removed store, as this may
1402 // expose additional DSE opportunities.
1403 Worklist.push(&SI);
1404 eraseInstFromFunction(*PrevSI);
1405 return nullptr;
1406 }
1407 break;
1408 }
1409
1410 // If this is a load, we have to stop. However, if the loaded value is from
1411 // the pointer we're loading and is producing the pointer we're storing,
1412 // then *this* store is dead (X = load P; store X -> P).
1413 if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
1414 if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
1415 assert(SI.isUnordered() && "can't eliminate ordering operation");
1416 return eraseInstFromFunction(SI);
1417 }
1418
1419 // Otherwise, this is a load from some other location. Stores before it
1420 // may not be dead.
1421 break;
1422 }
1423
1424 // Don't skip over loads, throws or things that can modify memory.
1425 if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
1426 break;
1427 }
1428
1429 // store X, null -> turns into 'unreachable' in SimplifyCFG
1430 // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
1431 if (canSimplifyNullStoreOrGEP(SI)) {
1432 if (!isa<PoisonValue>(Val))
1433 return replaceOperand(SI, 0, PoisonValue::get(Val->getType()));
1434 return nullptr; // Do not modify these!
1435 }
1436
1437 // store undef, Ptr -> noop
1438 // FIXME: This is technically incorrect because it might overwrite a poison
1439 // value. Change to PoisonValue once #52930 is resolved.
1440 if (isa<UndefValue>(Val))
1441 return eraseInstFromFunction(SI);
1442
1443 return nullptr;
1444 }
1445
1446 /// Try to transform:
1447 /// if () { *P = v1; } else { *P = v2 }
1448 /// or:
1449 /// *P = v1; if () { *P = v2; }
1450 /// into a phi node with a store in the successor.
mergeStoreIntoSuccessor(StoreInst & SI)1451 bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
1452 if (!SI.isUnordered())
1453 return false; // This code has not been audited for volatile/ordered case.
1454
1455 // Check if the successor block has exactly 2 incoming edges.
1456 BasicBlock *StoreBB = SI.getParent();
1457 BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
1458 if (!DestBB->hasNPredecessors(2))
1459 return false;
1460
1461 // Capture the other block (the block that doesn't contain our store).
1462 pred_iterator PredIter = pred_begin(DestBB);
1463 if (*PredIter == StoreBB)
1464 ++PredIter;
1465 BasicBlock *OtherBB = *PredIter;
1466
1467 // Bail out if all of the relevant blocks aren't distinct. This can happen,
1468 // for example, if SI is in an infinite loop.
1469 if (StoreBB == DestBB || OtherBB == DestBB)
1470 return false;
1471
1472 // Verify that the other block ends in a branch and is not otherwise empty.
1473 BasicBlock::iterator BBI(OtherBB->getTerminator());
1474 BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
1475 if (!OtherBr || BBI == OtherBB->begin())
1476 return false;
1477
1478 // If the other block ends in an unconditional branch, check for the 'if then
1479 // else' case. There is an instruction before the branch.
1480 StoreInst *OtherStore = nullptr;
1481 if (OtherBr->isUnconditional()) {
1482 --BBI;
1483 // Skip over debugging info and pseudo probes.
1484 while (BBI->isDebugOrPseudoInst() ||
1485 (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1486 if (BBI==OtherBB->begin())
1487 return false;
1488 --BBI;
1489 }
1490 // If this isn't a store, isn't a store to the same location, or is not the
1491 // right kind of store, bail out.
1492 OtherStore = dyn_cast<StoreInst>(BBI);
1493 if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
1494 !SI.isSameOperationAs(OtherStore))
1495 return false;
1496 } else {
1497 // Otherwise, the other block ended with a conditional branch. If one of the
1498 // destinations is StoreBB, then we have the if/then case.
1499 if (OtherBr->getSuccessor(0) != StoreBB &&
1500 OtherBr->getSuccessor(1) != StoreBB)
1501 return false;
1502
1503 // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
1504 // if/then triangle. See if there is a store to the same ptr as SI that
1505 // lives in OtherBB.
1506 for (;; --BBI) {
1507 // Check to see if we find the matching store.
1508 if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
1509 if (OtherStore->getOperand(1) != SI.getOperand(1) ||
1510 !SI.isSameOperationAs(OtherStore))
1511 return false;
1512 break;
1513 }
1514 // If we find something that may be using or overwriting the stored
1515 // value, or if we run out of instructions, we can't do the transform.
1516 if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
1517 BBI->mayWriteToMemory() || BBI == OtherBB->begin())
1518 return false;
1519 }
1520
1521 // In order to eliminate the store in OtherBr, we have to make sure nothing
1522 // reads or overwrites the stored value in StoreBB.
1523 for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
1524 // FIXME: This should really be AA driven.
1525 if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
1526 return false;
1527 }
1528 }
1529
1530 // Insert a PHI node now if we need it.
1531 Value *MergedVal = OtherStore->getOperand(0);
1532 // The debug locations of the original instructions might differ. Merge them.
1533 DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
1534 OtherStore->getDebugLoc());
1535 if (MergedVal != SI.getOperand(0)) {
1536 PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
1537 PN->addIncoming(SI.getOperand(0), SI.getParent());
1538 PN->addIncoming(OtherStore->getOperand(0), OtherBB);
1539 MergedVal = InsertNewInstBefore(PN, DestBB->front());
1540 PN->setDebugLoc(MergedLoc);
1541 }
1542
1543 // Advance to a place where it is safe to insert the new store and insert it.
1544 BBI = DestBB->getFirstInsertionPt();
1545 StoreInst *NewSI =
1546 new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
1547 SI.getOrdering(), SI.getSyncScopeID());
1548 InsertNewInstBefore(NewSI, *BBI);
1549 NewSI->setDebugLoc(MergedLoc);
1550
1551 // If the two stores had AA tags, merge them.
1552 AAMDNodes AATags = SI.getAAMetadata();
1553 if (AATags)
1554 NewSI->setAAMetadata(AATags.merge(OtherStore->getAAMetadata()));
1555
1556 // Nuke the old stores.
1557 eraseInstFromFunction(SI);
1558 eraseInstFromFunction(*OtherStore);
1559 return true;
1560 }
1561