1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// This file implements SLP analysis based on VPlan. The analysis is based on 10 /// the ideas described in 11 /// 12 /// Look-ahead SLP: auto-vectorization in the presence of commutative 13 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 14 /// Luís F. W. Góes 15 /// 16 //===----------------------------------------------------------------------===// 17 18 #include "VPlan.h" 19 #include "llvm/ADT/DepthFirstIterator.h" 20 #include "llvm/ADT/PostOrderIterator.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Analysis/LoopInfo.h" 24 #include "llvm/Analysis/VectorUtils.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Value.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/ErrorHandling.h" 36 #include "llvm/Support/GraphWriter.h" 37 #include "llvm/Support/raw_ostream.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 #include <cassert> 40 #include <iterator> 41 #include <string> 42 #include <vector> 43 44 using namespace llvm; 45 46 #define DEBUG_TYPE "vplan-slp" 47 48 // Number of levels to look ahead when re-ordering multi node operands. 49 static unsigned LookaheadMaxDepth = 5; 50 51 VPInstruction *VPlanSlp::markFailed() { 52 // FIXME: Currently this is used to signal we hit instructions we cannot 53 // trivially SLP'ize. 54 CompletelySLP = false; 55 return nullptr; 56 } 57 58 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 59 if (all_of(Operands, [](VPValue *V) { 60 return cast<VPInstruction>(V)->getUnderlyingInstr(); 61 })) { 62 unsigned BundleSize = 0; 63 for (VPValue *V : Operands) { 64 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 65 assert(!T->isVectorTy() && "Only scalar types supported for now"); 66 BundleSize += T->getScalarSizeInBits(); 67 } 68 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 69 } 70 71 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 72 assert(Res.second && 73 "Already created a combined instruction for the operand bundle"); 74 (void)Res; 75 } 76 77 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 78 // Currently we only support VPInstructions. 79 if (!all_of(Operands, [](VPValue *Op) { 80 return Op && isa<VPInstruction>(Op) && 81 cast<VPInstruction>(Op)->getUnderlyingInstr(); 82 })) { 83 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 84 return false; 85 } 86 87 // Check if opcodes and type width agree for all instructions in the bundle. 88 // FIXME: Differing widths/opcodes can be handled by inserting additional 89 // instructions. 90 // FIXME: Deal with non-primitive types. 91 const Instruction *OriginalInstr = 92 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 93 unsigned Opcode = OriginalInstr->getOpcode(); 94 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 95 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 96 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 97 return I->getOpcode() == Opcode && 98 I->getType()->getPrimitiveSizeInBits() == Width; 99 })) { 100 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 101 return false; 102 } 103 104 // For now, all operands must be defined in the same BB. 105 if (any_of(Operands, [this](VPValue *Op) { 106 return cast<VPInstruction>(Op)->getParent() != &this->BB; 107 })) { 108 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 109 return false; 110 } 111 112 if (any_of(Operands, 113 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 114 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 115 return false; 116 } 117 118 // For loads, check that there are no instructions writing to memory in 119 // between them. 120 // TODO: we only have to forbid instructions writing to memory that could 121 // interfere with any of the loads in the bundle 122 if (Opcode == Instruction::Load) { 123 unsigned LoadsSeen = 0; 124 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 125 for (auto &I : *Parent) { 126 auto *VPI = cast<VPInstruction>(&I); 127 if (VPI->getOpcode() == Instruction::Load && 128 std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) 129 LoadsSeen++; 130 131 if (LoadsSeen == Operands.size()) 132 break; 133 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 134 LLVM_DEBUG( 135 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 136 return false; 137 } 138 } 139 140 if (!all_of(Operands, [](VPValue *Op) { 141 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 142 ->isSimple(); 143 })) { 144 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 145 return false; 146 } 147 } 148 149 if (Opcode == Instruction::Store) 150 if (!all_of(Operands, [](VPValue *Op) { 151 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 152 ->isSimple(); 153 })) { 154 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 155 return false; 156 } 157 158 return true; 159 } 160 161 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 162 unsigned OperandIndex) { 163 SmallVector<VPValue *, 4> Operands; 164 for (VPValue *V : Values) { 165 auto *U = cast<VPUser>(V); 166 Operands.push_back(U->getOperand(OperandIndex)); 167 } 168 return Operands; 169 } 170 171 static bool areCommutative(ArrayRef<VPValue *> Values) { 172 return Instruction::isCommutative( 173 cast<VPInstruction>(Values[0])->getOpcode()); 174 } 175 176 static SmallVector<SmallVector<VPValue *, 4>, 4> 177 getOperands(ArrayRef<VPValue *> Values) { 178 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 179 auto *VPI = cast<VPInstruction>(Values[0]); 180 181 switch (VPI->getOpcode()) { 182 case Instruction::Load: 183 llvm_unreachable("Loads terminate a tree, no need to get operands"); 184 case Instruction::Store: 185 Result.push_back(getOperands(Values, 0)); 186 break; 187 default: 188 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 189 Result.push_back(getOperands(Values, I)); 190 break; 191 } 192 193 return Result; 194 } 195 196 /// Returns the opcode of Values or ~0 if they do not all agree. 197 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 198 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 199 if (any_of(Values, [Opcode](VPValue *V) { 200 return cast<VPInstruction>(V)->getOpcode() != Opcode; 201 })) 202 return None; 203 return {Opcode}; 204 } 205 206 /// Returns true if A and B access sequential memory if they are loads or 207 /// stores or if they have identical opcodes otherwise. 208 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 209 VPInterleavedAccessInfo &IAI) { 210 if (A->getOpcode() != B->getOpcode()) 211 return false; 212 213 if (A->getOpcode() != Instruction::Load && 214 A->getOpcode() != Instruction::Store) 215 return true; 216 auto *GA = IAI.getInterleaveGroup(A); 217 auto *GB = IAI.getInterleaveGroup(B); 218 219 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 220 } 221 222 /// Implements getLAScore from Listing 7 in the paper. 223 /// Traverses and compares operands of V1 and V2 to MaxLevel. 224 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 225 VPInterleavedAccessInfo &IAI) { 226 if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2)) 227 return 0; 228 229 if (MaxLevel == 0) 230 return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1), 231 cast<VPInstruction>(V2), IAI); 232 233 unsigned Score = 0; 234 for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I) 235 for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J) 236 Score += getLAScore(cast<VPUser>(V1)->getOperand(I), 237 cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI); 238 return Score; 239 } 240 241 std::pair<VPlanSlp::OpMode, VPValue *> 242 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 243 SmallPtrSetImpl<VPValue *> &Candidates, 244 VPInterleavedAccessInfo &IAI) { 245 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 246 "Currently we only handle load and commutative opcodes"); 247 LLVM_DEBUG(dbgs() << " getBest\n"); 248 249 SmallVector<VPValue *, 4> BestCandidates; 250 LLVM_DEBUG(dbgs() << " Candidates for " 251 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 252 for (auto *Candidate : Candidates) { 253 auto *LastI = cast<VPInstruction>(Last); 254 auto *CandidateI = cast<VPInstruction>(Candidate); 255 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 256 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 257 << " "); 258 BestCandidates.push_back(Candidate); 259 } 260 } 261 LLVM_DEBUG(dbgs() << "\n"); 262 263 if (BestCandidates.empty()) 264 return {OpMode::Failed, nullptr}; 265 266 if (BestCandidates.size() == 1) 267 return {Mode, BestCandidates[0]}; 268 269 VPValue *Best = nullptr; 270 unsigned BestScore = 0; 271 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 272 unsigned PrevScore = ~0u; 273 bool AllSame = true; 274 275 // FIXME: Avoid visiting the same operands multiple times. 276 for (auto *Candidate : BestCandidates) { 277 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 278 if (PrevScore == ~0u) 279 PrevScore = Score; 280 if (PrevScore != Score) 281 AllSame = false; 282 PrevScore = Score; 283 284 if (Score > BestScore) { 285 BestScore = Score; 286 Best = Candidate; 287 } 288 } 289 if (!AllSame) 290 break; 291 } 292 LLVM_DEBUG(dbgs() << "Found best " 293 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 294 << "\n"); 295 Candidates.erase(Best); 296 297 return {Mode, Best}; 298 } 299 300 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 301 SmallVector<MultiNodeOpTy, 4> FinalOrder; 302 SmallVector<OpMode, 4> Mode; 303 FinalOrder.reserve(MultiNodeOps.size()); 304 Mode.reserve(MultiNodeOps.size()); 305 306 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 307 308 for (auto &Operands : MultiNodeOps) { 309 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 310 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 311 Instruction::Load) 312 Mode.push_back(OpMode::Load); 313 else 314 Mode.push_back(OpMode::Opcode); 315 } 316 317 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 318 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 319 SmallPtrSet<VPValue *, 4> Candidates; 320 LLVM_DEBUG(dbgs() << " Candidates "); 321 for (auto Ops : MultiNodeOps) { 322 LLVM_DEBUG( 323 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 324 << " "); 325 Candidates.insert(Ops.second[Lane]); 326 } 327 LLVM_DEBUG(dbgs() << "\n"); 328 329 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 330 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 331 if (Mode[Op] == OpMode::Failed) 332 continue; 333 334 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 335 std::pair<OpMode, VPValue *> Res = 336 getBest(Mode[Op], Last, Candidates, IAI); 337 if (Res.second) 338 FinalOrder[Op].second.push_back(Res.second); 339 else 340 // TODO: handle this case 341 FinalOrder[Op].second.push_back(markFailed()); 342 } 343 } 344 345 return FinalOrder; 346 } 347 348 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 349 dbgs() << " Ops: "; 350 for (auto Op : Values) 351 if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr()) 352 dbgs() << *Instr << " | "; 353 else 354 dbgs() << " nullptr | "; 355 dbgs() << "\n"; 356 } 357 358 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 359 assert(!Values.empty() && "Need some operands!"); 360 361 // If we already visited this instruction bundle, re-use the existing node 362 auto I = BundleToCombined.find(to_vector<4>(Values)); 363 if (I != BundleToCombined.end()) { 364 #ifndef NDEBUG 365 // Check that the resulting graph is a tree. If we re-use a node, this means 366 // its values have multiple users. We only allow this, if all users of each 367 // value are the same instruction. 368 for (auto *V : Values) { 369 auto UI = V->user_begin(); 370 auto *FirstUser = *UI++; 371 while (UI != V->user_end()) { 372 assert(*UI == FirstUser && "Currently we only support SLP trees."); 373 UI++; 374 } 375 } 376 #endif 377 return I->second; 378 } 379 380 // Dump inputs 381 LLVM_DEBUG({ 382 dbgs() << "buildGraph: "; 383 dumpBundle(Values); 384 }); 385 386 if (!areVectorizable(Values)) 387 return markFailed(); 388 389 assert(getOpcode(Values) && "Opcodes for all values must match"); 390 unsigned ValuesOpcode = getOpcode(Values).getValue(); 391 392 SmallVector<VPValue *, 4> CombinedOperands; 393 if (areCommutative(Values)) { 394 bool MultiNodeRoot = !MultiNodeActive; 395 MultiNodeActive = true; 396 for (auto &Operands : getOperands(Values)) { 397 LLVM_DEBUG({ 398 dbgs() << " Visiting Commutative"; 399 dumpBundle(Operands); 400 }); 401 402 auto OperandsOpcode = getOpcode(Operands); 403 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 404 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 405 CombinedOperands.push_back(buildGraph(Operands)); 406 } else { 407 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 408 // Create dummy VPInstruction, which will we replace later by the 409 // re-ordered operand. 410 VPInstruction *Op = new VPInstruction(0, {}); 411 CombinedOperands.push_back(Op); 412 MultiNodeOps.emplace_back(Op, Operands); 413 } 414 } 415 416 if (MultiNodeRoot) { 417 LLVM_DEBUG(dbgs() << "Reorder \n"); 418 MultiNodeActive = false; 419 420 auto FinalOrder = reorderMultiNodeOps(); 421 422 MultiNodeOps.clear(); 423 for (auto &Ops : FinalOrder) { 424 VPInstruction *NewOp = buildGraph(Ops.second); 425 Ops.first->replaceAllUsesWith(NewOp); 426 for (unsigned i = 0; i < CombinedOperands.size(); i++) 427 if (CombinedOperands[i] == Ops.first) 428 CombinedOperands[i] = NewOp; 429 delete Ops.first; 430 Ops.first = NewOp; 431 } 432 LLVM_DEBUG(dbgs() << "Found final order\n"); 433 } 434 } else { 435 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 436 if (ValuesOpcode == Instruction::Load) 437 for (VPValue *V : Values) 438 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 439 else 440 for (auto &Operands : getOperands(Values)) 441 CombinedOperands.push_back(buildGraph(Operands)); 442 } 443 444 unsigned Opcode; 445 switch (ValuesOpcode) { 446 case Instruction::Load: 447 Opcode = VPInstruction::SLPLoad; 448 break; 449 case Instruction::Store: 450 Opcode = VPInstruction::SLPStore; 451 break; 452 default: 453 Opcode = ValuesOpcode; 454 break; 455 } 456 457 if (!CompletelySLP) 458 return markFailed(); 459 460 assert(CombinedOperands.size() > 0 && "Need more some operands"); 461 auto *VPI = new VPInstruction(Opcode, CombinedOperands); 462 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); 463 464 LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); 465 cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n"); 466 addCombined(Values, VPI); 467 return VPI; 468 } 469