1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "ARMTargetTransformInfo.h" 25 #include "MCTargetDesc/ARMAddressingModes.h" 26 #include "MCTargetDesc/ARMBaseInfo.h" 27 #include "Utils/ARMBaseInfo.h" 28 #include "llvm/ADT/APFloat.h" 29 #include "llvm/ADT/APInt.h" 30 #include "llvm/ADT/ArrayRef.h" 31 #include "llvm/ADT/BitVector.h" 32 #include "llvm/ADT/DenseMap.h" 33 #include "llvm/ADT/STLExtras.h" 34 #include "llvm/ADT/SmallPtrSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/ADT/StringExtras.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/ADT/StringSwitch.h" 40 #include "llvm/ADT/Triple.h" 41 #include "llvm/ADT/Twine.h" 42 #include "llvm/Analysis/VectorUtils.h" 43 #include "llvm/CodeGen/CallingConvLower.h" 44 #include "llvm/CodeGen/ISDOpcodes.h" 45 #include "llvm/CodeGen/IntrinsicLowering.h" 46 #include "llvm/CodeGen/MachineBasicBlock.h" 47 #include "llvm/CodeGen/MachineConstantPool.h" 48 #include "llvm/CodeGen/MachineFrameInfo.h" 49 #include "llvm/CodeGen/MachineFunction.h" 50 #include "llvm/CodeGen/MachineInstr.h" 51 #include "llvm/CodeGen/MachineInstrBuilder.h" 52 #include "llvm/CodeGen/MachineJumpTableInfo.h" 53 #include "llvm/CodeGen/MachineMemOperand.h" 54 #include "llvm/CodeGen/MachineOperand.h" 55 #include "llvm/CodeGen/MachineRegisterInfo.h" 56 #include "llvm/CodeGen/RuntimeLibcalls.h" 57 #include "llvm/CodeGen/SelectionDAG.h" 58 #include "llvm/CodeGen/SelectionDAGNodes.h" 59 #include "llvm/CodeGen/TargetInstrInfo.h" 60 #include "llvm/CodeGen/TargetLowering.h" 61 #include "llvm/CodeGen/TargetOpcodes.h" 62 #include "llvm/CodeGen/TargetRegisterInfo.h" 63 #include "llvm/CodeGen/TargetSubtargetInfo.h" 64 #include "llvm/CodeGen/ValueTypes.h" 65 #include "llvm/IR/Attributes.h" 66 #include "llvm/IR/CallingConv.h" 67 #include "llvm/IR/Constant.h" 68 #include "llvm/IR/Constants.h" 69 #include "llvm/IR/DataLayout.h" 70 #include "llvm/IR/DebugLoc.h" 71 #include "llvm/IR/DerivedTypes.h" 72 #include "llvm/IR/Function.h" 73 #include "llvm/IR/GlobalAlias.h" 74 #include "llvm/IR/GlobalValue.h" 75 #include "llvm/IR/GlobalVariable.h" 76 #include "llvm/IR/IRBuilder.h" 77 #include "llvm/IR/InlineAsm.h" 78 #include "llvm/IR/Instruction.h" 79 #include "llvm/IR/Instructions.h" 80 #include "llvm/IR/IntrinsicInst.h" 81 #include "llvm/IR/Intrinsics.h" 82 #include "llvm/IR/IntrinsicsARM.h" 83 #include "llvm/IR/Module.h" 84 #include "llvm/IR/PatternMatch.h" 85 #include "llvm/IR/Type.h" 86 #include "llvm/IR/User.h" 87 #include "llvm/IR/Value.h" 88 #include "llvm/MC/MCInstrDesc.h" 89 #include "llvm/MC/MCInstrItineraries.h" 90 #include "llvm/MC/MCRegisterInfo.h" 91 #include "llvm/MC/MCSchedule.h" 92 #include "llvm/Support/AtomicOrdering.h" 93 #include "llvm/Support/BranchProbability.h" 94 #include "llvm/Support/Casting.h" 95 #include "llvm/Support/CodeGen.h" 96 #include "llvm/Support/CommandLine.h" 97 #include "llvm/Support/Compiler.h" 98 #include "llvm/Support/Debug.h" 99 #include "llvm/Support/ErrorHandling.h" 100 #include "llvm/Support/KnownBits.h" 101 #include "llvm/Support/MachineValueType.h" 102 #include "llvm/Support/MathExtras.h" 103 #include "llvm/Support/raw_ostream.h" 104 #include "llvm/Target/TargetMachine.h" 105 #include "llvm/Target/TargetOptions.h" 106 #include <algorithm> 107 #include <cassert> 108 #include <cstdint> 109 #include <cstdlib> 110 #include <iterator> 111 #include <limits> 112 #include <string> 113 #include <tuple> 114 #include <utility> 115 #include <vector> 116 117 using namespace llvm; 118 using namespace llvm::PatternMatch; 119 120 #define DEBUG_TYPE "arm-isel" 121 122 STATISTIC(NumTailCalls, "Number of tail calls"); 123 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 124 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 125 STATISTIC(NumConstpoolPromoted, 126 "Number of constants with their storage promoted into constant pools"); 127 128 static cl::opt<bool> 129 ARMInterworking("arm-interworking", cl::Hidden, 130 cl::desc("Enable / disable ARM interworking (for debugging only)"), 131 cl::init(true)); 132 133 static cl::opt<bool> EnableConstpoolPromotion( 134 "arm-promote-constant", cl::Hidden, 135 cl::desc("Enable / disable promotion of unnamed_addr constants into " 136 "constant pools"), 137 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 138 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 139 "arm-promote-constant-max-size", cl::Hidden, 140 cl::desc("Maximum size of constant to promote into a constant pool"), 141 cl::init(64)); 142 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 143 "arm-promote-constant-max-total", cl::Hidden, 144 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 145 cl::init(128)); 146 147 cl::opt<unsigned> 148 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 149 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 150 cl::init(2)); 151 152 // The APCS parameter registers. 153 static const MCPhysReg GPRArgRegs[] = { 154 ARM::R0, ARM::R1, ARM::R2, ARM::R3 155 }; 156 157 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { 158 if (VT != PromotedLdStVT) { 159 setOperationAction(ISD::LOAD, VT, Promote); 160 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 161 162 setOperationAction(ISD::STORE, VT, Promote); 163 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 164 } 165 166 MVT ElemTy = VT.getVectorElementType(); 167 if (ElemTy != MVT::f64) 168 setOperationAction(ISD::SETCC, VT, Custom); 169 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 171 if (ElemTy == MVT::i32) { 172 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 173 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 175 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 176 } else { 177 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 178 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 180 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 181 } 182 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 183 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 184 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 185 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 186 setOperationAction(ISD::SELECT, VT, Expand); 187 setOperationAction(ISD::SELECT_CC, VT, Expand); 188 setOperationAction(ISD::VSELECT, VT, Expand); 189 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 190 if (VT.isInteger()) { 191 setOperationAction(ISD::SHL, VT, Custom); 192 setOperationAction(ISD::SRA, VT, Custom); 193 setOperationAction(ISD::SRL, VT, Custom); 194 } 195 196 // Neon does not support vector divide/remainder operations. 197 setOperationAction(ISD::SDIV, VT, Expand); 198 setOperationAction(ISD::UDIV, VT, Expand); 199 setOperationAction(ISD::FDIV, VT, Expand); 200 setOperationAction(ISD::SREM, VT, Expand); 201 setOperationAction(ISD::UREM, VT, Expand); 202 setOperationAction(ISD::FREM, VT, Expand); 203 setOperationAction(ISD::SDIVREM, VT, Expand); 204 setOperationAction(ISD::UDIVREM, VT, Expand); 205 206 if (!VT.isFloatingPoint() && 207 VT != MVT::v2i64 && VT != MVT::v1i64) 208 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 209 setOperationAction(Opcode, VT, Legal); 210 if (!VT.isFloatingPoint()) 211 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 212 setOperationAction(Opcode, VT, Legal); 213 } 214 215 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 216 addRegisterClass(VT, &ARM::DPRRegClass); 217 addTypeForNEON(VT, MVT::f64); 218 } 219 220 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 221 addRegisterClass(VT, &ARM::DPairRegClass); 222 addTypeForNEON(VT, MVT::v2f64); 223 } 224 225 void ARMTargetLowering::setAllExpand(MVT VT) { 226 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 227 setOperationAction(Opc, VT, Expand); 228 229 // We support these really simple operations even on types where all 230 // the actual arithmetic has to be broken down into simpler 231 // operations or turned into library calls. 232 setOperationAction(ISD::BITCAST, VT, Legal); 233 setOperationAction(ISD::LOAD, VT, Legal); 234 setOperationAction(ISD::STORE, VT, Legal); 235 setOperationAction(ISD::UNDEF, VT, Legal); 236 } 237 238 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 239 LegalizeAction Action) { 240 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 241 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 242 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 243 } 244 245 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 246 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 247 248 for (auto VT : IntTypes) { 249 addRegisterClass(VT, &ARM::MQPRRegClass); 250 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 251 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 252 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 253 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 254 setOperationAction(ISD::SHL, VT, Custom); 255 setOperationAction(ISD::SRA, VT, Custom); 256 setOperationAction(ISD::SRL, VT, Custom); 257 setOperationAction(ISD::SMIN, VT, Legal); 258 setOperationAction(ISD::SMAX, VT, Legal); 259 setOperationAction(ISD::UMIN, VT, Legal); 260 setOperationAction(ISD::UMAX, VT, Legal); 261 setOperationAction(ISD::ABS, VT, Legal); 262 setOperationAction(ISD::SETCC, VT, Custom); 263 setOperationAction(ISD::MLOAD, VT, Custom); 264 setOperationAction(ISD::MSTORE, VT, Legal); 265 setOperationAction(ISD::CTLZ, VT, Legal); 266 setOperationAction(ISD::CTTZ, VT, Custom); 267 setOperationAction(ISD::BITREVERSE, VT, Legal); 268 setOperationAction(ISD::BSWAP, VT, Legal); 269 setOperationAction(ISD::SADDSAT, VT, Legal); 270 setOperationAction(ISD::UADDSAT, VT, Legal); 271 setOperationAction(ISD::SSUBSAT, VT, Legal); 272 setOperationAction(ISD::USUBSAT, VT, Legal); 273 setOperationAction(ISD::ABDS, VT, Legal); 274 setOperationAction(ISD::ABDU, VT, Legal); 275 276 // No native support for these. 277 setOperationAction(ISD::UDIV, VT, Expand); 278 setOperationAction(ISD::SDIV, VT, Expand); 279 setOperationAction(ISD::UREM, VT, Expand); 280 setOperationAction(ISD::SREM, VT, Expand); 281 setOperationAction(ISD::UDIVREM, VT, Expand); 282 setOperationAction(ISD::SDIVREM, VT, Expand); 283 setOperationAction(ISD::CTPOP, VT, Expand); 284 setOperationAction(ISD::SELECT, VT, Expand); 285 setOperationAction(ISD::SELECT_CC, VT, Expand); 286 287 // Vector reductions 288 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 289 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 290 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 291 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 292 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 293 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); 294 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 295 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 296 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 297 298 if (!HasMVEFP) { 299 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 300 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 301 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 302 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 303 } 304 305 // Pre and Post inc are supported on loads and stores 306 for (unsigned im = (unsigned)ISD::PRE_INC; 307 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 308 setIndexedLoadAction(im, VT, Legal); 309 setIndexedStoreAction(im, VT, Legal); 310 setIndexedMaskedLoadAction(im, VT, Legal); 311 setIndexedMaskedStoreAction(im, VT, Legal); 312 } 313 } 314 315 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 316 for (auto VT : FloatTypes) { 317 addRegisterClass(VT, &ARM::MQPRRegClass); 318 if (!HasMVEFP) 319 setAllExpand(VT); 320 321 // These are legal or custom whether we have MVE.fp or not 322 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 323 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 324 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 325 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 326 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 327 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 328 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 329 setOperationAction(ISD::SETCC, VT, Custom); 330 setOperationAction(ISD::MLOAD, VT, Custom); 331 setOperationAction(ISD::MSTORE, VT, Legal); 332 setOperationAction(ISD::SELECT, VT, Expand); 333 setOperationAction(ISD::SELECT_CC, VT, Expand); 334 335 // Pre and Post inc are supported on loads and stores 336 for (unsigned im = (unsigned)ISD::PRE_INC; 337 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 338 setIndexedLoadAction(im, VT, Legal); 339 setIndexedStoreAction(im, VT, Legal); 340 setIndexedMaskedLoadAction(im, VT, Legal); 341 setIndexedMaskedStoreAction(im, VT, Legal); 342 } 343 344 if (HasMVEFP) { 345 setOperationAction(ISD::FMINNUM, VT, Legal); 346 setOperationAction(ISD::FMAXNUM, VT, Legal); 347 setOperationAction(ISD::FROUND, VT, Legal); 348 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 349 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); 350 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 351 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 352 353 // No native support for these. 354 setOperationAction(ISD::FDIV, VT, Expand); 355 setOperationAction(ISD::FREM, VT, Expand); 356 setOperationAction(ISD::FSQRT, VT, Expand); 357 setOperationAction(ISD::FSIN, VT, Expand); 358 setOperationAction(ISD::FCOS, VT, Expand); 359 setOperationAction(ISD::FPOW, VT, Expand); 360 setOperationAction(ISD::FLOG, VT, Expand); 361 setOperationAction(ISD::FLOG2, VT, Expand); 362 setOperationAction(ISD::FLOG10, VT, Expand); 363 setOperationAction(ISD::FEXP, VT, Expand); 364 setOperationAction(ISD::FEXP2, VT, Expand); 365 setOperationAction(ISD::FNEARBYINT, VT, Expand); 366 } 367 } 368 369 // Custom Expand smaller than legal vector reductions to prevent false zero 370 // items being added. 371 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); 372 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); 373 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); 374 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); 375 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); 376 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); 377 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); 378 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); 379 380 // We 'support' these types up to bitcast/load/store level, regardless of 381 // MVE integer-only / float support. Only doing FP data processing on the FP 382 // vector types is inhibited at integer-only level. 383 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 384 for (auto VT : LongTypes) { 385 addRegisterClass(VT, &ARM::MQPRRegClass); 386 setAllExpand(VT); 387 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 388 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 389 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 390 } 391 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 392 393 // We can do bitwise operations on v2i64 vectors 394 setOperationAction(ISD::AND, MVT::v2i64, Legal); 395 setOperationAction(ISD::OR, MVT::v2i64, Legal); 396 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 397 398 // It is legal to extload from v4i8 to v4i16 or v4i32. 399 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 400 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 401 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 402 403 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 404 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 405 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 407 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 408 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 409 410 // Some truncating stores are legal too. 411 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 412 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 413 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 414 415 // Pre and Post inc on these are legal, given the correct extends 416 for (unsigned im = (unsigned)ISD::PRE_INC; 417 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 418 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 419 setIndexedLoadAction(im, VT, Legal); 420 setIndexedStoreAction(im, VT, Legal); 421 setIndexedMaskedLoadAction(im, VT, Legal); 422 setIndexedMaskedStoreAction(im, VT, Legal); 423 } 424 } 425 426 // Predicate types 427 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 428 for (auto VT : pTypes) { 429 addRegisterClass(VT, &ARM::VCCRRegClass); 430 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 431 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 432 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 433 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 434 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 435 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 436 setOperationAction(ISD::SETCC, VT, Custom); 437 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 438 setOperationAction(ISD::LOAD, VT, Custom); 439 setOperationAction(ISD::STORE, VT, Custom); 440 setOperationAction(ISD::TRUNCATE, VT, Custom); 441 setOperationAction(ISD::VSELECT, VT, Expand); 442 setOperationAction(ISD::SELECT, VT, Expand); 443 } 444 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 445 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 446 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 447 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 448 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 449 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 450 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 451 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); 452 } 453 454 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 455 const ARMSubtarget &STI) 456 : TargetLowering(TM), Subtarget(&STI) { 457 RegInfo = Subtarget->getRegisterInfo(); 458 Itins = Subtarget->getInstrItineraryData(); 459 460 setBooleanContents(ZeroOrOneBooleanContent); 461 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 462 463 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 464 !Subtarget->isTargetWatchOS()) { 465 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 466 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 467 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 468 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 469 : CallingConv::ARM_AAPCS); 470 } 471 472 if (Subtarget->isTargetMachO()) { 473 // Uses VFP for Thumb libfuncs if available. 474 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 475 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 476 static const struct { 477 const RTLIB::Libcall Op; 478 const char * const Name; 479 const ISD::CondCode Cond; 480 } LibraryCalls[] = { 481 // Single-precision floating-point arithmetic. 482 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 483 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 484 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 485 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 486 487 // Double-precision floating-point arithmetic. 488 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 489 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 490 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 491 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 492 493 // Single-precision comparisons. 494 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 495 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 496 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 497 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 498 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 499 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 500 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 501 502 // Double-precision comparisons. 503 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 504 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 505 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 506 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 507 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 508 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 509 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 510 511 // Floating-point to integer conversions. 512 // i64 conversions are done via library routines even when generating VFP 513 // instructions, so use the same ones. 514 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 515 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 516 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 517 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 518 519 // Conversions between floating types. 520 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 521 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 522 523 // Integer to floating-point conversions. 524 // i64 conversions are done via library routines even when generating VFP 525 // instructions, so use the same ones. 526 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 527 // e.g., __floatunsidf vs. __floatunssidfvfp. 528 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 529 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 530 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 531 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 532 }; 533 534 for (const auto &LC : LibraryCalls) { 535 setLibcallName(LC.Op, LC.Name); 536 if (LC.Cond != ISD::SETCC_INVALID) 537 setCmpLibcallCC(LC.Op, LC.Cond); 538 } 539 } 540 } 541 542 // These libcalls are not available in 32-bit. 543 setLibcallName(RTLIB::SHL_I128, nullptr); 544 setLibcallName(RTLIB::SRL_I128, nullptr); 545 setLibcallName(RTLIB::SRA_I128, nullptr); 546 setLibcallName(RTLIB::MUL_I128, nullptr); 547 548 // RTLIB 549 if (Subtarget->isAAPCS_ABI() && 550 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 551 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 552 static const struct { 553 const RTLIB::Libcall Op; 554 const char * const Name; 555 const CallingConv::ID CC; 556 const ISD::CondCode Cond; 557 } LibraryCalls[] = { 558 // Double-precision floating-point arithmetic helper functions 559 // RTABI chapter 4.1.2, Table 2 560 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 561 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 562 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 565 // Double-precision floating-point comparison helper functions 566 // RTABI chapter 4.1.2, Table 3 567 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 568 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 569 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 570 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 571 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 572 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 573 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 574 575 // Single-precision floating-point arithmetic helper functions 576 // RTABI chapter 4.1.2, Table 4 577 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 578 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 579 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 582 // Single-precision floating-point comparison helper functions 583 // RTABI chapter 4.1.2, Table 5 584 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 585 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 586 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 587 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 588 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 589 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 590 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 591 592 // Floating-point to integer conversions. 593 // RTABI chapter 4.1.2, Table 6 594 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 595 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 596 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 597 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 599 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 600 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 601 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 602 603 // Conversions between floating types. 604 // RTABI chapter 4.1.2, Table 7 605 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 606 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 607 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 608 609 // Integer to floating-point conversions. 610 // RTABI chapter 4.1.2, Table 8 611 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 612 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 613 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 614 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 615 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 616 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 617 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 618 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 619 620 // Long long helper functions 621 // RTABI chapter 4.2, Table 9 622 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 623 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 624 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 627 // Integer division functions 628 // RTABI chapter 4.3.1 629 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 630 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 631 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 632 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 633 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 634 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 635 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 636 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 637 }; 638 639 for (const auto &LC : LibraryCalls) { 640 setLibcallName(LC.Op, LC.Name); 641 setLibcallCallingConv(LC.Op, LC.CC); 642 if (LC.Cond != ISD::SETCC_INVALID) 643 setCmpLibcallCC(LC.Op, LC.Cond); 644 } 645 646 // EABI dependent RTLIB 647 if (TM.Options.EABIVersion == EABI::EABI4 || 648 TM.Options.EABIVersion == EABI::EABI5) { 649 static const struct { 650 const RTLIB::Libcall Op; 651 const char *const Name; 652 const CallingConv::ID CC; 653 const ISD::CondCode Cond; 654 } MemOpsLibraryCalls[] = { 655 // Memory operations 656 // RTABI chapter 4.3.4 657 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 658 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 659 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 660 }; 661 662 for (const auto &LC : MemOpsLibraryCalls) { 663 setLibcallName(LC.Op, LC.Name); 664 setLibcallCallingConv(LC.Op, LC.CC); 665 if (LC.Cond != ISD::SETCC_INVALID) 666 setCmpLibcallCC(LC.Op, LC.Cond); 667 } 668 } 669 } 670 671 if (Subtarget->isTargetWindows()) { 672 static const struct { 673 const RTLIB::Libcall Op; 674 const char * const Name; 675 const CallingConv::ID CC; 676 } LibraryCalls[] = { 677 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 678 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 679 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 680 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 681 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 682 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 683 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 684 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 685 }; 686 687 for (const auto &LC : LibraryCalls) { 688 setLibcallName(LC.Op, LC.Name); 689 setLibcallCallingConv(LC.Op, LC.CC); 690 } 691 } 692 693 // Use divmod compiler-rt calls for iOS 5.0 and later. 694 if (Subtarget->isTargetMachO() && 695 !(Subtarget->isTargetIOS() && 696 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 697 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 698 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 699 } 700 701 // The half <-> float conversion functions are always soft-float on 702 // non-watchos platforms, but are needed for some targets which use a 703 // hard-float calling convention by default. 704 if (!Subtarget->isTargetWatchABI()) { 705 if (Subtarget->isAAPCS_ABI()) { 706 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 707 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 708 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 709 } else { 710 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 711 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 712 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 713 } 714 } 715 716 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 717 // a __gnu_ prefix (which is the default). 718 if (Subtarget->isTargetAEABI()) { 719 static const struct { 720 const RTLIB::Libcall Op; 721 const char * const Name; 722 const CallingConv::ID CC; 723 } LibraryCalls[] = { 724 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 725 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 726 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 727 }; 728 729 for (const auto &LC : LibraryCalls) { 730 setLibcallName(LC.Op, LC.Name); 731 setLibcallCallingConv(LC.Op, LC.CC); 732 } 733 } 734 735 if (Subtarget->isThumb1Only()) 736 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 737 else 738 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 739 740 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 741 Subtarget->hasFPRegs()) { 742 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 743 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 744 if (!Subtarget->hasVFP2Base()) 745 setAllExpand(MVT::f32); 746 if (!Subtarget->hasFP64()) 747 setAllExpand(MVT::f64); 748 } 749 750 if (Subtarget->hasFullFP16()) { 751 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 752 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 753 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 754 755 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 756 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 757 } 758 759 if (Subtarget->hasBF16()) { 760 addRegisterClass(MVT::bf16, &ARM::HPRRegClass); 761 setAllExpand(MVT::bf16); 762 if (!Subtarget->hasFullFP16()) 763 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 764 } 765 766 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 767 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 768 setTruncStoreAction(VT, InnerVT, Expand); 769 addAllExtLoads(VT, InnerVT, Expand); 770 } 771 772 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 773 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 774 775 setOperationAction(ISD::BSWAP, VT, Expand); 776 } 777 778 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 779 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 780 781 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 782 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 783 784 if (Subtarget->hasMVEIntegerOps()) 785 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 786 787 // Combine low-overhead loop intrinsics so that we can lower i1 types. 788 if (Subtarget->hasLOB()) { 789 setTargetDAGCombine(ISD::BRCOND); 790 setTargetDAGCombine(ISD::BR_CC); 791 } 792 793 if (Subtarget->hasNEON()) { 794 addDRTypeForNEON(MVT::v2f32); 795 addDRTypeForNEON(MVT::v8i8); 796 addDRTypeForNEON(MVT::v4i16); 797 addDRTypeForNEON(MVT::v2i32); 798 addDRTypeForNEON(MVT::v1i64); 799 800 addQRTypeForNEON(MVT::v4f32); 801 addQRTypeForNEON(MVT::v2f64); 802 addQRTypeForNEON(MVT::v16i8); 803 addQRTypeForNEON(MVT::v8i16); 804 addQRTypeForNEON(MVT::v4i32); 805 addQRTypeForNEON(MVT::v2i64); 806 807 if (Subtarget->hasFullFP16()) { 808 addQRTypeForNEON(MVT::v8f16); 809 addDRTypeForNEON(MVT::v4f16); 810 } 811 812 if (Subtarget->hasBF16()) { 813 addQRTypeForNEON(MVT::v8bf16); 814 addDRTypeForNEON(MVT::v4bf16); 815 } 816 } 817 818 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 819 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 820 // none of Neon, MVE or VFP supports any arithmetic operations on it. 821 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 822 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 823 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 824 // FIXME: Code duplication: FDIV and FREM are expanded always, see 825 // ARMTargetLowering::addTypeForNEON method for details. 826 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 827 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 828 // FIXME: Create unittest. 829 // In another words, find a way when "copysign" appears in DAG with vector 830 // operands. 831 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 832 // FIXME: Code duplication: SETCC has custom operation action, see 833 // ARMTargetLowering::addTypeForNEON method for details. 834 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 835 // FIXME: Create unittest for FNEG and for FABS. 836 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 837 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 838 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 839 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 840 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 841 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 842 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 843 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 844 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 845 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 846 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 847 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 848 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 849 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 850 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 851 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 852 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 853 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 854 } 855 856 if (Subtarget->hasNEON()) { 857 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 858 // supported for v4f32. 859 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 860 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 861 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 862 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 863 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 864 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 865 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 866 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 867 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 868 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 869 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 870 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 871 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 872 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 873 874 // Mark v2f32 intrinsics. 875 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 876 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 877 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 878 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 879 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 880 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 881 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 882 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 883 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 884 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 885 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 886 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 887 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 888 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 889 890 // Neon does not support some operations on v1i64 and v2i64 types. 891 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 892 // Custom handling for some quad-vector types to detect VMULL. 893 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 894 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 895 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 896 // Custom handling for some vector types to avoid expensive expansions 897 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 898 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 899 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 900 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 901 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 902 // a destination type that is wider than the source, and nor does 903 // it have a FP_TO_[SU]INT instruction with a narrower destination than 904 // source. 905 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 906 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 907 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 908 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 909 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 910 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 911 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 912 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 913 914 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 915 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 916 917 // NEON does not have single instruction CTPOP for vectors with element 918 // types wider than 8-bits. However, custom lowering can leverage the 919 // v8i8/v16i8 vcnt instruction. 920 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 921 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 922 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 923 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 924 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 925 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 926 927 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 928 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 929 930 // NEON does not have single instruction CTTZ for vectors. 931 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 932 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 933 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 934 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 935 936 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 937 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 938 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 939 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 940 941 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 942 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 943 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 944 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 945 946 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 947 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 948 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 949 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 950 951 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 952 setOperationAction(ISD::MULHS, VT, Expand); 953 setOperationAction(ISD::MULHU, VT, Expand); 954 } 955 956 // NEON only has FMA instructions as of VFP4. 957 if (!Subtarget->hasVFP4Base()) { 958 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 959 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 960 } 961 962 setTargetDAGCombine(ISD::SHL); 963 setTargetDAGCombine(ISD::SRL); 964 setTargetDAGCombine(ISD::SRA); 965 setTargetDAGCombine(ISD::FP_TO_SINT); 966 setTargetDAGCombine(ISD::FP_TO_UINT); 967 setTargetDAGCombine(ISD::FDIV); 968 setTargetDAGCombine(ISD::LOAD); 969 970 // It is legal to extload from v4i8 to v4i16 or v4i32. 971 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 972 MVT::v2i32}) { 973 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 974 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 975 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 976 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 977 } 978 } 979 } 980 981 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 982 setTargetDAGCombine(ISD::BUILD_VECTOR); 983 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 984 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 985 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 986 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 987 setTargetDAGCombine(ISD::STORE); 988 setTargetDAGCombine(ISD::SIGN_EXTEND); 989 setTargetDAGCombine(ISD::ZERO_EXTEND); 990 setTargetDAGCombine(ISD::ANY_EXTEND); 991 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 992 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 993 setTargetDAGCombine(ISD::INTRINSIC_VOID); 994 setTargetDAGCombine(ISD::VECREDUCE_ADD); 995 setTargetDAGCombine(ISD::ADD); 996 setTargetDAGCombine(ISD::BITCAST); 997 } 998 if (Subtarget->hasMVEIntegerOps()) { 999 setTargetDAGCombine(ISD::SMIN); 1000 setTargetDAGCombine(ISD::UMIN); 1001 setTargetDAGCombine(ISD::SMAX); 1002 setTargetDAGCombine(ISD::UMAX); 1003 setTargetDAGCombine(ISD::FP_EXTEND); 1004 setTargetDAGCombine(ISD::SELECT); 1005 setTargetDAGCombine(ISD::SELECT_CC); 1006 } 1007 1008 if (!Subtarget->hasFP64()) { 1009 // When targeting a floating-point unit with only single-precision 1010 // operations, f64 is legal for the few double-precision instructions which 1011 // are present However, no double-precision operations other than moves, 1012 // loads and stores are provided by the hardware. 1013 setOperationAction(ISD::FADD, MVT::f64, Expand); 1014 setOperationAction(ISD::FSUB, MVT::f64, Expand); 1015 setOperationAction(ISD::FMUL, MVT::f64, Expand); 1016 setOperationAction(ISD::FMA, MVT::f64, Expand); 1017 setOperationAction(ISD::FDIV, MVT::f64, Expand); 1018 setOperationAction(ISD::FREM, MVT::f64, Expand); 1019 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 1020 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 1021 setOperationAction(ISD::FNEG, MVT::f64, Expand); 1022 setOperationAction(ISD::FABS, MVT::f64, Expand); 1023 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 1024 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1025 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1026 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1027 setOperationAction(ISD::FLOG, MVT::f64, Expand); 1028 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 1029 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 1030 setOperationAction(ISD::FEXP, MVT::f64, Expand); 1031 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 1032 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 1033 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 1034 setOperationAction(ISD::FRINT, MVT::f64, Expand); 1035 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 1036 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 1037 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 1038 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 1039 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 1040 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 1041 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 1042 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 1043 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 1044 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 1045 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 1046 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 1047 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 1048 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 1049 } 1050 1051 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 1052 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 1053 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 1054 if (Subtarget->hasFullFP16()) { 1055 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 1056 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 1057 } 1058 } 1059 1060 if (!Subtarget->hasFP16()) { 1061 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1062 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1063 } 1064 1065 computeRegisterProperties(Subtarget->getRegisterInfo()); 1066 1067 // ARM does not have floating-point extending loads. 1068 for (MVT VT : MVT::fp_valuetypes()) { 1069 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1070 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1071 } 1072 1073 // ... or truncating stores 1074 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1075 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1076 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1077 1078 // ARM does not have i1 sign extending load. 1079 for (MVT VT : MVT::integer_valuetypes()) 1080 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1081 1082 // ARM supports all 4 flavors of integer indexed load / store. 1083 if (!Subtarget->isThumb1Only()) { 1084 for (unsigned im = (unsigned)ISD::PRE_INC; 1085 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1086 setIndexedLoadAction(im, MVT::i1, Legal); 1087 setIndexedLoadAction(im, MVT::i8, Legal); 1088 setIndexedLoadAction(im, MVT::i16, Legal); 1089 setIndexedLoadAction(im, MVT::i32, Legal); 1090 setIndexedStoreAction(im, MVT::i1, Legal); 1091 setIndexedStoreAction(im, MVT::i8, Legal); 1092 setIndexedStoreAction(im, MVT::i16, Legal); 1093 setIndexedStoreAction(im, MVT::i32, Legal); 1094 } 1095 } else { 1096 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1097 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1098 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1099 } 1100 1101 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1102 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1103 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1104 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1105 1106 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1107 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1108 if (Subtarget->hasDSP()) { 1109 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1110 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1111 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1112 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1113 setOperationAction(ISD::UADDSAT, MVT::i8, Custom); 1114 setOperationAction(ISD::USUBSAT, MVT::i8, Custom); 1115 setOperationAction(ISD::UADDSAT, MVT::i16, Custom); 1116 setOperationAction(ISD::USUBSAT, MVT::i16, Custom); 1117 } 1118 if (Subtarget->hasBaseDSP()) { 1119 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1120 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1121 } 1122 1123 // i64 operation support. 1124 setOperationAction(ISD::MUL, MVT::i64, Expand); 1125 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1126 if (Subtarget->isThumb1Only()) { 1127 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1128 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1129 } 1130 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1131 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1132 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1133 1134 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1135 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1136 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1137 setOperationAction(ISD::SRL, MVT::i64, Custom); 1138 setOperationAction(ISD::SRA, MVT::i64, Custom); 1139 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1140 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1141 setOperationAction(ISD::LOAD, MVT::i64, Custom); 1142 setOperationAction(ISD::STORE, MVT::i64, Custom); 1143 1144 // MVE lowers 64 bit shifts to lsll and lsrl 1145 // assuming that ISD::SRL and SRA of i64 are already marked custom 1146 if (Subtarget->hasMVEIntegerOps()) 1147 setOperationAction(ISD::SHL, MVT::i64, Custom); 1148 1149 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1150 if (Subtarget->isThumb1Only()) { 1151 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1152 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1153 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1154 } 1155 1156 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1157 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1158 1159 // ARM does not have ROTL. 1160 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1161 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1162 setOperationAction(ISD::ROTL, VT, Expand); 1163 setOperationAction(ISD::ROTR, VT, Expand); 1164 } 1165 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1166 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1167 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1168 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1169 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1170 } 1171 1172 // @llvm.readcyclecounter requires the Performance Monitors extension. 1173 // Default to the 0 expansion on unsupported platforms. 1174 // FIXME: Technically there are older ARM CPUs that have 1175 // implementation-specific ways of obtaining this information. 1176 if (Subtarget->hasPerfMon()) 1177 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1178 1179 // Only ARMv6 has BSWAP. 1180 if (!Subtarget->hasV6Ops()) 1181 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1182 1183 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1184 : Subtarget->hasDivideInARMMode(); 1185 if (!hasDivide) { 1186 // These are expanded into libcalls if the cpu doesn't have HW divider. 1187 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1188 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1189 } 1190 1191 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1192 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1193 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1194 1195 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1196 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1197 } 1198 1199 setOperationAction(ISD::SREM, MVT::i32, Expand); 1200 setOperationAction(ISD::UREM, MVT::i32, Expand); 1201 1202 // Register based DivRem for AEABI (RTABI 4.2) 1203 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1204 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1205 Subtarget->isTargetWindows()) { 1206 setOperationAction(ISD::SREM, MVT::i64, Custom); 1207 setOperationAction(ISD::UREM, MVT::i64, Custom); 1208 HasStandaloneRem = false; 1209 1210 if (Subtarget->isTargetWindows()) { 1211 const struct { 1212 const RTLIB::Libcall Op; 1213 const char * const Name; 1214 const CallingConv::ID CC; 1215 } LibraryCalls[] = { 1216 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1217 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1218 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1219 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1220 1221 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1222 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1223 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1224 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1225 }; 1226 1227 for (const auto &LC : LibraryCalls) { 1228 setLibcallName(LC.Op, LC.Name); 1229 setLibcallCallingConv(LC.Op, LC.CC); 1230 } 1231 } else { 1232 const struct { 1233 const RTLIB::Libcall Op; 1234 const char * const Name; 1235 const CallingConv::ID CC; 1236 } LibraryCalls[] = { 1237 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1238 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1239 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1240 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1241 1242 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1243 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1244 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1245 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1246 }; 1247 1248 for (const auto &LC : LibraryCalls) { 1249 setLibcallName(LC.Op, LC.Name); 1250 setLibcallCallingConv(LC.Op, LC.CC); 1251 } 1252 } 1253 1254 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1255 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1256 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1257 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1258 } else { 1259 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1260 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1261 } 1262 1263 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1264 // MSVCRT doesn't have powi; fall back to pow 1265 setLibcallName(RTLIB::POWI_F32, nullptr); 1266 setLibcallName(RTLIB::POWI_F64, nullptr); 1267 } 1268 1269 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1270 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1271 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1272 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1273 1274 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1275 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1276 1277 // Use the default implementation. 1278 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1279 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1280 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1281 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1282 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1283 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1284 1285 if (Subtarget->isTargetWindows()) 1286 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1287 else 1288 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1289 1290 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1291 // the default expansion. 1292 InsertFencesForAtomic = false; 1293 if (Subtarget->hasAnyDataBarrier() && 1294 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1295 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1296 // to ldrex/strex loops already. 1297 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1298 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1299 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1300 1301 // On v8, we have particularly efficient implementations of atomic fences 1302 // if they can be combined with nearby atomic loads and stores. 1303 if (!Subtarget->hasAcquireRelease() || 1304 getTargetMachine().getOptLevel() == 0) { 1305 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1306 InsertFencesForAtomic = true; 1307 } 1308 } else { 1309 // If there's anything we can use as a barrier, go through custom lowering 1310 // for ATOMIC_FENCE. 1311 // If target has DMB in thumb, Fences can be inserted. 1312 if (Subtarget->hasDataBarrier()) 1313 InsertFencesForAtomic = true; 1314 1315 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1316 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1317 1318 // Set them all for expansion, which will force libcalls. 1319 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1320 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1321 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1322 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1323 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1324 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1325 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1326 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1327 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1328 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1329 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1330 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1331 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1332 // Unordered/Monotonic case. 1333 if (!InsertFencesForAtomic) { 1334 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1335 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1336 } 1337 } 1338 1339 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1340 1341 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1342 if (!Subtarget->hasV6Ops()) { 1343 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1344 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1345 } 1346 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1347 1348 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1349 !Subtarget->isThumb1Only()) { 1350 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1351 // iff target supports vfp2. 1352 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1353 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1354 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 1355 } 1356 1357 // We want to custom lower some of our intrinsics. 1358 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1359 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1360 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1361 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1362 if (Subtarget->useSjLjEH()) 1363 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1364 1365 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1366 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1367 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1368 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1369 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1370 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1371 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1372 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1373 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1374 if (Subtarget->hasFullFP16()) { 1375 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1376 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1377 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1378 } 1379 1380 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1381 1382 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1383 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1384 if (Subtarget->hasFullFP16()) 1385 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1386 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1387 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1388 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1389 1390 // We don't support sin/cos/fmod/copysign/pow 1391 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1392 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1393 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1394 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1395 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1396 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1397 setOperationAction(ISD::FREM, MVT::f64, Expand); 1398 setOperationAction(ISD::FREM, MVT::f32, Expand); 1399 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1400 !Subtarget->isThumb1Only()) { 1401 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1402 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1403 } 1404 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1405 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1406 1407 if (!Subtarget->hasVFP4Base()) { 1408 setOperationAction(ISD::FMA, MVT::f64, Expand); 1409 setOperationAction(ISD::FMA, MVT::f32, Expand); 1410 } 1411 1412 // Various VFP goodness 1413 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1414 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1415 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1416 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1417 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1418 } 1419 1420 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1421 if (!Subtarget->hasFP16()) { 1422 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1423 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1424 } 1425 1426 // Strict floating-point comparisons need custom lowering. 1427 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1428 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1429 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1430 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1431 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1432 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1433 } 1434 1435 // Use __sincos_stret if available. 1436 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1437 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1438 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1439 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1440 } 1441 1442 // FP-ARMv8 implements a lot of rounding-like FP operations. 1443 if (Subtarget->hasFPARMv8Base()) { 1444 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1445 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1446 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1447 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1448 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1449 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1450 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1451 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1452 if (Subtarget->hasNEON()) { 1453 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1454 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1455 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1456 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1457 } 1458 1459 if (Subtarget->hasFP64()) { 1460 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1461 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1462 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1463 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1464 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1465 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1466 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1467 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1468 } 1469 } 1470 1471 // FP16 often need to be promoted to call lib functions 1472 if (Subtarget->hasFullFP16()) { 1473 setOperationAction(ISD::FREM, MVT::f16, Promote); 1474 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1475 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1476 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1477 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1478 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1479 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1480 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1481 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1482 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1483 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1484 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1485 1486 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1487 } 1488 1489 if (Subtarget->hasNEON()) { 1490 // vmin and vmax aren't available in a scalar form, so we can use 1491 // a NEON instruction with an undef lane instead. This has a performance 1492 // penalty on some cores, so we don't do this unless we have been 1493 // asked to by the core tuning model. 1494 if (Subtarget->useNEONForSinglePrecisionFP()) { 1495 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1496 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1497 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1498 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1499 } 1500 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1501 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1502 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1503 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1504 1505 if (Subtarget->hasFullFP16()) { 1506 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1507 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1508 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1509 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1510 1511 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1512 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1513 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1514 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1515 } 1516 } 1517 1518 // We have target-specific dag combine patterns for the following nodes: 1519 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1520 setTargetDAGCombine(ISD::ADD); 1521 setTargetDAGCombine(ISD::SUB); 1522 setTargetDAGCombine(ISD::MUL); 1523 setTargetDAGCombine(ISD::AND); 1524 setTargetDAGCombine(ISD::OR); 1525 setTargetDAGCombine(ISD::XOR); 1526 1527 if (Subtarget->hasMVEIntegerOps()) 1528 setTargetDAGCombine(ISD::VSELECT); 1529 1530 if (Subtarget->hasV6Ops()) 1531 setTargetDAGCombine(ISD::SRL); 1532 if (Subtarget->isThumb1Only()) 1533 setTargetDAGCombine(ISD::SHL); 1534 1535 setStackPointerRegisterToSaveRestore(ARM::SP); 1536 1537 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1538 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1539 setSchedulingPreference(Sched::RegPressure); 1540 else 1541 setSchedulingPreference(Sched::Hybrid); 1542 1543 //// temporary - rewrite interface to use type 1544 MaxStoresPerMemset = 8; 1545 MaxStoresPerMemsetOptSize = 4; 1546 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1547 MaxStoresPerMemcpyOptSize = 2; 1548 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1549 MaxStoresPerMemmoveOptSize = 2; 1550 1551 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1552 // are at least 4 bytes aligned. 1553 setMinStackArgumentAlignment(Align(4)); 1554 1555 // Prefer likely predicted branches to selects on out-of-order cores. 1556 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1557 1558 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1559 1560 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1561 1562 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1563 setTargetDAGCombine(ISD::ABS); 1564 } 1565 1566 bool ARMTargetLowering::useSoftFloat() const { 1567 return Subtarget->useSoftFloat(); 1568 } 1569 1570 // FIXME: It might make sense to define the representative register class as the 1571 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1572 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1573 // SPR's representative would be DPR_VFP2. This should work well if register 1574 // pressure tracking were modified such that a register use would increment the 1575 // pressure of the register class's representative and all of it's super 1576 // classes' representatives transitively. We have not implemented this because 1577 // of the difficulty prior to coalescing of modeling operand register classes 1578 // due to the common occurrence of cross class copies and subregister insertions 1579 // and extractions. 1580 std::pair<const TargetRegisterClass *, uint8_t> 1581 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1582 MVT VT) const { 1583 const TargetRegisterClass *RRC = nullptr; 1584 uint8_t Cost = 1; 1585 switch (VT.SimpleTy) { 1586 default: 1587 return TargetLowering::findRepresentativeClass(TRI, VT); 1588 // Use DPR as representative register class for all floating point 1589 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1590 // the cost is 1 for both f32 and f64. 1591 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1592 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1593 RRC = &ARM::DPRRegClass; 1594 // When NEON is used for SP, only half of the register file is available 1595 // because operations that define both SP and DP results will be constrained 1596 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1597 // coalescing by double-counting the SP regs. See the FIXME above. 1598 if (Subtarget->useNEONForSinglePrecisionFP()) 1599 Cost = 2; 1600 break; 1601 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1602 case MVT::v4f32: case MVT::v2f64: 1603 RRC = &ARM::DPRRegClass; 1604 Cost = 2; 1605 break; 1606 case MVT::v4i64: 1607 RRC = &ARM::DPRRegClass; 1608 Cost = 4; 1609 break; 1610 case MVT::v8i64: 1611 RRC = &ARM::DPRRegClass; 1612 Cost = 8; 1613 break; 1614 } 1615 return std::make_pair(RRC, Cost); 1616 } 1617 1618 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1619 #define MAKE_CASE(V) \ 1620 case V: \ 1621 return #V; 1622 switch ((ARMISD::NodeType)Opcode) { 1623 case ARMISD::FIRST_NUMBER: 1624 break; 1625 MAKE_CASE(ARMISD::Wrapper) 1626 MAKE_CASE(ARMISD::WrapperPIC) 1627 MAKE_CASE(ARMISD::WrapperJT) 1628 MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL) 1629 MAKE_CASE(ARMISD::CALL) 1630 MAKE_CASE(ARMISD::CALL_PRED) 1631 MAKE_CASE(ARMISD::CALL_NOLINK) 1632 MAKE_CASE(ARMISD::tSECALL) 1633 MAKE_CASE(ARMISD::BRCOND) 1634 MAKE_CASE(ARMISD::BR_JT) 1635 MAKE_CASE(ARMISD::BR2_JT) 1636 MAKE_CASE(ARMISD::RET_FLAG) 1637 MAKE_CASE(ARMISD::SERET_FLAG) 1638 MAKE_CASE(ARMISD::INTRET_FLAG) 1639 MAKE_CASE(ARMISD::PIC_ADD) 1640 MAKE_CASE(ARMISD::CMP) 1641 MAKE_CASE(ARMISD::CMN) 1642 MAKE_CASE(ARMISD::CMPZ) 1643 MAKE_CASE(ARMISD::CMPFP) 1644 MAKE_CASE(ARMISD::CMPFPE) 1645 MAKE_CASE(ARMISD::CMPFPw0) 1646 MAKE_CASE(ARMISD::CMPFPEw0) 1647 MAKE_CASE(ARMISD::BCC_i64) 1648 MAKE_CASE(ARMISD::FMSTAT) 1649 MAKE_CASE(ARMISD::CMOV) 1650 MAKE_CASE(ARMISD::SUBS) 1651 MAKE_CASE(ARMISD::SSAT) 1652 MAKE_CASE(ARMISD::USAT) 1653 MAKE_CASE(ARMISD::ASRL) 1654 MAKE_CASE(ARMISD::LSRL) 1655 MAKE_CASE(ARMISD::LSLL) 1656 MAKE_CASE(ARMISD::SRL_FLAG) 1657 MAKE_CASE(ARMISD::SRA_FLAG) 1658 MAKE_CASE(ARMISD::RRX) 1659 MAKE_CASE(ARMISD::ADDC) 1660 MAKE_CASE(ARMISD::ADDE) 1661 MAKE_CASE(ARMISD::SUBC) 1662 MAKE_CASE(ARMISD::SUBE) 1663 MAKE_CASE(ARMISD::LSLS) 1664 MAKE_CASE(ARMISD::VMOVRRD) 1665 MAKE_CASE(ARMISD::VMOVDRR) 1666 MAKE_CASE(ARMISD::VMOVhr) 1667 MAKE_CASE(ARMISD::VMOVrh) 1668 MAKE_CASE(ARMISD::VMOVSR) 1669 MAKE_CASE(ARMISD::EH_SJLJ_SETJMP) 1670 MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP) 1671 MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH) 1672 MAKE_CASE(ARMISD::TC_RETURN) 1673 MAKE_CASE(ARMISD::THREAD_POINTER) 1674 MAKE_CASE(ARMISD::DYN_ALLOC) 1675 MAKE_CASE(ARMISD::MEMBARRIER_MCR) 1676 MAKE_CASE(ARMISD::PRELOAD) 1677 MAKE_CASE(ARMISD::LDRD) 1678 MAKE_CASE(ARMISD::STRD) 1679 MAKE_CASE(ARMISD::WIN__CHKSTK) 1680 MAKE_CASE(ARMISD::WIN__DBZCHK) 1681 MAKE_CASE(ARMISD::PREDICATE_CAST) 1682 MAKE_CASE(ARMISD::VECTOR_REG_CAST) 1683 MAKE_CASE(ARMISD::MVESEXT) 1684 MAKE_CASE(ARMISD::MVEZEXT) 1685 MAKE_CASE(ARMISD::MVETRUNC) 1686 MAKE_CASE(ARMISD::VCMP) 1687 MAKE_CASE(ARMISD::VCMPZ) 1688 MAKE_CASE(ARMISD::VTST) 1689 MAKE_CASE(ARMISD::VSHLs) 1690 MAKE_CASE(ARMISD::VSHLu) 1691 MAKE_CASE(ARMISD::VSHLIMM) 1692 MAKE_CASE(ARMISD::VSHRsIMM) 1693 MAKE_CASE(ARMISD::VSHRuIMM) 1694 MAKE_CASE(ARMISD::VRSHRsIMM) 1695 MAKE_CASE(ARMISD::VRSHRuIMM) 1696 MAKE_CASE(ARMISD::VRSHRNIMM) 1697 MAKE_CASE(ARMISD::VQSHLsIMM) 1698 MAKE_CASE(ARMISD::VQSHLuIMM) 1699 MAKE_CASE(ARMISD::VQSHLsuIMM) 1700 MAKE_CASE(ARMISD::VQSHRNsIMM) 1701 MAKE_CASE(ARMISD::VQSHRNuIMM) 1702 MAKE_CASE(ARMISD::VQSHRNsuIMM) 1703 MAKE_CASE(ARMISD::VQRSHRNsIMM) 1704 MAKE_CASE(ARMISD::VQRSHRNuIMM) 1705 MAKE_CASE(ARMISD::VQRSHRNsuIMM) 1706 MAKE_CASE(ARMISD::VSLIIMM) 1707 MAKE_CASE(ARMISD::VSRIIMM) 1708 MAKE_CASE(ARMISD::VGETLANEu) 1709 MAKE_CASE(ARMISD::VGETLANEs) 1710 MAKE_CASE(ARMISD::VMOVIMM) 1711 MAKE_CASE(ARMISD::VMVNIMM) 1712 MAKE_CASE(ARMISD::VMOVFPIMM) 1713 MAKE_CASE(ARMISD::VDUP) 1714 MAKE_CASE(ARMISD::VDUPLANE) 1715 MAKE_CASE(ARMISD::VEXT) 1716 MAKE_CASE(ARMISD::VREV64) 1717 MAKE_CASE(ARMISD::VREV32) 1718 MAKE_CASE(ARMISD::VREV16) 1719 MAKE_CASE(ARMISD::VZIP) 1720 MAKE_CASE(ARMISD::VUZP) 1721 MAKE_CASE(ARMISD::VTRN) 1722 MAKE_CASE(ARMISD::VTBL1) 1723 MAKE_CASE(ARMISD::VTBL2) 1724 MAKE_CASE(ARMISD::VMOVN) 1725 MAKE_CASE(ARMISD::VQMOVNs) 1726 MAKE_CASE(ARMISD::VQMOVNu) 1727 MAKE_CASE(ARMISD::VCVTN) 1728 MAKE_CASE(ARMISD::VCVTL) 1729 MAKE_CASE(ARMISD::VIDUP) 1730 MAKE_CASE(ARMISD::VMULLs) 1731 MAKE_CASE(ARMISD::VMULLu) 1732 MAKE_CASE(ARMISD::VQDMULH) 1733 MAKE_CASE(ARMISD::VADDVs) 1734 MAKE_CASE(ARMISD::VADDVu) 1735 MAKE_CASE(ARMISD::VADDVps) 1736 MAKE_CASE(ARMISD::VADDVpu) 1737 MAKE_CASE(ARMISD::VADDLVs) 1738 MAKE_CASE(ARMISD::VADDLVu) 1739 MAKE_CASE(ARMISD::VADDLVAs) 1740 MAKE_CASE(ARMISD::VADDLVAu) 1741 MAKE_CASE(ARMISD::VADDLVps) 1742 MAKE_CASE(ARMISD::VADDLVpu) 1743 MAKE_CASE(ARMISD::VADDLVAps) 1744 MAKE_CASE(ARMISD::VADDLVApu) 1745 MAKE_CASE(ARMISD::VMLAVs) 1746 MAKE_CASE(ARMISD::VMLAVu) 1747 MAKE_CASE(ARMISD::VMLAVps) 1748 MAKE_CASE(ARMISD::VMLAVpu) 1749 MAKE_CASE(ARMISD::VMLALVs) 1750 MAKE_CASE(ARMISD::VMLALVu) 1751 MAKE_CASE(ARMISD::VMLALVps) 1752 MAKE_CASE(ARMISD::VMLALVpu) 1753 MAKE_CASE(ARMISD::VMLALVAs) 1754 MAKE_CASE(ARMISD::VMLALVAu) 1755 MAKE_CASE(ARMISD::VMLALVAps) 1756 MAKE_CASE(ARMISD::VMLALVApu) 1757 MAKE_CASE(ARMISD::VMINVu) 1758 MAKE_CASE(ARMISD::VMINVs) 1759 MAKE_CASE(ARMISD::VMAXVu) 1760 MAKE_CASE(ARMISD::VMAXVs) 1761 MAKE_CASE(ARMISD::UMAAL) 1762 MAKE_CASE(ARMISD::UMLAL) 1763 MAKE_CASE(ARMISD::SMLAL) 1764 MAKE_CASE(ARMISD::SMLALBB) 1765 MAKE_CASE(ARMISD::SMLALBT) 1766 MAKE_CASE(ARMISD::SMLALTB) 1767 MAKE_CASE(ARMISD::SMLALTT) 1768 MAKE_CASE(ARMISD::SMULWB) 1769 MAKE_CASE(ARMISD::SMULWT) 1770 MAKE_CASE(ARMISD::SMLALD) 1771 MAKE_CASE(ARMISD::SMLALDX) 1772 MAKE_CASE(ARMISD::SMLSLD) 1773 MAKE_CASE(ARMISD::SMLSLDX) 1774 MAKE_CASE(ARMISD::SMMLAR) 1775 MAKE_CASE(ARMISD::SMMLSR) 1776 MAKE_CASE(ARMISD::QADD16b) 1777 MAKE_CASE(ARMISD::QSUB16b) 1778 MAKE_CASE(ARMISD::QADD8b) 1779 MAKE_CASE(ARMISD::QSUB8b) 1780 MAKE_CASE(ARMISD::UQADD16b) 1781 MAKE_CASE(ARMISD::UQSUB16b) 1782 MAKE_CASE(ARMISD::UQADD8b) 1783 MAKE_CASE(ARMISD::UQSUB8b) 1784 MAKE_CASE(ARMISD::BUILD_VECTOR) 1785 MAKE_CASE(ARMISD::BFI) 1786 MAKE_CASE(ARMISD::VORRIMM) 1787 MAKE_CASE(ARMISD::VBICIMM) 1788 MAKE_CASE(ARMISD::VBSP) 1789 MAKE_CASE(ARMISD::MEMCPY) 1790 MAKE_CASE(ARMISD::VLD1DUP) 1791 MAKE_CASE(ARMISD::VLD2DUP) 1792 MAKE_CASE(ARMISD::VLD3DUP) 1793 MAKE_CASE(ARMISD::VLD4DUP) 1794 MAKE_CASE(ARMISD::VLD1_UPD) 1795 MAKE_CASE(ARMISD::VLD2_UPD) 1796 MAKE_CASE(ARMISD::VLD3_UPD) 1797 MAKE_CASE(ARMISD::VLD4_UPD) 1798 MAKE_CASE(ARMISD::VLD1x2_UPD) 1799 MAKE_CASE(ARMISD::VLD1x3_UPD) 1800 MAKE_CASE(ARMISD::VLD1x4_UPD) 1801 MAKE_CASE(ARMISD::VLD2LN_UPD) 1802 MAKE_CASE(ARMISD::VLD3LN_UPD) 1803 MAKE_CASE(ARMISD::VLD4LN_UPD) 1804 MAKE_CASE(ARMISD::VLD1DUP_UPD) 1805 MAKE_CASE(ARMISD::VLD2DUP_UPD) 1806 MAKE_CASE(ARMISD::VLD3DUP_UPD) 1807 MAKE_CASE(ARMISD::VLD4DUP_UPD) 1808 MAKE_CASE(ARMISD::VST1_UPD) 1809 MAKE_CASE(ARMISD::VST2_UPD) 1810 MAKE_CASE(ARMISD::VST3_UPD) 1811 MAKE_CASE(ARMISD::VST4_UPD) 1812 MAKE_CASE(ARMISD::VST1x2_UPD) 1813 MAKE_CASE(ARMISD::VST1x3_UPD) 1814 MAKE_CASE(ARMISD::VST1x4_UPD) 1815 MAKE_CASE(ARMISD::VST2LN_UPD) 1816 MAKE_CASE(ARMISD::VST3LN_UPD) 1817 MAKE_CASE(ARMISD::VST4LN_UPD) 1818 MAKE_CASE(ARMISD::WLS) 1819 MAKE_CASE(ARMISD::WLSSETUP) 1820 MAKE_CASE(ARMISD::LE) 1821 MAKE_CASE(ARMISD::LOOP_DEC) 1822 MAKE_CASE(ARMISD::CSINV) 1823 MAKE_CASE(ARMISD::CSNEG) 1824 MAKE_CASE(ARMISD::CSINC) 1825 MAKE_CASE(ARMISD::MEMCPYLOOP) 1826 MAKE_CASE(ARMISD::MEMSETLOOP) 1827 #undef MAKE_CASE 1828 } 1829 return nullptr; 1830 } 1831 1832 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1833 EVT VT) const { 1834 if (!VT.isVector()) 1835 return getPointerTy(DL); 1836 1837 // MVE has a predicate register. 1838 if ((Subtarget->hasMVEIntegerOps() && 1839 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) || 1840 (Subtarget->hasMVEFloatOps() && (VT == MVT::v4f32 || VT == MVT::v8f16))) 1841 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1842 return VT.changeVectorElementTypeToInteger(); 1843 } 1844 1845 /// getRegClassFor - Return the register class that should be used for the 1846 /// specified value type. 1847 const TargetRegisterClass * 1848 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1849 (void)isDivergent; 1850 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1851 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1852 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1853 // MVE Q registers. 1854 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1855 if (VT == MVT::v4i64) 1856 return &ARM::QQPRRegClass; 1857 if (VT == MVT::v8i64) 1858 return &ARM::QQQQPRRegClass; 1859 } 1860 return TargetLowering::getRegClassFor(VT); 1861 } 1862 1863 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1864 // source/dest is aligned and the copy size is large enough. We therefore want 1865 // to align such objects passed to memory intrinsics. 1866 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1867 unsigned &PrefAlign) const { 1868 if (!isa<MemIntrinsic>(CI)) 1869 return false; 1870 MinSize = 8; 1871 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1872 // cycle faster than 4-byte aligned LDM. 1873 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1874 return true; 1875 } 1876 1877 // Create a fast isel object. 1878 FastISel * 1879 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1880 const TargetLibraryInfo *libInfo) const { 1881 return ARM::createFastISel(funcInfo, libInfo); 1882 } 1883 1884 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1885 unsigned NumVals = N->getNumValues(); 1886 if (!NumVals) 1887 return Sched::RegPressure; 1888 1889 for (unsigned i = 0; i != NumVals; ++i) { 1890 EVT VT = N->getValueType(i); 1891 if (VT == MVT::Glue || VT == MVT::Other) 1892 continue; 1893 if (VT.isFloatingPoint() || VT.isVector()) 1894 return Sched::ILP; 1895 } 1896 1897 if (!N->isMachineOpcode()) 1898 return Sched::RegPressure; 1899 1900 // Load are scheduled for latency even if there instruction itinerary 1901 // is not available. 1902 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1903 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1904 1905 if (MCID.getNumDefs() == 0) 1906 return Sched::RegPressure; 1907 if (!Itins->isEmpty() && 1908 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1909 return Sched::ILP; 1910 1911 return Sched::RegPressure; 1912 } 1913 1914 //===----------------------------------------------------------------------===// 1915 // Lowering Code 1916 //===----------------------------------------------------------------------===// 1917 1918 static bool isSRL16(const SDValue &Op) { 1919 if (Op.getOpcode() != ISD::SRL) 1920 return false; 1921 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1922 return Const->getZExtValue() == 16; 1923 return false; 1924 } 1925 1926 static bool isSRA16(const SDValue &Op) { 1927 if (Op.getOpcode() != ISD::SRA) 1928 return false; 1929 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1930 return Const->getZExtValue() == 16; 1931 return false; 1932 } 1933 1934 static bool isSHL16(const SDValue &Op) { 1935 if (Op.getOpcode() != ISD::SHL) 1936 return false; 1937 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1938 return Const->getZExtValue() == 16; 1939 return false; 1940 } 1941 1942 // Check for a signed 16-bit value. We special case SRA because it makes it 1943 // more simple when also looking for SRAs that aren't sign extending a 1944 // smaller value. Without the check, we'd need to take extra care with 1945 // checking order for some operations. 1946 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1947 if (isSRA16(Op)) 1948 return isSHL16(Op.getOperand(0)); 1949 return DAG.ComputeNumSignBits(Op) == 17; 1950 } 1951 1952 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1953 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1954 switch (CC) { 1955 default: llvm_unreachable("Unknown condition code!"); 1956 case ISD::SETNE: return ARMCC::NE; 1957 case ISD::SETEQ: return ARMCC::EQ; 1958 case ISD::SETGT: return ARMCC::GT; 1959 case ISD::SETGE: return ARMCC::GE; 1960 case ISD::SETLT: return ARMCC::LT; 1961 case ISD::SETLE: return ARMCC::LE; 1962 case ISD::SETUGT: return ARMCC::HI; 1963 case ISD::SETUGE: return ARMCC::HS; 1964 case ISD::SETULT: return ARMCC::LO; 1965 case ISD::SETULE: return ARMCC::LS; 1966 } 1967 } 1968 1969 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1970 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1971 ARMCC::CondCodes &CondCode2) { 1972 CondCode2 = ARMCC::AL; 1973 switch (CC) { 1974 default: llvm_unreachable("Unknown FP condition!"); 1975 case ISD::SETEQ: 1976 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1977 case ISD::SETGT: 1978 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1979 case ISD::SETGE: 1980 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1981 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1982 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1983 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1984 case ISD::SETO: CondCode = ARMCC::VC; break; 1985 case ISD::SETUO: CondCode = ARMCC::VS; break; 1986 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1987 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1988 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1989 case ISD::SETLT: 1990 case ISD::SETULT: CondCode = ARMCC::LT; break; 1991 case ISD::SETLE: 1992 case ISD::SETULE: CondCode = ARMCC::LE; break; 1993 case ISD::SETNE: 1994 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1995 } 1996 } 1997 1998 //===----------------------------------------------------------------------===// 1999 // Calling Convention Implementation 2000 //===----------------------------------------------------------------------===// 2001 2002 /// getEffectiveCallingConv - Get the effective calling convention, taking into 2003 /// account presence of floating point hardware and calling convention 2004 /// limitations, such as support for variadic functions. 2005 CallingConv::ID 2006 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 2007 bool isVarArg) const { 2008 switch (CC) { 2009 default: 2010 report_fatal_error("Unsupported calling convention"); 2011 case CallingConv::ARM_AAPCS: 2012 case CallingConv::ARM_APCS: 2013 case CallingConv::GHC: 2014 case CallingConv::CFGuard_Check: 2015 return CC; 2016 case CallingConv::PreserveMost: 2017 return CallingConv::PreserveMost; 2018 case CallingConv::ARM_AAPCS_VFP: 2019 case CallingConv::Swift: 2020 case CallingConv::SwiftTail: 2021 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 2022 case CallingConv::C: 2023 case CallingConv::Tail: 2024 if (!Subtarget->isAAPCS_ABI()) 2025 return CallingConv::ARM_APCS; 2026 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 2027 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 2028 !isVarArg) 2029 return CallingConv::ARM_AAPCS_VFP; 2030 else 2031 return CallingConv::ARM_AAPCS; 2032 case CallingConv::Fast: 2033 case CallingConv::CXX_FAST_TLS: 2034 if (!Subtarget->isAAPCS_ABI()) { 2035 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 2036 return CallingConv::Fast; 2037 return CallingConv::ARM_APCS; 2038 } else if (Subtarget->hasVFP2Base() && 2039 !Subtarget->isThumb1Only() && !isVarArg) 2040 return CallingConv::ARM_AAPCS_VFP; 2041 else 2042 return CallingConv::ARM_AAPCS; 2043 } 2044 } 2045 2046 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2047 bool isVarArg) const { 2048 return CCAssignFnForNode(CC, false, isVarArg); 2049 } 2050 2051 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 2052 bool isVarArg) const { 2053 return CCAssignFnForNode(CC, true, isVarArg); 2054 } 2055 2056 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 2057 /// CallingConvention. 2058 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 2059 bool Return, 2060 bool isVarArg) const { 2061 switch (getEffectiveCallingConv(CC, isVarArg)) { 2062 default: 2063 report_fatal_error("Unsupported calling convention"); 2064 case CallingConv::ARM_APCS: 2065 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 2066 case CallingConv::ARM_AAPCS: 2067 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2068 case CallingConv::ARM_AAPCS_VFP: 2069 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 2070 case CallingConv::Fast: 2071 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 2072 case CallingConv::GHC: 2073 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 2074 case CallingConv::PreserveMost: 2075 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2076 case CallingConv::CFGuard_Check: 2077 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 2078 } 2079 } 2080 2081 SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, 2082 MVT LocVT, MVT ValVT, SDValue Val) const { 2083 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), 2084 Val); 2085 if (Subtarget->hasFullFP16()) { 2086 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); 2087 } else { 2088 Val = DAG.getNode(ISD::TRUNCATE, dl, 2089 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2090 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); 2091 } 2092 return Val; 2093 } 2094 2095 SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, 2096 MVT LocVT, MVT ValVT, 2097 SDValue Val) const { 2098 if (Subtarget->hasFullFP16()) { 2099 Val = DAG.getNode(ARMISD::VMOVrh, dl, 2100 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2101 } else { 2102 Val = DAG.getNode(ISD::BITCAST, dl, 2103 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2104 Val = DAG.getNode(ISD::ZERO_EXTEND, dl, 2105 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2106 } 2107 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); 2108 } 2109 2110 /// LowerCallResult - Lower the result values of a call into the 2111 /// appropriate copies out of appropriate physical registers. 2112 SDValue ARMTargetLowering::LowerCallResult( 2113 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2114 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2115 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2116 SDValue ThisVal) const { 2117 // Assign locations to each value returned by this call. 2118 SmallVector<CCValAssign, 16> RVLocs; 2119 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2120 *DAG.getContext()); 2121 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 2122 2123 // Copy all of the result registers out of their specified physreg. 2124 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2125 CCValAssign VA = RVLocs[i]; 2126 2127 // Pass 'this' value directly from the argument to return value, to avoid 2128 // reg unit interference 2129 if (i == 0 && isThisReturn) { 2130 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 2131 "unexpected return calling convention register assignment"); 2132 InVals.push_back(ThisVal); 2133 continue; 2134 } 2135 2136 SDValue Val; 2137 if (VA.needsCustom() && 2138 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) { 2139 // Handle f64 or half of a v2f64. 2140 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2141 InFlag); 2142 Chain = Lo.getValue(1); 2143 InFlag = Lo.getValue(2); 2144 VA = RVLocs[++i]; // skip ahead to next loc 2145 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2146 InFlag); 2147 Chain = Hi.getValue(1); 2148 InFlag = Hi.getValue(2); 2149 if (!Subtarget->isLittle()) 2150 std::swap (Lo, Hi); 2151 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2152 2153 if (VA.getLocVT() == MVT::v2f64) { 2154 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2155 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2156 DAG.getConstant(0, dl, MVT::i32)); 2157 2158 VA = RVLocs[++i]; // skip ahead to next loc 2159 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2160 Chain = Lo.getValue(1); 2161 InFlag = Lo.getValue(2); 2162 VA = RVLocs[++i]; // skip ahead to next loc 2163 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2164 Chain = Hi.getValue(1); 2165 InFlag = Hi.getValue(2); 2166 if (!Subtarget->isLittle()) 2167 std::swap (Lo, Hi); 2168 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2169 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2170 DAG.getConstant(1, dl, MVT::i32)); 2171 } 2172 } else { 2173 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2174 InFlag); 2175 Chain = Val.getValue(1); 2176 InFlag = Val.getValue(2); 2177 } 2178 2179 switch (VA.getLocInfo()) { 2180 default: llvm_unreachable("Unknown loc info!"); 2181 case CCValAssign::Full: break; 2182 case CCValAssign::BCvt: 2183 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2184 break; 2185 } 2186 2187 // f16 arguments have their size extended to 4 bytes and passed as if they 2188 // had been copied to the LSBs of a 32-bit register. 2189 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2190 if (VA.needsCustom() && 2191 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 2192 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); 2193 2194 InVals.push_back(Val); 2195 } 2196 2197 return Chain; 2198 } 2199 2200 std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg( 2201 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr, 2202 bool IsTailCall, int SPDiff) const { 2203 SDValue DstAddr; 2204 MachinePointerInfo DstInfo; 2205 int32_t Offset = VA.getLocMemOffset(); 2206 MachineFunction &MF = DAG.getMachineFunction(); 2207 2208 if (IsTailCall) { 2209 Offset += SPDiff; 2210 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2211 int Size = VA.getLocVT().getFixedSizeInBits() / 8; 2212 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); 2213 DstAddr = DAG.getFrameIndex(FI, PtrVT); 2214 DstInfo = 2215 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 2216 } else { 2217 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); 2218 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2219 StackPtr, PtrOff); 2220 DstInfo = 2221 MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset); 2222 } 2223 2224 return std::make_pair(DstAddr, DstInfo); 2225 } 2226 2227 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2228 SDValue Chain, SDValue &Arg, 2229 RegsToPassVector &RegsToPass, 2230 CCValAssign &VA, CCValAssign &NextVA, 2231 SDValue &StackPtr, 2232 SmallVectorImpl<SDValue> &MemOpChains, 2233 bool IsTailCall, 2234 int SPDiff) const { 2235 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2236 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2237 unsigned id = Subtarget->isLittle() ? 0 : 1; 2238 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2239 2240 if (NextVA.isRegLoc()) 2241 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2242 else { 2243 assert(NextVA.isMemLoc()); 2244 if (!StackPtr.getNode()) 2245 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2246 getPointerTy(DAG.getDataLayout())); 2247 2248 SDValue DstAddr; 2249 MachinePointerInfo DstInfo; 2250 std::tie(DstAddr, DstInfo) = 2251 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff); 2252 MemOpChains.push_back( 2253 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo)); 2254 } 2255 } 2256 2257 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { 2258 return (CC == CallingConv::Fast && GuaranteeTailCalls) || 2259 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 2260 } 2261 2262 /// LowerCall - Lowering a call into a callseq_start <- 2263 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2264 /// nodes. 2265 SDValue 2266 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2267 SmallVectorImpl<SDValue> &InVals) const { 2268 SelectionDAG &DAG = CLI.DAG; 2269 SDLoc &dl = CLI.DL; 2270 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2271 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2272 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2273 SDValue Chain = CLI.Chain; 2274 SDValue Callee = CLI.Callee; 2275 bool &isTailCall = CLI.IsTailCall; 2276 CallingConv::ID CallConv = CLI.CallConv; 2277 bool doesNotRet = CLI.DoesNotReturn; 2278 bool isVarArg = CLI.IsVarArg; 2279 2280 MachineFunction &MF = DAG.getMachineFunction(); 2281 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2282 MachineFunction::CallSiteInfo CSInfo; 2283 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2284 bool isThisReturn = false; 2285 bool isCmseNSCall = false; 2286 bool isSibCall = false; 2287 bool PreferIndirect = false; 2288 2289 // Determine whether this is a non-secure function call. 2290 if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call")) 2291 isCmseNSCall = true; 2292 2293 // Disable tail calls if they're not supported. 2294 if (!Subtarget->supportsTailCall()) 2295 isTailCall = false; 2296 2297 // For both the non-secure calls and the returns from a CMSE entry function, 2298 // the function needs to do some extra work afte r the call, or before the 2299 // return, respectively, thus it cannot end with atail call 2300 if (isCmseNSCall || AFI->isCmseNSEntryFunction()) 2301 isTailCall = false; 2302 2303 if (isa<GlobalAddressSDNode>(Callee)) { 2304 // If we're optimizing for minimum size and the function is called three or 2305 // more times in this block, we can improve codesize by calling indirectly 2306 // as BLXr has a 16-bit encoding. 2307 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2308 if (CLI.CB) { 2309 auto *BB = CLI.CB->getParent(); 2310 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2311 count_if(GV->users(), [&BB](const User *U) { 2312 return isa<Instruction>(U) && 2313 cast<Instruction>(U)->getParent() == BB; 2314 }) > 2; 2315 } 2316 } 2317 if (isTailCall) { 2318 // Check if it's really possible to do a tail call. 2319 isTailCall = IsEligibleForTailCallOptimization( 2320 Callee, CallConv, isVarArg, isStructRet, 2321 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2322 PreferIndirect); 2323 2324 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt && 2325 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) 2326 isSibCall = true; 2327 2328 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2329 // detected sibcalls. 2330 if (isTailCall) 2331 ++NumTailCalls; 2332 } 2333 2334 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) 2335 report_fatal_error("failed to perform tail call elimination on a call " 2336 "site marked musttail"); 2337 // Analyze operands of the call, assigning locations to each operand. 2338 SmallVector<CCValAssign, 16> ArgLocs; 2339 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2340 *DAG.getContext()); 2341 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2342 2343 // Get a count of how many bytes are to be pushed on the stack. 2344 unsigned NumBytes = CCInfo.getNextStackOffset(); 2345 2346 // SPDiff is the byte offset of the call's argument area from the callee's. 2347 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2348 // by this amount for a tail call. In a sibling call it must be 0 because the 2349 // caller will deallocate the entire stack and the callee still expects its 2350 // arguments to begin at SP+0. Completely unused for non-tail calls. 2351 int SPDiff = 0; 2352 2353 if (isTailCall && !isSibCall) { 2354 auto FuncInfo = MF.getInfo<ARMFunctionInfo>(); 2355 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize(); 2356 2357 // Since callee will pop argument stack as a tail call, we must keep the 2358 // popped size 16-byte aligned. 2359 Align StackAlign = DAG.getDataLayout().getStackAlignment(); 2360 NumBytes = alignTo(NumBytes, StackAlign); 2361 2362 // SPDiff will be negative if this tail call requires more space than we 2363 // would automatically have in our incoming argument space. Positive if we 2364 // can actually shrink the stack. 2365 SPDiff = NumReusableBytes - NumBytes; 2366 2367 // If this call requires more stack than we have available from 2368 // LowerFormalArguments, tell FrameLowering to reserve space for it. 2369 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff) 2370 AFI->setArgRegsSaveSize(-SPDiff); 2371 } 2372 2373 if (isSibCall) { 2374 // For sibling tail calls, memory operands are available in our caller's stack. 2375 NumBytes = 0; 2376 } else { 2377 // Adjust the stack pointer for the new arguments... 2378 // These operations are automatically eliminated by the prolog/epilog pass 2379 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl); 2380 } 2381 2382 SDValue StackPtr = 2383 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2384 2385 RegsToPassVector RegsToPass; 2386 SmallVector<SDValue, 8> MemOpChains; 2387 2388 // During a tail call, stores to the argument area must happen after all of 2389 // the function's incoming arguments have been loaded because they may alias. 2390 // This is done by folding in a TokenFactor from LowerFormalArguments, but 2391 // there's no point in doing so repeatedly so this tracks whether that's 2392 // happened yet. 2393 bool AfterFormalArgLoads = false; 2394 2395 // Walk the register/memloc assignments, inserting copies/loads. In the case 2396 // of tail call optimization, arguments are handled later. 2397 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2398 i != e; 2399 ++i, ++realArgIdx) { 2400 CCValAssign &VA = ArgLocs[i]; 2401 SDValue Arg = OutVals[realArgIdx]; 2402 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2403 bool isByVal = Flags.isByVal(); 2404 2405 // Promote the value if needed. 2406 switch (VA.getLocInfo()) { 2407 default: llvm_unreachable("Unknown loc info!"); 2408 case CCValAssign::Full: break; 2409 case CCValAssign::SExt: 2410 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2411 break; 2412 case CCValAssign::ZExt: 2413 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2414 break; 2415 case CCValAssign::AExt: 2416 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2417 break; 2418 case CCValAssign::BCvt: 2419 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2420 break; 2421 } 2422 2423 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) { 2424 Chain = DAG.getStackArgumentTokenFactor(Chain); 2425 AfterFormalArgLoads = true; 2426 } 2427 2428 // f16 arguments have their size extended to 4 bytes and passed as if they 2429 // had been copied to the LSBs of a 32-bit register. 2430 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2431 if (VA.needsCustom() && 2432 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { 2433 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 2434 } else { 2435 // f16 arguments could have been extended prior to argument lowering. 2436 // Mask them arguments if this is a CMSE nonsecure call. 2437 auto ArgVT = Outs[realArgIdx].ArgVT; 2438 if (isCmseNSCall && (ArgVT == MVT::f16)) { 2439 auto LocBits = VA.getLocVT().getSizeInBits(); 2440 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits()); 2441 SDValue Mask = 2442 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 2443 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 2444 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 2445 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2446 } 2447 } 2448 2449 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2450 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 2451 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2452 DAG.getConstant(0, dl, MVT::i32)); 2453 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2454 DAG.getConstant(1, dl, MVT::i32)); 2455 2456 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], 2457 StackPtr, MemOpChains, isTailCall, SPDiff); 2458 2459 VA = ArgLocs[++i]; // skip ahead to next loc 2460 if (VA.isRegLoc()) { 2461 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], 2462 StackPtr, MemOpChains, isTailCall, SPDiff); 2463 } else { 2464 assert(VA.isMemLoc()); 2465 SDValue DstAddr; 2466 MachinePointerInfo DstInfo; 2467 std::tie(DstAddr, DstInfo) = 2468 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2469 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo)); 2470 } 2471 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 2472 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2473 StackPtr, MemOpChains, isTailCall, SPDiff); 2474 } else if (VA.isRegLoc()) { 2475 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2476 Outs[0].VT == MVT::i32) { 2477 assert(VA.getLocVT() == MVT::i32 && 2478 "unexpected calling convention register assignment"); 2479 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2480 "unexpected use of 'returned'"); 2481 isThisReturn = true; 2482 } 2483 const TargetOptions &Options = DAG.getTarget().Options; 2484 if (Options.EmitCallSiteInfo) 2485 CSInfo.emplace_back(VA.getLocReg(), i); 2486 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2487 } else if (isByVal) { 2488 assert(VA.isMemLoc()); 2489 unsigned offset = 0; 2490 2491 // True if this byval aggregate will be split between registers 2492 // and memory. 2493 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2494 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2495 2496 if (CurByValIdx < ByValArgsCount) { 2497 2498 unsigned RegBegin, RegEnd; 2499 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2500 2501 EVT PtrVT = 2502 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2503 unsigned int i, j; 2504 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2505 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2506 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2507 SDValue Load = 2508 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), 2509 DAG.InferPtrAlign(AddArg)); 2510 MemOpChains.push_back(Load.getValue(1)); 2511 RegsToPass.push_back(std::make_pair(j, Load)); 2512 } 2513 2514 // If parameter size outsides register area, "offset" value 2515 // helps us to calculate stack slot for remained part properly. 2516 offset = RegEnd - RegBegin; 2517 2518 CCInfo.nextInRegsParam(); 2519 } 2520 2521 if (Flags.getByValSize() > 4*offset) { 2522 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2523 SDValue Dst; 2524 MachinePointerInfo DstInfo; 2525 std::tie(Dst, DstInfo) = 2526 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2527 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2528 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2529 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2530 MVT::i32); 2531 SDValue AlignNode = 2532 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); 2533 2534 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2535 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2536 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2537 Ops)); 2538 } 2539 } else { 2540 assert(VA.isMemLoc()); 2541 SDValue DstAddr; 2542 MachinePointerInfo DstInfo; 2543 std::tie(DstAddr, DstInfo) = 2544 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2545 2546 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo); 2547 MemOpChains.push_back(Store); 2548 } 2549 } 2550 2551 if (!MemOpChains.empty()) 2552 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2553 2554 // Build a sequence of copy-to-reg nodes chained together with token chain 2555 // and flag operands which copy the outgoing args into the appropriate regs. 2556 SDValue InFlag; 2557 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2558 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2559 RegsToPass[i].second, InFlag); 2560 InFlag = Chain.getValue(1); 2561 } 2562 2563 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2564 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2565 // node so that legalize doesn't hack it. 2566 bool isDirect = false; 2567 2568 const TargetMachine &TM = getTargetMachine(); 2569 const Module *Mod = MF.getFunction().getParent(); 2570 const GlobalValue *GV = nullptr; 2571 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2572 GV = G->getGlobal(); 2573 bool isStub = 2574 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2575 2576 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2577 bool isLocalARMFunc = false; 2578 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2579 2580 if (Subtarget->genLongCalls()) { 2581 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2582 "long-calls codegen is not position independent!"); 2583 // Handle a global address or an external symbol. If it's not one of 2584 // those, the target's already in a register, so we don't need to do 2585 // anything extra. 2586 if (isa<GlobalAddressSDNode>(Callee)) { 2587 // Create a constant pool entry for the callee address 2588 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2589 ARMConstantPoolValue *CPV = 2590 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2591 2592 // Get the address of the callee into a register 2593 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2594 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2595 Callee = DAG.getLoad( 2596 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2597 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2598 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2599 const char *Sym = S->getSymbol(); 2600 2601 // Create a constant pool entry for the callee address 2602 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2603 ARMConstantPoolValue *CPV = 2604 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2605 ARMPCLabelIndex, 0); 2606 // Get the address of the callee into a register 2607 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2608 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2609 Callee = DAG.getLoad( 2610 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2611 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2612 } 2613 } else if (isa<GlobalAddressSDNode>(Callee)) { 2614 if (!PreferIndirect) { 2615 isDirect = true; 2616 bool isDef = GV->isStrongDefinitionForLinker(); 2617 2618 // ARM call to a local ARM function is predicable. 2619 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2620 // tBX takes a register source operand. 2621 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2622 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2623 Callee = DAG.getNode( 2624 ARMISD::WrapperPIC, dl, PtrVt, 2625 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2626 Callee = DAG.getLoad( 2627 PtrVt, dl, DAG.getEntryNode(), Callee, 2628 MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), 2629 MachineMemOperand::MODereferenceable | 2630 MachineMemOperand::MOInvariant); 2631 } else if (Subtarget->isTargetCOFF()) { 2632 assert(Subtarget->isTargetWindows() && 2633 "Windows is the only supported COFF target"); 2634 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2635 if (GV->hasDLLImportStorageClass()) 2636 TargetFlags = ARMII::MO_DLLIMPORT; 2637 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2638 TargetFlags = ARMII::MO_COFFSTUB; 2639 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2640 TargetFlags); 2641 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2642 Callee = 2643 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2644 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2645 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2646 } else { 2647 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2648 } 2649 } 2650 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2651 isDirect = true; 2652 // tBX takes a register source operand. 2653 const char *Sym = S->getSymbol(); 2654 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2655 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2656 ARMConstantPoolValue *CPV = 2657 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2658 ARMPCLabelIndex, 4); 2659 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2660 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2661 Callee = DAG.getLoad( 2662 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2663 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2664 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2665 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2666 } else { 2667 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2668 } 2669 } 2670 2671 if (isCmseNSCall) { 2672 assert(!isARMFunc && !isDirect && 2673 "Cannot handle call to ARM function or direct call"); 2674 if (NumBytes > 0) { 2675 DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), 2676 "call to non-secure function would " 2677 "require passing arguments on stack", 2678 dl.getDebugLoc()); 2679 DAG.getContext()->diagnose(Diag); 2680 } 2681 if (isStructRet) { 2682 DiagnosticInfoUnsupported Diag( 2683 DAG.getMachineFunction().getFunction(), 2684 "call to non-secure function would return value through pointer", 2685 dl.getDebugLoc()); 2686 DAG.getContext()->diagnose(Diag); 2687 } 2688 } 2689 2690 // FIXME: handle tail calls differently. 2691 unsigned CallOpc; 2692 if (Subtarget->isThumb()) { 2693 if (isCmseNSCall) 2694 CallOpc = ARMISD::tSECALL; 2695 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2696 CallOpc = ARMISD::CALL_NOLINK; 2697 else 2698 CallOpc = ARMISD::CALL; 2699 } else { 2700 if (!isDirect && !Subtarget->hasV5TOps()) 2701 CallOpc = ARMISD::CALL_NOLINK; 2702 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2703 // Emit regular call when code size is the priority 2704 !Subtarget->hasMinSize()) 2705 // "mov lr, pc; b _foo" to avoid confusing the RSP 2706 CallOpc = ARMISD::CALL_NOLINK; 2707 else 2708 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2709 } 2710 2711 // We don't usually want to end the call-sequence here because we would tidy 2712 // the frame up *after* the call, however in the ABI-changing tail-call case 2713 // we've carefully laid out the parameters so that when sp is reset they'll be 2714 // in the correct location. 2715 if (isTailCall && !isSibCall) { 2716 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 2717 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2718 InFlag = Chain.getValue(1); 2719 } 2720 2721 std::vector<SDValue> Ops; 2722 Ops.push_back(Chain); 2723 Ops.push_back(Callee); 2724 2725 if (isTailCall) { 2726 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32)); 2727 } 2728 2729 // Add argument registers to the end of the list so that they are known live 2730 // into the call. 2731 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2732 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2733 RegsToPass[i].second.getValueType())); 2734 2735 // Add a register mask operand representing the call-preserved registers. 2736 if (!isTailCall) { 2737 const uint32_t *Mask; 2738 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2739 if (isThisReturn) { 2740 // For 'this' returns, use the R0-preserving mask if applicable 2741 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2742 if (!Mask) { 2743 // Set isThisReturn to false if the calling convention is not one that 2744 // allows 'returned' to be modeled in this way, so LowerCallResult does 2745 // not try to pass 'this' straight through 2746 isThisReturn = false; 2747 Mask = ARI->getCallPreservedMask(MF, CallConv); 2748 } 2749 } else 2750 Mask = ARI->getCallPreservedMask(MF, CallConv); 2751 2752 assert(Mask && "Missing call preserved mask for calling convention"); 2753 Ops.push_back(DAG.getRegisterMask(Mask)); 2754 } 2755 2756 if (InFlag.getNode()) 2757 Ops.push_back(InFlag); 2758 2759 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2760 if (isTailCall) { 2761 MF.getFrameInfo().setHasTailCall(); 2762 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2763 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2764 return Ret; 2765 } 2766 2767 // Returns a chain and a flag for retval copy to use. 2768 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2769 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2770 InFlag = Chain.getValue(1); 2771 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2772 2773 // If we're guaranteeing tail-calls will be honoured, the callee must 2774 // pop its own argument stack on return. But this call is *not* a tail call so 2775 // we need to undo that after it returns to restore the status-quo. 2776 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; 2777 uint64_t CalleePopBytes = 2778 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL; 2779 2780 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2781 DAG.getIntPtrConstant(CalleePopBytes, dl, true), 2782 InFlag, dl); 2783 if (!Ins.empty()) 2784 InFlag = Chain.getValue(1); 2785 2786 // Handle result values, copying them out of physregs into vregs that we 2787 // return. 2788 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2789 InVals, isThisReturn, 2790 isThisReturn ? OutVals[0] : SDValue()); 2791 } 2792 2793 /// HandleByVal - Every parameter *after* a byval parameter is passed 2794 /// on the stack. Remember the next parameter register to allocate, 2795 /// and then confiscate the rest of the parameter registers to insure 2796 /// this. 2797 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2798 Align Alignment) const { 2799 // Byval (as with any stack) slots are always at least 4 byte aligned. 2800 Alignment = std::max(Alignment, Align(4)); 2801 2802 unsigned Reg = State->AllocateReg(GPRArgRegs); 2803 if (!Reg) 2804 return; 2805 2806 unsigned AlignInRegs = Alignment.value() / 4; 2807 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2808 for (unsigned i = 0; i < Waste; ++i) 2809 Reg = State->AllocateReg(GPRArgRegs); 2810 2811 if (!Reg) 2812 return; 2813 2814 unsigned Excess = 4 * (ARM::R4 - Reg); 2815 2816 // Special case when NSAA != SP and parameter size greater than size of 2817 // all remained GPR regs. In that case we can't split parameter, we must 2818 // send it to stack. We also must set NCRN to R4, so waste all 2819 // remained registers. 2820 const unsigned NSAAOffset = State->getNextStackOffset(); 2821 if (NSAAOffset != 0 && Size > Excess) { 2822 while (State->AllocateReg(GPRArgRegs)) 2823 ; 2824 return; 2825 } 2826 2827 // First register for byval parameter is the first register that wasn't 2828 // allocated before this method call, so it would be "reg". 2829 // If parameter is small enough to be saved in range [reg, r4), then 2830 // the end (first after last) register would be reg + param-size-in-regs, 2831 // else parameter would be splitted between registers and stack, 2832 // end register would be r4 in this case. 2833 unsigned ByValRegBegin = Reg; 2834 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2835 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2836 // Note, first register is allocated in the beginning of function already, 2837 // allocate remained amount of registers we need. 2838 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2839 State->AllocateReg(GPRArgRegs); 2840 // A byval parameter that is split between registers and memory needs its 2841 // size truncated here. 2842 // In the case where the entire structure fits in registers, we set the 2843 // size in memory to zero. 2844 Size = std::max<int>(Size - Excess, 0); 2845 } 2846 2847 /// MatchingStackOffset - Return true if the given stack call argument is 2848 /// already available in the same position (relatively) of the caller's 2849 /// incoming argument stack. 2850 static 2851 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2852 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2853 const TargetInstrInfo *TII) { 2854 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2855 int FI = std::numeric_limits<int>::max(); 2856 if (Arg.getOpcode() == ISD::CopyFromReg) { 2857 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2858 if (!Register::isVirtualRegister(VR)) 2859 return false; 2860 MachineInstr *Def = MRI->getVRegDef(VR); 2861 if (!Def) 2862 return false; 2863 if (!Flags.isByVal()) { 2864 if (!TII->isLoadFromStackSlot(*Def, FI)) 2865 return false; 2866 } else { 2867 return false; 2868 } 2869 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2870 if (Flags.isByVal()) 2871 // ByVal argument is passed in as a pointer but it's now being 2872 // dereferenced. e.g. 2873 // define @foo(%struct.X* %A) { 2874 // tail call @bar(%struct.X* byval %A) 2875 // } 2876 return false; 2877 SDValue Ptr = Ld->getBasePtr(); 2878 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2879 if (!FINode) 2880 return false; 2881 FI = FINode->getIndex(); 2882 } else 2883 return false; 2884 2885 assert(FI != std::numeric_limits<int>::max()); 2886 if (!MFI.isFixedObjectIndex(FI)) 2887 return false; 2888 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2889 } 2890 2891 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2892 /// for tail call optimization. Targets which want to do tail call 2893 /// optimization should implement this function. 2894 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2895 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2896 bool isCalleeStructRet, bool isCallerStructRet, 2897 const SmallVectorImpl<ISD::OutputArg> &Outs, 2898 const SmallVectorImpl<SDValue> &OutVals, 2899 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2900 const bool isIndirect) const { 2901 MachineFunction &MF = DAG.getMachineFunction(); 2902 const Function &CallerF = MF.getFunction(); 2903 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2904 2905 assert(Subtarget->supportsTailCall()); 2906 2907 // Indirect tail calls cannot be optimized for Thumb1 if the args 2908 // to the call take up r0-r3. The reason is that there are no legal registers 2909 // left to hold the pointer to the function to be called. 2910 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2911 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2912 return false; 2913 2914 // Look for obvious safe cases to perform tail call optimization that do not 2915 // require ABI changes. This is what gcc calls sibcall. 2916 2917 // Exception-handling functions need a special set of instructions to indicate 2918 // a return to the hardware. Tail-calling another function would probably 2919 // break this. 2920 if (CallerF.hasFnAttribute("interrupt")) 2921 return false; 2922 2923 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) 2924 return CalleeCC == CallerCC; 2925 2926 // Also avoid sibcall optimization if either caller or callee uses struct 2927 // return semantics. 2928 if (isCalleeStructRet || isCallerStructRet) 2929 return false; 2930 2931 // Externally-defined functions with weak linkage should not be 2932 // tail-called on ARM when the OS does not support dynamic 2933 // pre-emption of symbols, as the AAELF spec requires normal calls 2934 // to undefined weak functions to be replaced with a NOP or jump to the 2935 // next instruction. The behaviour of branch instructions in this 2936 // situation (as used for tail calls) is implementation-defined, so we 2937 // cannot rely on the linker replacing the tail call with a return. 2938 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2939 const GlobalValue *GV = G->getGlobal(); 2940 const Triple &TT = getTargetMachine().getTargetTriple(); 2941 if (GV->hasExternalWeakLinkage() && 2942 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2943 return false; 2944 } 2945 2946 // Check that the call results are passed in the same way. 2947 LLVMContext &C = *DAG.getContext(); 2948 if (!CCState::resultsCompatible( 2949 getEffectiveCallingConv(CalleeCC, isVarArg), 2950 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, 2951 CCAssignFnForReturn(CalleeCC, isVarArg), 2952 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) 2953 return false; 2954 // The callee has to preserve all registers the caller needs to preserve. 2955 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2956 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2957 if (CalleeCC != CallerCC) { 2958 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2959 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2960 return false; 2961 } 2962 2963 // If Caller's vararg or byval argument has been split between registers and 2964 // stack, do not perform tail call, since part of the argument is in caller's 2965 // local frame. 2966 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2967 if (AFI_Caller->getArgRegsSaveSize()) 2968 return false; 2969 2970 // If the callee takes no arguments then go on to check the results of the 2971 // call. 2972 if (!Outs.empty()) { 2973 // Check if stack adjustment is needed. For now, do not do this if any 2974 // argument is passed on the stack. 2975 SmallVector<CCValAssign, 16> ArgLocs; 2976 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2977 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2978 if (CCInfo.getNextStackOffset()) { 2979 // Check if the arguments are already laid out in the right way as 2980 // the caller's fixed stack objects. 2981 MachineFrameInfo &MFI = MF.getFrameInfo(); 2982 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2983 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2984 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2985 i != e; 2986 ++i, ++realArgIdx) { 2987 CCValAssign &VA = ArgLocs[i]; 2988 EVT RegVT = VA.getLocVT(); 2989 SDValue Arg = OutVals[realArgIdx]; 2990 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2991 if (VA.getLocInfo() == CCValAssign::Indirect) 2992 return false; 2993 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { 2994 // f64 and vector types are split into multiple registers or 2995 // register/stack-slot combinations. The types will not match 2996 // the registers; give up on memory f64 refs until we figure 2997 // out what to do about this. 2998 if (!VA.isRegLoc()) 2999 return false; 3000 if (!ArgLocs[++i].isRegLoc()) 3001 return false; 3002 if (RegVT == MVT::v2f64) { 3003 if (!ArgLocs[++i].isRegLoc()) 3004 return false; 3005 if (!ArgLocs[++i].isRegLoc()) 3006 return false; 3007 } 3008 } else if (!VA.isRegLoc()) { 3009 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3010 MFI, MRI, TII)) 3011 return false; 3012 } 3013 } 3014 } 3015 3016 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3017 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 3018 return false; 3019 } 3020 3021 return true; 3022 } 3023 3024 bool 3025 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 3026 MachineFunction &MF, bool isVarArg, 3027 const SmallVectorImpl<ISD::OutputArg> &Outs, 3028 LLVMContext &Context) const { 3029 SmallVector<CCValAssign, 16> RVLocs; 3030 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3031 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3032 } 3033 3034 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 3035 const SDLoc &DL, SelectionDAG &DAG) { 3036 const MachineFunction &MF = DAG.getMachineFunction(); 3037 const Function &F = MF.getFunction(); 3038 3039 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 3040 3041 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 3042 // version of the "preferred return address". These offsets affect the return 3043 // instruction if this is a return from PL1 without hypervisor extensions. 3044 // IRQ/FIQ: +4 "subs pc, lr, #4" 3045 // SWI: 0 "subs pc, lr, #0" 3046 // ABORT: +4 "subs pc, lr, #4" 3047 // UNDEF: +4/+2 "subs pc, lr, #0" 3048 // UNDEF varies depending on where the exception came from ARM or Thumb 3049 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 3050 3051 int64_t LROffset; 3052 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 3053 IntKind == "ABORT") 3054 LROffset = 4; 3055 else if (IntKind == "SWI" || IntKind == "UNDEF") 3056 LROffset = 0; 3057 else 3058 report_fatal_error("Unsupported interrupt attribute. If present, value " 3059 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 3060 3061 RetOps.insert(RetOps.begin() + 1, 3062 DAG.getConstant(LROffset, DL, MVT::i32, false)); 3063 3064 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 3065 } 3066 3067 SDValue 3068 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3069 bool isVarArg, 3070 const SmallVectorImpl<ISD::OutputArg> &Outs, 3071 const SmallVectorImpl<SDValue> &OutVals, 3072 const SDLoc &dl, SelectionDAG &DAG) const { 3073 // CCValAssign - represent the assignment of the return value to a location. 3074 SmallVector<CCValAssign, 16> RVLocs; 3075 3076 // CCState - Info about the registers and stack slots. 3077 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3078 *DAG.getContext()); 3079 3080 // Analyze outgoing return values. 3081 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3082 3083 SDValue Flag; 3084 SmallVector<SDValue, 4> RetOps; 3085 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 3086 bool isLittleEndian = Subtarget->isLittle(); 3087 3088 MachineFunction &MF = DAG.getMachineFunction(); 3089 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3090 AFI->setReturnRegsCount(RVLocs.size()); 3091 3092 // Report error if cmse entry function returns structure through first ptr arg. 3093 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { 3094 // Note: using an empty SDLoc(), as the first line of the function is a 3095 // better place to report than the last line. 3096 DiagnosticInfoUnsupported Diag( 3097 DAG.getMachineFunction().getFunction(), 3098 "secure entry function would return value through pointer", 3099 SDLoc().getDebugLoc()); 3100 DAG.getContext()->diagnose(Diag); 3101 } 3102 3103 // Copy the result values into the output registers. 3104 for (unsigned i = 0, realRVLocIdx = 0; 3105 i != RVLocs.size(); 3106 ++i, ++realRVLocIdx) { 3107 CCValAssign &VA = RVLocs[i]; 3108 assert(VA.isRegLoc() && "Can only return in registers!"); 3109 3110 SDValue Arg = OutVals[realRVLocIdx]; 3111 bool ReturnF16 = false; 3112 3113 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 3114 // Half-precision return values can be returned like this: 3115 // 3116 // t11 f16 = fadd ... 3117 // t12: i16 = bitcast t11 3118 // t13: i32 = zero_extend t12 3119 // t14: f32 = bitcast t13 <~~~~~~~ Arg 3120 // 3121 // to avoid code generation for bitcasts, we simply set Arg to the node 3122 // that produces the f16 value, t11 in this case. 3123 // 3124 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 3125 SDValue ZE = Arg.getOperand(0); 3126 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 3127 SDValue BC = ZE.getOperand(0); 3128 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 3129 Arg = BC.getOperand(0); 3130 ReturnF16 = true; 3131 } 3132 } 3133 } 3134 } 3135 3136 switch (VA.getLocInfo()) { 3137 default: llvm_unreachable("Unknown loc info!"); 3138 case CCValAssign::Full: break; 3139 case CCValAssign::BCvt: 3140 if (!ReturnF16) 3141 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3142 break; 3143 } 3144 3145 // Mask f16 arguments if this is a CMSE nonsecure entry. 3146 auto RetVT = Outs[realRVLocIdx].ArgVT; 3147 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { 3148 if (VA.needsCustom() && VA.getValVT() == MVT::f16) { 3149 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 3150 } else { 3151 auto LocBits = VA.getLocVT().getSizeInBits(); 3152 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); 3153 SDValue Mask = 3154 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 3155 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 3156 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 3157 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3158 } 3159 } 3160 3161 if (VA.needsCustom() && 3162 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { 3163 if (VA.getLocVT() == MVT::v2f64) { 3164 // Extract the first half and return it in two registers. 3165 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3166 DAG.getConstant(0, dl, MVT::i32)); 3167 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 3168 DAG.getVTList(MVT::i32, MVT::i32), Half); 3169 3170 Chain = 3171 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3172 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); 3173 Flag = Chain.getValue(1); 3174 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3175 VA = RVLocs[++i]; // skip ahead to next loc 3176 Chain = 3177 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3178 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); 3179 Flag = Chain.getValue(1); 3180 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3181 VA = RVLocs[++i]; // skip ahead to next loc 3182 3183 // Extract the 2nd half and fall through to handle it as an f64 value. 3184 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3185 DAG.getConstant(1, dl, MVT::i32)); 3186 } 3187 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 3188 // available. 3189 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 3190 DAG.getVTList(MVT::i32, MVT::i32), Arg); 3191 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3192 fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); 3193 Flag = Chain.getValue(1); 3194 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3195 VA = RVLocs[++i]; // skip ahead to next loc 3196 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3197 fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); 3198 } else 3199 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 3200 3201 // Guarantee that all emitted copies are 3202 // stuck together, avoiding something bad. 3203 Flag = Chain.getValue(1); 3204 RetOps.push_back(DAG.getRegister( 3205 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT())); 3206 } 3207 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3208 const MCPhysReg *I = 3209 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3210 if (I) { 3211 for (; *I; ++I) { 3212 if (ARM::GPRRegClass.contains(*I)) 3213 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 3214 else if (ARM::DPRRegClass.contains(*I)) 3215 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3216 else 3217 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3218 } 3219 } 3220 3221 // Update chain and glue. 3222 RetOps[0] = Chain; 3223 if (Flag.getNode()) 3224 RetOps.push_back(Flag); 3225 3226 // CPUs which aren't M-class use a special sequence to return from 3227 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 3228 // though we use "subs pc, lr, #N"). 3229 // 3230 // M-class CPUs actually use a normal return sequence with a special 3231 // (hardware-provided) value in LR, so the normal code path works. 3232 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 3233 !Subtarget->isMClass()) { 3234 if (Subtarget->isThumb1Only()) 3235 report_fatal_error("interrupt attribute is not supported in Thumb1"); 3236 return LowerInterruptReturn(RetOps, dl, DAG); 3237 } 3238 3239 ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : 3240 ARMISD::RET_FLAG; 3241 return DAG.getNode(RetNode, dl, MVT::Other, RetOps); 3242 } 3243 3244 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 3245 if (N->getNumValues() != 1) 3246 return false; 3247 if (!N->hasNUsesOfValue(1, 0)) 3248 return false; 3249 3250 SDValue TCChain = Chain; 3251 SDNode *Copy = *N->use_begin(); 3252 if (Copy->getOpcode() == ISD::CopyToReg) { 3253 // If the copy has a glue operand, we conservatively assume it isn't safe to 3254 // perform a tail call. 3255 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3256 return false; 3257 TCChain = Copy->getOperand(0); 3258 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 3259 SDNode *VMov = Copy; 3260 // f64 returned in a pair of GPRs. 3261 SmallPtrSet<SDNode*, 2> Copies; 3262 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 3263 UI != UE; ++UI) { 3264 if (UI->getOpcode() != ISD::CopyToReg) 3265 return false; 3266 Copies.insert(*UI); 3267 } 3268 if (Copies.size() > 2) 3269 return false; 3270 3271 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 3272 UI != UE; ++UI) { 3273 SDValue UseChain = UI->getOperand(0); 3274 if (Copies.count(UseChain.getNode())) 3275 // Second CopyToReg 3276 Copy = *UI; 3277 else { 3278 // We are at the top of this chain. 3279 // If the copy has a glue operand, we conservatively assume it 3280 // isn't safe to perform a tail call. 3281 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 3282 return false; 3283 // First CopyToReg 3284 TCChain = UseChain; 3285 } 3286 } 3287 } else if (Copy->getOpcode() == ISD::BITCAST) { 3288 // f32 returned in a single GPR. 3289 if (!Copy->hasOneUse()) 3290 return false; 3291 Copy = *Copy->use_begin(); 3292 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 3293 return false; 3294 // If the copy has a glue operand, we conservatively assume it isn't safe to 3295 // perform a tail call. 3296 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3297 return false; 3298 TCChain = Copy->getOperand(0); 3299 } else { 3300 return false; 3301 } 3302 3303 bool HasRet = false; 3304 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 3305 UI != UE; ++UI) { 3306 if (UI->getOpcode() != ARMISD::RET_FLAG && 3307 UI->getOpcode() != ARMISD::INTRET_FLAG) 3308 return false; 3309 HasRet = true; 3310 } 3311 3312 if (!HasRet) 3313 return false; 3314 3315 Chain = TCChain; 3316 return true; 3317 } 3318 3319 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3320 if (!Subtarget->supportsTailCall()) 3321 return false; 3322 3323 if (!CI->isTailCall()) 3324 return false; 3325 3326 return true; 3327 } 3328 3329 // Trying to write a 64 bit value so need to split into two 32 bit values first, 3330 // and pass the lower and high parts through. 3331 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 3332 SDLoc DL(Op); 3333 SDValue WriteValue = Op->getOperand(2); 3334 3335 // This function is only supposed to be called for i64 type argument. 3336 assert(WriteValue.getValueType() == MVT::i64 3337 && "LowerWRITE_REGISTER called for non-i64 type argument."); 3338 3339 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3340 DAG.getConstant(0, DL, MVT::i32)); 3341 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3342 DAG.getConstant(1, DL, MVT::i32)); 3343 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3344 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3345 } 3346 3347 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3348 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3349 // one of the above mentioned nodes. It has to be wrapped because otherwise 3350 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3351 // be used to form addressing mode. These wrapped nodes will be selected 3352 // into MOVi. 3353 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3354 SelectionDAG &DAG) const { 3355 EVT PtrVT = Op.getValueType(); 3356 // FIXME there is no actual debug info here 3357 SDLoc dl(Op); 3358 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3359 SDValue Res; 3360 3361 // When generating execute-only code Constant Pools must be promoted to the 3362 // global data section. It's a bit ugly that we can't share them across basic 3363 // blocks, but this way we guarantee that execute-only behaves correct with 3364 // position-independent addressing modes. 3365 if (Subtarget->genExecuteOnly()) { 3366 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3367 auto T = const_cast<Type*>(CP->getType()); 3368 auto C = const_cast<Constant*>(CP->getConstVal()); 3369 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3370 getFunction().getParent()); 3371 auto GV = new GlobalVariable( 3372 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3373 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3374 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3375 Twine(AFI->createPICLabelUId()) 3376 ); 3377 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3378 dl, PtrVT); 3379 return LowerGlobalAddress(GA, DAG); 3380 } 3381 3382 if (CP->isMachineConstantPoolEntry()) 3383 Res = 3384 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); 3385 else 3386 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); 3387 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3388 } 3389 3390 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3391 return MachineJumpTableInfo::EK_Inline; 3392 } 3393 3394 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3395 SelectionDAG &DAG) const { 3396 MachineFunction &MF = DAG.getMachineFunction(); 3397 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3398 unsigned ARMPCLabelIndex = 0; 3399 SDLoc DL(Op); 3400 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3401 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3402 SDValue CPAddr; 3403 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3404 if (!IsPositionIndependent) { 3405 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); 3406 } else { 3407 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3408 ARMPCLabelIndex = AFI->createPICLabelUId(); 3409 ARMConstantPoolValue *CPV = 3410 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3411 ARMCP::CPBlockAddress, PCAdj); 3412 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3413 } 3414 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3415 SDValue Result = DAG.getLoad( 3416 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3417 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3418 if (!IsPositionIndependent) 3419 return Result; 3420 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3421 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3422 } 3423 3424 /// Convert a TLS address reference into the correct sequence of loads 3425 /// and calls to compute the variable's address for Darwin, and return an 3426 /// SDValue containing the final node. 3427 3428 /// Darwin only has one TLS scheme which must be capable of dealing with the 3429 /// fully general situation, in the worst case. This means: 3430 /// + "extern __thread" declaration. 3431 /// + Defined in a possibly unknown dynamic library. 3432 /// 3433 /// The general system is that each __thread variable has a [3 x i32] descriptor 3434 /// which contains information used by the runtime to calculate the address. The 3435 /// only part of this the compiler needs to know about is the first word, which 3436 /// contains a function pointer that must be called with the address of the 3437 /// entire descriptor in "r0". 3438 /// 3439 /// Since this descriptor may be in a different unit, in general access must 3440 /// proceed along the usual ARM rules. A common sequence to produce is: 3441 /// 3442 /// movw rT1, :lower16:_var$non_lazy_ptr 3443 /// movt rT1, :upper16:_var$non_lazy_ptr 3444 /// ldr r0, [rT1] 3445 /// ldr rT2, [r0] 3446 /// blx rT2 3447 /// [...address now in r0...] 3448 SDValue 3449 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3450 SelectionDAG &DAG) const { 3451 assert(Subtarget->isTargetDarwin() && 3452 "This function expects a Darwin target"); 3453 SDLoc DL(Op); 3454 3455 // First step is to get the address of the actua global symbol. This is where 3456 // the TLS descriptor lives. 3457 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3458 3459 // The first entry in the descriptor is a function pointer that we must call 3460 // to obtain the address of the variable. 3461 SDValue Chain = DAG.getEntryNode(); 3462 SDValue FuncTLVGet = DAG.getLoad( 3463 MVT::i32, DL, Chain, DescAddr, 3464 MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), 3465 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3466 MachineMemOperand::MOInvariant); 3467 Chain = FuncTLVGet.getValue(1); 3468 3469 MachineFunction &F = DAG.getMachineFunction(); 3470 MachineFrameInfo &MFI = F.getFrameInfo(); 3471 MFI.setAdjustsStack(true); 3472 3473 // TLS calls preserve all registers except those that absolutely must be 3474 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3475 // silly). 3476 auto TRI = 3477 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3478 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3479 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3480 3481 // Finally, we can make the call. This is just a degenerate version of a 3482 // normal AArch64 call node: r0 takes the address of the descriptor, and 3483 // returns the address of the variable in this thread. 3484 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3485 Chain = 3486 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3487 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3488 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3489 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3490 } 3491 3492 SDValue 3493 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3494 SelectionDAG &DAG) const { 3495 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3496 3497 SDValue Chain = DAG.getEntryNode(); 3498 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3499 SDLoc DL(Op); 3500 3501 // Load the current TEB (thread environment block) 3502 SDValue Ops[] = {Chain, 3503 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3504 DAG.getTargetConstant(15, DL, MVT::i32), 3505 DAG.getTargetConstant(0, DL, MVT::i32), 3506 DAG.getTargetConstant(13, DL, MVT::i32), 3507 DAG.getTargetConstant(0, DL, MVT::i32), 3508 DAG.getTargetConstant(2, DL, MVT::i32)}; 3509 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3510 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3511 3512 SDValue TEB = CurrentTEB.getValue(0); 3513 Chain = CurrentTEB.getValue(1); 3514 3515 // Load the ThreadLocalStoragePointer from the TEB 3516 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3517 SDValue TLSArray = 3518 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3519 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3520 3521 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3522 // offset into the TLSArray. 3523 3524 // Load the TLS index from the C runtime 3525 SDValue TLSIndex = 3526 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3527 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3528 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3529 3530 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3531 DAG.getConstant(2, DL, MVT::i32)); 3532 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3533 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3534 MachinePointerInfo()); 3535 3536 // Get the offset of the start of the .tls section (section base) 3537 const auto *GA = cast<GlobalAddressSDNode>(Op); 3538 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3539 SDValue Offset = DAG.getLoad( 3540 PtrVT, DL, Chain, 3541 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3542 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), 3543 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3544 3545 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3546 } 3547 3548 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3549 SDValue 3550 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3551 SelectionDAG &DAG) const { 3552 SDLoc dl(GA); 3553 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3554 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3555 MachineFunction &MF = DAG.getMachineFunction(); 3556 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3557 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3558 ARMConstantPoolValue *CPV = 3559 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3560 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3561 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3562 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3563 Argument = DAG.getLoad( 3564 PtrVT, dl, DAG.getEntryNode(), Argument, 3565 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3566 SDValue Chain = Argument.getValue(1); 3567 3568 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3569 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3570 3571 // call __tls_get_addr. 3572 ArgListTy Args; 3573 ArgListEntry Entry; 3574 Entry.Node = Argument; 3575 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3576 Args.push_back(Entry); 3577 3578 // FIXME: is there useful debug info available here? 3579 TargetLowering::CallLoweringInfo CLI(DAG); 3580 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3581 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3582 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3583 3584 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3585 return CallResult.first; 3586 } 3587 3588 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3589 // "local exec" model. 3590 SDValue 3591 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3592 SelectionDAG &DAG, 3593 TLSModel::Model model) const { 3594 const GlobalValue *GV = GA->getGlobal(); 3595 SDLoc dl(GA); 3596 SDValue Offset; 3597 SDValue Chain = DAG.getEntryNode(); 3598 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3599 // Get the Thread Pointer 3600 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3601 3602 if (model == TLSModel::InitialExec) { 3603 MachineFunction &MF = DAG.getMachineFunction(); 3604 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3605 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3606 // Initial exec model. 3607 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3608 ARMConstantPoolValue *CPV = 3609 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3610 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3611 true); 3612 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3613 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3614 Offset = DAG.getLoad( 3615 PtrVT, dl, Chain, Offset, 3616 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3617 Chain = Offset.getValue(1); 3618 3619 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3620 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3621 3622 Offset = DAG.getLoad( 3623 PtrVT, dl, Chain, Offset, 3624 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3625 } else { 3626 // local exec model 3627 assert(model == TLSModel::LocalExec); 3628 ARMConstantPoolValue *CPV = 3629 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3630 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3631 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3632 Offset = DAG.getLoad( 3633 PtrVT, dl, Chain, Offset, 3634 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3635 } 3636 3637 // The address of the thread local variable is the add of the thread 3638 // pointer with the offset of the variable. 3639 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3640 } 3641 3642 SDValue 3643 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3644 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3645 if (DAG.getTarget().useEmulatedTLS()) 3646 return LowerToTLSEmulatedModel(GA, DAG); 3647 3648 if (Subtarget->isTargetDarwin()) 3649 return LowerGlobalTLSAddressDarwin(Op, DAG); 3650 3651 if (Subtarget->isTargetWindows()) 3652 return LowerGlobalTLSAddressWindows(Op, DAG); 3653 3654 // TODO: implement the "local dynamic" model 3655 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3656 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3657 3658 switch (model) { 3659 case TLSModel::GeneralDynamic: 3660 case TLSModel::LocalDynamic: 3661 return LowerToTLSGeneralDynamicModel(GA, DAG); 3662 case TLSModel::InitialExec: 3663 case TLSModel::LocalExec: 3664 return LowerToTLSExecModels(GA, DAG, model); 3665 } 3666 llvm_unreachable("bogus TLS model"); 3667 } 3668 3669 /// Return true if all users of V are within function F, looking through 3670 /// ConstantExprs. 3671 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3672 SmallVector<const User*,4> Worklist(V->users()); 3673 while (!Worklist.empty()) { 3674 auto *U = Worklist.pop_back_val(); 3675 if (isa<ConstantExpr>(U)) { 3676 append_range(Worklist, U->users()); 3677 continue; 3678 } 3679 3680 auto *I = dyn_cast<Instruction>(U); 3681 if (!I || I->getParent()->getParent() != F) 3682 return false; 3683 } 3684 return true; 3685 } 3686 3687 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3688 const GlobalValue *GV, SelectionDAG &DAG, 3689 EVT PtrVT, const SDLoc &dl) { 3690 // If we're creating a pool entry for a constant global with unnamed address, 3691 // and the global is small enough, we can emit it inline into the constant pool 3692 // to save ourselves an indirection. 3693 // 3694 // This is a win if the constant is only used in one function (so it doesn't 3695 // need to be duplicated) or duplicating the constant wouldn't increase code 3696 // size (implying the constant is no larger than 4 bytes). 3697 const Function &F = DAG.getMachineFunction().getFunction(); 3698 3699 // We rely on this decision to inline being idemopotent and unrelated to the 3700 // use-site. We know that if we inline a variable at one use site, we'll 3701 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3702 // doesn't know about this optimization, so bail out if it's enabled else 3703 // we could decide to inline here (and thus never emit the GV) but require 3704 // the GV from fast-isel generated code. 3705 if (!EnableConstpoolPromotion || 3706 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3707 return SDValue(); 3708 3709 auto *GVar = dyn_cast<GlobalVariable>(GV); 3710 if (!GVar || !GVar->hasInitializer() || 3711 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3712 !GVar->hasLocalLinkage()) 3713 return SDValue(); 3714 3715 // If we inline a value that contains relocations, we move the relocations 3716 // from .data to .text. This is not allowed in position-independent code. 3717 auto *Init = GVar->getInitializer(); 3718 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3719 Init->needsDynamicRelocation()) 3720 return SDValue(); 3721 3722 // The constant islands pass can only really deal with alignment requests 3723 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3724 // any type wanting greater alignment requirements than 4 bytes. We also 3725 // can only promote constants that are multiples of 4 bytes in size or 3726 // are paddable to a multiple of 4. Currently we only try and pad constants 3727 // that are strings for simplicity. 3728 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3729 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3730 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar); 3731 unsigned RequiredPadding = 4 - (Size % 4); 3732 bool PaddingPossible = 3733 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3734 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || 3735 Size == 0) 3736 return SDValue(); 3737 3738 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3739 MachineFunction &MF = DAG.getMachineFunction(); 3740 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3741 3742 // We can't bloat the constant pool too much, else the ConstantIslands pass 3743 // may fail to converge. If we haven't promoted this global yet (it may have 3744 // multiple uses), and promoting it would increase the constant pool size (Sz 3745 // > 4), ensure we have space to do so up to MaxTotal. 3746 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3747 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3748 ConstpoolPromotionMaxTotal) 3749 return SDValue(); 3750 3751 // This is only valid if all users are in a single function; we can't clone 3752 // the constant in general. The LLVM IR unnamed_addr allows merging 3753 // constants, but not cloning them. 3754 // 3755 // We could potentially allow cloning if we could prove all uses of the 3756 // constant in the current function don't care about the address, like 3757 // printf format strings. But that isn't implemented for now. 3758 if (!allUsersAreInFunction(GVar, &F)) 3759 return SDValue(); 3760 3761 // We're going to inline this global. Pad it out if needed. 3762 if (RequiredPadding != 4) { 3763 StringRef S = CDAInit->getAsString(); 3764 3765 SmallVector<uint8_t,16> V(S.size()); 3766 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3767 while (RequiredPadding--) 3768 V.push_back(0); 3769 Init = ConstantDataArray::get(*DAG.getContext(), V); 3770 } 3771 3772 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3773 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); 3774 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3775 AFI->markGlobalAsPromotedToConstantPool(GVar); 3776 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3777 PaddedSize - 4); 3778 } 3779 ++NumConstpoolPromoted; 3780 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3781 } 3782 3783 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3784 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3785 if (!(GV = GA->getBaseObject())) 3786 return false; 3787 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3788 return V->isConstant(); 3789 return isa<Function>(GV); 3790 } 3791 3792 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3793 SelectionDAG &DAG) const { 3794 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3795 default: llvm_unreachable("unknown object format"); 3796 case Triple::COFF: 3797 return LowerGlobalAddressWindows(Op, DAG); 3798 case Triple::ELF: 3799 return LowerGlobalAddressELF(Op, DAG); 3800 case Triple::MachO: 3801 return LowerGlobalAddressDarwin(Op, DAG); 3802 } 3803 } 3804 3805 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3806 SelectionDAG &DAG) const { 3807 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3808 SDLoc dl(Op); 3809 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3810 const TargetMachine &TM = getTargetMachine(); 3811 bool IsRO = isReadOnly(GV); 3812 3813 // promoteToConstantPool only if not generating XO text section 3814 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3815 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3816 return V; 3817 3818 if (isPositionIndependent()) { 3819 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3820 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3821 UseGOT_PREL ? ARMII::MO_GOT : 0); 3822 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3823 if (UseGOT_PREL) 3824 Result = 3825 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3826 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3827 return Result; 3828 } else if (Subtarget->isROPI() && IsRO) { 3829 // PC-relative. 3830 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3831 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3832 return Result; 3833 } else if (Subtarget->isRWPI() && !IsRO) { 3834 // SB-relative. 3835 SDValue RelAddr; 3836 if (Subtarget->useMovt()) { 3837 ++NumMovwMovt; 3838 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3839 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3840 } else { // use literal pool for address constant 3841 ARMConstantPoolValue *CPV = 3842 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3843 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3844 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3845 RelAddr = DAG.getLoad( 3846 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3847 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3848 } 3849 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3850 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3851 return Result; 3852 } 3853 3854 // If we have T2 ops, we can materialize the address directly via movt/movw 3855 // pair. This is always cheaper. 3856 if (Subtarget->useMovt()) { 3857 ++NumMovwMovt; 3858 // FIXME: Once remat is capable of dealing with instructions with register 3859 // operands, expand this into two nodes. 3860 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3861 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3862 } else { 3863 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); 3864 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3865 return DAG.getLoad( 3866 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3867 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3868 } 3869 } 3870 3871 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3872 SelectionDAG &DAG) const { 3873 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3874 "ROPI/RWPI not currently supported for Darwin"); 3875 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3876 SDLoc dl(Op); 3877 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3878 3879 if (Subtarget->useMovt()) 3880 ++NumMovwMovt; 3881 3882 // FIXME: Once remat is capable of dealing with instructions with register 3883 // operands, expand this into multiple nodes 3884 unsigned Wrapper = 3885 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3886 3887 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3888 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3889 3890 if (Subtarget->isGVIndirectSymbol(GV)) 3891 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3892 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3893 return Result; 3894 } 3895 3896 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3897 SelectionDAG &DAG) const { 3898 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3899 assert(Subtarget->useMovt() && 3900 "Windows on ARM expects to use movw/movt"); 3901 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3902 "ROPI/RWPI not currently supported for Windows"); 3903 3904 const TargetMachine &TM = getTargetMachine(); 3905 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3906 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3907 if (GV->hasDLLImportStorageClass()) 3908 TargetFlags = ARMII::MO_DLLIMPORT; 3909 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3910 TargetFlags = ARMII::MO_COFFSTUB; 3911 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3912 SDValue Result; 3913 SDLoc DL(Op); 3914 3915 ++NumMovwMovt; 3916 3917 // FIXME: Once remat is capable of dealing with instructions with register 3918 // operands, expand this into two nodes. 3919 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3920 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3921 TargetFlags)); 3922 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3923 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3924 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3925 return Result; 3926 } 3927 3928 SDValue 3929 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3930 SDLoc dl(Op); 3931 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3932 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3933 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3934 Op.getOperand(1), Val); 3935 } 3936 3937 SDValue 3938 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3939 SDLoc dl(Op); 3940 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3941 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3942 } 3943 3944 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3945 SelectionDAG &DAG) const { 3946 SDLoc dl(Op); 3947 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3948 Op.getOperand(0)); 3949 } 3950 3951 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3952 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3953 unsigned IntNo = 3954 cast<ConstantSDNode>( 3955 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3956 ->getZExtValue(); 3957 switch (IntNo) { 3958 default: 3959 return SDValue(); // Don't custom lower most intrinsics. 3960 case Intrinsic::arm_gnu_eabi_mcount: { 3961 MachineFunction &MF = DAG.getMachineFunction(); 3962 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3963 SDLoc dl(Op); 3964 SDValue Chain = Op.getOperand(0); 3965 // call "\01__gnu_mcount_nc" 3966 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3967 const uint32_t *Mask = 3968 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3969 assert(Mask && "Missing call preserved mask for calling convention"); 3970 // Mark LR an implicit live-in. 3971 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3972 SDValue ReturnAddress = 3973 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3974 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; 3975 SDValue Callee = 3976 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3977 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3978 if (Subtarget->isThumb()) 3979 return SDValue( 3980 DAG.getMachineNode( 3981 ARM::tBL_PUSHLR, dl, ResultTys, 3982 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3983 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3984 0); 3985 return SDValue( 3986 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3987 {ReturnAddress, Callee, RegisterMask, Chain}), 3988 0); 3989 } 3990 } 3991 } 3992 3993 SDValue 3994 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3995 const ARMSubtarget *Subtarget) const { 3996 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3997 SDLoc dl(Op); 3998 switch (IntNo) { 3999 default: return SDValue(); // Don't custom lower most intrinsics. 4000 case Intrinsic::thread_pointer: { 4001 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4002 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 4003 } 4004 case Intrinsic::arm_cls: { 4005 const SDValue &Operand = Op.getOperand(1); 4006 const EVT VTy = Op.getValueType(); 4007 SDValue SRA = 4008 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 4009 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 4010 SDValue SHL = 4011 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 4012 SDValue OR = 4013 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 4014 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 4015 return Result; 4016 } 4017 case Intrinsic::arm_cls64: { 4018 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 4019 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 4020 const SDValue &Operand = Op.getOperand(1); 4021 const EVT VTy = Op.getValueType(); 4022 4023 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 4024 DAG.getConstant(1, dl, VTy)); 4025 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 4026 DAG.getConstant(0, dl, VTy)); 4027 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 4028 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 4029 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 4030 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 4031 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 4032 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 4033 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 4034 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 4035 SDValue CheckLo = 4036 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 4037 SDValue HiIsZero = 4038 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 4039 SDValue AdjustedLo = 4040 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 4041 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 4042 SDValue Result = 4043 DAG.getSelect(dl, VTy, CheckLo, 4044 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 4045 return Result; 4046 } 4047 case Intrinsic::eh_sjlj_lsda: { 4048 MachineFunction &MF = DAG.getMachineFunction(); 4049 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4050 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 4051 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4052 SDValue CPAddr; 4053 bool IsPositionIndependent = isPositionIndependent(); 4054 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 4055 ARMConstantPoolValue *CPV = 4056 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 4057 ARMCP::CPLSDA, PCAdj); 4058 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 4059 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 4060 SDValue Result = DAG.getLoad( 4061 PtrVT, dl, DAG.getEntryNode(), CPAddr, 4062 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 4063 4064 if (IsPositionIndependent) { 4065 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 4066 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 4067 } 4068 return Result; 4069 } 4070 case Intrinsic::arm_neon_vabs: 4071 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 4072 Op.getOperand(1)); 4073 case Intrinsic::arm_neon_vmulls: 4074 case Intrinsic::arm_neon_vmullu: { 4075 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 4076 ? ARMISD::VMULLs : ARMISD::VMULLu; 4077 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4078 Op.getOperand(1), Op.getOperand(2)); 4079 } 4080 case Intrinsic::arm_neon_vminnm: 4081 case Intrinsic::arm_neon_vmaxnm: { 4082 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 4083 ? ISD::FMINNUM : ISD::FMAXNUM; 4084 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4085 Op.getOperand(1), Op.getOperand(2)); 4086 } 4087 case Intrinsic::arm_neon_vminu: 4088 case Intrinsic::arm_neon_vmaxu: { 4089 if (Op.getValueType().isFloatingPoint()) 4090 return SDValue(); 4091 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 4092 ? ISD::UMIN : ISD::UMAX; 4093 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4094 Op.getOperand(1), Op.getOperand(2)); 4095 } 4096 case Intrinsic::arm_neon_vmins: 4097 case Intrinsic::arm_neon_vmaxs: { 4098 // v{min,max}s is overloaded between signed integers and floats. 4099 if (!Op.getValueType().isFloatingPoint()) { 4100 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 4101 ? ISD::SMIN : ISD::SMAX; 4102 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4103 Op.getOperand(1), Op.getOperand(2)); 4104 } 4105 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 4106 ? ISD::FMINIMUM : ISD::FMAXIMUM; 4107 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4108 Op.getOperand(1), Op.getOperand(2)); 4109 } 4110 case Intrinsic::arm_neon_vtbl1: 4111 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 4112 Op.getOperand(1), Op.getOperand(2)); 4113 case Intrinsic::arm_neon_vtbl2: 4114 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 4115 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4116 case Intrinsic::arm_mve_pred_i2v: 4117 case Intrinsic::arm_mve_pred_v2i: 4118 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 4119 Op.getOperand(1)); 4120 case Intrinsic::arm_mve_vreinterpretq: 4121 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), 4122 Op.getOperand(1)); 4123 case Intrinsic::arm_mve_lsll: 4124 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), 4125 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4126 case Intrinsic::arm_mve_asrl: 4127 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), 4128 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4129 } 4130 } 4131 4132 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 4133 const ARMSubtarget *Subtarget) { 4134 SDLoc dl(Op); 4135 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 4136 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 4137 if (SSID == SyncScope::SingleThread) 4138 return Op; 4139 4140 if (!Subtarget->hasDataBarrier()) { 4141 // Some ARMv6 cpus can support data barriers with an mcr instruction. 4142 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 4143 // here. 4144 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 4145 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 4146 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 4147 DAG.getConstant(0, dl, MVT::i32)); 4148 } 4149 4150 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 4151 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 4152 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 4153 if (Subtarget->isMClass()) { 4154 // Only a full system barrier exists in the M-class architectures. 4155 Domain = ARM_MB::SY; 4156 } else if (Subtarget->preferISHSTBarriers() && 4157 Ord == AtomicOrdering::Release) { 4158 // Swift happens to implement ISHST barriers in a way that's compatible with 4159 // Release semantics but weaker than ISH so we'd be fools not to use 4160 // it. Beware: other processors probably don't! 4161 Domain = ARM_MB::ISHST; 4162 } 4163 4164 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 4165 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 4166 DAG.getConstant(Domain, dl, MVT::i32)); 4167 } 4168 4169 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 4170 const ARMSubtarget *Subtarget) { 4171 // ARM pre v5TE and Thumb1 does not have preload instructions. 4172 if (!(Subtarget->isThumb2() || 4173 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 4174 // Just preserve the chain. 4175 return Op.getOperand(0); 4176 4177 SDLoc dl(Op); 4178 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 4179 if (!isRead && 4180 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 4181 // ARMv7 with MP extension has PLDW. 4182 return Op.getOperand(0); 4183 4184 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 4185 if (Subtarget->isThumb()) { 4186 // Invert the bits. 4187 isRead = ~isRead & 1; 4188 isData = ~isData & 1; 4189 } 4190 4191 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 4192 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 4193 DAG.getConstant(isData, dl, MVT::i32)); 4194 } 4195 4196 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 4197 MachineFunction &MF = DAG.getMachineFunction(); 4198 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 4199 4200 // vastart just stores the address of the VarArgsFrameIndex slot into the 4201 // memory location argument. 4202 SDLoc dl(Op); 4203 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4204 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4205 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4206 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 4207 MachinePointerInfo(SV)); 4208 } 4209 4210 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 4211 CCValAssign &NextVA, 4212 SDValue &Root, 4213 SelectionDAG &DAG, 4214 const SDLoc &dl) const { 4215 MachineFunction &MF = DAG.getMachineFunction(); 4216 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4217 4218 const TargetRegisterClass *RC; 4219 if (AFI->isThumb1OnlyFunction()) 4220 RC = &ARM::tGPRRegClass; 4221 else 4222 RC = &ARM::GPRRegClass; 4223 4224 // Transform the arguments stored in physical registers into virtual ones. 4225 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4226 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4227 4228 SDValue ArgValue2; 4229 if (NextVA.isMemLoc()) { 4230 MachineFrameInfo &MFI = MF.getFrameInfo(); 4231 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 4232 4233 // Create load node to retrieve arguments from the stack. 4234 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 4235 ArgValue2 = DAG.getLoad( 4236 MVT::i32, dl, Root, FIN, 4237 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4238 } else { 4239 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 4240 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4241 } 4242 if (!Subtarget->isLittle()) 4243 std::swap (ArgValue, ArgValue2); 4244 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 4245 } 4246 4247 // The remaining GPRs hold either the beginning of variable-argument 4248 // data, or the beginning of an aggregate passed by value (usually 4249 // byval). Either way, we allocate stack slots adjacent to the data 4250 // provided by our caller, and store the unallocated registers there. 4251 // If this is a variadic function, the va_list pointer will begin with 4252 // these values; otherwise, this reassembles a (byval) structure that 4253 // was split between registers and memory. 4254 // Return: The frame index registers were stored into. 4255 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 4256 const SDLoc &dl, SDValue &Chain, 4257 const Value *OrigArg, 4258 unsigned InRegsParamRecordIdx, 4259 int ArgOffset, unsigned ArgSize) const { 4260 // Currently, two use-cases possible: 4261 // Case #1. Non-var-args function, and we meet first byval parameter. 4262 // Setup first unallocated register as first byval register; 4263 // eat all remained registers 4264 // (these two actions are performed by HandleByVal method). 4265 // Then, here, we initialize stack frame with 4266 // "store-reg" instructions. 4267 // Case #2. Var-args function, that doesn't contain byval parameters. 4268 // The same: eat all remained unallocated registers, 4269 // initialize stack frame. 4270 4271 MachineFunction &MF = DAG.getMachineFunction(); 4272 MachineFrameInfo &MFI = MF.getFrameInfo(); 4273 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4274 unsigned RBegin, REnd; 4275 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 4276 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 4277 } else { 4278 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4279 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 4280 REnd = ARM::R4; 4281 } 4282 4283 if (REnd != RBegin) 4284 ArgOffset = -4 * (ARM::R4 - RBegin); 4285 4286 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4287 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 4288 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 4289 4290 SmallVector<SDValue, 4> MemOps; 4291 const TargetRegisterClass *RC = 4292 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 4293 4294 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 4295 unsigned VReg = MF.addLiveIn(Reg, RC); 4296 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4297 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4298 MachinePointerInfo(OrigArg, 4 * i)); 4299 MemOps.push_back(Store); 4300 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 4301 } 4302 4303 if (!MemOps.empty()) 4304 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4305 return FrameIndex; 4306 } 4307 4308 // Setup stack frame, the va_list pointer will start from. 4309 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 4310 const SDLoc &dl, SDValue &Chain, 4311 unsigned ArgOffset, 4312 unsigned TotalArgRegsSaveSize, 4313 bool ForceMutable) const { 4314 MachineFunction &MF = DAG.getMachineFunction(); 4315 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4316 4317 // Try to store any remaining integer argument regs 4318 // to their spots on the stack so that they may be loaded by dereferencing 4319 // the result of va_next. 4320 // If there is no regs to be stored, just point address after last 4321 // argument passed via stack. 4322 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 4323 CCInfo.getInRegsParamsCount(), 4324 CCInfo.getNextStackOffset(), 4325 std::max(4U, TotalArgRegsSaveSize)); 4326 AFI->setVarArgsFrameIndex(FrameIndex); 4327 } 4328 4329 bool ARMTargetLowering::splitValueIntoRegisterParts( 4330 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, 4331 unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { 4332 bool IsABIRegCopy = CC.hasValue(); 4333 EVT ValueVT = Val.getValueType(); 4334 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4335 PartVT == MVT::f32) { 4336 unsigned ValueBits = ValueVT.getSizeInBits(); 4337 unsigned PartBits = PartVT.getSizeInBits(); 4338 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); 4339 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); 4340 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); 4341 Parts[0] = Val; 4342 return true; 4343 } 4344 return false; 4345 } 4346 4347 SDValue ARMTargetLowering::joinRegisterPartsIntoValue( 4348 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, 4349 MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const { 4350 bool IsABIRegCopy = CC.hasValue(); 4351 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4352 PartVT == MVT::f32) { 4353 unsigned ValueBits = ValueVT.getSizeInBits(); 4354 unsigned PartBits = PartVT.getSizeInBits(); 4355 SDValue Val = Parts[0]; 4356 4357 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); 4358 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); 4359 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); 4360 return Val; 4361 } 4362 return SDValue(); 4363 } 4364 4365 SDValue ARMTargetLowering::LowerFormalArguments( 4366 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 4367 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4368 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4369 MachineFunction &MF = DAG.getMachineFunction(); 4370 MachineFrameInfo &MFI = MF.getFrameInfo(); 4371 4372 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4373 4374 // Assign locations to all of the incoming arguments. 4375 SmallVector<CCValAssign, 16> ArgLocs; 4376 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 4377 *DAG.getContext()); 4378 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 4379 4380 SmallVector<SDValue, 16> ArgValues; 4381 SDValue ArgValue; 4382 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4383 unsigned CurArgIdx = 0; 4384 4385 // Initially ArgRegsSaveSize is zero. 4386 // Then we increase this value each time we meet byval parameter. 4387 // We also increase this value in case of varargs function. 4388 AFI->setArgRegsSaveSize(0); 4389 4390 // Calculate the amount of stack space that we need to allocate to store 4391 // byval and variadic arguments that are passed in registers. 4392 // We need to know this before we allocate the first byval or variadic 4393 // argument, as they will be allocated a stack slot below the CFA (Canonical 4394 // Frame Address, the stack pointer at entry to the function). 4395 unsigned ArgRegBegin = ARM::R4; 4396 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4397 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4398 break; 4399 4400 CCValAssign &VA = ArgLocs[i]; 4401 unsigned Index = VA.getValNo(); 4402 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4403 if (!Flags.isByVal()) 4404 continue; 4405 4406 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4407 unsigned RBegin, REnd; 4408 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4409 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4410 4411 CCInfo.nextInRegsParam(); 4412 } 4413 CCInfo.rewindByValRegsInfo(); 4414 4415 int lastInsIndex = -1; 4416 if (isVarArg && MFI.hasVAStart()) { 4417 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4418 if (RegIdx != array_lengthof(GPRArgRegs)) 4419 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4420 } 4421 4422 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4423 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4424 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4425 4426 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4427 CCValAssign &VA = ArgLocs[i]; 4428 if (Ins[VA.getValNo()].isOrigArg()) { 4429 std::advance(CurOrigArg, 4430 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4431 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4432 } 4433 // Arguments stored in registers. 4434 if (VA.isRegLoc()) { 4435 EVT RegVT = VA.getLocVT(); 4436 4437 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 4438 // f64 and vector types are split up into multiple registers or 4439 // combinations of registers and stack slots. 4440 SDValue ArgValue1 = 4441 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4442 VA = ArgLocs[++i]; // skip ahead to next loc 4443 SDValue ArgValue2; 4444 if (VA.isMemLoc()) { 4445 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4446 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4447 ArgValue2 = DAG.getLoad( 4448 MVT::f64, dl, Chain, FIN, 4449 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4450 } else { 4451 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4452 } 4453 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4454 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4455 ArgValue1, DAG.getIntPtrConstant(0, dl)); 4456 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4457 ArgValue2, DAG.getIntPtrConstant(1, dl)); 4458 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 4459 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4460 } else { 4461 const TargetRegisterClass *RC; 4462 4463 if (RegVT == MVT::f16 || RegVT == MVT::bf16) 4464 RC = &ARM::HPRRegClass; 4465 else if (RegVT == MVT::f32) 4466 RC = &ARM::SPRRegClass; 4467 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 || 4468 RegVT == MVT::v4bf16) 4469 RC = &ARM::DPRRegClass; 4470 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 || 4471 RegVT == MVT::v8bf16) 4472 RC = &ARM::QPRRegClass; 4473 else if (RegVT == MVT::i32) 4474 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4475 : &ARM::GPRRegClass; 4476 else 4477 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4478 4479 // Transform the arguments in physical registers into virtual ones. 4480 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4481 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4482 4483 // If this value is passed in r0 and has the returned attribute (e.g. 4484 // C++ 'structors), record this fact for later use. 4485 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4486 AFI->setPreservesR0(); 4487 } 4488 } 4489 4490 // If this is an 8 or 16-bit value, it is really passed promoted 4491 // to 32 bits. Insert an assert[sz]ext to capture this, then 4492 // truncate to the right size. 4493 switch (VA.getLocInfo()) { 4494 default: llvm_unreachable("Unknown loc info!"); 4495 case CCValAssign::Full: break; 4496 case CCValAssign::BCvt: 4497 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4498 break; 4499 case CCValAssign::SExt: 4500 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4501 DAG.getValueType(VA.getValVT())); 4502 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4503 break; 4504 case CCValAssign::ZExt: 4505 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4506 DAG.getValueType(VA.getValVT())); 4507 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4508 break; 4509 } 4510 4511 // f16 arguments have their size extended to 4 bytes and passed as if they 4512 // had been copied to the LSBs of a 32-bit register. 4513 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 4514 if (VA.needsCustom() && 4515 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 4516 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); 4517 4518 InVals.push_back(ArgValue); 4519 } else { // VA.isRegLoc() 4520 // sanity check 4521 assert(VA.isMemLoc()); 4522 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4523 4524 int index = VA.getValNo(); 4525 4526 // Some Ins[] entries become multiple ArgLoc[] entries. 4527 // Process them only once. 4528 if (index != lastInsIndex) 4529 { 4530 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4531 // FIXME: For now, all byval parameter objects are marked mutable. 4532 // This can be changed with more analysis. 4533 // In case of tail call optimization mark all arguments mutable. 4534 // Since they could be overwritten by lowering of arguments in case of 4535 // a tail call. 4536 if (Flags.isByVal()) { 4537 assert(Ins[index].isOrigArg() && 4538 "Byval arguments cannot be implicit"); 4539 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4540 4541 int FrameIndex = StoreByValRegs( 4542 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4543 VA.getLocMemOffset(), Flags.getByValSize()); 4544 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4545 CCInfo.nextInRegsParam(); 4546 } else { 4547 unsigned FIOffset = VA.getLocMemOffset(); 4548 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4549 FIOffset, true); 4550 4551 // Create load nodes to retrieve arguments from the stack. 4552 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4553 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4554 MachinePointerInfo::getFixedStack( 4555 DAG.getMachineFunction(), FI))); 4556 } 4557 lastInsIndex = index; 4558 } 4559 } 4560 } 4561 4562 // varargs 4563 if (isVarArg && MFI.hasVAStart()) { 4564 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), 4565 TotalArgRegsSaveSize); 4566 if (AFI->isCmseNSEntryFunction()) { 4567 DiagnosticInfoUnsupported Diag( 4568 DAG.getMachineFunction().getFunction(), 4569 "secure entry function must not be variadic", dl.getDebugLoc()); 4570 DAG.getContext()->diagnose(Diag); 4571 } 4572 } 4573 4574 unsigned StackArgSize = CCInfo.getNextStackOffset(); 4575 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 4576 if (canGuaranteeTCO(CallConv, TailCallOpt)) { 4577 // The only way to guarantee a tail call is if the callee restores its 4578 // argument area, but it must also keep the stack aligned when doing so. 4579 const DataLayout &DL = DAG.getDataLayout(); 4580 StackArgSize = alignTo(StackArgSize, DL.getStackAlignment()); 4581 4582 AFI->setArgumentStackToRestore(StackArgSize); 4583 } 4584 AFI->setArgumentStackSize(StackArgSize); 4585 4586 if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { 4587 DiagnosticInfoUnsupported Diag( 4588 DAG.getMachineFunction().getFunction(), 4589 "secure entry function requires arguments on stack", dl.getDebugLoc()); 4590 DAG.getContext()->diagnose(Diag); 4591 } 4592 4593 return Chain; 4594 } 4595 4596 /// isFloatingPointZero - Return true if this is +0.0. 4597 static bool isFloatingPointZero(SDValue Op) { 4598 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4599 return CFP->getValueAPF().isPosZero(); 4600 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4601 // Maybe this has already been legalized into the constant pool? 4602 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4603 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4604 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4605 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4606 return CFP->getValueAPF().isPosZero(); 4607 } 4608 } else if (Op->getOpcode() == ISD::BITCAST && 4609 Op->getValueType(0) == MVT::f64) { 4610 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4611 // created by LowerConstantFP(). 4612 SDValue BitcastOp = Op->getOperand(0); 4613 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4614 isNullConstant(BitcastOp->getOperand(0))) 4615 return true; 4616 } 4617 return false; 4618 } 4619 4620 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4621 /// the given operands. 4622 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4623 SDValue &ARMcc, SelectionDAG &DAG, 4624 const SDLoc &dl) const { 4625 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4626 unsigned C = RHSC->getZExtValue(); 4627 if (!isLegalICmpImmediate((int32_t)C)) { 4628 // Constant does not fit, try adjusting it by one. 4629 switch (CC) { 4630 default: break; 4631 case ISD::SETLT: 4632 case ISD::SETGE: 4633 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4634 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4635 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4636 } 4637 break; 4638 case ISD::SETULT: 4639 case ISD::SETUGE: 4640 if (C != 0 && isLegalICmpImmediate(C-1)) { 4641 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4642 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4643 } 4644 break; 4645 case ISD::SETLE: 4646 case ISD::SETGT: 4647 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4648 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4649 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4650 } 4651 break; 4652 case ISD::SETULE: 4653 case ISD::SETUGT: 4654 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4655 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4656 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4657 } 4658 break; 4659 } 4660 } 4661 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4662 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4663 // In ARM and Thumb-2, the compare instructions can shift their second 4664 // operand. 4665 CC = ISD::getSetCCSwappedOperands(CC); 4666 std::swap(LHS, RHS); 4667 } 4668 4669 // Thumb1 has very limited immediate modes, so turning an "and" into a 4670 // shift can save multiple instructions. 4671 // 4672 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4673 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4674 // own. If it's the operand to an unsigned comparison with an immediate, 4675 // we can eliminate one of the shifts: we transform 4676 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4677 // 4678 // We avoid transforming cases which aren't profitable due to encoding 4679 // details: 4680 // 4681 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4682 // would not; in that case, we're essentially trading one immediate load for 4683 // another. 4684 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4685 // 3. C2 is zero; we have other code for this special case. 4686 // 4687 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4688 // instruction, since the AND is always one instruction anyway, but we could 4689 // use narrow instructions in some cases. 4690 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4691 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4692 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4693 !isSignedIntSetCC(CC)) { 4694 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4695 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4696 uint64_t RHSV = RHSC->getZExtValue(); 4697 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4698 unsigned ShiftBits = countLeadingZeros(Mask); 4699 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4700 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4701 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4702 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4703 } 4704 } 4705 } 4706 4707 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4708 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4709 // way a cmp would. 4710 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4711 // some tweaks to the heuristics for the previous and->shift transform. 4712 // FIXME: Optimize cases where the LHS isn't a shift. 4713 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4714 isa<ConstantSDNode>(RHS) && 4715 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4716 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4717 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4718 unsigned ShiftAmt = 4719 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4720 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4721 DAG.getVTList(MVT::i32, MVT::i32), 4722 LHS.getOperand(0), 4723 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4724 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4725 Shift.getValue(1), SDValue()); 4726 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4727 return Chain.getValue(1); 4728 } 4729 4730 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4731 4732 // If the RHS is a constant zero then the V (overflow) flag will never be 4733 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4734 // simpler for other passes (like the peephole optimiser) to deal with. 4735 if (isNullConstant(RHS)) { 4736 switch (CondCode) { 4737 default: break; 4738 case ARMCC::GE: 4739 CondCode = ARMCC::PL; 4740 break; 4741 case ARMCC::LT: 4742 CondCode = ARMCC::MI; 4743 break; 4744 } 4745 } 4746 4747 ARMISD::NodeType CompareType; 4748 switch (CondCode) { 4749 default: 4750 CompareType = ARMISD::CMP; 4751 break; 4752 case ARMCC::EQ: 4753 case ARMCC::NE: 4754 // Uses only Z Flag 4755 CompareType = ARMISD::CMPZ; 4756 break; 4757 } 4758 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4759 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4760 } 4761 4762 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4763 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4764 SelectionDAG &DAG, const SDLoc &dl, 4765 bool Signaling) const { 4766 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4767 SDValue Cmp; 4768 if (!isFloatingPointZero(RHS)) 4769 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4770 dl, MVT::Glue, LHS, RHS); 4771 else 4772 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4773 dl, MVT::Glue, LHS); 4774 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4775 } 4776 4777 /// duplicateCmp - Glue values can have only one use, so this function 4778 /// duplicates a comparison node. 4779 SDValue 4780 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4781 unsigned Opc = Cmp.getOpcode(); 4782 SDLoc DL(Cmp); 4783 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4784 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4785 4786 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4787 Cmp = Cmp.getOperand(0); 4788 Opc = Cmp.getOpcode(); 4789 if (Opc == ARMISD::CMPFP) 4790 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4791 else { 4792 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4793 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4794 } 4795 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4796 } 4797 4798 // This function returns three things: the arithmetic computation itself 4799 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4800 // comparison and the condition code define the case in which the arithmetic 4801 // computation *does not* overflow. 4802 std::pair<SDValue, SDValue> 4803 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4804 SDValue &ARMcc) const { 4805 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4806 4807 SDValue Value, OverflowCmp; 4808 SDValue LHS = Op.getOperand(0); 4809 SDValue RHS = Op.getOperand(1); 4810 SDLoc dl(Op); 4811 4812 // FIXME: We are currently always generating CMPs because we don't support 4813 // generating CMN through the backend. This is not as good as the natural 4814 // CMP case because it causes a register dependency and cannot be folded 4815 // later. 4816 4817 switch (Op.getOpcode()) { 4818 default: 4819 llvm_unreachable("Unknown overflow instruction!"); 4820 case ISD::SADDO: 4821 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4822 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4823 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4824 break; 4825 case ISD::UADDO: 4826 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4827 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4828 // We do not use it in the USUBO case as Value may not be used. 4829 Value = DAG.getNode(ARMISD::ADDC, dl, 4830 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4831 .getValue(0); 4832 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4833 break; 4834 case ISD::SSUBO: 4835 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4836 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4837 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4838 break; 4839 case ISD::USUBO: 4840 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4841 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4842 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4843 break; 4844 case ISD::UMULO: 4845 // We generate a UMUL_LOHI and then check if the high word is 0. 4846 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4847 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4848 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4849 LHS, RHS); 4850 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4851 DAG.getConstant(0, dl, MVT::i32)); 4852 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4853 break; 4854 case ISD::SMULO: 4855 // We generate a SMUL_LOHI and then check if all the bits of the high word 4856 // are the same as the sign bit of the low word. 4857 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4858 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4859 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4860 LHS, RHS); 4861 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4862 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4863 Value.getValue(0), 4864 DAG.getConstant(31, dl, MVT::i32))); 4865 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4866 break; 4867 } // switch (...) 4868 4869 return std::make_pair(Value, OverflowCmp); 4870 } 4871 4872 SDValue 4873 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4874 // Let legalize expand this if it isn't a legal type yet. 4875 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4876 return SDValue(); 4877 4878 SDValue Value, OverflowCmp; 4879 SDValue ARMcc; 4880 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4881 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4882 SDLoc dl(Op); 4883 // We use 0 and 1 as false and true values. 4884 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4885 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4886 EVT VT = Op.getValueType(); 4887 4888 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4889 ARMcc, CCR, OverflowCmp); 4890 4891 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4892 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4893 } 4894 4895 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4896 SelectionDAG &DAG) { 4897 SDLoc DL(BoolCarry); 4898 EVT CarryVT = BoolCarry.getValueType(); 4899 4900 // This converts the boolean value carry into the carry flag by doing 4901 // ARMISD::SUBC Carry, 1 4902 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4903 DAG.getVTList(CarryVT, MVT::i32), 4904 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4905 return Carry.getValue(1); 4906 } 4907 4908 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4909 SelectionDAG &DAG) { 4910 SDLoc DL(Flags); 4911 4912 // Now convert the carry flag into a boolean carry. We do this 4913 // using ARMISD:ADDE 0, 0, Carry 4914 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4915 DAG.getConstant(0, DL, MVT::i32), 4916 DAG.getConstant(0, DL, MVT::i32), Flags); 4917 } 4918 4919 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4920 SelectionDAG &DAG) const { 4921 // Let legalize expand this if it isn't a legal type yet. 4922 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4923 return SDValue(); 4924 4925 SDValue LHS = Op.getOperand(0); 4926 SDValue RHS = Op.getOperand(1); 4927 SDLoc dl(Op); 4928 4929 EVT VT = Op.getValueType(); 4930 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4931 SDValue Value; 4932 SDValue Overflow; 4933 switch (Op.getOpcode()) { 4934 default: 4935 llvm_unreachable("Unknown overflow instruction!"); 4936 case ISD::UADDO: 4937 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4938 // Convert the carry flag into a boolean value. 4939 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4940 break; 4941 case ISD::USUBO: { 4942 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4943 // Convert the carry flag into a boolean value. 4944 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4945 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4946 // value. So compute 1 - C. 4947 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4948 DAG.getConstant(1, dl, MVT::i32), Overflow); 4949 break; 4950 } 4951 } 4952 4953 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4954 } 4955 4956 static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4957 const ARMSubtarget *Subtarget) { 4958 EVT VT = Op.getValueType(); 4959 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 4960 return SDValue(); 4961 if (!VT.isSimple()) 4962 return SDValue(); 4963 4964 unsigned NewOpcode; 4965 switch (VT.getSimpleVT().SimpleTy) { 4966 default: 4967 return SDValue(); 4968 case MVT::i8: 4969 switch (Op->getOpcode()) { 4970 case ISD::UADDSAT: 4971 NewOpcode = ARMISD::UQADD8b; 4972 break; 4973 case ISD::SADDSAT: 4974 NewOpcode = ARMISD::QADD8b; 4975 break; 4976 case ISD::USUBSAT: 4977 NewOpcode = ARMISD::UQSUB8b; 4978 break; 4979 case ISD::SSUBSAT: 4980 NewOpcode = ARMISD::QSUB8b; 4981 break; 4982 } 4983 break; 4984 case MVT::i16: 4985 switch (Op->getOpcode()) { 4986 case ISD::UADDSAT: 4987 NewOpcode = ARMISD::UQADD16b; 4988 break; 4989 case ISD::SADDSAT: 4990 NewOpcode = ARMISD::QADD16b; 4991 break; 4992 case ISD::USUBSAT: 4993 NewOpcode = ARMISD::UQSUB16b; 4994 break; 4995 case ISD::SSUBSAT: 4996 NewOpcode = ARMISD::QSUB16b; 4997 break; 4998 } 4999 break; 5000 } 5001 5002 SDLoc dl(Op); 5003 SDValue Add = 5004 DAG.getNode(NewOpcode, dl, MVT::i32, 5005 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 5006 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 5007 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 5008 } 5009 5010 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 5011 SDValue Cond = Op.getOperand(0); 5012 SDValue SelectTrue = Op.getOperand(1); 5013 SDValue SelectFalse = Op.getOperand(2); 5014 SDLoc dl(Op); 5015 unsigned Opc = Cond.getOpcode(); 5016 5017 if (Cond.getResNo() == 1 && 5018 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5019 Opc == ISD::USUBO)) { 5020 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5021 return SDValue(); 5022 5023 SDValue Value, OverflowCmp; 5024 SDValue ARMcc; 5025 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5026 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5027 EVT VT = Op.getValueType(); 5028 5029 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 5030 OverflowCmp, DAG); 5031 } 5032 5033 // Convert: 5034 // 5035 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 5036 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 5037 // 5038 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 5039 const ConstantSDNode *CMOVTrue = 5040 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 5041 const ConstantSDNode *CMOVFalse = 5042 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 5043 5044 if (CMOVTrue && CMOVFalse) { 5045 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 5046 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 5047 5048 SDValue True; 5049 SDValue False; 5050 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 5051 True = SelectTrue; 5052 False = SelectFalse; 5053 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 5054 True = SelectFalse; 5055 False = SelectTrue; 5056 } 5057 5058 if (True.getNode() && False.getNode()) { 5059 EVT VT = Op.getValueType(); 5060 SDValue ARMcc = Cond.getOperand(2); 5061 SDValue CCR = Cond.getOperand(3); 5062 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 5063 assert(True.getValueType() == VT); 5064 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 5065 } 5066 } 5067 } 5068 5069 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 5070 // undefined bits before doing a full-word comparison with zero. 5071 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 5072 DAG.getConstant(1, dl, Cond.getValueType())); 5073 5074 return DAG.getSelectCC(dl, Cond, 5075 DAG.getConstant(0, dl, Cond.getValueType()), 5076 SelectTrue, SelectFalse, ISD::SETNE); 5077 } 5078 5079 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 5080 bool &swpCmpOps, bool &swpVselOps) { 5081 // Start by selecting the GE condition code for opcodes that return true for 5082 // 'equality' 5083 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 5084 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 5085 CondCode = ARMCC::GE; 5086 5087 // and GT for opcodes that return false for 'equality'. 5088 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 5089 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 5090 CondCode = ARMCC::GT; 5091 5092 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 5093 // to swap the compare operands. 5094 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 5095 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 5096 swpCmpOps = true; 5097 5098 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 5099 // If we have an unordered opcode, we need to swap the operands to the VSEL 5100 // instruction (effectively negating the condition). 5101 // 5102 // This also has the effect of swapping which one of 'less' or 'greater' 5103 // returns true, so we also swap the compare operands. It also switches 5104 // whether we return true for 'equality', so we compensate by picking the 5105 // opposite condition code to our original choice. 5106 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 5107 CC == ISD::SETUGT) { 5108 swpCmpOps = !swpCmpOps; 5109 swpVselOps = !swpVselOps; 5110 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 5111 } 5112 5113 // 'ordered' is 'anything but unordered', so use the VS condition code and 5114 // swap the VSEL operands. 5115 if (CC == ISD::SETO) { 5116 CondCode = ARMCC::VS; 5117 swpVselOps = true; 5118 } 5119 5120 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 5121 // code and swap the VSEL operands. Also do this if we don't care about the 5122 // unordered case. 5123 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 5124 CondCode = ARMCC::EQ; 5125 swpVselOps = true; 5126 } 5127 } 5128 5129 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 5130 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 5131 SDValue Cmp, SelectionDAG &DAG) const { 5132 if (!Subtarget->hasFP64() && VT == MVT::f64) { 5133 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 5134 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 5135 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 5136 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 5137 5138 SDValue TrueLow = TrueVal.getValue(0); 5139 SDValue TrueHigh = TrueVal.getValue(1); 5140 SDValue FalseLow = FalseVal.getValue(0); 5141 SDValue FalseHigh = FalseVal.getValue(1); 5142 5143 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 5144 ARMcc, CCR, Cmp); 5145 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 5146 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 5147 5148 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 5149 } else { 5150 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 5151 Cmp); 5152 } 5153 } 5154 5155 static bool isGTorGE(ISD::CondCode CC) { 5156 return CC == ISD::SETGT || CC == ISD::SETGE; 5157 } 5158 5159 static bool isLTorLE(ISD::CondCode CC) { 5160 return CC == ISD::SETLT || CC == ISD::SETLE; 5161 } 5162 5163 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 5164 // All of these conditions (and their <= and >= counterparts) will do: 5165 // x < k ? k : x 5166 // x > k ? x : k 5167 // k < x ? x : k 5168 // k > x ? k : x 5169 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 5170 const SDValue TrueVal, const SDValue FalseVal, 5171 const ISD::CondCode CC, const SDValue K) { 5172 return (isGTorGE(CC) && 5173 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 5174 (isLTorLE(CC) && 5175 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 5176 } 5177 5178 // Check if two chained conditionals could be converted into SSAT or USAT. 5179 // 5180 // SSAT can replace a set of two conditional selectors that bound a number to an 5181 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 5182 // 5183 // x < -k ? -k : (x > k ? k : x) 5184 // x < -k ? -k : (x < k ? x : k) 5185 // x > -k ? (x > k ? k : x) : -k 5186 // x < k ? (x < -k ? -k : x) : k 5187 // etc. 5188 // 5189 // LLVM canonicalizes these to either a min(max()) or a max(min()) 5190 // pattern. This function tries to match one of these and will return a SSAT 5191 // node if successful. 5192 // 5193 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 5194 // is a power of 2. 5195 static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) { 5196 EVT VT = Op.getValueType(); 5197 SDValue V1 = Op.getOperand(0); 5198 SDValue K1 = Op.getOperand(1); 5199 SDValue TrueVal1 = Op.getOperand(2); 5200 SDValue FalseVal1 = Op.getOperand(3); 5201 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5202 5203 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 5204 if (Op2.getOpcode() != ISD::SELECT_CC) 5205 return SDValue(); 5206 5207 SDValue V2 = Op2.getOperand(0); 5208 SDValue K2 = Op2.getOperand(1); 5209 SDValue TrueVal2 = Op2.getOperand(2); 5210 SDValue FalseVal2 = Op2.getOperand(3); 5211 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 5212 5213 SDValue V1Tmp = V1; 5214 SDValue V2Tmp = V2; 5215 5216 // Check that the registers and the constants match a max(min()) or min(max()) 5217 // pattern 5218 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 || 5219 K2 != FalseVal2 || 5220 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) 5221 return SDValue(); 5222 5223 // Check that the constant in the lower-bound check is 5224 // the opposite of the constant in the upper-bound check 5225 // in 1's complement. 5226 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2)) 5227 return SDValue(); 5228 5229 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue(); 5230 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue(); 5231 int64_t PosVal = std::max(Val1, Val2); 5232 int64_t NegVal = std::min(Val1, Val2); 5233 5234 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || 5235 !isPowerOf2_64(PosVal + 1)) 5236 return SDValue(); 5237 5238 // Handle the difference between USAT (unsigned) and SSAT (signed) 5239 // saturation 5240 // At this point, PosVal is guaranteed to be positive 5241 uint64_t K = PosVal; 5242 SDLoc dl(Op); 5243 if (Val1 == ~Val2) 5244 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp, 5245 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5246 if (NegVal == 0) 5247 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp, 5248 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5249 5250 return SDValue(); 5251 } 5252 5253 // Check if a condition of the type x < k ? k : x can be converted into a 5254 // bit operation instead of conditional moves. 5255 // Currently this is allowed given: 5256 // - The conditions and values match up 5257 // - k is 0 or -1 (all ones) 5258 // This function will not check the last condition, thats up to the caller 5259 // It returns true if the transformation can be made, and in such case 5260 // returns x in V, and k in SatK. 5261 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 5262 SDValue &SatK) 5263 { 5264 SDValue LHS = Op.getOperand(0); 5265 SDValue RHS = Op.getOperand(1); 5266 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5267 SDValue TrueVal = Op.getOperand(2); 5268 SDValue FalseVal = Op.getOperand(3); 5269 5270 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 5271 ? &RHS 5272 : nullptr; 5273 5274 // No constant operation in comparison, early out 5275 if (!K) 5276 return false; 5277 5278 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 5279 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 5280 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 5281 5282 // If the constant on left and right side, or variable on left and right, 5283 // does not match, early out 5284 if (*K != KTmp || V != VTmp) 5285 return false; 5286 5287 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 5288 SatK = *K; 5289 return true; 5290 } 5291 5292 return false; 5293 } 5294 5295 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 5296 if (VT == MVT::f32) 5297 return !Subtarget->hasVFP2Base(); 5298 if (VT == MVT::f64) 5299 return !Subtarget->hasFP64(); 5300 if (VT == MVT::f16) 5301 return !Subtarget->hasFullFP16(); 5302 return false; 5303 } 5304 5305 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 5306 EVT VT = Op.getValueType(); 5307 SDLoc dl(Op); 5308 5309 // Try to convert two saturating conditional selects into a single SSAT 5310 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) 5311 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG)) 5312 return SatValue; 5313 5314 // Try to convert expressions of the form x < k ? k : x (and similar forms) 5315 // into more efficient bit operations, which is possible when k is 0 or -1 5316 // On ARM and Thumb-2 which have flexible operand 2 this will result in 5317 // single instructions. On Thumb the shift and the bit operation will be two 5318 // instructions. 5319 // Only allow this transformation on full-width (32-bit) operations 5320 SDValue LowerSatConstant; 5321 SDValue SatValue; 5322 if (VT == MVT::i32 && 5323 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 5324 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 5325 DAG.getConstant(31, dl, VT)); 5326 if (isNullConstant(LowerSatConstant)) { 5327 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 5328 DAG.getAllOnesConstant(dl, VT)); 5329 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 5330 } else if (isAllOnesConstant(LowerSatConstant)) 5331 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 5332 } 5333 5334 SDValue LHS = Op.getOperand(0); 5335 SDValue RHS = Op.getOperand(1); 5336 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5337 SDValue TrueVal = Op.getOperand(2); 5338 SDValue FalseVal = Op.getOperand(3); 5339 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 5340 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 5341 5342 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 5343 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 5344 unsigned TVal = CTVal->getZExtValue(); 5345 unsigned FVal = CFVal->getZExtValue(); 5346 unsigned Opcode = 0; 5347 5348 if (TVal == ~FVal) { 5349 Opcode = ARMISD::CSINV; 5350 } else if (TVal == ~FVal + 1) { 5351 Opcode = ARMISD::CSNEG; 5352 } else if (TVal + 1 == FVal) { 5353 Opcode = ARMISD::CSINC; 5354 } else if (TVal == FVal + 1) { 5355 Opcode = ARMISD::CSINC; 5356 std::swap(TrueVal, FalseVal); 5357 std::swap(TVal, FVal); 5358 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5359 } 5360 5361 if (Opcode) { 5362 // If one of the constants is cheaper than another, materialise the 5363 // cheaper one and let the csel generate the other. 5364 if (Opcode != ARMISD::CSINC && 5365 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 5366 std::swap(TrueVal, FalseVal); 5367 std::swap(TVal, FVal); 5368 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5369 } 5370 5371 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 5372 // to get there. CSINC not is invertable like the other two (~(~a) == a, 5373 // -(-a) == a, but (a+1)+1 != a). 5374 if (FVal == 0 && Opcode != ARMISD::CSINC) { 5375 std::swap(TrueVal, FalseVal); 5376 std::swap(TVal, FVal); 5377 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5378 } 5379 5380 // Drops F's value because we can get it by inverting/negating TVal. 5381 FalseVal = TrueVal; 5382 5383 SDValue ARMcc; 5384 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5385 EVT VT = TrueVal.getValueType(); 5386 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5387 } 5388 } 5389 5390 if (isUnsupportedFloatingType(LHS.getValueType())) { 5391 DAG.getTargetLoweringInfo().softenSetCCOperands( 5392 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5393 5394 // If softenSetCCOperands only returned one value, we should compare it to 5395 // zero. 5396 if (!RHS.getNode()) { 5397 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5398 CC = ISD::SETNE; 5399 } 5400 } 5401 5402 if (LHS.getValueType() == MVT::i32) { 5403 // Try to generate VSEL on ARMv8. 5404 // The VSEL instruction can't use all the usual ARM condition 5405 // codes: it only has two bits to select the condition code, so it's 5406 // constrained to use only GE, GT, VS and EQ. 5407 // 5408 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5409 // swap the operands of the previous compare instruction (effectively 5410 // inverting the compare condition, swapping 'less' and 'greater') and 5411 // sometimes need to swap the operands to the VSEL (which inverts the 5412 // condition in the sense of firing whenever the previous condition didn't) 5413 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5414 TrueVal.getValueType() == MVT::f32 || 5415 TrueVal.getValueType() == MVT::f64)) { 5416 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5417 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5418 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5419 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5420 std::swap(TrueVal, FalseVal); 5421 } 5422 } 5423 5424 SDValue ARMcc; 5425 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5426 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5427 // Choose GE over PL, which vsel does now support 5428 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5429 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5430 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5431 } 5432 5433 ARMCC::CondCodes CondCode, CondCode2; 5434 FPCCToARMCC(CC, CondCode, CondCode2); 5435 5436 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5437 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5438 // must use VSEL (limited condition codes), due to not having conditional f16 5439 // moves. 5440 if (Subtarget->hasFPARMv8Base() && 5441 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5442 (TrueVal.getValueType() == MVT::f16 || 5443 TrueVal.getValueType() == MVT::f32 || 5444 TrueVal.getValueType() == MVT::f64)) { 5445 bool swpCmpOps = false; 5446 bool swpVselOps = false; 5447 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5448 5449 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5450 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5451 if (swpCmpOps) 5452 std::swap(LHS, RHS); 5453 if (swpVselOps) 5454 std::swap(TrueVal, FalseVal); 5455 } 5456 } 5457 5458 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5459 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5460 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5461 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5462 if (CondCode2 != ARMCC::AL) { 5463 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5464 // FIXME: Needs another CMP because flag can have but one use. 5465 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5466 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5467 } 5468 return Result; 5469 } 5470 5471 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5472 /// to morph to an integer compare sequence. 5473 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5474 const ARMSubtarget *Subtarget) { 5475 SDNode *N = Op.getNode(); 5476 if (!N->hasOneUse()) 5477 // Otherwise it requires moving the value from fp to integer registers. 5478 return false; 5479 if (!N->getNumValues()) 5480 return false; 5481 EVT VT = Op.getValueType(); 5482 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5483 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5484 // vmrs are very slow, e.g. cortex-a8. 5485 return false; 5486 5487 if (isFloatingPointZero(Op)) { 5488 SeenZero = true; 5489 return true; 5490 } 5491 return ISD::isNormalLoad(N); 5492 } 5493 5494 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5495 if (isFloatingPointZero(Op)) 5496 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5497 5498 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5499 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5500 Ld->getPointerInfo(), Ld->getAlignment(), 5501 Ld->getMemOperand()->getFlags()); 5502 5503 llvm_unreachable("Unknown VFP cmp argument!"); 5504 } 5505 5506 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5507 SDValue &RetVal1, SDValue &RetVal2) { 5508 SDLoc dl(Op); 5509 5510 if (isFloatingPointZero(Op)) { 5511 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5512 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5513 return; 5514 } 5515 5516 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5517 SDValue Ptr = Ld->getBasePtr(); 5518 RetVal1 = 5519 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5520 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5521 5522 EVT PtrType = Ptr.getValueType(); 5523 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5524 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5525 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5526 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5527 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5528 Ld->getMemOperand()->getFlags()); 5529 return; 5530 } 5531 5532 llvm_unreachable("Unknown VFP cmp argument!"); 5533 } 5534 5535 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5536 /// f32 and even f64 comparisons to integer ones. 5537 SDValue 5538 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5539 SDValue Chain = Op.getOperand(0); 5540 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5541 SDValue LHS = Op.getOperand(2); 5542 SDValue RHS = Op.getOperand(3); 5543 SDValue Dest = Op.getOperand(4); 5544 SDLoc dl(Op); 5545 5546 bool LHSSeenZero = false; 5547 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5548 bool RHSSeenZero = false; 5549 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5550 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5551 // If unsafe fp math optimization is enabled and there are no other uses of 5552 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5553 // to an integer comparison. 5554 if (CC == ISD::SETOEQ) 5555 CC = ISD::SETEQ; 5556 else if (CC == ISD::SETUNE) 5557 CC = ISD::SETNE; 5558 5559 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5560 SDValue ARMcc; 5561 if (LHS.getValueType() == MVT::f32) { 5562 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5563 bitcastf32Toi32(LHS, DAG), Mask); 5564 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5565 bitcastf32Toi32(RHS, DAG), Mask); 5566 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5567 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5568 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5569 Chain, Dest, ARMcc, CCR, Cmp); 5570 } 5571 5572 SDValue LHS1, LHS2; 5573 SDValue RHS1, RHS2; 5574 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5575 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5576 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5577 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5578 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5579 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5580 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5581 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5582 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5583 } 5584 5585 return SDValue(); 5586 } 5587 5588 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5589 SDValue Chain = Op.getOperand(0); 5590 SDValue Cond = Op.getOperand(1); 5591 SDValue Dest = Op.getOperand(2); 5592 SDLoc dl(Op); 5593 5594 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5595 // instruction. 5596 unsigned Opc = Cond.getOpcode(); 5597 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5598 !Subtarget->isThumb1Only(); 5599 if (Cond.getResNo() == 1 && 5600 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5601 Opc == ISD::USUBO || OptimizeMul)) { 5602 // Only lower legal XALUO ops. 5603 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5604 return SDValue(); 5605 5606 // The actual operation with overflow check. 5607 SDValue Value, OverflowCmp; 5608 SDValue ARMcc; 5609 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5610 5611 // Reverse the condition code. 5612 ARMCC::CondCodes CondCode = 5613 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5614 CondCode = ARMCC::getOppositeCondition(CondCode); 5615 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5616 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5617 5618 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5619 OverflowCmp); 5620 } 5621 5622 return SDValue(); 5623 } 5624 5625 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5626 SDValue Chain = Op.getOperand(0); 5627 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5628 SDValue LHS = Op.getOperand(2); 5629 SDValue RHS = Op.getOperand(3); 5630 SDValue Dest = Op.getOperand(4); 5631 SDLoc dl(Op); 5632 5633 if (isUnsupportedFloatingType(LHS.getValueType())) { 5634 DAG.getTargetLoweringInfo().softenSetCCOperands( 5635 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5636 5637 // If softenSetCCOperands only returned one value, we should compare it to 5638 // zero. 5639 if (!RHS.getNode()) { 5640 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5641 CC = ISD::SETNE; 5642 } 5643 } 5644 5645 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5646 // instruction. 5647 unsigned Opc = LHS.getOpcode(); 5648 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5649 !Subtarget->isThumb1Only(); 5650 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5651 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5652 Opc == ISD::USUBO || OptimizeMul) && 5653 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5654 // Only lower legal XALUO ops. 5655 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5656 return SDValue(); 5657 5658 // The actual operation with overflow check. 5659 SDValue Value, OverflowCmp; 5660 SDValue ARMcc; 5661 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5662 5663 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5664 // Reverse the condition code. 5665 ARMCC::CondCodes CondCode = 5666 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5667 CondCode = ARMCC::getOppositeCondition(CondCode); 5668 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5669 } 5670 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5671 5672 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5673 OverflowCmp); 5674 } 5675 5676 if (LHS.getValueType() == MVT::i32) { 5677 SDValue ARMcc; 5678 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5679 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5680 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5681 Chain, Dest, ARMcc, CCR, Cmp); 5682 } 5683 5684 if (getTargetMachine().Options.UnsafeFPMath && 5685 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5686 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5687 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5688 return Result; 5689 } 5690 5691 ARMCC::CondCodes CondCode, CondCode2; 5692 FPCCToARMCC(CC, CondCode, CondCode2); 5693 5694 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5695 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5696 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5697 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5698 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5699 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5700 if (CondCode2 != ARMCC::AL) { 5701 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5702 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5703 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5704 } 5705 return Res; 5706 } 5707 5708 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5709 SDValue Chain = Op.getOperand(0); 5710 SDValue Table = Op.getOperand(1); 5711 SDValue Index = Op.getOperand(2); 5712 SDLoc dl(Op); 5713 5714 EVT PTy = getPointerTy(DAG.getDataLayout()); 5715 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5716 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5717 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5718 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5719 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5720 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5721 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5722 // which does another jump to the destination. This also makes it easier 5723 // to translate it to TBB / TBH later (Thumb2 only). 5724 // FIXME: This might not work if the function is extremely large. 5725 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5726 Addr, Op.getOperand(2), JTI); 5727 } 5728 if (isPositionIndependent() || Subtarget->isROPI()) { 5729 Addr = 5730 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5731 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5732 Chain = Addr.getValue(1); 5733 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5734 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5735 } else { 5736 Addr = 5737 DAG.getLoad(PTy, dl, Chain, Addr, 5738 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5739 Chain = Addr.getValue(1); 5740 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5741 } 5742 } 5743 5744 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5745 EVT VT = Op.getValueType(); 5746 SDLoc dl(Op); 5747 5748 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5749 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5750 return Op; 5751 return DAG.UnrollVectorOp(Op.getNode()); 5752 } 5753 5754 const bool HasFullFP16 = 5755 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5756 5757 EVT NewTy; 5758 const EVT OpTy = Op.getOperand(0).getValueType(); 5759 if (OpTy == MVT::v4f32) 5760 NewTy = MVT::v4i32; 5761 else if (OpTy == MVT::v4f16 && HasFullFP16) 5762 NewTy = MVT::v4i16; 5763 else if (OpTy == MVT::v8f16 && HasFullFP16) 5764 NewTy = MVT::v8i16; 5765 else 5766 llvm_unreachable("Invalid type for custom lowering!"); 5767 5768 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5769 return DAG.UnrollVectorOp(Op.getNode()); 5770 5771 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5772 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5773 } 5774 5775 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5776 EVT VT = Op.getValueType(); 5777 if (VT.isVector()) 5778 return LowerVectorFP_TO_INT(Op, DAG); 5779 5780 bool IsStrict = Op->isStrictFPOpcode(); 5781 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5782 5783 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5784 RTLIB::Libcall LC; 5785 if (Op.getOpcode() == ISD::FP_TO_SINT || 5786 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5787 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5788 Op.getValueType()); 5789 else 5790 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5791 Op.getValueType()); 5792 SDLoc Loc(Op); 5793 MakeLibCallOptions CallOptions; 5794 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5795 SDValue Result; 5796 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5797 CallOptions, Loc, Chain); 5798 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5799 } 5800 5801 // FIXME: Remove this when we have strict fp instruction selection patterns 5802 if (IsStrict) { 5803 SDLoc Loc(Op); 5804 SDValue Result = 5805 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5806 : ISD::FP_TO_UINT, 5807 Loc, Op.getValueType(), SrcVal); 5808 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5809 } 5810 5811 return Op; 5812 } 5813 5814 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5815 EVT VT = Op.getValueType(); 5816 SDLoc dl(Op); 5817 5818 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5819 if (VT.getVectorElementType() == MVT::f32) 5820 return Op; 5821 return DAG.UnrollVectorOp(Op.getNode()); 5822 } 5823 5824 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5825 Op.getOperand(0).getValueType() == MVT::v8i16) && 5826 "Invalid type for custom lowering!"); 5827 5828 const bool HasFullFP16 = 5829 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5830 5831 EVT DestVecType; 5832 if (VT == MVT::v4f32) 5833 DestVecType = MVT::v4i32; 5834 else if (VT == MVT::v4f16 && HasFullFP16) 5835 DestVecType = MVT::v4i16; 5836 else if (VT == MVT::v8f16 && HasFullFP16) 5837 DestVecType = MVT::v8i16; 5838 else 5839 return DAG.UnrollVectorOp(Op.getNode()); 5840 5841 unsigned CastOpc; 5842 unsigned Opc; 5843 switch (Op.getOpcode()) { 5844 default: llvm_unreachable("Invalid opcode!"); 5845 case ISD::SINT_TO_FP: 5846 CastOpc = ISD::SIGN_EXTEND; 5847 Opc = ISD::SINT_TO_FP; 5848 break; 5849 case ISD::UINT_TO_FP: 5850 CastOpc = ISD::ZERO_EXTEND; 5851 Opc = ISD::UINT_TO_FP; 5852 break; 5853 } 5854 5855 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5856 return DAG.getNode(Opc, dl, VT, Op); 5857 } 5858 5859 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5860 EVT VT = Op.getValueType(); 5861 if (VT.isVector()) 5862 return LowerVectorINT_TO_FP(Op, DAG); 5863 if (isUnsupportedFloatingType(VT)) { 5864 RTLIB::Libcall LC; 5865 if (Op.getOpcode() == ISD::SINT_TO_FP) 5866 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5867 Op.getValueType()); 5868 else 5869 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5870 Op.getValueType()); 5871 MakeLibCallOptions CallOptions; 5872 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5873 CallOptions, SDLoc(Op)).first; 5874 } 5875 5876 return Op; 5877 } 5878 5879 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5880 // Implement fcopysign with a fabs and a conditional fneg. 5881 SDValue Tmp0 = Op.getOperand(0); 5882 SDValue Tmp1 = Op.getOperand(1); 5883 SDLoc dl(Op); 5884 EVT VT = Op.getValueType(); 5885 EVT SrcVT = Tmp1.getValueType(); 5886 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5887 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5888 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5889 5890 if (UseNEON) { 5891 // Use VBSL to copy the sign bit. 5892 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5893 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5894 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5895 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5896 if (VT == MVT::f64) 5897 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5898 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5899 DAG.getConstant(32, dl, MVT::i32)); 5900 else /*if (VT == MVT::f32)*/ 5901 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5902 if (SrcVT == MVT::f32) { 5903 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5904 if (VT == MVT::f64) 5905 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5906 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5907 DAG.getConstant(32, dl, MVT::i32)); 5908 } else if (VT == MVT::f32) 5909 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5910 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5911 DAG.getConstant(32, dl, MVT::i32)); 5912 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5913 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5914 5915 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5916 dl, MVT::i32); 5917 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5918 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5919 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5920 5921 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5922 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5923 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5924 if (VT == MVT::f32) { 5925 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5926 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5927 DAG.getConstant(0, dl, MVT::i32)); 5928 } else { 5929 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5930 } 5931 5932 return Res; 5933 } 5934 5935 // Bitcast operand 1 to i32. 5936 if (SrcVT == MVT::f64) 5937 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5938 Tmp1).getValue(1); 5939 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5940 5941 // Or in the signbit with integer operations. 5942 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5943 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5944 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5945 if (VT == MVT::f32) { 5946 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5947 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5948 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5949 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5950 } 5951 5952 // f64: Or the high part with signbit and then combine two parts. 5953 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5954 Tmp0); 5955 SDValue Lo = Tmp0.getValue(0); 5956 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5957 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5958 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5959 } 5960 5961 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5962 MachineFunction &MF = DAG.getMachineFunction(); 5963 MachineFrameInfo &MFI = MF.getFrameInfo(); 5964 MFI.setReturnAddressIsTaken(true); 5965 5966 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5967 return SDValue(); 5968 5969 EVT VT = Op.getValueType(); 5970 SDLoc dl(Op); 5971 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5972 if (Depth) { 5973 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5974 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5975 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5976 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5977 MachinePointerInfo()); 5978 } 5979 5980 // Return LR, which contains the return address. Mark it an implicit live-in. 5981 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5982 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5983 } 5984 5985 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5986 const ARMBaseRegisterInfo &ARI = 5987 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5988 MachineFunction &MF = DAG.getMachineFunction(); 5989 MachineFrameInfo &MFI = MF.getFrameInfo(); 5990 MFI.setFrameAddressIsTaken(true); 5991 5992 EVT VT = Op.getValueType(); 5993 SDLoc dl(Op); // FIXME probably not meaningful 5994 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5995 Register FrameReg = ARI.getFrameRegister(MF); 5996 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5997 while (Depth--) 5998 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5999 MachinePointerInfo()); 6000 return FrameAddr; 6001 } 6002 6003 // FIXME? Maybe this could be a TableGen attribute on some registers and 6004 // this table could be generated automatically from RegInfo. 6005 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 6006 const MachineFunction &MF) const { 6007 Register Reg = StringSwitch<unsigned>(RegName) 6008 .Case("sp", ARM::SP) 6009 .Default(0); 6010 if (Reg) 6011 return Reg; 6012 report_fatal_error(Twine("Invalid register name \"" 6013 + StringRef(RegName) + "\".")); 6014 } 6015 6016 // Result is 64 bit value so split into two 32 bit values and return as a 6017 // pair of values. 6018 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 6019 SelectionDAG &DAG) { 6020 SDLoc DL(N); 6021 6022 // This function is only supposed to be called for i64 type destination. 6023 assert(N->getValueType(0) == MVT::i64 6024 && "ExpandREAD_REGISTER called for non-i64 type result."); 6025 6026 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 6027 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 6028 N->getOperand(0), 6029 N->getOperand(1)); 6030 6031 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 6032 Read.getValue(1))); 6033 Results.push_back(Read.getOperand(0)); 6034 } 6035 6036 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 6037 /// When \p DstVT, the destination type of \p BC, is on the vector 6038 /// register bank and the source of bitcast, \p Op, operates on the same bank, 6039 /// it might be possible to combine them, such that everything stays on the 6040 /// vector register bank. 6041 /// \p return The node that would replace \p BT, if the combine 6042 /// is possible. 6043 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 6044 SelectionDAG &DAG) { 6045 SDValue Op = BC->getOperand(0); 6046 EVT DstVT = BC->getValueType(0); 6047 6048 // The only vector instruction that can produce a scalar (remember, 6049 // since the bitcast was about to be turned into VMOVDRR, the source 6050 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 6051 // Moreover, we can do this combine only if there is one use. 6052 // Finally, if the destination type is not a vector, there is not 6053 // much point on forcing everything on the vector bank. 6054 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6055 !Op.hasOneUse()) 6056 return SDValue(); 6057 6058 // If the index is not constant, we will introduce an additional 6059 // multiply that will stick. 6060 // Give up in that case. 6061 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6062 if (!Index) 6063 return SDValue(); 6064 unsigned DstNumElt = DstVT.getVectorNumElements(); 6065 6066 // Compute the new index. 6067 const APInt &APIntIndex = Index->getAPIntValue(); 6068 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 6069 NewIndex *= APIntIndex; 6070 // Check if the new constant index fits into i32. 6071 if (NewIndex.getBitWidth() > 32) 6072 return SDValue(); 6073 6074 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 6075 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 6076 SDLoc dl(Op); 6077 SDValue ExtractSrc = Op.getOperand(0); 6078 EVT VecVT = EVT::getVectorVT( 6079 *DAG.getContext(), DstVT.getScalarType(), 6080 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 6081 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 6082 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 6083 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 6084 } 6085 6086 /// ExpandBITCAST - If the target supports VFP, this function is called to 6087 /// expand a bit convert where either the source or destination type is i64 to 6088 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 6089 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 6090 /// vectors), since the legalizer won't know what to do with that. 6091 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 6092 const ARMSubtarget *Subtarget) const { 6093 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6094 SDLoc dl(N); 6095 SDValue Op = N->getOperand(0); 6096 6097 // This function is only supposed to be called for i16 and i64 types, either 6098 // as the source or destination of the bit convert. 6099 EVT SrcVT = Op.getValueType(); 6100 EVT DstVT = N->getValueType(0); 6101 6102 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && 6103 (DstVT == MVT::f16 || DstVT == MVT::bf16)) 6104 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), 6105 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); 6106 6107 if ((DstVT == MVT::i16 || DstVT == MVT::i32) && 6108 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) 6109 return DAG.getNode( 6110 ISD::TRUNCATE, SDLoc(N), DstVT, 6111 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); 6112 6113 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 6114 return SDValue(); 6115 6116 // Turn i64->f64 into VMOVDRR. 6117 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 6118 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 6119 // if we can combine the bitcast with its source. 6120 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 6121 return Val; 6122 6123 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 6124 DAG.getConstant(0, dl, MVT::i32)); 6125 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 6126 DAG.getConstant(1, dl, MVT::i32)); 6127 return DAG.getNode(ISD::BITCAST, dl, DstVT, 6128 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 6129 } 6130 6131 // Turn f64->i64 into VMOVRRD. 6132 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 6133 SDValue Cvt; 6134 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 6135 SrcVT.getVectorNumElements() > 1) 6136 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 6137 DAG.getVTList(MVT::i32, MVT::i32), 6138 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 6139 else 6140 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 6141 DAG.getVTList(MVT::i32, MVT::i32), Op); 6142 // Merge the pieces into a single i64 value. 6143 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 6144 } 6145 6146 return SDValue(); 6147 } 6148 6149 /// getZeroVector - Returns a vector of specified type with all zero elements. 6150 /// Zero vectors are used to represent vector negation and in those cases 6151 /// will be implemented with the NEON VNEG instruction. However, VNEG does 6152 /// not support i64 elements, so sometimes the zero vectors will need to be 6153 /// explicitly constructed. Regardless, use a canonical VMOV to create the 6154 /// zero vector. 6155 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 6156 assert(VT.isVector() && "Expected a vector type"); 6157 // The canonical modified immediate encoding of a zero vector is....0! 6158 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 6159 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 6160 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 6161 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6162 } 6163 6164 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 6165 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6166 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 6167 SelectionDAG &DAG) const { 6168 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6169 EVT VT = Op.getValueType(); 6170 unsigned VTBits = VT.getSizeInBits(); 6171 SDLoc dl(Op); 6172 SDValue ShOpLo = Op.getOperand(0); 6173 SDValue ShOpHi = Op.getOperand(1); 6174 SDValue ShAmt = Op.getOperand(2); 6175 SDValue ARMcc; 6176 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6177 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 6178 6179 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 6180 6181 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6182 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6183 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 6184 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6185 DAG.getConstant(VTBits, dl, MVT::i32)); 6186 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 6187 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6188 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 6189 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6190 ISD::SETGE, ARMcc, DAG, dl); 6191 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 6192 ARMcc, CCR, CmpLo); 6193 6194 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 6195 SDValue HiBigShift = Opc == ISD::SRA 6196 ? DAG.getNode(Opc, dl, VT, ShOpHi, 6197 DAG.getConstant(VTBits - 1, dl, VT)) 6198 : DAG.getConstant(0, dl, VT); 6199 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6200 ISD::SETGE, ARMcc, DAG, dl); 6201 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6202 ARMcc, CCR, CmpHi); 6203 6204 SDValue Ops[2] = { Lo, Hi }; 6205 return DAG.getMergeValues(Ops, dl); 6206 } 6207 6208 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 6209 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6210 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 6211 SelectionDAG &DAG) const { 6212 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6213 EVT VT = Op.getValueType(); 6214 unsigned VTBits = VT.getSizeInBits(); 6215 SDLoc dl(Op); 6216 SDValue ShOpLo = Op.getOperand(0); 6217 SDValue ShOpHi = Op.getOperand(1); 6218 SDValue ShAmt = Op.getOperand(2); 6219 SDValue ARMcc; 6220 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6221 6222 assert(Op.getOpcode() == ISD::SHL_PARTS); 6223 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6224 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6225 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 6226 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 6227 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6228 6229 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6230 DAG.getConstant(VTBits, dl, MVT::i32)); 6231 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 6232 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6233 ISD::SETGE, ARMcc, DAG, dl); 6234 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6235 ARMcc, CCR, CmpHi); 6236 6237 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6238 ISD::SETGE, ARMcc, DAG, dl); 6239 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6240 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 6241 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 6242 6243 SDValue Ops[2] = { Lo, Hi }; 6244 return DAG.getMergeValues(Ops, dl); 6245 } 6246 6247 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 6248 SelectionDAG &DAG) const { 6249 // The rounding mode is in bits 23:22 of the FPSCR. 6250 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 6251 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 6252 // so that the shift + and get folded into a bitfield extract. 6253 SDLoc dl(Op); 6254 SDValue Chain = Op.getOperand(0); 6255 SDValue Ops[] = {Chain, 6256 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; 6257 6258 SDValue FPSCR = 6259 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); 6260 Chain = FPSCR.getValue(1); 6261 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 6262 DAG.getConstant(1U << 22, dl, MVT::i32)); 6263 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 6264 DAG.getConstant(22, dl, MVT::i32)); 6265 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 6266 DAG.getConstant(3, dl, MVT::i32)); 6267 return DAG.getMergeValues({And, Chain}, dl); 6268 } 6269 6270 SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op, 6271 SelectionDAG &DAG) const { 6272 SDLoc DL(Op); 6273 SDValue Chain = Op->getOperand(0); 6274 SDValue RMValue = Op->getOperand(1); 6275 6276 // The rounding mode is in bits 23:22 of the FPSCR. 6277 // The llvm.set.rounding argument value to ARM rounding mode value mapping 6278 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is 6279 // ((arg - 1) & 3) << 22). 6280 // 6281 // It is expected that the argument of llvm.set.rounding is within the 6282 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is 6283 // responsibility of the code generated llvm.set.rounding to ensure this 6284 // condition. 6285 6286 // Calculate new value of FPSCR[23:22]. 6287 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, 6288 DAG.getConstant(1, DL, MVT::i32)); 6289 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, 6290 DAG.getConstant(0x3, DL, MVT::i32)); 6291 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, 6292 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32)); 6293 6294 // Get current value of FPSCR. 6295 SDValue Ops[] = {Chain, 6296 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)}; 6297 SDValue FPSCR = 6298 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops); 6299 Chain = FPSCR.getValue(1); 6300 FPSCR = FPSCR.getValue(0); 6301 6302 // Put new rounding mode into FPSCR[23:22]. 6303 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos); 6304 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR, 6305 DAG.getConstant(RMMask, DL, MVT::i32)); 6306 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue); 6307 SDValue Ops2[] = { 6308 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR}; 6309 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); 6310 } 6311 6312 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 6313 const ARMSubtarget *ST) { 6314 SDLoc dl(N); 6315 EVT VT = N->getValueType(0); 6316 if (VT.isVector() && ST->hasNEON()) { 6317 6318 // Compute the least significant set bit: LSB = X & -X 6319 SDValue X = N->getOperand(0); 6320 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 6321 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 6322 6323 EVT ElemTy = VT.getVectorElementType(); 6324 6325 if (ElemTy == MVT::i8) { 6326 // Compute with: cttz(x) = ctpop(lsb - 1) 6327 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6328 DAG.getTargetConstant(1, dl, ElemTy)); 6329 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6330 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6331 } 6332 6333 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 6334 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 6335 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 6336 unsigned NumBits = ElemTy.getSizeInBits(); 6337 SDValue WidthMinus1 = 6338 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6339 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 6340 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 6341 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 6342 } 6343 6344 // Compute with: cttz(x) = ctpop(lsb - 1) 6345 6346 // Compute LSB - 1. 6347 SDValue Bits; 6348 if (ElemTy == MVT::i64) { 6349 // Load constant 0xffff'ffff'ffff'ffff to register. 6350 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6351 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 6352 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 6353 } else { 6354 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6355 DAG.getTargetConstant(1, dl, ElemTy)); 6356 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6357 } 6358 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6359 } 6360 6361 if (!ST->hasV6T2Ops()) 6362 return SDValue(); 6363 6364 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 6365 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 6366 } 6367 6368 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6369 const ARMSubtarget *ST) { 6370 EVT VT = N->getValueType(0); 6371 SDLoc DL(N); 6372 6373 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6374 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6375 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6376 "Unexpected type for custom ctpop lowering"); 6377 6378 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6379 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6380 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6381 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6382 6383 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6384 unsigned EltSize = 8; 6385 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6386 while (EltSize != VT.getScalarSizeInBits()) { 6387 SmallVector<SDValue, 8> Ops; 6388 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6389 TLI.getPointerTy(DAG.getDataLayout()))); 6390 Ops.push_back(Res); 6391 6392 EltSize *= 2; 6393 NumElts /= 2; 6394 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6395 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6396 } 6397 6398 return Res; 6399 } 6400 6401 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6402 /// operand of a vector shift operation, where all the elements of the 6403 /// build_vector must have the same constant integer value. 6404 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6405 // Ignore bit_converts. 6406 while (Op.getOpcode() == ISD::BITCAST) 6407 Op = Op.getOperand(0); 6408 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6409 APInt SplatBits, SplatUndef; 6410 unsigned SplatBitSize; 6411 bool HasAnyUndefs; 6412 if (!BVN || 6413 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6414 ElementBits) || 6415 SplatBitSize > ElementBits) 6416 return false; 6417 Cnt = SplatBits.getSExtValue(); 6418 return true; 6419 } 6420 6421 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6422 /// operand of a vector shift left operation. That value must be in the range: 6423 /// 0 <= Value < ElementBits for a left shift; or 6424 /// 0 <= Value <= ElementBits for a long left shift. 6425 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6426 assert(VT.isVector() && "vector shift count is not a vector type"); 6427 int64_t ElementBits = VT.getScalarSizeInBits(); 6428 if (!getVShiftImm(Op, ElementBits, Cnt)) 6429 return false; 6430 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6431 } 6432 6433 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6434 /// operand of a vector shift right operation. For a shift opcode, the value 6435 /// is positive, but for an intrinsic the value count must be negative. The 6436 /// absolute value must be in the range: 6437 /// 1 <= |Value| <= ElementBits for a right shift; or 6438 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6439 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6440 int64_t &Cnt) { 6441 assert(VT.isVector() && "vector shift count is not a vector type"); 6442 int64_t ElementBits = VT.getScalarSizeInBits(); 6443 if (!getVShiftImm(Op, ElementBits, Cnt)) 6444 return false; 6445 if (!isIntrinsic) 6446 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6447 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6448 Cnt = -Cnt; 6449 return true; 6450 } 6451 return false; 6452 } 6453 6454 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6455 const ARMSubtarget *ST) { 6456 EVT VT = N->getValueType(0); 6457 SDLoc dl(N); 6458 int64_t Cnt; 6459 6460 if (!VT.isVector()) 6461 return SDValue(); 6462 6463 // We essentially have two forms here. Shift by an immediate and shift by a 6464 // vector register (there are also shift by a gpr, but that is just handled 6465 // with a tablegen pattern). We cannot easily match shift by an immediate in 6466 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6467 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6468 // signed or unsigned, and a negative shift indicates a shift right). 6469 if (N->getOpcode() == ISD::SHL) { 6470 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6471 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6472 DAG.getConstant(Cnt, dl, MVT::i32)); 6473 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6474 N->getOperand(1)); 6475 } 6476 6477 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6478 "unexpected vector shift opcode"); 6479 6480 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6481 unsigned VShiftOpc = 6482 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6483 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6484 DAG.getConstant(Cnt, dl, MVT::i32)); 6485 } 6486 6487 // Other right shifts we don't have operations for (we use a shift left by a 6488 // negative number). 6489 EVT ShiftVT = N->getOperand(1).getValueType(); 6490 SDValue NegatedCount = DAG.getNode( 6491 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6492 unsigned VShiftOpc = 6493 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6494 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6495 } 6496 6497 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6498 const ARMSubtarget *ST) { 6499 EVT VT = N->getValueType(0); 6500 SDLoc dl(N); 6501 6502 // We can get here for a node like i32 = ISD::SHL i32, i64 6503 if (VT != MVT::i64) 6504 return SDValue(); 6505 6506 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6507 N->getOpcode() == ISD::SHL) && 6508 "Unknown shift to lower!"); 6509 6510 unsigned ShOpc = N->getOpcode(); 6511 if (ST->hasMVEIntegerOps()) { 6512 SDValue ShAmt = N->getOperand(1); 6513 unsigned ShPartsOpc = ARMISD::LSLL; 6514 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6515 6516 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6517 // then do the default optimisation 6518 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6519 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6520 return SDValue(); 6521 6522 // Extract the lower 32 bits of the shift amount if it's not an i32 6523 if (ShAmt->getValueType(0) != MVT::i32) 6524 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6525 6526 if (ShOpc == ISD::SRL) { 6527 if (!Con) 6528 // There is no t2LSRLr instruction so negate and perform an lsll if the 6529 // shift amount is in a register, emulating a right shift. 6530 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6531 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6532 else 6533 // Else generate an lsrl on the immediate shift amount 6534 ShPartsOpc = ARMISD::LSRL; 6535 } else if (ShOpc == ISD::SRA) 6536 ShPartsOpc = ARMISD::ASRL; 6537 6538 // Lower 32 bits of the destination/source 6539 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6540 DAG.getConstant(0, dl, MVT::i32)); 6541 // Upper 32 bits of the destination/source 6542 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6543 DAG.getConstant(1, dl, MVT::i32)); 6544 6545 // Generate the shift operation as computed above 6546 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6547 ShAmt); 6548 // The upper 32 bits come from the second return value of lsll 6549 Hi = SDValue(Lo.getNode(), 1); 6550 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6551 } 6552 6553 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6554 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6555 return SDValue(); 6556 6557 // If we are in thumb mode, we don't have RRX. 6558 if (ST->isThumb1Only()) 6559 return SDValue(); 6560 6561 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6562 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6563 DAG.getConstant(0, dl, MVT::i32)); 6564 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6565 DAG.getConstant(1, dl, MVT::i32)); 6566 6567 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6568 // captures the result into a carry flag. 6569 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6570 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6571 6572 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6573 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6574 6575 // Merge the pieces into a single i64 value. 6576 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6577 } 6578 6579 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6580 const ARMSubtarget *ST) { 6581 bool Invert = false; 6582 bool Swap = false; 6583 unsigned Opc = ARMCC::AL; 6584 6585 SDValue Op0 = Op.getOperand(0); 6586 SDValue Op1 = Op.getOperand(1); 6587 SDValue CC = Op.getOperand(2); 6588 EVT VT = Op.getValueType(); 6589 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6590 SDLoc dl(Op); 6591 6592 EVT CmpVT; 6593 if (ST->hasNEON()) 6594 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6595 else { 6596 assert(ST->hasMVEIntegerOps() && 6597 "No hardware support for integer vector comparison!"); 6598 6599 if (Op.getValueType().getVectorElementType() != MVT::i1) 6600 return SDValue(); 6601 6602 // Make sure we expand floating point setcc to scalar if we do not have 6603 // mve.fp, so that we can handle them from there. 6604 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6605 return SDValue(); 6606 6607 CmpVT = VT; 6608 } 6609 6610 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6611 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6612 // Special-case integer 64-bit equality comparisons. They aren't legal, 6613 // but they can be lowered with a few vector instructions. 6614 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6615 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6616 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6617 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6618 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6619 DAG.getCondCode(ISD::SETEQ)); 6620 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6621 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6622 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6623 if (SetCCOpcode == ISD::SETNE) 6624 Merged = DAG.getNOT(dl, Merged, CmpVT); 6625 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6626 return Merged; 6627 } 6628 6629 if (CmpVT.getVectorElementType() == MVT::i64) 6630 // 64-bit comparisons are not legal in general. 6631 return SDValue(); 6632 6633 if (Op1.getValueType().isFloatingPoint()) { 6634 switch (SetCCOpcode) { 6635 default: llvm_unreachable("Illegal FP comparison"); 6636 case ISD::SETUNE: 6637 case ISD::SETNE: 6638 if (ST->hasMVEFloatOps()) { 6639 Opc = ARMCC::NE; break; 6640 } else { 6641 Invert = true; LLVM_FALLTHROUGH; 6642 } 6643 case ISD::SETOEQ: 6644 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6645 case ISD::SETOLT: 6646 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6647 case ISD::SETOGT: 6648 case ISD::SETGT: Opc = ARMCC::GT; break; 6649 case ISD::SETOLE: 6650 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6651 case ISD::SETOGE: 6652 case ISD::SETGE: Opc = ARMCC::GE; break; 6653 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6654 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6655 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6656 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6657 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6658 case ISD::SETONE: { 6659 // Expand this to (OLT | OGT). 6660 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6661 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6662 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6663 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6664 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6665 if (Invert) 6666 Result = DAG.getNOT(dl, Result, VT); 6667 return Result; 6668 } 6669 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6670 case ISD::SETO: { 6671 // Expand this to (OLT | OGE). 6672 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6673 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6674 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6675 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6676 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6677 if (Invert) 6678 Result = DAG.getNOT(dl, Result, VT); 6679 return Result; 6680 } 6681 } 6682 } else { 6683 // Integer comparisons. 6684 switch (SetCCOpcode) { 6685 default: llvm_unreachable("Illegal integer comparison"); 6686 case ISD::SETNE: 6687 if (ST->hasMVEIntegerOps()) { 6688 Opc = ARMCC::NE; break; 6689 } else { 6690 Invert = true; LLVM_FALLTHROUGH; 6691 } 6692 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6693 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6694 case ISD::SETGT: Opc = ARMCC::GT; break; 6695 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6696 case ISD::SETGE: Opc = ARMCC::GE; break; 6697 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6698 case ISD::SETUGT: Opc = ARMCC::HI; break; 6699 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6700 case ISD::SETUGE: Opc = ARMCC::HS; break; 6701 } 6702 6703 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6704 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6705 SDValue AndOp; 6706 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6707 AndOp = Op0; 6708 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6709 AndOp = Op1; 6710 6711 // Ignore bitconvert. 6712 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6713 AndOp = AndOp.getOperand(0); 6714 6715 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6716 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6717 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6718 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6719 if (!Invert) 6720 Result = DAG.getNOT(dl, Result, VT); 6721 return Result; 6722 } 6723 } 6724 } 6725 6726 if (Swap) 6727 std::swap(Op0, Op1); 6728 6729 // If one of the operands is a constant vector zero, attempt to fold the 6730 // comparison to a specialized compare-against-zero form. 6731 SDValue SingleOp; 6732 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6733 SingleOp = Op0; 6734 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6735 if (Opc == ARMCC::GE) 6736 Opc = ARMCC::LE; 6737 else if (Opc == ARMCC::GT) 6738 Opc = ARMCC::LT; 6739 SingleOp = Op1; 6740 } 6741 6742 SDValue Result; 6743 if (SingleOp.getNode()) { 6744 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6745 DAG.getConstant(Opc, dl, MVT::i32)); 6746 } else { 6747 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6748 DAG.getConstant(Opc, dl, MVT::i32)); 6749 } 6750 6751 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6752 6753 if (Invert) 6754 Result = DAG.getNOT(dl, Result, VT); 6755 6756 return Result; 6757 } 6758 6759 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6760 SDValue LHS = Op.getOperand(0); 6761 SDValue RHS = Op.getOperand(1); 6762 SDValue Carry = Op.getOperand(2); 6763 SDValue Cond = Op.getOperand(3); 6764 SDLoc DL(Op); 6765 6766 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6767 6768 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6769 // have to invert the carry first. 6770 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6771 DAG.getConstant(1, DL, MVT::i32), Carry); 6772 // This converts the boolean value carry into the carry flag. 6773 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6774 6775 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6776 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6777 6778 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6779 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6780 SDValue ARMcc = DAG.getConstant( 6781 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6782 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6783 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6784 Cmp.getValue(1), SDValue()); 6785 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6786 CCR, Chain.getValue(1)); 6787 } 6788 6789 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6790 /// valid vector constant for a NEON or MVE instruction with a "modified 6791 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6792 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6793 unsigned SplatBitSize, SelectionDAG &DAG, 6794 const SDLoc &dl, EVT &VT, EVT VectorVT, 6795 VMOVModImmType type) { 6796 unsigned OpCmode, Imm; 6797 bool is128Bits = VectorVT.is128BitVector(); 6798 6799 // SplatBitSize is set to the smallest size that splats the vector, so a 6800 // zero vector will always have SplatBitSize == 8. However, NEON modified 6801 // immediate instructions others than VMOV do not support the 8-bit encoding 6802 // of a zero vector, and the default encoding of zero is supposed to be the 6803 // 32-bit version. 6804 if (SplatBits == 0) 6805 SplatBitSize = 32; 6806 6807 switch (SplatBitSize) { 6808 case 8: 6809 if (type != VMOVModImm) 6810 return SDValue(); 6811 // Any 1-byte value is OK. Op=0, Cmode=1110. 6812 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6813 OpCmode = 0xe; 6814 Imm = SplatBits; 6815 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6816 break; 6817 6818 case 16: 6819 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6820 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6821 if ((SplatBits & ~0xff) == 0) { 6822 // Value = 0x00nn: Op=x, Cmode=100x. 6823 OpCmode = 0x8; 6824 Imm = SplatBits; 6825 break; 6826 } 6827 if ((SplatBits & ~0xff00) == 0) { 6828 // Value = 0xnn00: Op=x, Cmode=101x. 6829 OpCmode = 0xa; 6830 Imm = SplatBits >> 8; 6831 break; 6832 } 6833 return SDValue(); 6834 6835 case 32: 6836 // NEON's 32-bit VMOV supports splat values where: 6837 // * only one byte is nonzero, or 6838 // * the least significant byte is 0xff and the second byte is nonzero, or 6839 // * the least significant 2 bytes are 0xff and the third is nonzero. 6840 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6841 if ((SplatBits & ~0xff) == 0) { 6842 // Value = 0x000000nn: Op=x, Cmode=000x. 6843 OpCmode = 0; 6844 Imm = SplatBits; 6845 break; 6846 } 6847 if ((SplatBits & ~0xff00) == 0) { 6848 // Value = 0x0000nn00: Op=x, Cmode=001x. 6849 OpCmode = 0x2; 6850 Imm = SplatBits >> 8; 6851 break; 6852 } 6853 if ((SplatBits & ~0xff0000) == 0) { 6854 // Value = 0x00nn0000: Op=x, Cmode=010x. 6855 OpCmode = 0x4; 6856 Imm = SplatBits >> 16; 6857 break; 6858 } 6859 if ((SplatBits & ~0xff000000) == 0) { 6860 // Value = 0xnn000000: Op=x, Cmode=011x. 6861 OpCmode = 0x6; 6862 Imm = SplatBits >> 24; 6863 break; 6864 } 6865 6866 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6867 if (type == OtherModImm) return SDValue(); 6868 6869 if ((SplatBits & ~0xffff) == 0 && 6870 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6871 // Value = 0x0000nnff: Op=x, Cmode=1100. 6872 OpCmode = 0xc; 6873 Imm = SplatBits >> 8; 6874 break; 6875 } 6876 6877 // cmode == 0b1101 is not supported for MVE VMVN 6878 if (type == MVEVMVNModImm) 6879 return SDValue(); 6880 6881 if ((SplatBits & ~0xffffff) == 0 && 6882 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6883 // Value = 0x00nnffff: Op=x, Cmode=1101. 6884 OpCmode = 0xd; 6885 Imm = SplatBits >> 16; 6886 break; 6887 } 6888 6889 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6890 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6891 // VMOV.I32. A (very) minor optimization would be to replicate the value 6892 // and fall through here to test for a valid 64-bit splat. But, then the 6893 // caller would also need to check and handle the change in size. 6894 return SDValue(); 6895 6896 case 64: { 6897 if (type != VMOVModImm) 6898 return SDValue(); 6899 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6900 uint64_t BitMask = 0xff; 6901 unsigned ImmMask = 1; 6902 Imm = 0; 6903 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6904 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6905 Imm |= ImmMask; 6906 } else if ((SplatBits & BitMask) != 0) { 6907 return SDValue(); 6908 } 6909 BitMask <<= 8; 6910 ImmMask <<= 1; 6911 } 6912 6913 if (DAG.getDataLayout().isBigEndian()) { 6914 // Reverse the order of elements within the vector. 6915 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; 6916 unsigned Mask = (1 << BytesPerElem) - 1; 6917 unsigned NumElems = 8 / BytesPerElem; 6918 unsigned NewImm = 0; 6919 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { 6920 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); 6921 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; 6922 } 6923 Imm = NewImm; 6924 } 6925 6926 // Op=1, Cmode=1110. 6927 OpCmode = 0x1e; 6928 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6929 break; 6930 } 6931 6932 default: 6933 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6934 } 6935 6936 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6937 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6938 } 6939 6940 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6941 const ARMSubtarget *ST) const { 6942 EVT VT = Op.getValueType(); 6943 bool IsDouble = (VT == MVT::f64); 6944 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6945 const APFloat &FPVal = CFP->getValueAPF(); 6946 6947 // Prevent floating-point constants from using literal loads 6948 // when execute-only is enabled. 6949 if (ST->genExecuteOnly()) { 6950 // If we can represent the constant as an immediate, don't lower it 6951 if (isFPImmLegal(FPVal, VT)) 6952 return Op; 6953 // Otherwise, construct as integer, and move to float register 6954 APInt INTVal = FPVal.bitcastToAPInt(); 6955 SDLoc DL(CFP); 6956 switch (VT.getSimpleVT().SimpleTy) { 6957 default: 6958 llvm_unreachable("Unknown floating point type!"); 6959 break; 6960 case MVT::f64: { 6961 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6962 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6963 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6964 } 6965 case MVT::f32: 6966 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6967 DAG.getConstant(INTVal, DL, MVT::i32)); 6968 } 6969 } 6970 6971 if (!ST->hasVFP3Base()) 6972 return SDValue(); 6973 6974 // Use the default (constant pool) lowering for double constants when we have 6975 // an SP-only FPU 6976 if (IsDouble && !Subtarget->hasFP64()) 6977 return SDValue(); 6978 6979 // Try splatting with a VMOV.f32... 6980 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6981 6982 if (ImmVal != -1) { 6983 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6984 // We have code in place to select a valid ConstantFP already, no need to 6985 // do any mangling. 6986 return Op; 6987 } 6988 6989 // It's a float and we are trying to use NEON operations where 6990 // possible. Lower it to a splat followed by an extract. 6991 SDLoc DL(Op); 6992 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6993 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6994 NewVal); 6995 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6996 DAG.getConstant(0, DL, MVT::i32)); 6997 } 6998 6999 // The rest of our options are NEON only, make sure that's allowed before 7000 // proceeding.. 7001 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 7002 return SDValue(); 7003 7004 EVT VMovVT; 7005 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 7006 7007 // It wouldn't really be worth bothering for doubles except for one very 7008 // important value, which does happen to match: 0.0. So make sure we don't do 7009 // anything stupid. 7010 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 7011 return SDValue(); 7012 7013 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 7014 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 7015 VMovVT, VT, VMOVModImm); 7016 if (NewVal != SDValue()) { 7017 SDLoc DL(Op); 7018 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 7019 NewVal); 7020 if (IsDouble) 7021 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 7022 7023 // It's a float: cast and extract a vector element. 7024 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 7025 VecConstant); 7026 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 7027 DAG.getConstant(0, DL, MVT::i32)); 7028 } 7029 7030 // Finally, try a VMVN.i32 7031 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 7032 VT, VMVNModImm); 7033 if (NewVal != SDValue()) { 7034 SDLoc DL(Op); 7035 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 7036 7037 if (IsDouble) 7038 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 7039 7040 // It's a float: cast and extract a vector element. 7041 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 7042 VecConstant); 7043 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 7044 DAG.getConstant(0, DL, MVT::i32)); 7045 } 7046 7047 return SDValue(); 7048 } 7049 7050 // check if an VEXT instruction can handle the shuffle mask when the 7051 // vector sources of the shuffle are the same. 7052 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 7053 unsigned NumElts = VT.getVectorNumElements(); 7054 7055 // Assume that the first shuffle index is not UNDEF. Fail if it is. 7056 if (M[0] < 0) 7057 return false; 7058 7059 Imm = M[0]; 7060 7061 // If this is a VEXT shuffle, the immediate value is the index of the first 7062 // element. The other shuffle indices must be the successive elements after 7063 // the first one. 7064 unsigned ExpectedElt = Imm; 7065 for (unsigned i = 1; i < NumElts; ++i) { 7066 // Increment the expected index. If it wraps around, just follow it 7067 // back to index zero and keep going. 7068 ++ExpectedElt; 7069 if (ExpectedElt == NumElts) 7070 ExpectedElt = 0; 7071 7072 if (M[i] < 0) continue; // ignore UNDEF indices 7073 if (ExpectedElt != static_cast<unsigned>(M[i])) 7074 return false; 7075 } 7076 7077 return true; 7078 } 7079 7080 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 7081 bool &ReverseVEXT, unsigned &Imm) { 7082 unsigned NumElts = VT.getVectorNumElements(); 7083 ReverseVEXT = false; 7084 7085 // Assume that the first shuffle index is not UNDEF. Fail if it is. 7086 if (M[0] < 0) 7087 return false; 7088 7089 Imm = M[0]; 7090 7091 // If this is a VEXT shuffle, the immediate value is the index of the first 7092 // element. The other shuffle indices must be the successive elements after 7093 // the first one. 7094 unsigned ExpectedElt = Imm; 7095 for (unsigned i = 1; i < NumElts; ++i) { 7096 // Increment the expected index. If it wraps around, it may still be 7097 // a VEXT but the source vectors must be swapped. 7098 ExpectedElt += 1; 7099 if (ExpectedElt == NumElts * 2) { 7100 ExpectedElt = 0; 7101 ReverseVEXT = true; 7102 } 7103 7104 if (M[i] < 0) continue; // ignore UNDEF indices 7105 if (ExpectedElt != static_cast<unsigned>(M[i])) 7106 return false; 7107 } 7108 7109 // Adjust the index value if the source operands will be swapped. 7110 if (ReverseVEXT) 7111 Imm -= NumElts; 7112 7113 return true; 7114 } 7115 7116 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 7117 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 7118 // range, then 0 is placed into the resulting vector. So pretty much any mask 7119 // of 8 elements can work here. 7120 return VT == MVT::v8i8 && M.size() == 8; 7121 } 7122 7123 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 7124 unsigned Index) { 7125 if (Mask.size() == Elements * 2) 7126 return Index / Elements; 7127 return Mask[Index] == 0 ? 0 : 1; 7128 } 7129 7130 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 7131 // checking that pairs of elements in the shuffle mask represent the same index 7132 // in each vector, incrementing the expected index by 2 at each step. 7133 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 7134 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 7135 // v2={e,f,g,h} 7136 // WhichResult gives the offset for each element in the mask based on which 7137 // of the two results it belongs to. 7138 // 7139 // The transpose can be represented either as: 7140 // result1 = shufflevector v1, v2, result1_shuffle_mask 7141 // result2 = shufflevector v1, v2, result2_shuffle_mask 7142 // where v1/v2 and the shuffle masks have the same number of elements 7143 // (here WhichResult (see below) indicates which result is being checked) 7144 // 7145 // or as: 7146 // results = shufflevector v1, v2, shuffle_mask 7147 // where both results are returned in one vector and the shuffle mask has twice 7148 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 7149 // want to check the low half and high half of the shuffle mask as if it were 7150 // the other case 7151 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7152 unsigned EltSz = VT.getScalarSizeInBits(); 7153 if (EltSz == 64) 7154 return false; 7155 7156 unsigned NumElts = VT.getVectorNumElements(); 7157 if (M.size() != NumElts && M.size() != NumElts*2) 7158 return false; 7159 7160 // If the mask is twice as long as the input vector then we need to check the 7161 // upper and lower parts of the mask with a matching value for WhichResult 7162 // FIXME: A mask with only even values will be rejected in case the first 7163 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 7164 // M[0] is used to determine WhichResult 7165 for (unsigned i = 0; i < M.size(); i += NumElts) { 7166 WhichResult = SelectPairHalf(NumElts, M, i); 7167 for (unsigned j = 0; j < NumElts; j += 2) { 7168 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7169 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 7170 return false; 7171 } 7172 } 7173 7174 if (M.size() == NumElts*2) 7175 WhichResult = 0; 7176 7177 return true; 7178 } 7179 7180 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 7181 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7182 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 7183 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7184 unsigned EltSz = VT.getScalarSizeInBits(); 7185 if (EltSz == 64) 7186 return false; 7187 7188 unsigned NumElts = VT.getVectorNumElements(); 7189 if (M.size() != NumElts && M.size() != NumElts*2) 7190 return false; 7191 7192 for (unsigned i = 0; i < M.size(); i += NumElts) { 7193 WhichResult = SelectPairHalf(NumElts, M, i); 7194 for (unsigned j = 0; j < NumElts; j += 2) { 7195 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7196 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 7197 return false; 7198 } 7199 } 7200 7201 if (M.size() == NumElts*2) 7202 WhichResult = 0; 7203 7204 return true; 7205 } 7206 7207 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 7208 // that the mask elements are either all even and in steps of size 2 or all odd 7209 // and in steps of size 2. 7210 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 7211 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 7212 // v2={e,f,g,h} 7213 // Requires similar checks to that of isVTRNMask with 7214 // respect the how results are returned. 7215 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7216 unsigned EltSz = VT.getScalarSizeInBits(); 7217 if (EltSz == 64) 7218 return false; 7219 7220 unsigned NumElts = VT.getVectorNumElements(); 7221 if (M.size() != NumElts && M.size() != NumElts*2) 7222 return false; 7223 7224 for (unsigned i = 0; i < M.size(); i += NumElts) { 7225 WhichResult = SelectPairHalf(NumElts, M, i); 7226 for (unsigned j = 0; j < NumElts; ++j) { 7227 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 7228 return false; 7229 } 7230 } 7231 7232 if (M.size() == NumElts*2) 7233 WhichResult = 0; 7234 7235 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7236 if (VT.is64BitVector() && EltSz == 32) 7237 return false; 7238 7239 return true; 7240 } 7241 7242 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 7243 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7244 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 7245 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7246 unsigned EltSz = VT.getScalarSizeInBits(); 7247 if (EltSz == 64) 7248 return false; 7249 7250 unsigned NumElts = VT.getVectorNumElements(); 7251 if (M.size() != NumElts && M.size() != NumElts*2) 7252 return false; 7253 7254 unsigned Half = NumElts / 2; 7255 for (unsigned i = 0; i < M.size(); i += NumElts) { 7256 WhichResult = SelectPairHalf(NumElts, M, i); 7257 for (unsigned j = 0; j < NumElts; j += Half) { 7258 unsigned Idx = WhichResult; 7259 for (unsigned k = 0; k < Half; ++k) { 7260 int MIdx = M[i + j + k]; 7261 if (MIdx >= 0 && (unsigned) MIdx != Idx) 7262 return false; 7263 Idx += 2; 7264 } 7265 } 7266 } 7267 7268 if (M.size() == NumElts*2) 7269 WhichResult = 0; 7270 7271 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7272 if (VT.is64BitVector() && EltSz == 32) 7273 return false; 7274 7275 return true; 7276 } 7277 7278 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 7279 // that pairs of elements of the shufflemask represent the same index in each 7280 // vector incrementing sequentially through the vectors. 7281 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 7282 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 7283 // v2={e,f,g,h} 7284 // Requires similar checks to that of isVTRNMask with respect the how results 7285 // are returned. 7286 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7287 unsigned EltSz = VT.getScalarSizeInBits(); 7288 if (EltSz == 64) 7289 return false; 7290 7291 unsigned NumElts = VT.getVectorNumElements(); 7292 if (M.size() != NumElts && M.size() != NumElts*2) 7293 return false; 7294 7295 for (unsigned i = 0; i < M.size(); i += NumElts) { 7296 WhichResult = SelectPairHalf(NumElts, M, i); 7297 unsigned Idx = WhichResult * NumElts / 2; 7298 for (unsigned j = 0; j < NumElts; j += 2) { 7299 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7300 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 7301 return false; 7302 Idx += 1; 7303 } 7304 } 7305 7306 if (M.size() == NumElts*2) 7307 WhichResult = 0; 7308 7309 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7310 if (VT.is64BitVector() && EltSz == 32) 7311 return false; 7312 7313 return true; 7314 } 7315 7316 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 7317 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7318 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 7319 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7320 unsigned EltSz = VT.getScalarSizeInBits(); 7321 if (EltSz == 64) 7322 return false; 7323 7324 unsigned NumElts = VT.getVectorNumElements(); 7325 if (M.size() != NumElts && M.size() != NumElts*2) 7326 return false; 7327 7328 for (unsigned i = 0; i < M.size(); i += NumElts) { 7329 WhichResult = SelectPairHalf(NumElts, M, i); 7330 unsigned Idx = WhichResult * NumElts / 2; 7331 for (unsigned j = 0; j < NumElts; j += 2) { 7332 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7333 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 7334 return false; 7335 Idx += 1; 7336 } 7337 } 7338 7339 if (M.size() == NumElts*2) 7340 WhichResult = 0; 7341 7342 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7343 if (VT.is64BitVector() && EltSz == 32) 7344 return false; 7345 7346 return true; 7347 } 7348 7349 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7350 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7351 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7352 unsigned &WhichResult, 7353 bool &isV_UNDEF) { 7354 isV_UNDEF = false; 7355 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7356 return ARMISD::VTRN; 7357 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7358 return ARMISD::VUZP; 7359 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7360 return ARMISD::VZIP; 7361 7362 isV_UNDEF = true; 7363 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7364 return ARMISD::VTRN; 7365 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7366 return ARMISD::VUZP; 7367 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7368 return ARMISD::VZIP; 7369 7370 return 0; 7371 } 7372 7373 /// \return true if this is a reverse operation on an vector. 7374 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7375 unsigned NumElts = VT.getVectorNumElements(); 7376 // Make sure the mask has the right size. 7377 if (NumElts != M.size()) 7378 return false; 7379 7380 // Look for <15, ..., 3, -1, 1, 0>. 7381 for (unsigned i = 0; i != NumElts; ++i) 7382 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7383 return false; 7384 7385 return true; 7386 } 7387 7388 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { 7389 unsigned NumElts = VT.getVectorNumElements(); 7390 // Make sure the mask has the right size. 7391 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7392 return false; 7393 7394 // If Top 7395 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7396 // This inserts Input2 into Input1 7397 // else if not Top 7398 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7399 // This inserts Input1 into Input2 7400 unsigned Offset = Top ? 0 : 1; 7401 unsigned N = SingleSource ? 0 : NumElts; 7402 for (unsigned i = 0; i < NumElts; i += 2) { 7403 if (M[i] >= 0 && M[i] != (int)i) 7404 return false; 7405 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset)) 7406 return false; 7407 } 7408 7409 return true; 7410 } 7411 7412 static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) { 7413 unsigned NumElts = ToVT.getVectorNumElements(); 7414 if (NumElts != M.size()) 7415 return false; 7416 7417 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are 7418 // looking for patterns of: 7419 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ... 7420 // rev: N/2 0 N/2+1 1 N/2+2 2 ... 7421 7422 unsigned Off0 = rev ? NumElts / 2 : 0; 7423 unsigned Off1 = rev ? 0 : NumElts / 2; 7424 for (unsigned i = 0; i < NumElts; i += 2) { 7425 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) 7426 return false; 7427 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) 7428 return false; 7429 } 7430 7431 return true; 7432 } 7433 7434 // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted 7435 // from a pair of inputs. For example: 7436 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7437 // FP_ROUND(EXTRACT_ELT(Y, 0), 7438 // FP_ROUND(EXTRACT_ELT(X, 1), 7439 // FP_ROUND(EXTRACT_ELT(Y, 1), ...) 7440 static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, 7441 const ARMSubtarget *ST) { 7442 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7443 if (!ST->hasMVEFloatOps()) 7444 return SDValue(); 7445 7446 SDLoc dl(BV); 7447 EVT VT = BV.getValueType(); 7448 if (VT != MVT::v8f16) 7449 return SDValue(); 7450 7451 // We are looking for a buildvector of fptrunc elements, where all the 7452 // elements are interleavingly extracted from two sources. Check the first two 7453 // items are valid enough and extract some info from them (they are checked 7454 // properly in the loop below). 7455 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || 7456 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7457 BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) 7458 return SDValue(); 7459 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || 7460 BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7461 BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) 7462 return SDValue(); 7463 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7464 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); 7465 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) 7466 return SDValue(); 7467 7468 // Check all the values in the BuildVector line up with our expectations. 7469 for (unsigned i = 1; i < 4; i++) { 7470 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7471 return Trunc.getOpcode() == ISD::FP_ROUND && 7472 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7473 Trunc.getOperand(0).getOperand(0) == Op && 7474 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7475 }; 7476 if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) 7477 return SDValue(); 7478 if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) 7479 return SDValue(); 7480 } 7481 7482 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, 7483 DAG.getConstant(0, dl, MVT::i32)); 7484 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, 7485 DAG.getConstant(1, dl, MVT::i32)); 7486 } 7487 7488 // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted 7489 // from a single input on alternating lanes. For example: 7490 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7491 // FP_ROUND(EXTRACT_ELT(X, 2), 7492 // FP_ROUND(EXTRACT_ELT(X, 4), ...) 7493 static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, 7494 const ARMSubtarget *ST) { 7495 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7496 if (!ST->hasMVEFloatOps()) 7497 return SDValue(); 7498 7499 SDLoc dl(BV); 7500 EVT VT = BV.getValueType(); 7501 if (VT != MVT::v4f32) 7502 return SDValue(); 7503 7504 // We are looking for a buildvector of fptext elements, where all the 7505 // elements are alternating lanes from a single source. For example <0,2,4,6> 7506 // or <1,3,5,7>. Check the first two items are valid enough and extract some 7507 // info from them (they are checked properly in the loop below). 7508 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || 7509 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) 7510 return SDValue(); 7511 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7512 int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); 7513 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) 7514 return SDValue(); 7515 7516 // Check all the values in the BuildVector line up with our expectations. 7517 for (unsigned i = 1; i < 4; i++) { 7518 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7519 return Trunc.getOpcode() == ISD::FP_EXTEND && 7520 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7521 Trunc.getOperand(0).getOperand(0) == Op && 7522 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7523 }; 7524 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) 7525 return SDValue(); 7526 } 7527 7528 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, 7529 DAG.getConstant(Offset, dl, MVT::i32)); 7530 } 7531 7532 // If N is an integer constant that can be moved into a register in one 7533 // instruction, return an SDValue of such a constant (will become a MOV 7534 // instruction). Otherwise return null. 7535 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7536 const ARMSubtarget *ST, const SDLoc &dl) { 7537 uint64_t Val; 7538 if (!isa<ConstantSDNode>(N)) 7539 return SDValue(); 7540 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7541 7542 if (ST->isThumb1Only()) { 7543 if (Val <= 255 || ~Val <= 255) 7544 return DAG.getConstant(Val, dl, MVT::i32); 7545 } else { 7546 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7547 return DAG.getConstant(Val, dl, MVT::i32); 7548 } 7549 return SDValue(); 7550 } 7551 7552 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7553 const ARMSubtarget *ST) { 7554 SDLoc dl(Op); 7555 EVT VT = Op.getValueType(); 7556 7557 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7558 7559 unsigned NumElts = VT.getVectorNumElements(); 7560 unsigned BoolMask; 7561 unsigned BitsPerBool; 7562 if (NumElts == 4) { 7563 BitsPerBool = 4; 7564 BoolMask = 0xf; 7565 } else if (NumElts == 8) { 7566 BitsPerBool = 2; 7567 BoolMask = 0x3; 7568 } else if (NumElts == 16) { 7569 BitsPerBool = 1; 7570 BoolMask = 0x1; 7571 } else 7572 return SDValue(); 7573 7574 // If this is a single value copied into all lanes (a splat), we can just sign 7575 // extend that single value 7576 SDValue FirstOp = Op.getOperand(0); 7577 if (!isa<ConstantSDNode>(FirstOp) && 7578 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7579 [&FirstOp](SDUse &U) { 7580 return U.get().isUndef() || U.get() == FirstOp; 7581 })) { 7582 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7583 DAG.getValueType(MVT::i1)); 7584 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7585 } 7586 7587 // First create base with bits set where known 7588 unsigned Bits32 = 0; 7589 for (unsigned i = 0; i < NumElts; ++i) { 7590 SDValue V = Op.getOperand(i); 7591 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7592 continue; 7593 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7594 if (BitSet) 7595 Bits32 |= BoolMask << (i * BitsPerBool); 7596 } 7597 7598 // Add in unknown nodes 7599 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7600 DAG.getConstant(Bits32, dl, MVT::i32)); 7601 for (unsigned i = 0; i < NumElts; ++i) { 7602 SDValue V = Op.getOperand(i); 7603 if (isa<ConstantSDNode>(V) || V.isUndef()) 7604 continue; 7605 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7606 DAG.getConstant(i, dl, MVT::i32)); 7607 } 7608 7609 return Base; 7610 } 7611 7612 static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, 7613 const ARMSubtarget *ST) { 7614 if (!ST->hasMVEIntegerOps()) 7615 return SDValue(); 7616 7617 // We are looking for a buildvector where each element is Op[0] + i*N 7618 EVT VT = Op.getValueType(); 7619 SDValue Op0 = Op.getOperand(0); 7620 unsigned NumElts = VT.getVectorNumElements(); 7621 7622 // Get the increment value from operand 1 7623 SDValue Op1 = Op.getOperand(1); 7624 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 || 7625 !isa<ConstantSDNode>(Op1.getOperand(1))) 7626 return SDValue(); 7627 unsigned N = Op1.getConstantOperandVal(1); 7628 if (N != 1 && N != 2 && N != 4 && N != 8) 7629 return SDValue(); 7630 7631 // Check that each other operand matches 7632 for (unsigned I = 2; I < NumElts; I++) { 7633 SDValue OpI = Op.getOperand(I); 7634 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 || 7635 !isa<ConstantSDNode>(OpI.getOperand(1)) || 7636 OpI.getConstantOperandVal(1) != I * N) 7637 return SDValue(); 7638 } 7639 7640 SDLoc DL(Op); 7641 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0, 7642 DAG.getConstant(N, DL, MVT::i32)); 7643 } 7644 7645 // If this is a case we can't handle, return null and let the default 7646 // expansion code take care of it. 7647 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7648 const ARMSubtarget *ST) const { 7649 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7650 SDLoc dl(Op); 7651 EVT VT = Op.getValueType(); 7652 7653 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7654 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7655 7656 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST)) 7657 return R; 7658 7659 APInt SplatBits, SplatUndef; 7660 unsigned SplatBitSize; 7661 bool HasAnyUndefs; 7662 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7663 if (SplatUndef.isAllOnesValue()) 7664 return DAG.getUNDEF(VT); 7665 7666 if ((ST->hasNEON() && SplatBitSize <= 64) || 7667 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { 7668 // Check if an immediate VMOV works. 7669 EVT VmovVT; 7670 SDValue Val = 7671 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 7672 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); 7673 7674 if (Val.getNode()) { 7675 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7676 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7677 } 7678 7679 // Try an immediate VMVN. 7680 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7681 Val = isVMOVModifiedImm( 7682 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, 7683 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7684 if (Val.getNode()) { 7685 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7686 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7687 } 7688 7689 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7690 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7691 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7692 if (ImmVal != -1) { 7693 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7694 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7695 } 7696 } 7697 7698 // If we are under MVE, generate a VDUP(constant), bitcast to the original 7699 // type. 7700 if (ST->hasMVEIntegerOps() && 7701 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) { 7702 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 7703 : SplatBitSize == 16 ? MVT::v8i16 7704 : MVT::v16i8; 7705 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); 7706 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); 7707 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); 7708 } 7709 } 7710 } 7711 7712 // Scan through the operands to see if only one value is used. 7713 // 7714 // As an optimisation, even if more than one value is used it may be more 7715 // profitable to splat with one value then change some lanes. 7716 // 7717 // Heuristically we decide to do this if the vector has a "dominant" value, 7718 // defined as splatted to more than half of the lanes. 7719 unsigned NumElts = VT.getVectorNumElements(); 7720 bool isOnlyLowElement = true; 7721 bool usesOnlyOneValue = true; 7722 bool hasDominantValue = false; 7723 bool isConstant = true; 7724 7725 // Map of the number of times a particular SDValue appears in the 7726 // element list. 7727 DenseMap<SDValue, unsigned> ValueCounts; 7728 SDValue Value; 7729 for (unsigned i = 0; i < NumElts; ++i) { 7730 SDValue V = Op.getOperand(i); 7731 if (V.isUndef()) 7732 continue; 7733 if (i > 0) 7734 isOnlyLowElement = false; 7735 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7736 isConstant = false; 7737 7738 ValueCounts.insert(std::make_pair(V, 0)); 7739 unsigned &Count = ValueCounts[V]; 7740 7741 // Is this value dominant? (takes up more than half of the lanes) 7742 if (++Count > (NumElts / 2)) { 7743 hasDominantValue = true; 7744 Value = V; 7745 } 7746 } 7747 if (ValueCounts.size() != 1) 7748 usesOnlyOneValue = false; 7749 if (!Value.getNode() && !ValueCounts.empty()) 7750 Value = ValueCounts.begin()->first; 7751 7752 if (ValueCounts.empty()) 7753 return DAG.getUNDEF(VT); 7754 7755 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7756 // Keep going if we are hitting this case. 7757 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7758 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7759 7760 unsigned EltSize = VT.getScalarSizeInBits(); 7761 7762 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7763 // i32 and try again. 7764 if (hasDominantValue && EltSize <= 32) { 7765 if (!isConstant) { 7766 SDValue N; 7767 7768 // If we are VDUPing a value that comes directly from a vector, that will 7769 // cause an unnecessary move to and from a GPR, where instead we could 7770 // just use VDUPLANE. We can only do this if the lane being extracted 7771 // is at a constant index, as the VDUP from lane instructions only have 7772 // constant-index forms. 7773 ConstantSDNode *constIndex; 7774 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7775 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7776 // We need to create a new undef vector to use for the VDUPLANE if the 7777 // size of the vector from which we get the value is different than the 7778 // size of the vector that we need to create. We will insert the element 7779 // such that the register coalescer will remove unnecessary copies. 7780 if (VT != Value->getOperand(0).getValueType()) { 7781 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7782 VT.getVectorNumElements(); 7783 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7784 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7785 Value, DAG.getConstant(index, dl, MVT::i32)), 7786 DAG.getConstant(index, dl, MVT::i32)); 7787 } else 7788 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7789 Value->getOperand(0), Value->getOperand(1)); 7790 } else 7791 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7792 7793 if (!usesOnlyOneValue) { 7794 // The dominant value was splatted as 'N', but we now have to insert 7795 // all differing elements. 7796 for (unsigned I = 0; I < NumElts; ++I) { 7797 if (Op.getOperand(I) == Value) 7798 continue; 7799 SmallVector<SDValue, 3> Ops; 7800 Ops.push_back(N); 7801 Ops.push_back(Op.getOperand(I)); 7802 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7803 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7804 } 7805 } 7806 return N; 7807 } 7808 if (VT.getVectorElementType().isFloatingPoint()) { 7809 SmallVector<SDValue, 8> Ops; 7810 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7811 assert(FVT == MVT::f32 || FVT == MVT::f16); 7812 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7813 for (unsigned i = 0; i < NumElts; ++i) 7814 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7815 Op.getOperand(i))); 7816 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7817 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7818 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7819 if (Val.getNode()) 7820 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7821 } 7822 if (usesOnlyOneValue) { 7823 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7824 if (isConstant && Val.getNode()) 7825 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7826 } 7827 } 7828 7829 // If all elements are constants and the case above didn't get hit, fall back 7830 // to the default expansion, which will generate a load from the constant 7831 // pool. 7832 if (isConstant) 7833 return SDValue(); 7834 7835 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and 7836 // vmovn). Empirical tests suggest this is rarely worth it for vectors of 7837 // length <= 2. 7838 if (NumElts >= 4) 7839 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 7840 return shuffle; 7841 7842 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into 7843 // VCVT's 7844 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) 7845 return VCVT; 7846 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) 7847 return VCVT; 7848 7849 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7850 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7851 // into two 64-bit vectors; we might discover a better way to lower it. 7852 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7853 EVT ExtVT = VT.getVectorElementType(); 7854 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7855 SDValue Lower = 7856 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7857 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7858 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7859 SDValue Upper = DAG.getBuildVector( 7860 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7861 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7862 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7863 if (Lower && Upper) 7864 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7865 } 7866 7867 // Vectors with 32- or 64-bit elements can be built by directly assigning 7868 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7869 // will be legalized. 7870 if (EltSize >= 32) { 7871 // Do the expansion with floating-point types, since that is what the VFP 7872 // registers are defined to use, and since i64 is not legal. 7873 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7874 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7875 SmallVector<SDValue, 8> Ops; 7876 for (unsigned i = 0; i < NumElts; ++i) 7877 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7878 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7879 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7880 } 7881 7882 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7883 // know the default expansion would otherwise fall back on something even 7884 // worse. For a vector with one or two non-undef values, that's 7885 // scalar_to_vector for the elements followed by a shuffle (provided the 7886 // shuffle is valid for the target) and materialization element by element 7887 // on the stack followed by a load for everything else. 7888 if (!isConstant && !usesOnlyOneValue) { 7889 SDValue Vec = DAG.getUNDEF(VT); 7890 for (unsigned i = 0 ; i < NumElts; ++i) { 7891 SDValue V = Op.getOperand(i); 7892 if (V.isUndef()) 7893 continue; 7894 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7895 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7896 } 7897 return Vec; 7898 } 7899 7900 return SDValue(); 7901 } 7902 7903 // Gather data to see if the operation can be modelled as a 7904 // shuffle in combination with VEXTs. 7905 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7906 SelectionDAG &DAG) const { 7907 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7908 SDLoc dl(Op); 7909 EVT VT = Op.getValueType(); 7910 unsigned NumElts = VT.getVectorNumElements(); 7911 7912 struct ShuffleSourceInfo { 7913 SDValue Vec; 7914 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7915 unsigned MaxElt = 0; 7916 7917 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7918 // be compatible with the shuffle we intend to construct. As a result 7919 // ShuffleVec will be some sliding window into the original Vec. 7920 SDValue ShuffleVec; 7921 7922 // Code should guarantee that element i in Vec starts at element "WindowBase 7923 // + i * WindowScale in ShuffleVec". 7924 int WindowBase = 0; 7925 int WindowScale = 1; 7926 7927 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7928 7929 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7930 }; 7931 7932 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7933 // node. 7934 SmallVector<ShuffleSourceInfo, 2> Sources; 7935 for (unsigned i = 0; i < NumElts; ++i) { 7936 SDValue V = Op.getOperand(i); 7937 if (V.isUndef()) 7938 continue; 7939 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7940 // A shuffle can only come from building a vector from various 7941 // elements of other vectors. 7942 return SDValue(); 7943 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7944 // Furthermore, shuffles require a constant mask, whereas extractelts 7945 // accept variable indices. 7946 return SDValue(); 7947 } 7948 7949 // Add this element source to the list if it's not already there. 7950 SDValue SourceVec = V.getOperand(0); 7951 auto Source = llvm::find(Sources, SourceVec); 7952 if (Source == Sources.end()) 7953 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7954 7955 // Update the minimum and maximum lane number seen. 7956 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7957 Source->MinElt = std::min(Source->MinElt, EltNo); 7958 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7959 } 7960 7961 // Currently only do something sane when at most two source vectors 7962 // are involved. 7963 if (Sources.size() > 2) 7964 return SDValue(); 7965 7966 // Find out the smallest element size among result and two sources, and use 7967 // it as element size to build the shuffle_vector. 7968 EVT SmallestEltTy = VT.getVectorElementType(); 7969 for (auto &Source : Sources) { 7970 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7971 if (SrcEltTy.bitsLT(SmallestEltTy)) 7972 SmallestEltTy = SrcEltTy; 7973 } 7974 unsigned ResMultiplier = 7975 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7976 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7977 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7978 7979 // If the source vector is too wide or too narrow, we may nevertheless be able 7980 // to construct a compatible shuffle either by concatenating it with UNDEF or 7981 // extracting a suitable range of elements. 7982 for (auto &Src : Sources) { 7983 EVT SrcVT = Src.ShuffleVec.getValueType(); 7984 7985 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); 7986 uint64_t VTSize = VT.getFixedSizeInBits(); 7987 if (SrcVTSize == VTSize) 7988 continue; 7989 7990 // This stage of the search produces a source with the same element type as 7991 // the original, but with a total width matching the BUILD_VECTOR output. 7992 EVT EltVT = SrcVT.getVectorElementType(); 7993 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); 7994 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7995 7996 if (SrcVTSize < VTSize) { 7997 if (2 * SrcVTSize != VTSize) 7998 return SDValue(); 7999 // We can pad out the smaller vector for free, so if it's part of a 8000 // shuffle... 8001 Src.ShuffleVec = 8002 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 8003 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 8004 continue; 8005 } 8006 8007 if (SrcVTSize != 2 * VTSize) 8008 return SDValue(); 8009 8010 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 8011 // Span too large for a VEXT to cope 8012 return SDValue(); 8013 } 8014 8015 if (Src.MinElt >= NumSrcElts) { 8016 // The extraction can just take the second half 8017 Src.ShuffleVec = 8018 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8019 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 8020 Src.WindowBase = -NumSrcElts; 8021 } else if (Src.MaxElt < NumSrcElts) { 8022 // The extraction can just take the first half 8023 Src.ShuffleVec = 8024 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8025 DAG.getConstant(0, dl, MVT::i32)); 8026 } else { 8027 // An actual VEXT is needed 8028 SDValue VEXTSrc1 = 8029 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8030 DAG.getConstant(0, dl, MVT::i32)); 8031 SDValue VEXTSrc2 = 8032 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8033 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 8034 8035 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 8036 VEXTSrc2, 8037 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 8038 Src.WindowBase = -Src.MinElt; 8039 } 8040 } 8041 8042 // Another possible incompatibility occurs from the vector element types. We 8043 // can fix this by bitcasting the source vectors to the same type we intend 8044 // for the shuffle. 8045 for (auto &Src : Sources) { 8046 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 8047 if (SrcEltTy == SmallestEltTy) 8048 continue; 8049 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 8050 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); 8051 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 8052 Src.WindowBase *= Src.WindowScale; 8053 } 8054 8055 // Final sanity check before we try to actually produce a shuffle. 8056 LLVM_DEBUG(for (auto Src 8057 : Sources) 8058 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 8059 8060 // The stars all align, our next step is to produce the mask for the shuffle. 8061 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 8062 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 8063 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 8064 SDValue Entry = Op.getOperand(i); 8065 if (Entry.isUndef()) 8066 continue; 8067 8068 auto Src = llvm::find(Sources, Entry.getOperand(0)); 8069 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 8070 8071 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 8072 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 8073 // segment. 8074 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 8075 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), 8076 VT.getScalarSizeInBits()); 8077 int LanesDefined = BitsDefined / BitsPerShuffleLane; 8078 8079 // This source is expected to fill ResMultiplier lanes of the final shuffle, 8080 // starting at the appropriate offset. 8081 int *LaneMask = &Mask[i * ResMultiplier]; 8082 8083 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 8084 ExtractBase += NumElts * (Src - Sources.begin()); 8085 for (int j = 0; j < LanesDefined; ++j) 8086 LaneMask[j] = ExtractBase + j; 8087 } 8088 8089 8090 // We can't handle more than two sources. This should have already 8091 // been checked before this point. 8092 assert(Sources.size() <= 2 && "Too many sources!"); 8093 8094 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 8095 for (unsigned i = 0; i < Sources.size(); ++i) 8096 ShuffleOps[i] = Sources[i].ShuffleVec; 8097 8098 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 8099 ShuffleOps[1], Mask, DAG); 8100 if (!Shuffle) 8101 return SDValue(); 8102 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); 8103 } 8104 8105 enum ShuffleOpCodes { 8106 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 8107 OP_VREV, 8108 OP_VDUP0, 8109 OP_VDUP1, 8110 OP_VDUP2, 8111 OP_VDUP3, 8112 OP_VEXT1, 8113 OP_VEXT2, 8114 OP_VEXT3, 8115 OP_VUZPL, // VUZP, left result 8116 OP_VUZPR, // VUZP, right result 8117 OP_VZIPL, // VZIP, left result 8118 OP_VZIPR, // VZIP, right result 8119 OP_VTRNL, // VTRN, left result 8120 OP_VTRNR // VTRN, right result 8121 }; 8122 8123 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 8124 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8125 switch (OpNum) { 8126 case OP_COPY: 8127 case OP_VREV: 8128 case OP_VDUP0: 8129 case OP_VDUP1: 8130 case OP_VDUP2: 8131 case OP_VDUP3: 8132 return true; 8133 } 8134 return false; 8135 } 8136 8137 /// isShuffleMaskLegal - Targets can use this to indicate that they only 8138 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8139 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8140 /// are assumed to be legal. 8141 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 8142 if (VT.getVectorNumElements() == 4 && 8143 (VT.is128BitVector() || VT.is64BitVector())) { 8144 unsigned PFIndexes[4]; 8145 for (unsigned i = 0; i != 4; ++i) { 8146 if (M[i] < 0) 8147 PFIndexes[i] = 8; 8148 else 8149 PFIndexes[i] = M[i]; 8150 } 8151 8152 // Compute the index in the perfect shuffle table. 8153 unsigned PFTableIndex = 8154 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8155 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8156 unsigned Cost = (PFEntry >> 30); 8157 8158 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 8159 return true; 8160 } 8161 8162 bool ReverseVEXT, isV_UNDEF; 8163 unsigned Imm, WhichResult; 8164 8165 unsigned EltSize = VT.getScalarSizeInBits(); 8166 if (EltSize >= 32 || 8167 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8168 ShuffleVectorInst::isIdentityMask(M) || 8169 isVREVMask(M, VT, 64) || 8170 isVREVMask(M, VT, 32) || 8171 isVREVMask(M, VT, 16)) 8172 return true; 8173 else if (Subtarget->hasNEON() && 8174 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 8175 isVTBLMask(M, VT) || 8176 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 8177 return true; 8178 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 8179 isReverseMask(M, VT)) 8180 return true; 8181 else if (Subtarget->hasMVEIntegerOps() && 8182 (isVMOVNMask(M, VT, true, false) || 8183 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true))) 8184 return true; 8185 else 8186 return false; 8187 } 8188 8189 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 8190 /// the specified operations to build the shuffle. 8191 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 8192 SDValue RHS, SelectionDAG &DAG, 8193 const SDLoc &dl) { 8194 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8195 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8196 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8197 8198 if (OpNum == OP_COPY) { 8199 if (LHSID == (1*9+2)*9+3) return LHS; 8200 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 8201 return RHS; 8202 } 8203 8204 SDValue OpLHS, OpRHS; 8205 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 8206 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 8207 EVT VT = OpLHS.getValueType(); 8208 8209 switch (OpNum) { 8210 default: llvm_unreachable("Unknown shuffle opcode!"); 8211 case OP_VREV: 8212 // VREV divides the vector in half and swaps within the half. 8213 if (VT.getVectorElementType() == MVT::i32 || 8214 VT.getVectorElementType() == MVT::f32) 8215 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 8216 // vrev <4 x i16> -> VREV32 8217 if (VT.getVectorElementType() == MVT::i16 || 8218 VT.getVectorElementType() == MVT::f16) 8219 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 8220 // vrev <4 x i8> -> VREV16 8221 assert(VT.getVectorElementType() == MVT::i8); 8222 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 8223 case OP_VDUP0: 8224 case OP_VDUP1: 8225 case OP_VDUP2: 8226 case OP_VDUP3: 8227 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 8228 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 8229 case OP_VEXT1: 8230 case OP_VEXT2: 8231 case OP_VEXT3: 8232 return DAG.getNode(ARMISD::VEXT, dl, VT, 8233 OpLHS, OpRHS, 8234 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 8235 case OP_VUZPL: 8236 case OP_VUZPR: 8237 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 8238 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 8239 case OP_VZIPL: 8240 case OP_VZIPR: 8241 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 8242 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 8243 case OP_VTRNL: 8244 case OP_VTRNR: 8245 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 8246 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 8247 } 8248 } 8249 8250 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 8251 ArrayRef<int> ShuffleMask, 8252 SelectionDAG &DAG) { 8253 // Check to see if we can use the VTBL instruction. 8254 SDValue V1 = Op.getOperand(0); 8255 SDValue V2 = Op.getOperand(1); 8256 SDLoc DL(Op); 8257 8258 SmallVector<SDValue, 8> VTBLMask; 8259 for (ArrayRef<int>::iterator 8260 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 8261 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 8262 8263 if (V2.getNode()->isUndef()) 8264 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 8265 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8266 8267 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 8268 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8269 } 8270 8271 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 8272 SelectionDAG &DAG) { 8273 SDLoc DL(Op); 8274 SDValue OpLHS = Op.getOperand(0); 8275 EVT VT = OpLHS.getValueType(); 8276 8277 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 8278 "Expect an v8i16/v16i8 type"); 8279 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 8280 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 8281 // extract the first 8 bytes into the top double word and the last 8 bytes 8282 // into the bottom double word. The v8i16 case is similar. 8283 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 8284 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 8285 DAG.getConstant(ExtractNum, DL, MVT::i32)); 8286 } 8287 8288 static EVT getVectorTyFromPredicateVector(EVT VT) { 8289 switch (VT.getSimpleVT().SimpleTy) { 8290 case MVT::v4i1: 8291 return MVT::v4i32; 8292 case MVT::v8i1: 8293 return MVT::v8i16; 8294 case MVT::v16i1: 8295 return MVT::v16i8; 8296 default: 8297 llvm_unreachable("Unexpected vector predicate type"); 8298 } 8299 } 8300 8301 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 8302 SelectionDAG &DAG) { 8303 // Converting from boolean predicates to integers involves creating a vector 8304 // of all ones or all zeroes and selecting the lanes based upon the real 8305 // predicate. 8306 SDValue AllOnes = 8307 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 8308 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 8309 8310 SDValue AllZeroes = 8311 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 8312 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 8313 8314 // Get full vector type from predicate type 8315 EVT NewVT = getVectorTyFromPredicateVector(VT); 8316 8317 SDValue RecastV1; 8318 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 8319 // this to a v16i1. This cannot be done with an ordinary bitcast because the 8320 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 8321 // since we know in hardware the sizes are really the same. 8322 if (VT != MVT::v16i1) 8323 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 8324 else 8325 RecastV1 = Pred; 8326 8327 // Select either all ones or zeroes depending upon the real predicate bits. 8328 SDValue PredAsVector = 8329 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 8330 8331 // Recast our new predicate-as-integer v16i8 vector into something 8332 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 8333 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 8334 } 8335 8336 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 8337 const ARMSubtarget *ST) { 8338 EVT VT = Op.getValueType(); 8339 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8340 ArrayRef<int> ShuffleMask = SVN->getMask(); 8341 8342 assert(ST->hasMVEIntegerOps() && 8343 "No support for vector shuffle of boolean predicates"); 8344 8345 SDValue V1 = Op.getOperand(0); 8346 SDLoc dl(Op); 8347 if (isReverseMask(ShuffleMask, VT)) { 8348 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 8349 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 8350 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 8351 DAG.getConstant(16, dl, MVT::i32)); 8352 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 8353 } 8354 8355 // Until we can come up with optimised cases for every single vector 8356 // shuffle in existence we have chosen the least painful strategy. This is 8357 // to essentially promote the boolean predicate to a 8-bit integer, where 8358 // each predicate represents a byte. Then we fall back on a normal integer 8359 // vector shuffle and convert the result back into a predicate vector. In 8360 // many cases the generated code might be even better than scalar code 8361 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 8362 // fields in a register into 8 other arbitrary 2-bit fields! 8363 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 8364 EVT NewVT = PredAsVector.getValueType(); 8365 8366 // Do the shuffle! 8367 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 8368 DAG.getUNDEF(NewVT), ShuffleMask); 8369 8370 // Now return the result of comparing the shuffled vector with zero, 8371 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8372 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 8373 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8374 } 8375 8376 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 8377 ArrayRef<int> ShuffleMask, 8378 SelectionDAG &DAG) { 8379 // Attempt to lower the vector shuffle using as many whole register movs as 8380 // possible. This is useful for types smaller than 32bits, which would 8381 // often otherwise become a series for grp movs. 8382 SDLoc dl(Op); 8383 EVT VT = Op.getValueType(); 8384 if (VT.getScalarSizeInBits() >= 32) 8385 return SDValue(); 8386 8387 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8388 "Unexpected vector type"); 8389 int NumElts = VT.getVectorNumElements(); 8390 int QuarterSize = NumElts / 4; 8391 // The four final parts of the vector, as i32's 8392 SDValue Parts[4]; 8393 8394 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 8395 // <u,u,u,u>), returning the vmov lane index 8396 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 8397 // Detect which mov lane this would be from the first non-undef element. 8398 int MovIdx = -1; 8399 for (int i = 0; i < Length; i++) { 8400 if (ShuffleMask[Start + i] >= 0) { 8401 if (ShuffleMask[Start + i] % Length != i) 8402 return -1; 8403 MovIdx = ShuffleMask[Start + i] / Length; 8404 break; 8405 } 8406 } 8407 // If all items are undef, leave this for other combines 8408 if (MovIdx == -1) 8409 return -1; 8410 // Check the remaining values are the correct part of the same mov 8411 for (int i = 1; i < Length; i++) { 8412 if (ShuffleMask[Start + i] >= 0 && 8413 (ShuffleMask[Start + i] / Length != MovIdx || 8414 ShuffleMask[Start + i] % Length != i)) 8415 return -1; 8416 } 8417 return MovIdx; 8418 }; 8419 8420 for (int Part = 0; Part < 4; ++Part) { 8421 // Does this part look like a mov 8422 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 8423 if (Elt != -1) { 8424 SDValue Input = Op->getOperand(0); 8425 if (Elt >= 4) { 8426 Input = Op->getOperand(1); 8427 Elt -= 4; 8428 } 8429 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input); 8430 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast, 8431 DAG.getConstant(Elt, dl, MVT::i32)); 8432 } 8433 } 8434 8435 // Nothing interesting found, just return 8436 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 8437 return SDValue(); 8438 8439 // The other parts need to be built with the old shuffle vector, cast to a 8440 // v4i32 and extract_vector_elts 8441 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 8442 SmallVector<int, 16> NewShuffleMask; 8443 for (int Part = 0; Part < 4; ++Part) 8444 for (int i = 0; i < QuarterSize; i++) 8445 NewShuffleMask.push_back( 8446 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 8447 SDValue NewShuffle = DAG.getVectorShuffle( 8448 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 8449 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle); 8450 8451 for (int Part = 0; Part < 4; ++Part) 8452 if (!Parts[Part]) 8453 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, 8454 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 8455 } 8456 // Build a vector out of the various parts and bitcast it back to the original 8457 // type. 8458 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts); 8459 return DAG.getBitcast(VT, NewVec); 8460 } 8461 8462 static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, 8463 ArrayRef<int> ShuffleMask, 8464 SelectionDAG &DAG) { 8465 SDValue V1 = Op.getOperand(0); 8466 SDValue V2 = Op.getOperand(1); 8467 EVT VT = Op.getValueType(); 8468 unsigned NumElts = VT.getVectorNumElements(); 8469 8470 // An One-Off Identity mask is one that is mostly an identity mask from as 8471 // single source but contains a single element out-of-place, either from a 8472 // different vector or from another position in the same vector. As opposed to 8473 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert 8474 // pair directly. 8475 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset, 8476 int &OffElement) { 8477 OffElement = -1; 8478 int NonUndef = 0; 8479 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) { 8480 if (Mask[i] == -1) 8481 continue; 8482 NonUndef++; 8483 if (Mask[i] != i + BaseOffset) { 8484 if (OffElement == -1) 8485 OffElement = i; 8486 else 8487 return false; 8488 } 8489 } 8490 return NonUndef > 2 && OffElement != -1; 8491 }; 8492 int OffElement; 8493 SDValue VInput; 8494 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement)) 8495 VInput = V1; 8496 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement)) 8497 VInput = V2; 8498 else 8499 return SDValue(); 8500 8501 SDLoc dl(Op); 8502 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16 8503 ? MVT::i32 8504 : VT.getScalarType(); 8505 SDValue Elt = DAG.getNode( 8506 ISD::EXTRACT_VECTOR_ELT, dl, SVT, 8507 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2, 8508 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl)); 8509 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt, 8510 DAG.getVectorIdxConstant(OffElement % NumElts, dl)); 8511 } 8512 8513 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 8514 const ARMSubtarget *ST) { 8515 SDValue V1 = Op.getOperand(0); 8516 SDValue V2 = Op.getOperand(1); 8517 SDLoc dl(Op); 8518 EVT VT = Op.getValueType(); 8519 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8520 unsigned EltSize = VT.getScalarSizeInBits(); 8521 8522 if (ST->hasMVEIntegerOps() && EltSize == 1) 8523 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 8524 8525 // Convert shuffles that are directly supported on NEON to target-specific 8526 // DAG nodes, instead of keeping them as shuffles and matching them again 8527 // during code selection. This is more efficient and avoids the possibility 8528 // of inconsistencies between legalization and selection. 8529 // FIXME: floating-point vectors should be canonicalized to integer vectors 8530 // of the same time so that they get CSEd properly. 8531 ArrayRef<int> ShuffleMask = SVN->getMask(); 8532 8533 if (EltSize <= 32) { 8534 if (SVN->isSplat()) { 8535 int Lane = SVN->getSplatIndex(); 8536 // If this is undef splat, generate it via "just" vdup, if possible. 8537 if (Lane == -1) Lane = 0; 8538 8539 // Test if V1 is a SCALAR_TO_VECTOR. 8540 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 8541 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8542 } 8543 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 8544 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 8545 // reaches it). 8546 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 8547 !isa<ConstantSDNode>(V1.getOperand(0))) { 8548 bool IsScalarToVector = true; 8549 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 8550 if (!V1.getOperand(i).isUndef()) { 8551 IsScalarToVector = false; 8552 break; 8553 } 8554 if (IsScalarToVector) 8555 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8556 } 8557 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 8558 DAG.getConstant(Lane, dl, MVT::i32)); 8559 } 8560 8561 bool ReverseVEXT = false; 8562 unsigned Imm = 0; 8563 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 8564 if (ReverseVEXT) 8565 std::swap(V1, V2); 8566 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 8567 DAG.getConstant(Imm, dl, MVT::i32)); 8568 } 8569 8570 if (isVREVMask(ShuffleMask, VT, 64)) 8571 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 8572 if (isVREVMask(ShuffleMask, VT, 32)) 8573 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8574 if (isVREVMask(ShuffleMask, VT, 16)) 8575 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8576 8577 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8578 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8579 DAG.getConstant(Imm, dl, MVT::i32)); 8580 } 8581 8582 // Check for Neon shuffles that modify both input vectors in place. 8583 // If both results are used, i.e., if there are two shuffles with the same 8584 // source operands and with masks corresponding to both results of one of 8585 // these operations, DAG memoization will ensure that a single node is 8586 // used for both shuffles. 8587 unsigned WhichResult = 0; 8588 bool isV_UNDEF = false; 8589 if (ST->hasNEON()) { 8590 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8591 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8592 if (isV_UNDEF) 8593 V2 = V1; 8594 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8595 .getValue(WhichResult); 8596 } 8597 } 8598 if (ST->hasMVEIntegerOps()) { 8599 if (isVMOVNMask(ShuffleMask, VT, false, false)) 8600 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8601 DAG.getConstant(0, dl, MVT::i32)); 8602 if (isVMOVNMask(ShuffleMask, VT, true, false)) 8603 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8604 DAG.getConstant(1, dl, MVT::i32)); 8605 if (isVMOVNMask(ShuffleMask, VT, true, true)) 8606 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1, 8607 DAG.getConstant(1, dl, MVT::i32)); 8608 } 8609 8610 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8611 // shuffles that produce a result larger than their operands with: 8612 // shuffle(concat(v1, undef), concat(v2, undef)) 8613 // -> 8614 // shuffle(concat(v1, v2), undef) 8615 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8616 // 8617 // This is useful in the general case, but there are special cases where 8618 // native shuffles produce larger results: the two-result ops. 8619 // 8620 // Look through the concat when lowering them: 8621 // shuffle(concat(v1, v2), undef) 8622 // -> 8623 // concat(VZIP(v1, v2):0, :1) 8624 // 8625 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8626 SDValue SubV1 = V1->getOperand(0); 8627 SDValue SubV2 = V1->getOperand(1); 8628 EVT SubVT = SubV1.getValueType(); 8629 8630 // We expect these to have been canonicalized to -1. 8631 assert(llvm::all_of(ShuffleMask, [&](int i) { 8632 return i < (int)VT.getVectorNumElements(); 8633 }) && "Unexpected shuffle index into UNDEF operand!"); 8634 8635 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8636 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8637 if (isV_UNDEF) 8638 SubV2 = SubV1; 8639 assert((WhichResult == 0) && 8640 "In-place shuffle of concat can only have one result!"); 8641 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8642 SubV1, SubV2); 8643 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8644 Res.getValue(1)); 8645 } 8646 } 8647 } 8648 8649 if (ST->hasMVEIntegerOps() && EltSize <= 32) 8650 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) 8651 return V; 8652 8653 // If the shuffle is not directly supported and it has 4 elements, use 8654 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8655 unsigned NumElts = VT.getVectorNumElements(); 8656 if (NumElts == 4) { 8657 unsigned PFIndexes[4]; 8658 for (unsigned i = 0; i != 4; ++i) { 8659 if (ShuffleMask[i] < 0) 8660 PFIndexes[i] = 8; 8661 else 8662 PFIndexes[i] = ShuffleMask[i]; 8663 } 8664 8665 // Compute the index in the perfect shuffle table. 8666 unsigned PFTableIndex = 8667 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8668 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8669 unsigned Cost = (PFEntry >> 30); 8670 8671 if (Cost <= 4) { 8672 if (ST->hasNEON()) 8673 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8674 else if (isLegalMVEShuffleOp(PFEntry)) { 8675 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8676 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8677 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8678 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8679 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8680 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8681 } 8682 } 8683 } 8684 8685 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8686 if (EltSize >= 32) { 8687 // Do the expansion with floating-point types, since that is what the VFP 8688 // registers are defined to use, and since i64 is not legal. 8689 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8690 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8691 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8692 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8693 SmallVector<SDValue, 8> Ops; 8694 for (unsigned i = 0; i < NumElts; ++i) { 8695 if (ShuffleMask[i] < 0) 8696 Ops.push_back(DAG.getUNDEF(EltVT)); 8697 else 8698 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8699 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8700 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8701 dl, MVT::i32))); 8702 } 8703 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8704 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8705 } 8706 8707 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 8708 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 8709 8710 if (ST->hasNEON() && VT == MVT::v8i8) 8711 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8712 return NewOp; 8713 8714 if (ST->hasMVEIntegerOps()) 8715 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8716 return NewOp; 8717 8718 return SDValue(); 8719 } 8720 8721 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8722 const ARMSubtarget *ST) { 8723 EVT VecVT = Op.getOperand(0).getValueType(); 8724 SDLoc dl(Op); 8725 8726 assert(ST->hasMVEIntegerOps() && 8727 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8728 8729 SDValue Conv = 8730 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8731 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8732 unsigned LaneWidth = 8733 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8734 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8735 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8736 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8737 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8738 DAG.getConstant(~Mask, dl, MVT::i32)); 8739 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8740 } 8741 8742 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8743 SelectionDAG &DAG) const { 8744 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8745 SDValue Lane = Op.getOperand(2); 8746 if (!isa<ConstantSDNode>(Lane)) 8747 return SDValue(); 8748 8749 SDValue Elt = Op.getOperand(1); 8750 EVT EltVT = Elt.getValueType(); 8751 8752 if (Subtarget->hasMVEIntegerOps() && 8753 Op.getValueType().getScalarSizeInBits() == 1) 8754 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8755 8756 if (getTypeAction(*DAG.getContext(), EltVT) == 8757 TargetLowering::TypePromoteFloat) { 8758 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8759 // but the type system will try to do that if we don't intervene. 8760 // Reinterpret any such vector-element insertion as one with the 8761 // corresponding integer types. 8762 8763 SDLoc dl(Op); 8764 8765 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8766 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8767 TargetLowering::TypePromoteFloat); 8768 8769 SDValue VecIn = Op.getOperand(0); 8770 EVT VecVT = VecIn.getValueType(); 8771 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8772 VecVT.getVectorNumElements()); 8773 8774 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8775 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8776 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8777 IVecIn, IElt, Lane); 8778 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8779 } 8780 8781 return Op; 8782 } 8783 8784 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8785 const ARMSubtarget *ST) { 8786 EVT VecVT = Op.getOperand(0).getValueType(); 8787 SDLoc dl(Op); 8788 8789 assert(ST->hasMVEIntegerOps() && 8790 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8791 8792 SDValue Conv = 8793 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8794 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8795 unsigned LaneWidth = 8796 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8797 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8798 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8799 return Shift; 8800 } 8801 8802 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8803 const ARMSubtarget *ST) { 8804 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8805 SDValue Lane = Op.getOperand(1); 8806 if (!isa<ConstantSDNode>(Lane)) 8807 return SDValue(); 8808 8809 SDValue Vec = Op.getOperand(0); 8810 EVT VT = Vec.getValueType(); 8811 8812 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8813 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8814 8815 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8816 SDLoc dl(Op); 8817 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8818 } 8819 8820 return Op; 8821 } 8822 8823 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8824 const ARMSubtarget *ST) { 8825 SDValue V1 = Op.getOperand(0); 8826 SDValue V2 = Op.getOperand(1); 8827 SDLoc dl(Op); 8828 EVT VT = Op.getValueType(); 8829 EVT Op1VT = V1.getValueType(); 8830 EVT Op2VT = V2.getValueType(); 8831 unsigned NumElts = VT.getVectorNumElements(); 8832 8833 assert(Op1VT == Op2VT && "Operand types don't match!"); 8834 assert(VT.getScalarSizeInBits() == 1 && 8835 "Unexpected custom CONCAT_VECTORS lowering"); 8836 assert(ST->hasMVEIntegerOps() && 8837 "CONCAT_VECTORS lowering only supported for MVE"); 8838 8839 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8840 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8841 8842 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8843 // promoted to v8i16, etc. 8844 8845 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8846 8847 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8848 // to be the right size for the destination. For example, if Op1 is v4i1 then 8849 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8850 // which when promoted is v8i16. That means each i32 element from Op1 needs 8851 // truncating to i16 and inserting in the result. 8852 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8853 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8854 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8855 EVT NewVT = NewV.getValueType(); 8856 EVT ConcatVT = ConVec.getValueType(); 8857 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8858 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8859 DAG.getIntPtrConstant(i, dl)); 8860 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8861 DAG.getConstant(j, dl, MVT::i32)); 8862 } 8863 return ConVec; 8864 }; 8865 unsigned j = 0; 8866 ConVec = ExractInto(NewV1, ConVec, j); 8867 ConVec = ExractInto(NewV2, ConVec, j); 8868 8869 // Now return the result of comparing the subvector with zero, 8870 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8871 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8872 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8873 } 8874 8875 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8876 const ARMSubtarget *ST) { 8877 EVT VT = Op->getValueType(0); 8878 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8879 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8880 8881 // The only time a CONCAT_VECTORS operation can have legal types is when 8882 // two 64-bit vectors are concatenated to a 128-bit vector. 8883 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8884 "unexpected CONCAT_VECTORS"); 8885 SDLoc dl(Op); 8886 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8887 SDValue Op0 = Op.getOperand(0); 8888 SDValue Op1 = Op.getOperand(1); 8889 if (!Op0.isUndef()) 8890 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8891 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8892 DAG.getIntPtrConstant(0, dl)); 8893 if (!Op1.isUndef()) 8894 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8895 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8896 DAG.getIntPtrConstant(1, dl)); 8897 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8898 } 8899 8900 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8901 const ARMSubtarget *ST) { 8902 SDValue V1 = Op.getOperand(0); 8903 SDValue V2 = Op.getOperand(1); 8904 SDLoc dl(Op); 8905 EVT VT = Op.getValueType(); 8906 EVT Op1VT = V1.getValueType(); 8907 unsigned NumElts = VT.getVectorNumElements(); 8908 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8909 8910 assert(VT.getScalarSizeInBits() == 1 && 8911 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8912 assert(ST->hasMVEIntegerOps() && 8913 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8914 8915 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8916 8917 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8918 // promoted to v8i16, etc. 8919 8920 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8921 8922 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8923 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8924 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8925 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8926 DAG.getIntPtrConstant(i, dl)); 8927 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8928 DAG.getConstant(j, dl, MVT::i32)); 8929 } 8930 8931 // Now return the result of comparing the subvector with zero, 8932 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8933 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8934 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8935 } 8936 8937 // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0). 8938 static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, 8939 const ARMSubtarget *ST) { 8940 assert(ST->hasMVEIntegerOps() && "Expected MVE!"); 8941 EVT VT = N->getValueType(0); 8942 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) && 8943 "Expected a vector i1 type!"); 8944 SDValue Op = N->getOperand(0); 8945 EVT FromVT = Op.getValueType(); 8946 SDLoc DL(N); 8947 8948 SDValue And = 8949 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT)); 8950 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT), 8951 DAG.getCondCode(ISD::SETNE)); 8952 } 8953 8954 static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, 8955 const ARMSubtarget *Subtarget) { 8956 if (!Subtarget->hasMVEIntegerOps()) 8957 return SDValue(); 8958 8959 EVT ToVT = N->getValueType(0); 8960 if (ToVT.getScalarType() == MVT::i1) 8961 return LowerTruncatei1(N, DAG, Subtarget); 8962 8963 // MVE does not have a single instruction to perform the truncation of a v4i32 8964 // into the lower half of a v8i16, in the same way that a NEON vmovn would. 8965 // Most of the instructions in MVE follow the 'Beats' system, where moving 8966 // values from different lanes is usually something that the instructions 8967 // avoid. 8968 // 8969 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B, 8970 // which take a the top/bottom half of a larger lane and extend it (or do the 8971 // opposite, truncating into the top/bottom lane from a larger lane). Note 8972 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the 8973 // bottom 16bits from each vector lane. This works really well with T/B 8974 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need 8975 // to move order. 8976 // 8977 // But truncates and sext/zext are always going to be fairly common from llvm. 8978 // We have several options for how to deal with them: 8979 // - Wherever possible combine them into an instruction that makes them 8980 // "free". This includes loads/stores, which can perform the trunc as part 8981 // of the memory operation. Or certain shuffles that can be turned into 8982 // VMOVN/VMOVL. 8983 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So 8984 // trunc(mul(sext(a), sext(b))) may become 8985 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in 8986 // this case can use VMULL). This is performed in the 8987 // MVELaneInterleavingPass. 8988 // - Otherwise we have an option. By default we would expand the 8989 // zext/sext/trunc into a series of lane extract/inserts going via GPR 8990 // registers. One for each vector lane in the vector. This can obviously be 8991 // very expensive. 8992 // - The other option is to use the fact that loads/store can extend/truncate 8993 // to turn a trunc into two truncating stack stores and a stack reload. This 8994 // becomes 3 back-to-back memory operations, but at least that is less than 8995 // all the insert/extracts. 8996 // 8997 // In order to do the last, we convert certain trunc's into MVETRUNC, which 8998 // are either optimized where they can be, or eventually lowered into stack 8999 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores 9000 // two early, where other instructions would be better, and stops us from 9001 // having to reconstruct multiple buildvector shuffles into loads/stores. 9002 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8) 9003 return SDValue(); 9004 EVT FromVT = N->getOperand(0).getValueType(); 9005 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16) 9006 return SDValue(); 9007 9008 SDValue Lo, Hi; 9009 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 9010 SDLoc DL(N); 9011 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi); 9012 } 9013 9014 static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, 9015 const ARMSubtarget *Subtarget) { 9016 if (!Subtarget->hasMVEIntegerOps()) 9017 return SDValue(); 9018 9019 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC. 9020 9021 EVT ToVT = N->getValueType(0); 9022 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16) 9023 return SDValue(); 9024 SDValue Op = N->getOperand(0); 9025 EVT FromVT = Op.getValueType(); 9026 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8) 9027 return SDValue(); 9028 9029 SDLoc DL(N); 9030 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 9031 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) 9032 ExtVT = MVT::v8i16; 9033 9034 unsigned Opcode = 9035 N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT; 9036 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op); 9037 SDValue Ext1 = Ext.getValue(1); 9038 9039 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) { 9040 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext); 9041 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1); 9042 } 9043 9044 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1); 9045 } 9046 9047 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 9048 /// element has been zero/sign-extended, depending on the isSigned parameter, 9049 /// from an integer type half its size. 9050 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 9051 bool isSigned) { 9052 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 9053 EVT VT = N->getValueType(0); 9054 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 9055 SDNode *BVN = N->getOperand(0).getNode(); 9056 if (BVN->getValueType(0) != MVT::v4i32 || 9057 BVN->getOpcode() != ISD::BUILD_VECTOR) 9058 return false; 9059 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 9060 unsigned HiElt = 1 - LoElt; 9061 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 9062 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 9063 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 9064 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 9065 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 9066 return false; 9067 if (isSigned) { 9068 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 9069 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 9070 return true; 9071 } else { 9072 if (Hi0->isNullValue() && Hi1->isNullValue()) 9073 return true; 9074 } 9075 return false; 9076 } 9077 9078 if (N->getOpcode() != ISD::BUILD_VECTOR) 9079 return false; 9080 9081 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 9082 SDNode *Elt = N->getOperand(i).getNode(); 9083 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 9084 unsigned EltSize = VT.getScalarSizeInBits(); 9085 unsigned HalfSize = EltSize / 2; 9086 if (isSigned) { 9087 if (!isIntN(HalfSize, C->getSExtValue())) 9088 return false; 9089 } else { 9090 if (!isUIntN(HalfSize, C->getZExtValue())) 9091 return false; 9092 } 9093 continue; 9094 } 9095 return false; 9096 } 9097 9098 return true; 9099 } 9100 9101 /// isSignExtended - Check if a node is a vector value that is sign-extended 9102 /// or a constant BUILD_VECTOR with sign-extended elements. 9103 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 9104 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 9105 return true; 9106 if (isExtendedBUILD_VECTOR(N, DAG, true)) 9107 return true; 9108 return false; 9109 } 9110 9111 /// isZeroExtended - Check if a node is a vector value that is zero-extended (or 9112 /// any-extended) or a constant BUILD_VECTOR with zero-extended elements. 9113 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 9114 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || 9115 ISD::isZEXTLoad(N)) 9116 return true; 9117 if (isExtendedBUILD_VECTOR(N, DAG, false)) 9118 return true; 9119 return false; 9120 } 9121 9122 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 9123 if (OrigVT.getSizeInBits() >= 64) 9124 return OrigVT; 9125 9126 assert(OrigVT.isSimple() && "Expecting a simple value type"); 9127 9128 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 9129 switch (OrigSimpleTy) { 9130 default: llvm_unreachable("Unexpected Vector Type"); 9131 case MVT::v2i8: 9132 case MVT::v2i16: 9133 return MVT::v2i32; 9134 case MVT::v4i8: 9135 return MVT::v4i16; 9136 } 9137 } 9138 9139 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 9140 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 9141 /// We insert the required extension here to get the vector to fill a D register. 9142 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 9143 const EVT &OrigTy, 9144 const EVT &ExtTy, 9145 unsigned ExtOpcode) { 9146 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 9147 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 9148 // 64-bits we need to insert a new extension so that it will be 64-bits. 9149 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 9150 if (OrigTy.getSizeInBits() >= 64) 9151 return N; 9152 9153 // Must extend size to at least 64 bits to be used as an operand for VMULL. 9154 EVT NewVT = getExtensionTo64Bits(OrigTy); 9155 9156 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 9157 } 9158 9159 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 9160 /// does not do any sign/zero extension. If the original vector is less 9161 /// than 64 bits, an appropriate extension will be added after the load to 9162 /// reach a total size of 64 bits. We have to add the extension separately 9163 /// because ARM does not have a sign/zero extending load for vectors. 9164 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 9165 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 9166 9167 // The load already has the right type. 9168 if (ExtendedTy == LD->getMemoryVT()) 9169 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 9170 LD->getBasePtr(), LD->getPointerInfo(), 9171 LD->getAlignment(), LD->getMemOperand()->getFlags()); 9172 9173 // We need to create a zextload/sextload. We cannot just create a load 9174 // followed by a zext/zext node because LowerMUL is also run during normal 9175 // operation legalization where we can't create illegal types. 9176 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 9177 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 9178 LD->getMemoryVT(), LD->getAlignment(), 9179 LD->getMemOperand()->getFlags()); 9180 } 9181 9182 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 9183 /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return 9184 /// the unextended value. The unextended vector should be 64 bits so that it can 9185 /// be used as an operand to a VMULL instruction. If the original vector size 9186 /// before extension is less than 64 bits we add a an extension to resize 9187 /// the vector to 64 bits. 9188 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 9189 if (N->getOpcode() == ISD::SIGN_EXTEND || 9190 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) 9191 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 9192 N->getOperand(0)->getValueType(0), 9193 N->getValueType(0), 9194 N->getOpcode()); 9195 9196 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 9197 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 9198 "Expected extending load"); 9199 9200 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 9201 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 9202 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 9203 SDValue extLoad = 9204 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 9205 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 9206 9207 return newLoad; 9208 } 9209 9210 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 9211 // have been legalized as a BITCAST from v4i32. 9212 if (N->getOpcode() == ISD::BITCAST) { 9213 SDNode *BVN = N->getOperand(0).getNode(); 9214 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 9215 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 9216 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 9217 return DAG.getBuildVector( 9218 MVT::v2i32, SDLoc(N), 9219 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 9220 } 9221 // Construct a new BUILD_VECTOR with elements truncated to half the size. 9222 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 9223 EVT VT = N->getValueType(0); 9224 unsigned EltSize = VT.getScalarSizeInBits() / 2; 9225 unsigned NumElts = VT.getVectorNumElements(); 9226 MVT TruncVT = MVT::getIntegerVT(EltSize); 9227 SmallVector<SDValue, 8> Ops; 9228 SDLoc dl(N); 9229 for (unsigned i = 0; i != NumElts; ++i) { 9230 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 9231 const APInt &CInt = C->getAPIntValue(); 9232 // Element types smaller than 32 bits are not legal, so use i32 elements. 9233 // The values are implicitly truncated so sext vs. zext doesn't matter. 9234 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 9235 } 9236 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 9237 } 9238 9239 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 9240 unsigned Opcode = N->getOpcode(); 9241 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 9242 SDNode *N0 = N->getOperand(0).getNode(); 9243 SDNode *N1 = N->getOperand(1).getNode(); 9244 return N0->hasOneUse() && N1->hasOneUse() && 9245 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 9246 } 9247 return false; 9248 } 9249 9250 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 9251 unsigned Opcode = N->getOpcode(); 9252 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 9253 SDNode *N0 = N->getOperand(0).getNode(); 9254 SDNode *N1 = N->getOperand(1).getNode(); 9255 return N0->hasOneUse() && N1->hasOneUse() && 9256 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 9257 } 9258 return false; 9259 } 9260 9261 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 9262 // Multiplications are only custom-lowered for 128-bit vectors so that 9263 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 9264 EVT VT = Op.getValueType(); 9265 assert(VT.is128BitVector() && VT.isInteger() && 9266 "unexpected type for custom-lowering ISD::MUL"); 9267 SDNode *N0 = Op.getOperand(0).getNode(); 9268 SDNode *N1 = Op.getOperand(1).getNode(); 9269 unsigned NewOpc = 0; 9270 bool isMLA = false; 9271 bool isN0SExt = isSignExtended(N0, DAG); 9272 bool isN1SExt = isSignExtended(N1, DAG); 9273 if (isN0SExt && isN1SExt) 9274 NewOpc = ARMISD::VMULLs; 9275 else { 9276 bool isN0ZExt = isZeroExtended(N0, DAG); 9277 bool isN1ZExt = isZeroExtended(N1, DAG); 9278 if (isN0ZExt && isN1ZExt) 9279 NewOpc = ARMISD::VMULLu; 9280 else if (isN1SExt || isN1ZExt) { 9281 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 9282 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 9283 if (isN1SExt && isAddSubSExt(N0, DAG)) { 9284 NewOpc = ARMISD::VMULLs; 9285 isMLA = true; 9286 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 9287 NewOpc = ARMISD::VMULLu; 9288 isMLA = true; 9289 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 9290 std::swap(N0, N1); 9291 NewOpc = ARMISD::VMULLu; 9292 isMLA = true; 9293 } 9294 } 9295 9296 if (!NewOpc) { 9297 if (VT == MVT::v2i64) 9298 // Fall through to expand this. It is not legal. 9299 return SDValue(); 9300 else 9301 // Other vector multiplications are legal. 9302 return Op; 9303 } 9304 } 9305 9306 // Legalize to a VMULL instruction. 9307 SDLoc DL(Op); 9308 SDValue Op0; 9309 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 9310 if (!isMLA) { 9311 Op0 = SkipExtensionForVMULL(N0, DAG); 9312 assert(Op0.getValueType().is64BitVector() && 9313 Op1.getValueType().is64BitVector() && 9314 "unexpected types for extended operands to VMULL"); 9315 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 9316 } 9317 9318 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 9319 // isel lowering to take advantage of no-stall back to back vmul + vmla. 9320 // vmull q0, d4, d6 9321 // vmlal q0, d5, d6 9322 // is faster than 9323 // vaddl q0, d4, d5 9324 // vmovl q1, d6 9325 // vmul q0, q0, q1 9326 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 9327 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 9328 EVT Op1VT = Op1.getValueType(); 9329 return DAG.getNode(N0->getOpcode(), DL, VT, 9330 DAG.getNode(NewOpc, DL, VT, 9331 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 9332 DAG.getNode(NewOpc, DL, VT, 9333 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 9334 } 9335 9336 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 9337 SelectionDAG &DAG) { 9338 // TODO: Should this propagate fast-math-flags? 9339 9340 // Convert to float 9341 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 9342 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 9343 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 9344 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 9345 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 9346 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 9347 // Get reciprocal estimate. 9348 // float4 recip = vrecpeq_f32(yf); 9349 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9350 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9351 Y); 9352 // Because char has a smaller range than uchar, we can actually get away 9353 // without any newton steps. This requires that we use a weird bias 9354 // of 0xb000, however (again, this has been exhaustively tested). 9355 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 9356 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 9357 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 9358 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 9359 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 9360 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 9361 // Convert back to short. 9362 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 9363 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 9364 return X; 9365 } 9366 9367 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 9368 SelectionDAG &DAG) { 9369 // TODO: Should this propagate fast-math-flags? 9370 9371 SDValue N2; 9372 // Convert to float. 9373 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 9374 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 9375 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 9376 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 9377 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9378 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9379 9380 // Use reciprocal estimate and one refinement step. 9381 // float4 recip = vrecpeq_f32(yf); 9382 // recip *= vrecpsq_f32(yf, recip); 9383 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9384 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9385 N1); 9386 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9387 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9388 N1, N2); 9389 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9390 // Because short has a smaller range than ushort, we can actually get away 9391 // with only a single newton step. This requires that we use a weird bias 9392 // of 89, however (again, this has been exhaustively tested). 9393 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 9394 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9395 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9396 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 9397 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9398 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9399 // Convert back to integer and return. 9400 // return vmovn_s32(vcvt_s32_f32(result)); 9401 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9402 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9403 return N0; 9404 } 9405 9406 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 9407 const ARMSubtarget *ST) { 9408 EVT VT = Op.getValueType(); 9409 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9410 "unexpected type for custom-lowering ISD::SDIV"); 9411 9412 SDLoc dl(Op); 9413 SDValue N0 = Op.getOperand(0); 9414 SDValue N1 = Op.getOperand(1); 9415 SDValue N2, N3; 9416 9417 if (VT == MVT::v8i8) { 9418 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 9419 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 9420 9421 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9422 DAG.getIntPtrConstant(4, dl)); 9423 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9424 DAG.getIntPtrConstant(4, dl)); 9425 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9426 DAG.getIntPtrConstant(0, dl)); 9427 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9428 DAG.getIntPtrConstant(0, dl)); 9429 9430 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 9431 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 9432 9433 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9434 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9435 9436 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 9437 return N0; 9438 } 9439 return LowerSDIV_v4i16(N0, N1, dl, DAG); 9440 } 9441 9442 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 9443 const ARMSubtarget *ST) { 9444 // TODO: Should this propagate fast-math-flags? 9445 EVT VT = Op.getValueType(); 9446 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9447 "unexpected type for custom-lowering ISD::UDIV"); 9448 9449 SDLoc dl(Op); 9450 SDValue N0 = Op.getOperand(0); 9451 SDValue N1 = Op.getOperand(1); 9452 SDValue N2, N3; 9453 9454 if (VT == MVT::v8i8) { 9455 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 9456 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 9457 9458 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9459 DAG.getIntPtrConstant(4, dl)); 9460 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9461 DAG.getIntPtrConstant(4, dl)); 9462 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9463 DAG.getIntPtrConstant(0, dl)); 9464 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9465 DAG.getIntPtrConstant(0, dl)); 9466 9467 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 9468 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 9469 9470 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9471 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9472 9473 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 9474 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 9475 MVT::i32), 9476 N0); 9477 return N0; 9478 } 9479 9480 // v4i16 sdiv ... Convert to float. 9481 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 9482 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 9483 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 9484 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 9485 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9486 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9487 9488 // Use reciprocal estimate and two refinement steps. 9489 // float4 recip = vrecpeq_f32(yf); 9490 // recip *= vrecpsq_f32(yf, recip); 9491 // recip *= vrecpsq_f32(yf, recip); 9492 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9493 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9494 BN1); 9495 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9496 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9497 BN1, N2); 9498 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9499 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9500 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9501 BN1, N2); 9502 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9503 // Simply multiplying by the reciprocal estimate can leave us a few ulps 9504 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 9505 // and that it will never cause us to return an answer too large). 9506 // float4 result = as_float4(as_int4(xf*recip) + 2); 9507 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9508 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9509 N1 = DAG.getConstant(2, dl, MVT::v4i32); 9510 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9511 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9512 // Convert back to integer and return. 9513 // return vmovn_u32(vcvt_s32_f32(result)); 9514 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9515 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9516 return N0; 9517 } 9518 9519 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 9520 SDNode *N = Op.getNode(); 9521 EVT VT = N->getValueType(0); 9522 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9523 9524 SDValue Carry = Op.getOperand(2); 9525 9526 SDLoc DL(Op); 9527 9528 SDValue Result; 9529 if (Op.getOpcode() == ISD::ADDCARRY) { 9530 // This converts the boolean value carry into the carry flag. 9531 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9532 9533 // Do the addition proper using the carry flag we wanted. 9534 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 9535 Op.getOperand(1), Carry); 9536 9537 // Now convert the carry flag into a boolean value. 9538 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9539 } else { 9540 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 9541 // have to invert the carry first. 9542 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9543 DAG.getConstant(1, DL, MVT::i32), Carry); 9544 // This converts the boolean value carry into the carry flag. 9545 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9546 9547 // Do the subtraction proper using the carry flag we wanted. 9548 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 9549 Op.getOperand(1), Carry); 9550 9551 // Now convert the carry flag into a boolean value. 9552 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9553 // But the carry returned by ARMISD::SUBE is not a borrow as expected 9554 // by ISD::SUBCARRY, so compute 1 - C. 9555 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9556 DAG.getConstant(1, DL, MVT::i32), Carry); 9557 } 9558 9559 // Return both values. 9560 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 9561 } 9562 9563 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 9564 assert(Subtarget->isTargetDarwin()); 9565 9566 // For iOS, we want to call an alternative entry point: __sincos_stret, 9567 // return values are passed via sret. 9568 SDLoc dl(Op); 9569 SDValue Arg = Op.getOperand(0); 9570 EVT ArgVT = Arg.getValueType(); 9571 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9572 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9573 9574 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9575 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9576 9577 // Pair of floats / doubles used to pass the result. 9578 Type *RetTy = StructType::get(ArgTy, ArgTy); 9579 auto &DL = DAG.getDataLayout(); 9580 9581 ArgListTy Args; 9582 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 9583 SDValue SRet; 9584 if (ShouldUseSRet) { 9585 // Create stack object for sret. 9586 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 9587 const Align StackAlign = DL.getPrefTypeAlign(RetTy); 9588 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 9589 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 9590 9591 ArgListEntry Entry; 9592 Entry.Node = SRet; 9593 Entry.Ty = RetTy->getPointerTo(); 9594 Entry.IsSExt = false; 9595 Entry.IsZExt = false; 9596 Entry.IsSRet = true; 9597 Args.push_back(Entry); 9598 RetTy = Type::getVoidTy(*DAG.getContext()); 9599 } 9600 9601 ArgListEntry Entry; 9602 Entry.Node = Arg; 9603 Entry.Ty = ArgTy; 9604 Entry.IsSExt = false; 9605 Entry.IsZExt = false; 9606 Args.push_back(Entry); 9607 9608 RTLIB::Libcall LC = 9609 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 9610 const char *LibcallName = getLibcallName(LC); 9611 CallingConv::ID CC = getLibcallCallingConv(LC); 9612 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 9613 9614 TargetLowering::CallLoweringInfo CLI(DAG); 9615 CLI.setDebugLoc(dl) 9616 .setChain(DAG.getEntryNode()) 9617 .setCallee(CC, RetTy, Callee, std::move(Args)) 9618 .setDiscardResult(ShouldUseSRet); 9619 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 9620 9621 if (!ShouldUseSRet) 9622 return CallResult.first; 9623 9624 SDValue LoadSin = 9625 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 9626 9627 // Address of cos field. 9628 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 9629 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 9630 SDValue LoadCos = 9631 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 9632 9633 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 9634 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 9635 LoadSin.getValue(0), LoadCos.getValue(0)); 9636 } 9637 9638 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 9639 bool Signed, 9640 SDValue &Chain) const { 9641 EVT VT = Op.getValueType(); 9642 assert((VT == MVT::i32 || VT == MVT::i64) && 9643 "unexpected type for custom lowering DIV"); 9644 SDLoc dl(Op); 9645 9646 const auto &DL = DAG.getDataLayout(); 9647 const auto &TLI = DAG.getTargetLoweringInfo(); 9648 9649 const char *Name = nullptr; 9650 if (Signed) 9651 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 9652 else 9653 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 9654 9655 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 9656 9657 ARMTargetLowering::ArgListTy Args; 9658 9659 for (auto AI : {1, 0}) { 9660 ArgListEntry Arg; 9661 Arg.Node = Op.getOperand(AI); 9662 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 9663 Args.push_back(Arg); 9664 } 9665 9666 CallLoweringInfo CLI(DAG); 9667 CLI.setDebugLoc(dl) 9668 .setChain(Chain) 9669 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 9670 ES, std::move(Args)); 9671 9672 return LowerCallTo(CLI).first; 9673 } 9674 9675 // This is a code size optimisation: return the original SDIV node to 9676 // DAGCombiner when we don't want to expand SDIV into a sequence of 9677 // instructions, and an empty node otherwise which will cause the 9678 // SDIV to be expanded in DAGCombine. 9679 SDValue 9680 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 9681 SelectionDAG &DAG, 9682 SmallVectorImpl<SDNode *> &Created) const { 9683 // TODO: Support SREM 9684 if (N->getOpcode() != ISD::SDIV) 9685 return SDValue(); 9686 9687 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 9688 const bool MinSize = ST.hasMinSize(); 9689 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 9690 : ST.hasDivideInARMMode(); 9691 9692 // Don't touch vector types; rewriting this may lead to scalarizing 9693 // the int divs. 9694 if (N->getOperand(0).getValueType().isVector()) 9695 return SDValue(); 9696 9697 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9698 // hwdiv support for this to be really profitable. 9699 if (!(MinSize && HasDivide)) 9700 return SDValue(); 9701 9702 // ARM mode is a bit simpler than Thumb: we can handle large power 9703 // of 2 immediates with 1 mov instruction; no further checks required, 9704 // just return the sdiv node. 9705 if (!ST.isThumb()) 9706 return SDValue(N, 0); 9707 9708 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9709 // and thus lose the code size benefits of a MOVS that requires only 2. 9710 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9711 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9712 if (Divisor.sgt(128)) 9713 return SDValue(); 9714 9715 return SDValue(N, 0); 9716 } 9717 9718 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9719 bool Signed) const { 9720 assert(Op.getValueType() == MVT::i32 && 9721 "unexpected type for custom lowering DIV"); 9722 SDLoc dl(Op); 9723 9724 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9725 DAG.getEntryNode(), Op.getOperand(1)); 9726 9727 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9728 } 9729 9730 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9731 SDLoc DL(N); 9732 SDValue Op = N->getOperand(1); 9733 if (N->getValueType(0) == MVT::i32) 9734 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9735 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9736 DAG.getConstant(0, DL, MVT::i32)); 9737 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9738 DAG.getConstant(1, DL, MVT::i32)); 9739 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9740 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9741 } 9742 9743 void ARMTargetLowering::ExpandDIV_Windows( 9744 SDValue Op, SelectionDAG &DAG, bool Signed, 9745 SmallVectorImpl<SDValue> &Results) const { 9746 const auto &DL = DAG.getDataLayout(); 9747 const auto &TLI = DAG.getTargetLoweringInfo(); 9748 9749 assert(Op.getValueType() == MVT::i64 && 9750 "unexpected type for custom lowering DIV"); 9751 SDLoc dl(Op); 9752 9753 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9754 9755 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9756 9757 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9758 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9759 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9760 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9761 9762 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); 9763 } 9764 9765 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9766 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9767 EVT MemVT = LD->getMemoryVT(); 9768 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9769 "Expected a predicate type!"); 9770 assert(MemVT == Op.getValueType()); 9771 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9772 "Expected a non-extending load"); 9773 assert(LD->isUnindexed() && "Expected a unindexed load"); 9774 9775 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 9776 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9777 // need to make sure that 8/4 bits are actually loaded into the correct 9778 // place, which means loading the value and then shuffling the values into 9779 // the bottom bits of the predicate. 9780 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9781 // for BE). 9782 // Speaking of BE, apparently the rest of llvm will assume a reverse order to 9783 // a natural VMSR(load), so needs to be reversed. 9784 9785 SDLoc dl(Op); 9786 SDValue Load = DAG.getExtLoad( 9787 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9788 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9789 LD->getMemOperand()); 9790 SDValue Val = Load; 9791 if (DAG.getDataLayout().isBigEndian()) 9792 Val = DAG.getNode(ISD::SRL, dl, MVT::i32, 9793 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load), 9794 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32)); 9795 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val); 9796 if (MemVT != MVT::v16i1) 9797 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9798 DAG.getConstant(0, dl, MVT::i32)); 9799 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9800 } 9801 9802 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, 9803 SelectionDAG &DAG) const { 9804 LoadSDNode *LD = cast<LoadSDNode>(N); 9805 EVT MemVT = LD->getMemoryVT(); 9806 assert(LD->isUnindexed() && "Loads should be unindexed at this point."); 9807 9808 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9809 !Subtarget->isThumb1Only() && LD->isVolatile()) { 9810 SDLoc dl(N); 9811 SDValue Result = DAG.getMemIntrinsicNode( 9812 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), 9813 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); 9814 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); 9815 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); 9816 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 9817 Results.append({Pair, Result.getValue(2)}); 9818 } 9819 } 9820 9821 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 9822 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9823 EVT MemVT = ST->getMemoryVT(); 9824 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9825 "Expected a predicate type!"); 9826 assert(MemVT == ST->getValue().getValueType()); 9827 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 9828 assert(ST->isUnindexed() && "Expected a unindexed store"); 9829 9830 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 9831 // unset and a scalar store. 9832 SDLoc dl(Op); 9833 SDValue Build = ST->getValue(); 9834 if (MemVT != MVT::v16i1) { 9835 SmallVector<SDValue, 16> Ops; 9836 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) { 9837 unsigned Elt = DAG.getDataLayout().isBigEndian() 9838 ? MemVT.getVectorNumElements() - I - 1 9839 : I; 9840 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 9841 DAG.getConstant(Elt, dl, MVT::i32))); 9842 } 9843 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 9844 Ops.push_back(DAG.getUNDEF(MVT::i32)); 9845 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 9846 } 9847 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 9848 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian()) 9849 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32, 9850 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP), 9851 DAG.getConstant(16, dl, MVT::i32)); 9852 return DAG.getTruncStore( 9853 ST->getChain(), dl, GRP, ST->getBasePtr(), 9854 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9855 ST->getMemOperand()); 9856 } 9857 9858 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, 9859 const ARMSubtarget *Subtarget) { 9860 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9861 EVT MemVT = ST->getMemoryVT(); 9862 assert(ST->isUnindexed() && "Stores should be unindexed at this point."); 9863 9864 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9865 !Subtarget->isThumb1Only() && ST->isVolatile()) { 9866 SDNode *N = Op.getNode(); 9867 SDLoc dl(N); 9868 9869 SDValue Lo = DAG.getNode( 9870 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9871 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, 9872 MVT::i32)); 9873 SDValue Hi = DAG.getNode( 9874 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9875 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, 9876 MVT::i32)); 9877 9878 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), 9879 {ST->getChain(), Lo, Hi, ST->getBasePtr()}, 9880 MemVT, ST->getMemOperand()); 9881 } else if (Subtarget->hasMVEIntegerOps() && 9882 ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 9883 MemVT == MVT::v16i1))) { 9884 return LowerPredicateStore(Op, DAG); 9885 } 9886 9887 return SDValue(); 9888 } 9889 9890 static bool isZeroVector(SDValue N) { 9891 return (ISD::isBuildVectorAllZeros(N.getNode()) || 9892 (N->getOpcode() == ARMISD::VMOVIMM && 9893 isNullConstant(N->getOperand(0)))); 9894 } 9895 9896 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 9897 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 9898 MVT VT = Op.getSimpleValueType(); 9899 SDValue Mask = N->getMask(); 9900 SDValue PassThru = N->getPassThru(); 9901 SDLoc dl(Op); 9902 9903 if (isZeroVector(PassThru)) 9904 return Op; 9905 9906 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9907 // zero too, and other values are lowered to a select. 9908 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9909 DAG.getTargetConstant(0, dl, MVT::i32)); 9910 SDValue NewLoad = DAG.getMaskedLoad( 9911 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 9912 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 9913 N->getExtensionType(), N->isExpandingLoad()); 9914 SDValue Combo = NewLoad; 9915 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || 9916 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && 9917 isZeroVector(PassThru->getOperand(0)); 9918 if (!PassThru.isUndef() && !PassThruIsCastZero) 9919 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9920 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9921 } 9922 9923 static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, 9924 const ARMSubtarget *ST) { 9925 if (!ST->hasMVEIntegerOps()) 9926 return SDValue(); 9927 9928 SDLoc dl(Op); 9929 unsigned BaseOpcode = 0; 9930 switch (Op->getOpcode()) { 9931 default: llvm_unreachable("Expected VECREDUCE opcode"); 9932 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; 9933 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; 9934 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; 9935 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; 9936 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; 9937 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; 9938 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; 9939 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; 9940 } 9941 9942 SDValue Op0 = Op->getOperand(0); 9943 EVT VT = Op0.getValueType(); 9944 EVT EltVT = VT.getVectorElementType(); 9945 unsigned NumElts = VT.getVectorNumElements(); 9946 unsigned NumActiveLanes = NumElts; 9947 9948 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || 9949 NumActiveLanes == 2) && 9950 "Only expected a power 2 vector size"); 9951 9952 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements 9953 // allows us to easily extract vector elements from the lanes. 9954 while (NumActiveLanes > 4) { 9955 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; 9956 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); 9957 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); 9958 NumActiveLanes /= 2; 9959 } 9960 9961 SDValue Res; 9962 if (NumActiveLanes == 4) { 9963 // The remaining 4 elements are summed sequentially 9964 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9965 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); 9966 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9967 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); 9968 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9969 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); 9970 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9971 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); 9972 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 9973 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); 9974 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); 9975 } else { 9976 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9977 DAG.getConstant(0, dl, MVT::i32)); 9978 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9979 DAG.getConstant(1, dl, MVT::i32)); 9980 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 9981 } 9982 9983 // Result type may be wider than element type. 9984 if (EltVT != Op->getValueType(0)) 9985 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); 9986 return Res; 9987 } 9988 9989 static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, 9990 const ARMSubtarget *ST) { 9991 if (!ST->hasMVEFloatOps()) 9992 return SDValue(); 9993 return LowerVecReduce(Op, DAG, ST); 9994 } 9995 9996 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9997 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) 9998 // Acquire/Release load/store is not legal for targets without a dmb or 9999 // equivalent available. 10000 return SDValue(); 10001 10002 // Monotonic load/store is legal for all targets. 10003 return Op; 10004 } 10005 10006 static void ReplaceREADCYCLECOUNTER(SDNode *N, 10007 SmallVectorImpl<SDValue> &Results, 10008 SelectionDAG &DAG, 10009 const ARMSubtarget *Subtarget) { 10010 SDLoc DL(N); 10011 // Under Power Management extensions, the cycle-count is: 10012 // mrc p15, #0, <Rt>, c9, c13, #0 10013 SDValue Ops[] = { N->getOperand(0), // Chain 10014 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 10015 DAG.getTargetConstant(15, DL, MVT::i32), 10016 DAG.getTargetConstant(0, DL, MVT::i32), 10017 DAG.getTargetConstant(9, DL, MVT::i32), 10018 DAG.getTargetConstant(13, DL, MVT::i32), 10019 DAG.getTargetConstant(0, DL, MVT::i32) 10020 }; 10021 10022 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 10023 DAG.getVTList(MVT::i32, MVT::Other), Ops); 10024 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 10025 DAG.getConstant(0, DL, MVT::i32))); 10026 Results.push_back(Cycles32.getValue(1)); 10027 } 10028 10029 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 10030 SDLoc dl(V.getNode()); 10031 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 10032 SDValue VHi = DAG.getAnyExtOrTrunc( 10033 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 10034 dl, MVT::i32); 10035 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10036 if (isBigEndian) 10037 std::swap (VLo, VHi); 10038 SDValue RegClass = 10039 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 10040 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 10041 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 10042 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 10043 return SDValue( 10044 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 10045 } 10046 10047 static void ReplaceCMP_SWAP_64Results(SDNode *N, 10048 SmallVectorImpl<SDValue> & Results, 10049 SelectionDAG &DAG) { 10050 assert(N->getValueType(0) == MVT::i64 && 10051 "AtomicCmpSwap on types less than 64 should be legal"); 10052 SDValue Ops[] = {N->getOperand(1), 10053 createGPRPairNode(DAG, N->getOperand(2)), 10054 createGPRPairNode(DAG, N->getOperand(3)), 10055 N->getOperand(0)}; 10056 SDNode *CmpSwap = DAG.getMachineNode( 10057 ARM::CMP_SWAP_64, SDLoc(N), 10058 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 10059 10060 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 10061 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 10062 10063 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10064 10065 SDValue Lo = 10066 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 10067 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 10068 SDValue Hi = 10069 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 10070 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 10071 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); 10072 Results.push_back(SDValue(CmpSwap, 2)); 10073 } 10074 10075 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 10076 SDLoc dl(Op); 10077 EVT VT = Op.getValueType(); 10078 SDValue Chain = Op.getOperand(0); 10079 SDValue LHS = Op.getOperand(1); 10080 SDValue RHS = Op.getOperand(2); 10081 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 10082 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 10083 10084 // If we don't have instructions of this float type then soften to a libcall 10085 // and use SETCC instead. 10086 if (isUnsupportedFloatingType(LHS.getValueType())) { 10087 DAG.getTargetLoweringInfo().softenSetCCOperands( 10088 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 10089 if (!RHS.getNode()) { 10090 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 10091 CC = ISD::SETNE; 10092 } 10093 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 10094 DAG.getCondCode(CC)); 10095 return DAG.getMergeValues({Result, Chain}, dl); 10096 } 10097 10098 ARMCC::CondCodes CondCode, CondCode2; 10099 FPCCToARMCC(CC, CondCode, CondCode2); 10100 10101 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 10102 // in CMPFP and CMPFPE, but instead it should be made explicit by these 10103 // instructions using a chain instead of glue. This would also fix the problem 10104 // here (and also in LowerSELECT_CC) where we generate two comparisons when 10105 // CondCode2 != AL. 10106 SDValue True = DAG.getConstant(1, dl, VT); 10107 SDValue False = DAG.getConstant(0, dl, VT); 10108 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 10109 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 10110 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 10111 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 10112 if (CondCode2 != ARMCC::AL) { 10113 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 10114 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 10115 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 10116 } 10117 return DAG.getMergeValues({Result, Chain}, dl); 10118 } 10119 10120 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 10121 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 10122 switch (Op.getOpcode()) { 10123 default: llvm_unreachable("Don't know how to custom lower this!"); 10124 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 10125 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 10126 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 10127 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 10128 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 10129 case ISD::SELECT: return LowerSELECT(Op, DAG); 10130 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 10131 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 10132 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 10133 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 10134 case ISD::VASTART: return LowerVASTART(Op, DAG); 10135 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 10136 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 10137 case ISD::SINT_TO_FP: 10138 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 10139 case ISD::STRICT_FP_TO_SINT: 10140 case ISD::STRICT_FP_TO_UINT: 10141 case ISD::FP_TO_SINT: 10142 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 10143 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 10144 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 10145 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 10146 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 10147 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 10148 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 10149 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 10150 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 10151 Subtarget); 10152 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 10153 case ISD::SHL: 10154 case ISD::SRL: 10155 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 10156 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 10157 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 10158 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 10159 case ISD::SRL_PARTS: 10160 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 10161 case ISD::CTTZ: 10162 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 10163 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 10164 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 10165 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 10166 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 10167 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 10168 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 10169 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 10170 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 10171 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 10172 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 10173 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget); 10174 case ISD::SIGN_EXTEND: 10175 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget); 10176 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 10177 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); 10178 case ISD::MUL: return LowerMUL(Op, DAG); 10179 case ISD::SDIV: 10180 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 10181 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 10182 return LowerSDIV(Op, DAG, Subtarget); 10183 case ISD::UDIV: 10184 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 10185 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 10186 return LowerUDIV(Op, DAG, Subtarget); 10187 case ISD::ADDCARRY: 10188 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 10189 case ISD::SADDO: 10190 case ISD::SSUBO: 10191 return LowerSignedALUO(Op, DAG); 10192 case ISD::UADDO: 10193 case ISD::USUBO: 10194 return LowerUnsignedALUO(Op, DAG); 10195 case ISD::SADDSAT: 10196 case ISD::SSUBSAT: 10197 case ISD::UADDSAT: 10198 case ISD::USUBSAT: 10199 return LowerADDSUBSAT(Op, DAG, Subtarget); 10200 case ISD::LOAD: 10201 return LowerPredicateLoad(Op, DAG); 10202 case ISD::STORE: 10203 return LowerSTORE(Op, DAG, Subtarget); 10204 case ISD::MLOAD: 10205 return LowerMLOAD(Op, DAG); 10206 case ISD::VECREDUCE_MUL: 10207 case ISD::VECREDUCE_AND: 10208 case ISD::VECREDUCE_OR: 10209 case ISD::VECREDUCE_XOR: 10210 return LowerVecReduce(Op, DAG, Subtarget); 10211 case ISD::VECREDUCE_FADD: 10212 case ISD::VECREDUCE_FMUL: 10213 case ISD::VECREDUCE_FMIN: 10214 case ISD::VECREDUCE_FMAX: 10215 return LowerVecReduceF(Op, DAG, Subtarget); 10216 case ISD::ATOMIC_LOAD: 10217 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 10218 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 10219 case ISD::SDIVREM: 10220 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 10221 case ISD::DYNAMIC_STACKALLOC: 10222 if (Subtarget->isTargetWindows()) 10223 return LowerDYNAMIC_STACKALLOC(Op, DAG); 10224 llvm_unreachable("Don't know how to custom lower this!"); 10225 case ISD::STRICT_FP_ROUND: 10226 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 10227 case ISD::STRICT_FP_EXTEND: 10228 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 10229 case ISD::STRICT_FSETCC: 10230 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 10231 case ARMISD::WIN__DBZCHK: return SDValue(); 10232 } 10233 } 10234 10235 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 10236 SelectionDAG &DAG) { 10237 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 10238 unsigned Opc = 0; 10239 if (IntNo == Intrinsic::arm_smlald) 10240 Opc = ARMISD::SMLALD; 10241 else if (IntNo == Intrinsic::arm_smlaldx) 10242 Opc = ARMISD::SMLALDX; 10243 else if (IntNo == Intrinsic::arm_smlsld) 10244 Opc = ARMISD::SMLSLD; 10245 else if (IntNo == Intrinsic::arm_smlsldx) 10246 Opc = ARMISD::SMLSLDX; 10247 else 10248 return; 10249 10250 SDLoc dl(N); 10251 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10252 N->getOperand(3), 10253 DAG.getConstant(0, dl, MVT::i32)); 10254 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10255 N->getOperand(3), 10256 DAG.getConstant(1, dl, MVT::i32)); 10257 10258 SDValue LongMul = DAG.getNode(Opc, dl, 10259 DAG.getVTList(MVT::i32, MVT::i32), 10260 N->getOperand(1), N->getOperand(2), 10261 Lo, Hi); 10262 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 10263 LongMul.getValue(0), LongMul.getValue(1))); 10264 } 10265 10266 /// ReplaceNodeResults - Replace the results of node with an illegal result 10267 /// type with new values built out of custom code. 10268 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 10269 SmallVectorImpl<SDValue> &Results, 10270 SelectionDAG &DAG) const { 10271 SDValue Res; 10272 switch (N->getOpcode()) { 10273 default: 10274 llvm_unreachable("Don't know how to custom expand this!"); 10275 case ISD::READ_REGISTER: 10276 ExpandREAD_REGISTER(N, Results, DAG); 10277 break; 10278 case ISD::BITCAST: 10279 Res = ExpandBITCAST(N, DAG, Subtarget); 10280 break; 10281 case ISD::SRL: 10282 case ISD::SRA: 10283 case ISD::SHL: 10284 Res = Expand64BitShift(N, DAG, Subtarget); 10285 break; 10286 case ISD::SREM: 10287 case ISD::UREM: 10288 Res = LowerREM(N, DAG); 10289 break; 10290 case ISD::SDIVREM: 10291 case ISD::UDIVREM: 10292 Res = LowerDivRem(SDValue(N, 0), DAG); 10293 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 10294 Results.push_back(Res.getValue(0)); 10295 Results.push_back(Res.getValue(1)); 10296 return; 10297 case ISD::SADDSAT: 10298 case ISD::SSUBSAT: 10299 case ISD::UADDSAT: 10300 case ISD::USUBSAT: 10301 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 10302 break; 10303 case ISD::READCYCLECOUNTER: 10304 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 10305 return; 10306 case ISD::UDIV: 10307 case ISD::SDIV: 10308 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 10309 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 10310 Results); 10311 case ISD::ATOMIC_CMP_SWAP: 10312 ReplaceCMP_SWAP_64Results(N, Results, DAG); 10313 return; 10314 case ISD::INTRINSIC_WO_CHAIN: 10315 return ReplaceLongIntrinsic(N, Results, DAG); 10316 case ISD::ABS: 10317 lowerABS(N, Results, DAG); 10318 return ; 10319 case ISD::LOAD: 10320 LowerLOAD(N, Results, DAG); 10321 break; 10322 case ISD::TRUNCATE: 10323 Res = LowerTruncate(N, DAG, Subtarget); 10324 break; 10325 case ISD::SIGN_EXTEND: 10326 case ISD::ZERO_EXTEND: 10327 Res = LowerVectorExtend(N, DAG, Subtarget); 10328 break; 10329 } 10330 if (Res.getNode()) 10331 Results.push_back(Res); 10332 } 10333 10334 //===----------------------------------------------------------------------===// 10335 // ARM Scheduler Hooks 10336 //===----------------------------------------------------------------------===// 10337 10338 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 10339 /// registers the function context. 10340 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 10341 MachineBasicBlock *MBB, 10342 MachineBasicBlock *DispatchBB, 10343 int FI) const { 10344 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 10345 "ROPI/RWPI not currently supported with SjLj"); 10346 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10347 DebugLoc dl = MI.getDebugLoc(); 10348 MachineFunction *MF = MBB->getParent(); 10349 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10350 MachineConstantPool *MCP = MF->getConstantPool(); 10351 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 10352 const Function &F = MF->getFunction(); 10353 10354 bool isThumb = Subtarget->isThumb(); 10355 bool isThumb2 = Subtarget->isThumb2(); 10356 10357 unsigned PCLabelId = AFI->createPICLabelUId(); 10358 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 10359 ARMConstantPoolValue *CPV = 10360 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 10361 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); 10362 10363 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 10364 : &ARM::GPRRegClass; 10365 10366 // Grab constant pool and fixed stack memory operands. 10367 MachineMemOperand *CPMMO = 10368 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10369 MachineMemOperand::MOLoad, 4, Align(4)); 10370 10371 MachineMemOperand *FIMMOSt = 10372 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 10373 MachineMemOperand::MOStore, 4, Align(4)); 10374 10375 // Load the address of the dispatch MBB into the jump buffer. 10376 if (isThumb2) { 10377 // Incoming value: jbuf 10378 // ldr.n r5, LCPI1_1 10379 // orr r5, r5, #1 10380 // add r5, pc 10381 // str r5, [$jbuf, #+4] ; &jbuf[1] 10382 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10383 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 10384 .addConstantPoolIndex(CPI) 10385 .addMemOperand(CPMMO) 10386 .add(predOps(ARMCC::AL)); 10387 // Set the low bit because of thumb mode. 10388 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10389 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 10390 .addReg(NewVReg1, RegState::Kill) 10391 .addImm(0x01) 10392 .add(predOps(ARMCC::AL)) 10393 .add(condCodeOp()); 10394 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10395 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 10396 .addReg(NewVReg2, RegState::Kill) 10397 .addImm(PCLabelId); 10398 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 10399 .addReg(NewVReg3, RegState::Kill) 10400 .addFrameIndex(FI) 10401 .addImm(36) // &jbuf[1] :: pc 10402 .addMemOperand(FIMMOSt) 10403 .add(predOps(ARMCC::AL)); 10404 } else if (isThumb) { 10405 // Incoming value: jbuf 10406 // ldr.n r1, LCPI1_4 10407 // add r1, pc 10408 // mov r2, #1 10409 // orrs r1, r2 10410 // add r2, $jbuf, #+4 ; &jbuf[1] 10411 // str r1, [r2] 10412 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10413 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 10414 .addConstantPoolIndex(CPI) 10415 .addMemOperand(CPMMO) 10416 .add(predOps(ARMCC::AL)); 10417 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10418 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 10419 .addReg(NewVReg1, RegState::Kill) 10420 .addImm(PCLabelId); 10421 // Set the low bit because of thumb mode. 10422 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10423 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 10424 .addReg(ARM::CPSR, RegState::Define) 10425 .addImm(1) 10426 .add(predOps(ARMCC::AL)); 10427 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10428 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 10429 .addReg(ARM::CPSR, RegState::Define) 10430 .addReg(NewVReg2, RegState::Kill) 10431 .addReg(NewVReg3, RegState::Kill) 10432 .add(predOps(ARMCC::AL)); 10433 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10434 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 10435 .addFrameIndex(FI) 10436 .addImm(36); // &jbuf[1] :: pc 10437 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 10438 .addReg(NewVReg4, RegState::Kill) 10439 .addReg(NewVReg5, RegState::Kill) 10440 .addImm(0) 10441 .addMemOperand(FIMMOSt) 10442 .add(predOps(ARMCC::AL)); 10443 } else { 10444 // Incoming value: jbuf 10445 // ldr r1, LCPI1_1 10446 // add r1, pc, r1 10447 // str r1, [$jbuf, #+4] ; &jbuf[1] 10448 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10449 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 10450 .addConstantPoolIndex(CPI) 10451 .addImm(0) 10452 .addMemOperand(CPMMO) 10453 .add(predOps(ARMCC::AL)); 10454 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10455 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 10456 .addReg(NewVReg1, RegState::Kill) 10457 .addImm(PCLabelId) 10458 .add(predOps(ARMCC::AL)); 10459 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 10460 .addReg(NewVReg2, RegState::Kill) 10461 .addFrameIndex(FI) 10462 .addImm(36) // &jbuf[1] :: pc 10463 .addMemOperand(FIMMOSt) 10464 .add(predOps(ARMCC::AL)); 10465 } 10466 } 10467 10468 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 10469 MachineBasicBlock *MBB) const { 10470 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10471 DebugLoc dl = MI.getDebugLoc(); 10472 MachineFunction *MF = MBB->getParent(); 10473 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10474 MachineFrameInfo &MFI = MF->getFrameInfo(); 10475 int FI = MFI.getFunctionContextIndex(); 10476 10477 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 10478 : &ARM::GPRnopcRegClass; 10479 10480 // Get a mapping of the call site numbers to all of the landing pads they're 10481 // associated with. 10482 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 10483 unsigned MaxCSNum = 0; 10484 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 10485 ++BB) { 10486 if (!BB->isEHPad()) continue; 10487 10488 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 10489 // pad. 10490 for (MachineBasicBlock::iterator 10491 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 10492 if (!II->isEHLabel()) continue; 10493 10494 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 10495 if (!MF->hasCallSiteLandingPad(Sym)) continue; 10496 10497 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 10498 for (SmallVectorImpl<unsigned>::iterator 10499 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 10500 CSI != CSE; ++CSI) { 10501 CallSiteNumToLPad[*CSI].push_back(&*BB); 10502 MaxCSNum = std::max(MaxCSNum, *CSI); 10503 } 10504 break; 10505 } 10506 } 10507 10508 // Get an ordered list of the machine basic blocks for the jump table. 10509 std::vector<MachineBasicBlock*> LPadList; 10510 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 10511 LPadList.reserve(CallSiteNumToLPad.size()); 10512 for (unsigned I = 1; I <= MaxCSNum; ++I) { 10513 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 10514 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10515 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 10516 LPadList.push_back(*II); 10517 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 10518 } 10519 } 10520 10521 assert(!LPadList.empty() && 10522 "No landing pad destinations for the dispatch jump table!"); 10523 10524 // Create the jump table and associated information. 10525 MachineJumpTableInfo *JTI = 10526 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 10527 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 10528 10529 // Create the MBBs for the dispatch code. 10530 10531 // Shove the dispatch's address into the return slot in the function context. 10532 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 10533 DispatchBB->setIsEHPad(); 10534 10535 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10536 unsigned trap_opcode; 10537 if (Subtarget->isThumb()) 10538 trap_opcode = ARM::tTRAP; 10539 else 10540 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 10541 10542 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 10543 DispatchBB->addSuccessor(TrapBB); 10544 10545 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 10546 DispatchBB->addSuccessor(DispContBB); 10547 10548 // Insert and MBBs. 10549 MF->insert(MF->end(), DispatchBB); 10550 MF->insert(MF->end(), DispContBB); 10551 MF->insert(MF->end(), TrapBB); 10552 10553 // Insert code into the entry block that creates and registers the function 10554 // context. 10555 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 10556 10557 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 10558 MachinePointerInfo::getFixedStack(*MF, FI), 10559 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); 10560 10561 MachineInstrBuilder MIB; 10562 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 10563 10564 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 10565 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 10566 10567 // Add a register mask with no preserved registers. This results in all 10568 // registers being marked as clobbered. This can't work if the dispatch block 10569 // is in a Thumb1 function and is linked with ARM code which uses the FP 10570 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 10571 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 10572 10573 bool IsPositionIndependent = isPositionIndependent(); 10574 unsigned NumLPads = LPadList.size(); 10575 if (Subtarget->isThumb2()) { 10576 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10577 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 10578 .addFrameIndex(FI) 10579 .addImm(4) 10580 .addMemOperand(FIMMOLd) 10581 .add(predOps(ARMCC::AL)); 10582 10583 if (NumLPads < 256) { 10584 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 10585 .addReg(NewVReg1) 10586 .addImm(LPadList.size()) 10587 .add(predOps(ARMCC::AL)); 10588 } else { 10589 Register VReg1 = MRI->createVirtualRegister(TRC); 10590 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 10591 .addImm(NumLPads & 0xFFFF) 10592 .add(predOps(ARMCC::AL)); 10593 10594 unsigned VReg2 = VReg1; 10595 if ((NumLPads & 0xFFFF0000) != 0) { 10596 VReg2 = MRI->createVirtualRegister(TRC); 10597 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 10598 .addReg(VReg1) 10599 .addImm(NumLPads >> 16) 10600 .add(predOps(ARMCC::AL)); 10601 } 10602 10603 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 10604 .addReg(NewVReg1) 10605 .addReg(VReg2) 10606 .add(predOps(ARMCC::AL)); 10607 } 10608 10609 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 10610 .addMBB(TrapBB) 10611 .addImm(ARMCC::HI) 10612 .addReg(ARM::CPSR); 10613 10614 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10615 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 10616 .addJumpTableIndex(MJTI) 10617 .add(predOps(ARMCC::AL)); 10618 10619 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10620 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 10621 .addReg(NewVReg3, RegState::Kill) 10622 .addReg(NewVReg1) 10623 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10624 .add(predOps(ARMCC::AL)) 10625 .add(condCodeOp()); 10626 10627 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 10628 .addReg(NewVReg4, RegState::Kill) 10629 .addReg(NewVReg1) 10630 .addJumpTableIndex(MJTI); 10631 } else if (Subtarget->isThumb()) { 10632 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10633 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 10634 .addFrameIndex(FI) 10635 .addImm(1) 10636 .addMemOperand(FIMMOLd) 10637 .add(predOps(ARMCC::AL)); 10638 10639 if (NumLPads < 256) { 10640 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 10641 .addReg(NewVReg1) 10642 .addImm(NumLPads) 10643 .add(predOps(ARMCC::AL)); 10644 } else { 10645 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10646 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10647 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10648 10649 // MachineConstantPool wants an explicit alignment. 10650 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10651 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10652 10653 Register VReg1 = MRI->createVirtualRegister(TRC); 10654 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 10655 .addReg(VReg1, RegState::Define) 10656 .addConstantPoolIndex(Idx) 10657 .add(predOps(ARMCC::AL)); 10658 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 10659 .addReg(NewVReg1) 10660 .addReg(VReg1) 10661 .add(predOps(ARMCC::AL)); 10662 } 10663 10664 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 10665 .addMBB(TrapBB) 10666 .addImm(ARMCC::HI) 10667 .addReg(ARM::CPSR); 10668 10669 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10670 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 10671 .addReg(ARM::CPSR, RegState::Define) 10672 .addReg(NewVReg1) 10673 .addImm(2) 10674 .add(predOps(ARMCC::AL)); 10675 10676 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10677 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 10678 .addJumpTableIndex(MJTI) 10679 .add(predOps(ARMCC::AL)); 10680 10681 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10682 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 10683 .addReg(ARM::CPSR, RegState::Define) 10684 .addReg(NewVReg2, RegState::Kill) 10685 .addReg(NewVReg3) 10686 .add(predOps(ARMCC::AL)); 10687 10688 MachineMemOperand *JTMMOLd = 10689 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10690 MachineMemOperand::MOLoad, 4, Align(4)); 10691 10692 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10693 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 10694 .addReg(NewVReg4, RegState::Kill) 10695 .addImm(0) 10696 .addMemOperand(JTMMOLd) 10697 .add(predOps(ARMCC::AL)); 10698 10699 unsigned NewVReg6 = NewVReg5; 10700 if (IsPositionIndependent) { 10701 NewVReg6 = MRI->createVirtualRegister(TRC); 10702 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 10703 .addReg(ARM::CPSR, RegState::Define) 10704 .addReg(NewVReg5, RegState::Kill) 10705 .addReg(NewVReg3) 10706 .add(predOps(ARMCC::AL)); 10707 } 10708 10709 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 10710 .addReg(NewVReg6, RegState::Kill) 10711 .addJumpTableIndex(MJTI); 10712 } else { 10713 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10714 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 10715 .addFrameIndex(FI) 10716 .addImm(4) 10717 .addMemOperand(FIMMOLd) 10718 .add(predOps(ARMCC::AL)); 10719 10720 if (NumLPads < 256) { 10721 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 10722 .addReg(NewVReg1) 10723 .addImm(NumLPads) 10724 .add(predOps(ARMCC::AL)); 10725 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 10726 Register VReg1 = MRI->createVirtualRegister(TRC); 10727 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 10728 .addImm(NumLPads & 0xFFFF) 10729 .add(predOps(ARMCC::AL)); 10730 10731 unsigned VReg2 = VReg1; 10732 if ((NumLPads & 0xFFFF0000) != 0) { 10733 VReg2 = MRI->createVirtualRegister(TRC); 10734 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 10735 .addReg(VReg1) 10736 .addImm(NumLPads >> 16) 10737 .add(predOps(ARMCC::AL)); 10738 } 10739 10740 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10741 .addReg(NewVReg1) 10742 .addReg(VReg2) 10743 .add(predOps(ARMCC::AL)); 10744 } else { 10745 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10746 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10747 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10748 10749 // MachineConstantPool wants an explicit alignment. 10750 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10751 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10752 10753 Register VReg1 = MRI->createVirtualRegister(TRC); 10754 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 10755 .addReg(VReg1, RegState::Define) 10756 .addConstantPoolIndex(Idx) 10757 .addImm(0) 10758 .add(predOps(ARMCC::AL)); 10759 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10760 .addReg(NewVReg1) 10761 .addReg(VReg1, RegState::Kill) 10762 .add(predOps(ARMCC::AL)); 10763 } 10764 10765 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 10766 .addMBB(TrapBB) 10767 .addImm(ARMCC::HI) 10768 .addReg(ARM::CPSR); 10769 10770 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10771 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 10772 .addReg(NewVReg1) 10773 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10774 .add(predOps(ARMCC::AL)) 10775 .add(condCodeOp()); 10776 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10777 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 10778 .addJumpTableIndex(MJTI) 10779 .add(predOps(ARMCC::AL)); 10780 10781 MachineMemOperand *JTMMOLd = 10782 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10783 MachineMemOperand::MOLoad, 4, Align(4)); 10784 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10785 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 10786 .addReg(NewVReg3, RegState::Kill) 10787 .addReg(NewVReg4) 10788 .addImm(0) 10789 .addMemOperand(JTMMOLd) 10790 .add(predOps(ARMCC::AL)); 10791 10792 if (IsPositionIndependent) { 10793 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 10794 .addReg(NewVReg5, RegState::Kill) 10795 .addReg(NewVReg4) 10796 .addJumpTableIndex(MJTI); 10797 } else { 10798 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 10799 .addReg(NewVReg5, RegState::Kill) 10800 .addJumpTableIndex(MJTI); 10801 } 10802 } 10803 10804 // Add the jump table entries as successors to the MBB. 10805 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 10806 for (std::vector<MachineBasicBlock*>::iterator 10807 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 10808 MachineBasicBlock *CurMBB = *I; 10809 if (SeenMBBs.insert(CurMBB).second) 10810 DispContBB->addSuccessor(CurMBB); 10811 } 10812 10813 // N.B. the order the invoke BBs are processed in doesn't matter here. 10814 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 10815 SmallVector<MachineBasicBlock*, 64> MBBLPads; 10816 for (MachineBasicBlock *BB : InvokeBBs) { 10817 10818 // Remove the landing pad successor from the invoke block and replace it 10819 // with the new dispatch block. 10820 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors()); 10821 while (!Successors.empty()) { 10822 MachineBasicBlock *SMBB = Successors.pop_back_val(); 10823 if (SMBB->isEHPad()) { 10824 BB->removeSuccessor(SMBB); 10825 MBBLPads.push_back(SMBB); 10826 } 10827 } 10828 10829 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 10830 BB->normalizeSuccProbs(); 10831 10832 // Find the invoke call and mark all of the callee-saved registers as 10833 // 'implicit defined' so that they're spilled. This prevents code from 10834 // moving instructions to before the EH block, where they will never be 10835 // executed. 10836 for (MachineBasicBlock::reverse_iterator 10837 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 10838 if (!II->isCall()) continue; 10839 10840 DenseMap<unsigned, bool> DefRegs; 10841 for (MachineInstr::mop_iterator 10842 OI = II->operands_begin(), OE = II->operands_end(); 10843 OI != OE; ++OI) { 10844 if (!OI->isReg()) continue; 10845 DefRegs[OI->getReg()] = true; 10846 } 10847 10848 MachineInstrBuilder MIB(*MF, &*II); 10849 10850 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 10851 unsigned Reg = SavedRegs[i]; 10852 if (Subtarget->isThumb2() && 10853 !ARM::tGPRRegClass.contains(Reg) && 10854 !ARM::hGPRRegClass.contains(Reg)) 10855 continue; 10856 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 10857 continue; 10858 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 10859 continue; 10860 if (!DefRegs[Reg]) 10861 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 10862 } 10863 10864 break; 10865 } 10866 } 10867 10868 // Mark all former landing pads as non-landing pads. The dispatch is the only 10869 // landing pad now. 10870 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10871 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 10872 (*I)->setIsEHPad(false); 10873 10874 // The instruction is gone now. 10875 MI.eraseFromParent(); 10876 } 10877 10878 static 10879 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 10880 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 10881 E = MBB->succ_end(); I != E; ++I) 10882 if (*I != Succ) 10883 return *I; 10884 llvm_unreachable("Expecting a BB with two successors!"); 10885 } 10886 10887 /// Return the load opcode for a given load size. If load size >= 8, 10888 /// neon opcode will be returned. 10889 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 10890 if (LdSize >= 8) 10891 return LdSize == 16 ? ARM::VLD1q32wb_fixed 10892 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 10893 if (IsThumb1) 10894 return LdSize == 4 ? ARM::tLDRi 10895 : LdSize == 2 ? ARM::tLDRHi 10896 : LdSize == 1 ? ARM::tLDRBi : 0; 10897 if (IsThumb2) 10898 return LdSize == 4 ? ARM::t2LDR_POST 10899 : LdSize == 2 ? ARM::t2LDRH_POST 10900 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 10901 return LdSize == 4 ? ARM::LDR_POST_IMM 10902 : LdSize == 2 ? ARM::LDRH_POST 10903 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 10904 } 10905 10906 /// Return the store opcode for a given store size. If store size >= 8, 10907 /// neon opcode will be returned. 10908 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 10909 if (StSize >= 8) 10910 return StSize == 16 ? ARM::VST1q32wb_fixed 10911 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 10912 if (IsThumb1) 10913 return StSize == 4 ? ARM::tSTRi 10914 : StSize == 2 ? ARM::tSTRHi 10915 : StSize == 1 ? ARM::tSTRBi : 0; 10916 if (IsThumb2) 10917 return StSize == 4 ? ARM::t2STR_POST 10918 : StSize == 2 ? ARM::t2STRH_POST 10919 : StSize == 1 ? ARM::t2STRB_POST : 0; 10920 return StSize == 4 ? ARM::STR_POST_IMM 10921 : StSize == 2 ? ARM::STRH_POST 10922 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 10923 } 10924 10925 /// Emit a post-increment load operation with given size. The instructions 10926 /// will be added to BB at Pos. 10927 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10928 const TargetInstrInfo *TII, const DebugLoc &dl, 10929 unsigned LdSize, unsigned Data, unsigned AddrIn, 10930 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10931 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 10932 assert(LdOpc != 0 && "Should have a load opcode"); 10933 if (LdSize >= 8) { 10934 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10935 .addReg(AddrOut, RegState::Define) 10936 .addReg(AddrIn) 10937 .addImm(0) 10938 .add(predOps(ARMCC::AL)); 10939 } else if (IsThumb1) { 10940 // load + update AddrIn 10941 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10942 .addReg(AddrIn) 10943 .addImm(0) 10944 .add(predOps(ARMCC::AL)); 10945 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10946 .add(t1CondCodeOp()) 10947 .addReg(AddrIn) 10948 .addImm(LdSize) 10949 .add(predOps(ARMCC::AL)); 10950 } else if (IsThumb2) { 10951 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10952 .addReg(AddrOut, RegState::Define) 10953 .addReg(AddrIn) 10954 .addImm(LdSize) 10955 .add(predOps(ARMCC::AL)); 10956 } else { // arm 10957 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10958 .addReg(AddrOut, RegState::Define) 10959 .addReg(AddrIn) 10960 .addReg(0) 10961 .addImm(LdSize) 10962 .add(predOps(ARMCC::AL)); 10963 } 10964 } 10965 10966 /// Emit a post-increment store operation with given size. The instructions 10967 /// will be added to BB at Pos. 10968 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10969 const TargetInstrInfo *TII, const DebugLoc &dl, 10970 unsigned StSize, unsigned Data, unsigned AddrIn, 10971 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10972 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 10973 assert(StOpc != 0 && "Should have a store opcode"); 10974 if (StSize >= 8) { 10975 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10976 .addReg(AddrIn) 10977 .addImm(0) 10978 .addReg(Data) 10979 .add(predOps(ARMCC::AL)); 10980 } else if (IsThumb1) { 10981 // store + update AddrIn 10982 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 10983 .addReg(Data) 10984 .addReg(AddrIn) 10985 .addImm(0) 10986 .add(predOps(ARMCC::AL)); 10987 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10988 .add(t1CondCodeOp()) 10989 .addReg(AddrIn) 10990 .addImm(StSize) 10991 .add(predOps(ARMCC::AL)); 10992 } else if (IsThumb2) { 10993 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10994 .addReg(Data) 10995 .addReg(AddrIn) 10996 .addImm(StSize) 10997 .add(predOps(ARMCC::AL)); 10998 } else { // arm 10999 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 11000 .addReg(Data) 11001 .addReg(AddrIn) 11002 .addReg(0) 11003 .addImm(StSize) 11004 .add(predOps(ARMCC::AL)); 11005 } 11006 } 11007 11008 MachineBasicBlock * 11009 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 11010 MachineBasicBlock *BB) const { 11011 // This pseudo instruction has 3 operands: dst, src, size 11012 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 11013 // Otherwise, we will generate unrolled scalar copies. 11014 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11015 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11016 MachineFunction::iterator It = ++BB->getIterator(); 11017 11018 Register dest = MI.getOperand(0).getReg(); 11019 Register src = MI.getOperand(1).getReg(); 11020 unsigned SizeVal = MI.getOperand(2).getImm(); 11021 unsigned Alignment = MI.getOperand(3).getImm(); 11022 DebugLoc dl = MI.getDebugLoc(); 11023 11024 MachineFunction *MF = BB->getParent(); 11025 MachineRegisterInfo &MRI = MF->getRegInfo(); 11026 unsigned UnitSize = 0; 11027 const TargetRegisterClass *TRC = nullptr; 11028 const TargetRegisterClass *VecTRC = nullptr; 11029 11030 bool IsThumb1 = Subtarget->isThumb1Only(); 11031 bool IsThumb2 = Subtarget->isThumb2(); 11032 bool IsThumb = Subtarget->isThumb(); 11033 11034 if (Alignment & 1) { 11035 UnitSize = 1; 11036 } else if (Alignment & 2) { 11037 UnitSize = 2; 11038 } else { 11039 // Check whether we can use NEON instructions. 11040 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 11041 Subtarget->hasNEON()) { 11042 if ((Alignment % 16 == 0) && SizeVal >= 16) 11043 UnitSize = 16; 11044 else if ((Alignment % 8 == 0) && SizeVal >= 8) 11045 UnitSize = 8; 11046 } 11047 // Can't use NEON instructions. 11048 if (UnitSize == 0) 11049 UnitSize = 4; 11050 } 11051 11052 // Select the correct opcode and register class for unit size load/store 11053 bool IsNeon = UnitSize >= 8; 11054 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 11055 if (IsNeon) 11056 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 11057 : UnitSize == 8 ? &ARM::DPRRegClass 11058 : nullptr; 11059 11060 unsigned BytesLeft = SizeVal % UnitSize; 11061 unsigned LoopSize = SizeVal - BytesLeft; 11062 11063 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 11064 // Use LDR and STR to copy. 11065 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 11066 // [destOut] = STR_POST(scratch, destIn, UnitSize) 11067 unsigned srcIn = src; 11068 unsigned destIn = dest; 11069 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 11070 Register srcOut = MRI.createVirtualRegister(TRC); 11071 Register destOut = MRI.createVirtualRegister(TRC); 11072 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 11073 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 11074 IsThumb1, IsThumb2); 11075 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 11076 IsThumb1, IsThumb2); 11077 srcIn = srcOut; 11078 destIn = destOut; 11079 } 11080 11081 // Handle the leftover bytes with LDRB and STRB. 11082 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 11083 // [destOut] = STRB_POST(scratch, destIn, 1) 11084 for (unsigned i = 0; i < BytesLeft; i++) { 11085 Register srcOut = MRI.createVirtualRegister(TRC); 11086 Register destOut = MRI.createVirtualRegister(TRC); 11087 Register scratch = MRI.createVirtualRegister(TRC); 11088 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 11089 IsThumb1, IsThumb2); 11090 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 11091 IsThumb1, IsThumb2); 11092 srcIn = srcOut; 11093 destIn = destOut; 11094 } 11095 MI.eraseFromParent(); // The instruction is gone now. 11096 return BB; 11097 } 11098 11099 // Expand the pseudo op to a loop. 11100 // thisMBB: 11101 // ... 11102 // movw varEnd, # --> with thumb2 11103 // movt varEnd, # 11104 // ldrcp varEnd, idx --> without thumb2 11105 // fallthrough --> loopMBB 11106 // loopMBB: 11107 // PHI varPhi, varEnd, varLoop 11108 // PHI srcPhi, src, srcLoop 11109 // PHI destPhi, dst, destLoop 11110 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 11111 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 11112 // subs varLoop, varPhi, #UnitSize 11113 // bne loopMBB 11114 // fallthrough --> exitMBB 11115 // exitMBB: 11116 // epilogue to handle left-over bytes 11117 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 11118 // [destOut] = STRB_POST(scratch, destLoop, 1) 11119 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11120 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11121 MF->insert(It, loopMBB); 11122 MF->insert(It, exitMBB); 11123 11124 // Transfer the remainder of BB and its successor edges to exitMBB. 11125 exitMBB->splice(exitMBB->begin(), BB, 11126 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11127 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 11128 11129 // Load an immediate to varEnd. 11130 Register varEnd = MRI.createVirtualRegister(TRC); 11131 if (Subtarget->useMovt()) { 11132 unsigned Vtmp = varEnd; 11133 if ((LoopSize & 0xFFFF0000) != 0) 11134 Vtmp = MRI.createVirtualRegister(TRC); 11135 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 11136 .addImm(LoopSize & 0xFFFF) 11137 .add(predOps(ARMCC::AL)); 11138 11139 if ((LoopSize & 0xFFFF0000) != 0) 11140 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 11141 .addReg(Vtmp) 11142 .addImm(LoopSize >> 16) 11143 .add(predOps(ARMCC::AL)); 11144 } else { 11145 MachineConstantPool *ConstantPool = MF->getConstantPool(); 11146 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 11147 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 11148 11149 // MachineConstantPool wants an explicit alignment. 11150 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 11151 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 11152 MachineMemOperand *CPMMO = 11153 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 11154 MachineMemOperand::MOLoad, 4, Align(4)); 11155 11156 if (IsThumb) 11157 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 11158 .addReg(varEnd, RegState::Define) 11159 .addConstantPoolIndex(Idx) 11160 .add(predOps(ARMCC::AL)) 11161 .addMemOperand(CPMMO); 11162 else 11163 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 11164 .addReg(varEnd, RegState::Define) 11165 .addConstantPoolIndex(Idx) 11166 .addImm(0) 11167 .add(predOps(ARMCC::AL)) 11168 .addMemOperand(CPMMO); 11169 } 11170 BB->addSuccessor(loopMBB); 11171 11172 // Generate the loop body: 11173 // varPhi = PHI(varLoop, varEnd) 11174 // srcPhi = PHI(srcLoop, src) 11175 // destPhi = PHI(destLoop, dst) 11176 MachineBasicBlock *entryBB = BB; 11177 BB = loopMBB; 11178 Register varLoop = MRI.createVirtualRegister(TRC); 11179 Register varPhi = MRI.createVirtualRegister(TRC); 11180 Register srcLoop = MRI.createVirtualRegister(TRC); 11181 Register srcPhi = MRI.createVirtualRegister(TRC); 11182 Register destLoop = MRI.createVirtualRegister(TRC); 11183 Register destPhi = MRI.createVirtualRegister(TRC); 11184 11185 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 11186 .addReg(varLoop).addMBB(loopMBB) 11187 .addReg(varEnd).addMBB(entryBB); 11188 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 11189 .addReg(srcLoop).addMBB(loopMBB) 11190 .addReg(src).addMBB(entryBB); 11191 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 11192 .addReg(destLoop).addMBB(loopMBB) 11193 .addReg(dest).addMBB(entryBB); 11194 11195 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 11196 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 11197 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 11198 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 11199 IsThumb1, IsThumb2); 11200 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 11201 IsThumb1, IsThumb2); 11202 11203 // Decrement loop variable by UnitSize. 11204 if (IsThumb1) { 11205 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 11206 .add(t1CondCodeOp()) 11207 .addReg(varPhi) 11208 .addImm(UnitSize) 11209 .add(predOps(ARMCC::AL)); 11210 } else { 11211 MachineInstrBuilder MIB = 11212 BuildMI(*BB, BB->end(), dl, 11213 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 11214 MIB.addReg(varPhi) 11215 .addImm(UnitSize) 11216 .add(predOps(ARMCC::AL)) 11217 .add(condCodeOp()); 11218 MIB->getOperand(5).setReg(ARM::CPSR); 11219 MIB->getOperand(5).setIsDef(true); 11220 } 11221 BuildMI(*BB, BB->end(), dl, 11222 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 11223 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 11224 11225 // loopMBB can loop back to loopMBB or fall through to exitMBB. 11226 BB->addSuccessor(loopMBB); 11227 BB->addSuccessor(exitMBB); 11228 11229 // Add epilogue to handle BytesLeft. 11230 BB = exitMBB; 11231 auto StartOfExit = exitMBB->begin(); 11232 11233 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 11234 // [destOut] = STRB_POST(scratch, destLoop, 1) 11235 unsigned srcIn = srcLoop; 11236 unsigned destIn = destLoop; 11237 for (unsigned i = 0; i < BytesLeft; i++) { 11238 Register srcOut = MRI.createVirtualRegister(TRC); 11239 Register destOut = MRI.createVirtualRegister(TRC); 11240 Register scratch = MRI.createVirtualRegister(TRC); 11241 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 11242 IsThumb1, IsThumb2); 11243 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 11244 IsThumb1, IsThumb2); 11245 srcIn = srcOut; 11246 destIn = destOut; 11247 } 11248 11249 MI.eraseFromParent(); // The instruction is gone now. 11250 return BB; 11251 } 11252 11253 MachineBasicBlock * 11254 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 11255 MachineBasicBlock *MBB) const { 11256 const TargetMachine &TM = getTargetMachine(); 11257 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 11258 DebugLoc DL = MI.getDebugLoc(); 11259 11260 assert(Subtarget->isTargetWindows() && 11261 "__chkstk is only supported on Windows"); 11262 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 11263 11264 // __chkstk takes the number of words to allocate on the stack in R4, and 11265 // returns the stack adjustment in number of bytes in R4. This will not 11266 // clober any other registers (other than the obvious lr). 11267 // 11268 // Although, technically, IP should be considered a register which may be 11269 // clobbered, the call itself will not touch it. Windows on ARM is a pure 11270 // thumb-2 environment, so there is no interworking required. As a result, we 11271 // do not expect a veneer to be emitted by the linker, clobbering IP. 11272 // 11273 // Each module receives its own copy of __chkstk, so no import thunk is 11274 // required, again, ensuring that IP is not clobbered. 11275 // 11276 // Finally, although some linkers may theoretically provide a trampoline for 11277 // out of range calls (which is quite common due to a 32M range limitation of 11278 // branches for Thumb), we can generate the long-call version via 11279 // -mcmodel=large, alleviating the need for the trampoline which may clobber 11280 // IP. 11281 11282 switch (TM.getCodeModel()) { 11283 case CodeModel::Tiny: 11284 llvm_unreachable("Tiny code model not available on ARM."); 11285 case CodeModel::Small: 11286 case CodeModel::Medium: 11287 case CodeModel::Kernel: 11288 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 11289 .add(predOps(ARMCC::AL)) 11290 .addExternalSymbol("__chkstk") 11291 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11292 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11293 .addReg(ARM::R12, 11294 RegState::Implicit | RegState::Define | RegState::Dead) 11295 .addReg(ARM::CPSR, 11296 RegState::Implicit | RegState::Define | RegState::Dead); 11297 break; 11298 case CodeModel::Large: { 11299 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 11300 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11301 11302 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 11303 .addExternalSymbol("__chkstk"); 11304 BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent()))) 11305 .add(predOps(ARMCC::AL)) 11306 .addReg(Reg, RegState::Kill) 11307 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11308 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11309 .addReg(ARM::R12, 11310 RegState::Implicit | RegState::Define | RegState::Dead) 11311 .addReg(ARM::CPSR, 11312 RegState::Implicit | RegState::Define | RegState::Dead); 11313 break; 11314 } 11315 } 11316 11317 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 11318 .addReg(ARM::SP, RegState::Kill) 11319 .addReg(ARM::R4, RegState::Kill) 11320 .setMIFlags(MachineInstr::FrameSetup) 11321 .add(predOps(ARMCC::AL)) 11322 .add(condCodeOp()); 11323 11324 MI.eraseFromParent(); 11325 return MBB; 11326 } 11327 11328 MachineBasicBlock * 11329 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 11330 MachineBasicBlock *MBB) const { 11331 DebugLoc DL = MI.getDebugLoc(); 11332 MachineFunction *MF = MBB->getParent(); 11333 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11334 11335 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 11336 MF->insert(++MBB->getIterator(), ContBB); 11337 ContBB->splice(ContBB->begin(), MBB, 11338 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 11339 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 11340 MBB->addSuccessor(ContBB); 11341 11342 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 11343 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 11344 MF->push_back(TrapBB); 11345 MBB->addSuccessor(TrapBB); 11346 11347 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 11348 .addReg(MI.getOperand(0).getReg()) 11349 .addImm(0) 11350 .add(predOps(ARMCC::AL)); 11351 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 11352 .addMBB(TrapBB) 11353 .addImm(ARMCC::EQ) 11354 .addReg(ARM::CPSR); 11355 11356 MI.eraseFromParent(); 11357 return ContBB; 11358 } 11359 11360 // The CPSR operand of SelectItr might be missing a kill marker 11361 // because there were multiple uses of CPSR, and ISel didn't know 11362 // which to mark. Figure out whether SelectItr should have had a 11363 // kill marker, and set it if it should. Returns the correct kill 11364 // marker value. 11365 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 11366 MachineBasicBlock* BB, 11367 const TargetRegisterInfo* TRI) { 11368 // Scan forward through BB for a use/def of CPSR. 11369 MachineBasicBlock::iterator miI(std::next(SelectItr)); 11370 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 11371 const MachineInstr& mi = *miI; 11372 if (mi.readsRegister(ARM::CPSR)) 11373 return false; 11374 if (mi.definesRegister(ARM::CPSR)) 11375 break; // Should have kill-flag - update below. 11376 } 11377 11378 // If we hit the end of the block, check whether CPSR is live into a 11379 // successor. 11380 if (miI == BB->end()) { 11381 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 11382 sEnd = BB->succ_end(); 11383 sItr != sEnd; ++sItr) { 11384 MachineBasicBlock* succ = *sItr; 11385 if (succ->isLiveIn(ARM::CPSR)) 11386 return false; 11387 } 11388 } 11389 11390 // We found a def, or hit the end of the basic block and CPSR wasn't live 11391 // out. SelectMI should have a kill flag on CPSR. 11392 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 11393 return true; 11394 } 11395 11396 /// Adds logic in loop entry MBB to calculate loop iteration count and adds 11397 /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop 11398 static Register genTPEntry(MachineBasicBlock *TpEntry, 11399 MachineBasicBlock *TpLoopBody, 11400 MachineBasicBlock *TpExit, Register OpSizeReg, 11401 const TargetInstrInfo *TII, DebugLoc Dl, 11402 MachineRegisterInfo &MRI) { 11403 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4. 11404 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11405 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg) 11406 .addUse(OpSizeReg) 11407 .addImm(15) 11408 .add(predOps(ARMCC::AL)) 11409 .addReg(0); 11410 11411 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11412 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg) 11413 .addUse(AddDestReg, RegState::Kill) 11414 .addImm(4) 11415 .add(predOps(ARMCC::AL)) 11416 .addReg(0); 11417 11418 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11419 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg) 11420 .addUse(LsrDestReg, RegState::Kill); 11421 11422 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart)) 11423 .addUse(TotalIterationsReg) 11424 .addMBB(TpExit); 11425 11426 BuildMI(TpEntry, Dl, TII->get(ARM::t2B)) 11427 .addMBB(TpLoopBody) 11428 .add(predOps(ARMCC::AL)); 11429 11430 return TotalIterationsReg; 11431 } 11432 11433 /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and 11434 /// t2DoLoopEnd. These are used by later passes to generate tail predicated 11435 /// loops. 11436 static void genTPLoopBody(MachineBasicBlock *TpLoopBody, 11437 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, 11438 const TargetInstrInfo *TII, DebugLoc Dl, 11439 MachineRegisterInfo &MRI, Register OpSrcReg, 11440 Register OpDestReg, Register ElementCountReg, 11441 Register TotalIterationsReg, bool IsMemcpy) { 11442 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest 11443 // array, loop iteration counter, predication counter. 11444 11445 Register SrcPhiReg, CurrSrcReg; 11446 if (IsMemcpy) { 11447 // Current position in the src array 11448 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11449 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11450 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) 11451 .addUse(OpSrcReg) 11452 .addMBB(TpEntry) 11453 .addUse(CurrSrcReg) 11454 .addMBB(TpLoopBody); 11455 } 11456 11457 // Current position in the dest array 11458 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11459 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11460 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg) 11461 .addUse(OpDestReg) 11462 .addMBB(TpEntry) 11463 .addUse(CurrDestReg) 11464 .addMBB(TpLoopBody); 11465 11466 // Current loop counter 11467 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11468 Register RemainingLoopIterationsReg = 11469 MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11470 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg) 11471 .addUse(TotalIterationsReg) 11472 .addMBB(TpEntry) 11473 .addUse(RemainingLoopIterationsReg) 11474 .addMBB(TpLoopBody); 11475 11476 // Predication counter 11477 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11478 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11479 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg) 11480 .addUse(ElementCountReg) 11481 .addMBB(TpEntry) 11482 .addUse(RemainingElementsReg) 11483 .addMBB(TpLoopBody); 11484 11485 // Pass predication counter to VCTP 11486 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass); 11487 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg) 11488 .addUse(PredCounterPhiReg) 11489 .addImm(ARMVCC::None) 11490 .addReg(0); 11491 11492 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg) 11493 .addUse(PredCounterPhiReg) 11494 .addImm(16) 11495 .add(predOps(ARMCC::AL)) 11496 .addReg(0); 11497 11498 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR 11499 Register SrcValueReg; 11500 if (IsMemcpy) { 11501 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); 11502 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) 11503 .addDef(CurrSrcReg) 11504 .addDef(SrcValueReg) 11505 .addReg(SrcPhiReg) 11506 .addImm(16) 11507 .addImm(ARMVCC::Then) 11508 .addUse(VccrReg); 11509 } else 11510 SrcValueReg = OpSrcReg; 11511 11512 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post)) 11513 .addDef(CurrDestReg) 11514 .addUse(SrcValueReg) 11515 .addReg(DestPhiReg) 11516 .addImm(16) 11517 .addImm(ARMVCC::Then) 11518 .addUse(VccrReg); 11519 11520 // Add the pseudoInstrs for decrementing the loop counter and marking the 11521 // end:t2DoLoopDec and t2DoLoopEnd 11522 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg) 11523 .addUse(LoopCounterPhiReg) 11524 .addImm(1); 11525 11526 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd)) 11527 .addUse(RemainingLoopIterationsReg) 11528 .addMBB(TpLoopBody); 11529 11530 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B)) 11531 .addMBB(TpExit) 11532 .add(predOps(ARMCC::AL)); 11533 } 11534 11535 MachineBasicBlock * 11536 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 11537 MachineBasicBlock *BB) const { 11538 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11539 DebugLoc dl = MI.getDebugLoc(); 11540 bool isThumb2 = Subtarget->isThumb2(); 11541 switch (MI.getOpcode()) { 11542 default: { 11543 MI.print(errs()); 11544 llvm_unreachable("Unexpected instr type to insert"); 11545 } 11546 11547 // Thumb1 post-indexed loads are really just single-register LDMs. 11548 case ARM::tLDR_postidx: { 11549 MachineOperand Def(MI.getOperand(1)); 11550 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 11551 .add(Def) // Rn_wb 11552 .add(MI.getOperand(2)) // Rn 11553 .add(MI.getOperand(3)) // PredImm 11554 .add(MI.getOperand(4)) // PredReg 11555 .add(MI.getOperand(0)) // Rt 11556 .cloneMemRefs(MI); 11557 MI.eraseFromParent(); 11558 return BB; 11559 } 11560 11561 case ARM::MVE_MEMCPYLOOPINST: 11562 case ARM::MVE_MEMSETLOOPINST: { 11563 11564 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo 11565 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate 11566 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and 11567 // adds the relevant instructions in the TP loop Body for generation of a 11568 // WLSTP loop. 11569 11570 // Below is relevant portion of the CFG after the transformation. 11571 // The Machine Basic Blocks are shown along with branch conditions (in 11572 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this 11573 // portion of the CFG and may not necessarily be the entry/exit of the 11574 // function. 11575 11576 // (Relevant) CFG after transformation: 11577 // TP entry MBB 11578 // | 11579 // |-----------------| 11580 // (n <= 0) (n > 0) 11581 // | | 11582 // | TP loop Body MBB<--| 11583 // | | | 11584 // \ |___________| 11585 // \ / 11586 // TP exit MBB 11587 11588 MachineFunction *MF = BB->getParent(); 11589 MachineFunctionProperties &Properties = MF->getProperties(); 11590 MachineRegisterInfo &MRI = MF->getRegInfo(); 11591 11592 Register OpDestReg = MI.getOperand(0).getReg(); 11593 Register OpSrcReg = MI.getOperand(1).getReg(); 11594 Register OpSizeReg = MI.getOperand(2).getReg(); 11595 11596 // Allocate the required MBBs and add to parent function. 11597 MachineBasicBlock *TpEntry = BB; 11598 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock(); 11599 MachineBasicBlock *TpExit; 11600 11601 MF->push_back(TpLoopBody); 11602 11603 // If any instructions are present in the current block after 11604 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and 11605 // move the instructions into the newly created exit block. If there are no 11606 // instructions add an explicit branch to the FallThrough block and then 11607 // split. 11608 // 11609 // The split is required for two reasons: 11610 // 1) A terminator(t2WhileLoopStart) will be placed at that site. 11611 // 2) Since a TPLoopBody will be added later, any phis in successive blocks 11612 // need to be updated. splitAt() already handles this. 11613 TpExit = BB->splitAt(MI, false); 11614 if (TpExit == BB) { 11615 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the " 11616 "block containing memcpy/memset Pseudo"); 11617 TpExit = BB->getFallThrough(); 11618 BuildMI(BB, dl, TII->get(ARM::t2B)) 11619 .addMBB(TpExit) 11620 .add(predOps(ARMCC::AL)); 11621 TpExit = BB->splitAt(MI, false); 11622 } 11623 11624 // Add logic for iteration count 11625 Register TotalIterationsReg = 11626 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI); 11627 11628 // Add the vectorized (and predicated) loads/store instructions 11629 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST; 11630 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg, 11631 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy); 11632 11633 // Required to avoid conflict with the MachineVerifier during testing. 11634 Properties.reset(MachineFunctionProperties::Property::NoPHIs); 11635 11636 // Connect the blocks 11637 TpEntry->addSuccessor(TpLoopBody); 11638 TpLoopBody->addSuccessor(TpLoopBody); 11639 TpLoopBody->addSuccessor(TpExit); 11640 11641 // Reorder for a more natural layout 11642 TpLoopBody->moveAfter(TpEntry); 11643 TpExit->moveAfter(TpLoopBody); 11644 11645 // Finally, remove the memcpy Psuedo Instruction 11646 MI.eraseFromParent(); 11647 11648 // Return the exit block as it may contain other instructions requiring a 11649 // custom inserter 11650 return TpExit; 11651 } 11652 11653 // The Thumb2 pre-indexed stores have the same MI operands, they just 11654 // define them differently in the .td files from the isel patterns, so 11655 // they need pseudos. 11656 case ARM::t2STR_preidx: 11657 MI.setDesc(TII->get(ARM::t2STR_PRE)); 11658 return BB; 11659 case ARM::t2STRB_preidx: 11660 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 11661 return BB; 11662 case ARM::t2STRH_preidx: 11663 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 11664 return BB; 11665 11666 case ARM::STRi_preidx: 11667 case ARM::STRBi_preidx: { 11668 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 11669 : ARM::STRB_PRE_IMM; 11670 // Decode the offset. 11671 unsigned Offset = MI.getOperand(4).getImm(); 11672 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 11673 Offset = ARM_AM::getAM2Offset(Offset); 11674 if (isSub) 11675 Offset = -Offset; 11676 11677 MachineMemOperand *MMO = *MI.memoperands_begin(); 11678 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 11679 .add(MI.getOperand(0)) // Rn_wb 11680 .add(MI.getOperand(1)) // Rt 11681 .add(MI.getOperand(2)) // Rn 11682 .addImm(Offset) // offset (skip GPR==zero_reg) 11683 .add(MI.getOperand(5)) // pred 11684 .add(MI.getOperand(6)) 11685 .addMemOperand(MMO); 11686 MI.eraseFromParent(); 11687 return BB; 11688 } 11689 case ARM::STRr_preidx: 11690 case ARM::STRBr_preidx: 11691 case ARM::STRH_preidx: { 11692 unsigned NewOpc; 11693 switch (MI.getOpcode()) { 11694 default: llvm_unreachable("unexpected opcode!"); 11695 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 11696 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 11697 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 11698 } 11699 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 11700 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 11701 MIB.add(MI.getOperand(i)); 11702 MI.eraseFromParent(); 11703 return BB; 11704 } 11705 11706 case ARM::tMOVCCr_pseudo: { 11707 // To "insert" a SELECT_CC instruction, we actually have to insert the 11708 // diamond control-flow pattern. The incoming instruction knows the 11709 // destination vreg to set, the condition code register to branch on, the 11710 // true/false values to select between, and a branch opcode to use. 11711 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11712 MachineFunction::iterator It = ++BB->getIterator(); 11713 11714 // thisMBB: 11715 // ... 11716 // TrueVal = ... 11717 // cmpTY ccX, r1, r2 11718 // bCC copy1MBB 11719 // fallthrough --> copy0MBB 11720 MachineBasicBlock *thisMBB = BB; 11721 MachineFunction *F = BB->getParent(); 11722 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 11723 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 11724 F->insert(It, copy0MBB); 11725 F->insert(It, sinkMBB); 11726 11727 // Check whether CPSR is live past the tMOVCCr_pseudo. 11728 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 11729 if (!MI.killsRegister(ARM::CPSR) && 11730 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 11731 copy0MBB->addLiveIn(ARM::CPSR); 11732 sinkMBB->addLiveIn(ARM::CPSR); 11733 } 11734 11735 // Transfer the remainder of BB and its successor edges to sinkMBB. 11736 sinkMBB->splice(sinkMBB->begin(), BB, 11737 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11738 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 11739 11740 BB->addSuccessor(copy0MBB); 11741 BB->addSuccessor(sinkMBB); 11742 11743 BuildMI(BB, dl, TII->get(ARM::tBcc)) 11744 .addMBB(sinkMBB) 11745 .addImm(MI.getOperand(3).getImm()) 11746 .addReg(MI.getOperand(4).getReg()); 11747 11748 // copy0MBB: 11749 // %FalseValue = ... 11750 // # fallthrough to sinkMBB 11751 BB = copy0MBB; 11752 11753 // Update machine-CFG edges 11754 BB->addSuccessor(sinkMBB); 11755 11756 // sinkMBB: 11757 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 11758 // ... 11759 BB = sinkMBB; 11760 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 11761 .addReg(MI.getOperand(1).getReg()) 11762 .addMBB(copy0MBB) 11763 .addReg(MI.getOperand(2).getReg()) 11764 .addMBB(thisMBB); 11765 11766 MI.eraseFromParent(); // The pseudo instruction is gone now. 11767 return BB; 11768 } 11769 11770 case ARM::BCCi64: 11771 case ARM::BCCZi64: { 11772 // If there is an unconditional branch to the other successor, remove it. 11773 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11774 11775 // Compare both parts that make up the double comparison separately for 11776 // equality. 11777 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 11778 11779 Register LHS1 = MI.getOperand(1).getReg(); 11780 Register LHS2 = MI.getOperand(2).getReg(); 11781 if (RHSisZero) { 11782 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11783 .addReg(LHS1) 11784 .addImm(0) 11785 .add(predOps(ARMCC::AL)); 11786 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11787 .addReg(LHS2).addImm(0) 11788 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 11789 } else { 11790 Register RHS1 = MI.getOperand(3).getReg(); 11791 Register RHS2 = MI.getOperand(4).getReg(); 11792 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 11793 .addReg(LHS1) 11794 .addReg(RHS1) 11795 .add(predOps(ARMCC::AL)); 11796 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 11797 .addReg(LHS2).addReg(RHS2) 11798 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 11799 } 11800 11801 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 11802 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 11803 if (MI.getOperand(0).getImm() == ARMCC::NE) 11804 std::swap(destMBB, exitMBB); 11805 11806 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 11807 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 11808 if (isThumb2) 11809 BuildMI(BB, dl, TII->get(ARM::t2B)) 11810 .addMBB(exitMBB) 11811 .add(predOps(ARMCC::AL)); 11812 else 11813 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 11814 11815 MI.eraseFromParent(); // The pseudo instruction is gone now. 11816 return BB; 11817 } 11818 11819 case ARM::Int_eh_sjlj_setjmp: 11820 case ARM::Int_eh_sjlj_setjmp_nofp: 11821 case ARM::tInt_eh_sjlj_setjmp: 11822 case ARM::t2Int_eh_sjlj_setjmp: 11823 case ARM::t2Int_eh_sjlj_setjmp_nofp: 11824 return BB; 11825 11826 case ARM::Int_eh_sjlj_setup_dispatch: 11827 EmitSjLjDispatchBlock(MI, BB); 11828 return BB; 11829 11830 case ARM::ABS: 11831 case ARM::t2ABS: { 11832 // To insert an ABS instruction, we have to insert the 11833 // diamond control-flow pattern. The incoming instruction knows the 11834 // source vreg to test against 0, the destination vreg to set, 11835 // the condition code register to branch on, the 11836 // true/false values to select between, and a branch opcode to use. 11837 // It transforms 11838 // V1 = ABS V0 11839 // into 11840 // V2 = MOVS V0 11841 // BCC (branch to SinkBB if V0 >= 0) 11842 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 11843 // SinkBB: V1 = PHI(V2, V3) 11844 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11845 MachineFunction::iterator BBI = ++BB->getIterator(); 11846 MachineFunction *Fn = BB->getParent(); 11847 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 11848 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 11849 Fn->insert(BBI, RSBBB); 11850 Fn->insert(BBI, SinkBB); 11851 11852 Register ABSSrcReg = MI.getOperand(1).getReg(); 11853 Register ABSDstReg = MI.getOperand(0).getReg(); 11854 bool ABSSrcKIll = MI.getOperand(1).isKill(); 11855 bool isThumb2 = Subtarget->isThumb2(); 11856 MachineRegisterInfo &MRI = Fn->getRegInfo(); 11857 // In Thumb mode S must not be specified if source register is the SP or 11858 // PC and if destination register is the SP, so restrict register class 11859 Register NewRsbDstReg = MRI.createVirtualRegister( 11860 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 11861 11862 // Transfer the remainder of BB and its successor edges to sinkMBB. 11863 SinkBB->splice(SinkBB->begin(), BB, 11864 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11865 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 11866 11867 BB->addSuccessor(RSBBB); 11868 BB->addSuccessor(SinkBB); 11869 11870 // fall through to SinkMBB 11871 RSBBB->addSuccessor(SinkBB); 11872 11873 // insert a cmp at the end of BB 11874 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11875 .addReg(ABSSrcReg) 11876 .addImm(0) 11877 .add(predOps(ARMCC::AL)); 11878 11879 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 11880 BuildMI(BB, dl, 11881 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 11882 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 11883 11884 // insert rsbri in RSBBB 11885 // Note: BCC and rsbri will be converted into predicated rsbmi 11886 // by if-conversion pass 11887 BuildMI(*RSBBB, RSBBB->begin(), dl, 11888 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 11889 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 11890 .addImm(0) 11891 .add(predOps(ARMCC::AL)) 11892 .add(condCodeOp()); 11893 11894 // insert PHI in SinkBB, 11895 // reuse ABSDstReg to not change uses of ABS instruction 11896 BuildMI(*SinkBB, SinkBB->begin(), dl, 11897 TII->get(ARM::PHI), ABSDstReg) 11898 .addReg(NewRsbDstReg).addMBB(RSBBB) 11899 .addReg(ABSSrcReg).addMBB(BB); 11900 11901 // remove ABS instruction 11902 MI.eraseFromParent(); 11903 11904 // return last added BB 11905 return SinkBB; 11906 } 11907 case ARM::COPY_STRUCT_BYVAL_I32: 11908 ++NumLoopByVals; 11909 return EmitStructByval(MI, BB); 11910 case ARM::WIN__CHKSTK: 11911 return EmitLowered__chkstk(MI, BB); 11912 case ARM::WIN__DBZCHK: 11913 return EmitLowered__dbzchk(MI, BB); 11914 } 11915 } 11916 11917 /// Attaches vregs to MEMCPY that it will use as scratch registers 11918 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 11919 /// instead of as a custom inserter because we need the use list from the SDNode. 11920 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 11921 MachineInstr &MI, const SDNode *Node) { 11922 bool isThumb1 = Subtarget->isThumb1Only(); 11923 11924 DebugLoc DL = MI.getDebugLoc(); 11925 MachineFunction *MF = MI.getParent()->getParent(); 11926 MachineRegisterInfo &MRI = MF->getRegInfo(); 11927 MachineInstrBuilder MIB(*MF, MI); 11928 11929 // If the new dst/src is unused mark it as dead. 11930 if (!Node->hasAnyUseOfValue(0)) { 11931 MI.getOperand(0).setIsDead(true); 11932 } 11933 if (!Node->hasAnyUseOfValue(1)) { 11934 MI.getOperand(1).setIsDead(true); 11935 } 11936 11937 // The MEMCPY both defines and kills the scratch registers. 11938 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 11939 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 11940 : &ARM::GPRRegClass); 11941 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 11942 } 11943 } 11944 11945 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 11946 SDNode *Node) const { 11947 if (MI.getOpcode() == ARM::MEMCPY) { 11948 attachMEMCPYScratchRegs(Subtarget, MI, Node); 11949 return; 11950 } 11951 11952 const MCInstrDesc *MCID = &MI.getDesc(); 11953 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 11954 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 11955 // operand is still set to noreg. If needed, set the optional operand's 11956 // register to CPSR, and remove the redundant implicit def. 11957 // 11958 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 11959 11960 // Rename pseudo opcodes. 11961 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 11962 unsigned ccOutIdx; 11963 if (NewOpc) { 11964 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 11965 MCID = &TII->get(NewOpc); 11966 11967 assert(MCID->getNumOperands() == 11968 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 11969 && "converted opcode should be the same except for cc_out" 11970 " (and, on Thumb1, pred)"); 11971 11972 MI.setDesc(*MCID); 11973 11974 // Add the optional cc_out operand 11975 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 11976 11977 // On Thumb1, move all input operands to the end, then add the predicate 11978 if (Subtarget->isThumb1Only()) { 11979 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 11980 MI.addOperand(MI.getOperand(1)); 11981 MI.RemoveOperand(1); 11982 } 11983 11984 // Restore the ties 11985 for (unsigned i = MI.getNumOperands(); i--;) { 11986 const MachineOperand& op = MI.getOperand(i); 11987 if (op.isReg() && op.isUse()) { 11988 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 11989 if (DefIdx != -1) 11990 MI.tieOperands(DefIdx, i); 11991 } 11992 } 11993 11994 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 11995 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 11996 ccOutIdx = 1; 11997 } else 11998 ccOutIdx = MCID->getNumOperands() - 1; 11999 } else 12000 ccOutIdx = MCID->getNumOperands() - 1; 12001 12002 // Any ARM instruction that sets the 's' bit should specify an optional 12003 // "cc_out" operand in the last operand position. 12004 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 12005 assert(!NewOpc && "Optional cc_out operand required"); 12006 return; 12007 } 12008 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 12009 // since we already have an optional CPSR def. 12010 bool definesCPSR = false; 12011 bool deadCPSR = false; 12012 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 12013 ++i) { 12014 const MachineOperand &MO = MI.getOperand(i); 12015 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 12016 definesCPSR = true; 12017 if (MO.isDead()) 12018 deadCPSR = true; 12019 MI.RemoveOperand(i); 12020 break; 12021 } 12022 } 12023 if (!definesCPSR) { 12024 assert(!NewOpc && "Optional cc_out operand required"); 12025 return; 12026 } 12027 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 12028 if (deadCPSR) { 12029 assert(!MI.getOperand(ccOutIdx).getReg() && 12030 "expect uninitialized optional cc_out operand"); 12031 // Thumb1 instructions must have the S bit even if the CPSR is dead. 12032 if (!Subtarget->isThumb1Only()) 12033 return; 12034 } 12035 12036 // If this instruction was defined with an optional CPSR def and its dag node 12037 // had a live implicit CPSR def, then activate the optional CPSR def. 12038 MachineOperand &MO = MI.getOperand(ccOutIdx); 12039 MO.setReg(ARM::CPSR); 12040 MO.setIsDef(true); 12041 } 12042 12043 //===----------------------------------------------------------------------===// 12044 // ARM Optimization Hooks 12045 //===----------------------------------------------------------------------===// 12046 12047 // Helper function that checks if N is a null or all ones constant. 12048 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 12049 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 12050 } 12051 12052 // Return true if N is conditionally 0 or all ones. 12053 // Detects these expressions where cc is an i1 value: 12054 // 12055 // (select cc 0, y) [AllOnes=0] 12056 // (select cc y, 0) [AllOnes=0] 12057 // (zext cc) [AllOnes=0] 12058 // (sext cc) [AllOnes=0/1] 12059 // (select cc -1, y) [AllOnes=1] 12060 // (select cc y, -1) [AllOnes=1] 12061 // 12062 // Invert is set when N is the null/all ones constant when CC is false. 12063 // OtherOp is set to the alternative value of N. 12064 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 12065 SDValue &CC, bool &Invert, 12066 SDValue &OtherOp, 12067 SelectionDAG &DAG) { 12068 switch (N->getOpcode()) { 12069 default: return false; 12070 case ISD::SELECT: { 12071 CC = N->getOperand(0); 12072 SDValue N1 = N->getOperand(1); 12073 SDValue N2 = N->getOperand(2); 12074 if (isZeroOrAllOnes(N1, AllOnes)) { 12075 Invert = false; 12076 OtherOp = N2; 12077 return true; 12078 } 12079 if (isZeroOrAllOnes(N2, AllOnes)) { 12080 Invert = true; 12081 OtherOp = N1; 12082 return true; 12083 } 12084 return false; 12085 } 12086 case ISD::ZERO_EXTEND: 12087 // (zext cc) can never be the all ones value. 12088 if (AllOnes) 12089 return false; 12090 LLVM_FALLTHROUGH; 12091 case ISD::SIGN_EXTEND: { 12092 SDLoc dl(N); 12093 EVT VT = N->getValueType(0); 12094 CC = N->getOperand(0); 12095 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 12096 return false; 12097 Invert = !AllOnes; 12098 if (AllOnes) 12099 // When looking for an AllOnes constant, N is an sext, and the 'other' 12100 // value is 0. 12101 OtherOp = DAG.getConstant(0, dl, VT); 12102 else if (N->getOpcode() == ISD::ZERO_EXTEND) 12103 // When looking for a 0 constant, N can be zext or sext. 12104 OtherOp = DAG.getConstant(1, dl, VT); 12105 else 12106 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 12107 VT); 12108 return true; 12109 } 12110 } 12111 } 12112 12113 // Combine a constant select operand into its use: 12114 // 12115 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 12116 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 12117 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 12118 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12119 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12120 // 12121 // The transform is rejected if the select doesn't have a constant operand that 12122 // is null, or all ones when AllOnes is set. 12123 // 12124 // Also recognize sext/zext from i1: 12125 // 12126 // (add (zext cc), x) -> (select cc (add x, 1), x) 12127 // (add (sext cc), x) -> (select cc (add x, -1), x) 12128 // 12129 // These transformations eventually create predicated instructions. 12130 // 12131 // @param N The node to transform. 12132 // @param Slct The N operand that is a select. 12133 // @param OtherOp The other N operand (x above). 12134 // @param DCI Context. 12135 // @param AllOnes Require the select constant to be all ones instead of null. 12136 // @returns The new node, or SDValue() on failure. 12137 static 12138 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 12139 TargetLowering::DAGCombinerInfo &DCI, 12140 bool AllOnes = false) { 12141 SelectionDAG &DAG = DCI.DAG; 12142 EVT VT = N->getValueType(0); 12143 SDValue NonConstantVal; 12144 SDValue CCOp; 12145 bool SwapSelectOps; 12146 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 12147 NonConstantVal, DAG)) 12148 return SDValue(); 12149 12150 // Slct is now know to be the desired identity constant when CC is true. 12151 SDValue TrueVal = OtherOp; 12152 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 12153 OtherOp, NonConstantVal); 12154 // Unless SwapSelectOps says CC should be false. 12155 if (SwapSelectOps) 12156 std::swap(TrueVal, FalseVal); 12157 12158 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 12159 CCOp, TrueVal, FalseVal); 12160 } 12161 12162 // Attempt combineSelectAndUse on each operand of a commutative operator N. 12163 static 12164 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 12165 TargetLowering::DAGCombinerInfo &DCI) { 12166 SDValue N0 = N->getOperand(0); 12167 SDValue N1 = N->getOperand(1); 12168 if (N0.getNode()->hasOneUse()) 12169 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 12170 return Result; 12171 if (N1.getNode()->hasOneUse()) 12172 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 12173 return Result; 12174 return SDValue(); 12175 } 12176 12177 static bool IsVUZPShuffleNode(SDNode *N) { 12178 // VUZP shuffle node. 12179 if (N->getOpcode() == ARMISD::VUZP) 12180 return true; 12181 12182 // "VUZP" on i32 is an alias for VTRN. 12183 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 12184 return true; 12185 12186 return false; 12187 } 12188 12189 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 12190 TargetLowering::DAGCombinerInfo &DCI, 12191 const ARMSubtarget *Subtarget) { 12192 // Look for ADD(VUZP.0, VUZP.1). 12193 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 12194 N0 == N1) 12195 return SDValue(); 12196 12197 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 12198 if (!N->getValueType(0).is64BitVector()) 12199 return SDValue(); 12200 12201 // Generate vpadd. 12202 SelectionDAG &DAG = DCI.DAG; 12203 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12204 SDLoc dl(N); 12205 SDNode *Unzip = N0.getNode(); 12206 EVT VT = N->getValueType(0); 12207 12208 SmallVector<SDValue, 8> Ops; 12209 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 12210 TLI.getPointerTy(DAG.getDataLayout()))); 12211 Ops.push_back(Unzip->getOperand(0)); 12212 Ops.push_back(Unzip->getOperand(1)); 12213 12214 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 12215 } 12216 12217 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 12218 TargetLowering::DAGCombinerInfo &DCI, 12219 const ARMSubtarget *Subtarget) { 12220 // Check for two extended operands. 12221 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 12222 N1.getOpcode() == ISD::SIGN_EXTEND) && 12223 !(N0.getOpcode() == ISD::ZERO_EXTEND && 12224 N1.getOpcode() == ISD::ZERO_EXTEND)) 12225 return SDValue(); 12226 12227 SDValue N00 = N0.getOperand(0); 12228 SDValue N10 = N1.getOperand(0); 12229 12230 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 12231 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 12232 N00 == N10) 12233 return SDValue(); 12234 12235 // We only recognize Q register paddl here; this can't be reached until 12236 // after type legalization. 12237 if (!N00.getValueType().is64BitVector() || 12238 !N0.getValueType().is128BitVector()) 12239 return SDValue(); 12240 12241 // Generate vpaddl. 12242 SelectionDAG &DAG = DCI.DAG; 12243 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12244 SDLoc dl(N); 12245 EVT VT = N->getValueType(0); 12246 12247 SmallVector<SDValue, 8> Ops; 12248 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 12249 unsigned Opcode; 12250 if (N0.getOpcode() == ISD::SIGN_EXTEND) 12251 Opcode = Intrinsic::arm_neon_vpaddls; 12252 else 12253 Opcode = Intrinsic::arm_neon_vpaddlu; 12254 Ops.push_back(DAG.getConstant(Opcode, dl, 12255 TLI.getPointerTy(DAG.getDataLayout()))); 12256 EVT ElemTy = N00.getValueType().getVectorElementType(); 12257 unsigned NumElts = VT.getVectorNumElements(); 12258 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 12259 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 12260 N00.getOperand(0), N00.getOperand(1)); 12261 Ops.push_back(Concat); 12262 12263 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 12264 } 12265 12266 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 12267 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 12268 // much easier to match. 12269 static SDValue 12270 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 12271 TargetLowering::DAGCombinerInfo &DCI, 12272 const ARMSubtarget *Subtarget) { 12273 // Only perform optimization if after legalize, and if NEON is available. We 12274 // also expected both operands to be BUILD_VECTORs. 12275 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 12276 || N0.getOpcode() != ISD::BUILD_VECTOR 12277 || N1.getOpcode() != ISD::BUILD_VECTOR) 12278 return SDValue(); 12279 12280 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 12281 EVT VT = N->getValueType(0); 12282 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 12283 return SDValue(); 12284 12285 // Check that the vector operands are of the right form. 12286 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 12287 // operands, where N is the size of the formed vector. 12288 // Each EXTRACT_VECTOR should have the same input vector and odd or even 12289 // index such that we have a pair wise add pattern. 12290 12291 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 12292 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12293 return SDValue(); 12294 SDValue Vec = N0->getOperand(0)->getOperand(0); 12295 SDNode *V = Vec.getNode(); 12296 unsigned nextIndex = 0; 12297 12298 // For each operands to the ADD which are BUILD_VECTORs, 12299 // check to see if each of their operands are an EXTRACT_VECTOR with 12300 // the same vector and appropriate index. 12301 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 12302 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 12303 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12304 12305 SDValue ExtVec0 = N0->getOperand(i); 12306 SDValue ExtVec1 = N1->getOperand(i); 12307 12308 // First operand is the vector, verify its the same. 12309 if (V != ExtVec0->getOperand(0).getNode() || 12310 V != ExtVec1->getOperand(0).getNode()) 12311 return SDValue(); 12312 12313 // Second is the constant, verify its correct. 12314 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 12315 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 12316 12317 // For the constant, we want to see all the even or all the odd. 12318 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 12319 || C1->getZExtValue() != nextIndex+1) 12320 return SDValue(); 12321 12322 // Increment index. 12323 nextIndex+=2; 12324 } else 12325 return SDValue(); 12326 } 12327 12328 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 12329 // we're using the entire input vector, otherwise there's a size/legality 12330 // mismatch somewhere. 12331 if (nextIndex != Vec.getValueType().getVectorNumElements() || 12332 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 12333 return SDValue(); 12334 12335 // Create VPADDL node. 12336 SelectionDAG &DAG = DCI.DAG; 12337 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12338 12339 SDLoc dl(N); 12340 12341 // Build operand list. 12342 SmallVector<SDValue, 8> Ops; 12343 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 12344 TLI.getPointerTy(DAG.getDataLayout()))); 12345 12346 // Input is the vector. 12347 Ops.push_back(Vec); 12348 12349 // Get widened type and narrowed type. 12350 MVT widenType; 12351 unsigned numElem = VT.getVectorNumElements(); 12352 12353 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 12354 switch (inputLaneType.getSimpleVT().SimpleTy) { 12355 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 12356 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 12357 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 12358 default: 12359 llvm_unreachable("Invalid vector element type for padd optimization."); 12360 } 12361 12362 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 12363 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 12364 return DAG.getNode(ExtOp, dl, VT, tmp); 12365 } 12366 12367 static SDValue findMUL_LOHI(SDValue V) { 12368 if (V->getOpcode() == ISD::UMUL_LOHI || 12369 V->getOpcode() == ISD::SMUL_LOHI) 12370 return V; 12371 return SDValue(); 12372 } 12373 12374 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 12375 TargetLowering::DAGCombinerInfo &DCI, 12376 const ARMSubtarget *Subtarget) { 12377 if (!Subtarget->hasBaseDSP()) 12378 return SDValue(); 12379 12380 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 12381 // accumulates the product into a 64-bit value. The 16-bit values will 12382 // be sign extended somehow or SRA'd into 32-bit values 12383 // (addc (adde (mul 16bit, 16bit), lo), hi) 12384 SDValue Mul = AddcNode->getOperand(0); 12385 SDValue Lo = AddcNode->getOperand(1); 12386 if (Mul.getOpcode() != ISD::MUL) { 12387 Lo = AddcNode->getOperand(0); 12388 Mul = AddcNode->getOperand(1); 12389 if (Mul.getOpcode() != ISD::MUL) 12390 return SDValue(); 12391 } 12392 12393 SDValue SRA = AddeNode->getOperand(0); 12394 SDValue Hi = AddeNode->getOperand(1); 12395 if (SRA.getOpcode() != ISD::SRA) { 12396 SRA = AddeNode->getOperand(1); 12397 Hi = AddeNode->getOperand(0); 12398 if (SRA.getOpcode() != ISD::SRA) 12399 return SDValue(); 12400 } 12401 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 12402 if (Const->getZExtValue() != 31) 12403 return SDValue(); 12404 } else 12405 return SDValue(); 12406 12407 if (SRA.getOperand(0) != Mul) 12408 return SDValue(); 12409 12410 SelectionDAG &DAG = DCI.DAG; 12411 SDLoc dl(AddcNode); 12412 unsigned Opcode = 0; 12413 SDValue Op0; 12414 SDValue Op1; 12415 12416 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 12417 Opcode = ARMISD::SMLALBB; 12418 Op0 = Mul.getOperand(0); 12419 Op1 = Mul.getOperand(1); 12420 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 12421 Opcode = ARMISD::SMLALBT; 12422 Op0 = Mul.getOperand(0); 12423 Op1 = Mul.getOperand(1).getOperand(0); 12424 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 12425 Opcode = ARMISD::SMLALTB; 12426 Op0 = Mul.getOperand(0).getOperand(0); 12427 Op1 = Mul.getOperand(1); 12428 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 12429 Opcode = ARMISD::SMLALTT; 12430 Op0 = Mul->getOperand(0).getOperand(0); 12431 Op1 = Mul->getOperand(1).getOperand(0); 12432 } 12433 12434 if (!Op0 || !Op1) 12435 return SDValue(); 12436 12437 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 12438 Op0, Op1, Lo, Hi); 12439 // Replace the ADDs' nodes uses by the MLA node's values. 12440 SDValue HiMLALResult(SMLAL.getNode(), 1); 12441 SDValue LoMLALResult(SMLAL.getNode(), 0); 12442 12443 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 12444 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 12445 12446 // Return original node to notify the driver to stop replacing. 12447 SDValue resNode(AddcNode, 0); 12448 return resNode; 12449 } 12450 12451 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 12452 TargetLowering::DAGCombinerInfo &DCI, 12453 const ARMSubtarget *Subtarget) { 12454 // Look for multiply add opportunities. 12455 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 12456 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 12457 // a glue link from the first add to the second add. 12458 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 12459 // a S/UMLAL instruction. 12460 // UMUL_LOHI 12461 // / :lo \ :hi 12462 // V \ [no multiline comment] 12463 // loAdd -> ADDC | 12464 // \ :carry / 12465 // V V 12466 // ADDE <- hiAdd 12467 // 12468 // In the special case where only the higher part of a signed result is used 12469 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 12470 // a constant with the exact value of 0x80000000, we recognize we are dealing 12471 // with a "rounded multiply and add" (or subtract) and transform it into 12472 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 12473 12474 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 12475 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 12476 "Expect an ADDE or SUBE"); 12477 12478 assert(AddeSubeNode->getNumOperands() == 3 && 12479 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 12480 "ADDE node has the wrong inputs"); 12481 12482 // Check that we are chained to the right ADDC or SUBC node. 12483 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 12484 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 12485 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 12486 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 12487 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 12488 return SDValue(); 12489 12490 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 12491 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 12492 12493 // Check if the two operands are from the same mul_lohi node. 12494 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 12495 return SDValue(); 12496 12497 assert(AddcSubcNode->getNumValues() == 2 && 12498 AddcSubcNode->getValueType(0) == MVT::i32 && 12499 "Expect ADDC with two result values. First: i32"); 12500 12501 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 12502 // maybe a SMLAL which multiplies two 16-bit values. 12503 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 12504 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 12505 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 12506 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 12507 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 12508 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 12509 12510 // Check for the triangle shape. 12511 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 12512 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 12513 12514 // Make sure that the ADDE/SUBE operands are not coming from the same node. 12515 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 12516 return SDValue(); 12517 12518 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 12519 bool IsLeftOperandMUL = false; 12520 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 12521 if (MULOp == SDValue()) 12522 MULOp = findMUL_LOHI(AddeSubeOp1); 12523 else 12524 IsLeftOperandMUL = true; 12525 if (MULOp == SDValue()) 12526 return SDValue(); 12527 12528 // Figure out the right opcode. 12529 unsigned Opc = MULOp->getOpcode(); 12530 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 12531 12532 // Figure out the high and low input values to the MLAL node. 12533 SDValue *HiAddSub = nullptr; 12534 SDValue *LoMul = nullptr; 12535 SDValue *LowAddSub = nullptr; 12536 12537 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 12538 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 12539 return SDValue(); 12540 12541 if (IsLeftOperandMUL) 12542 HiAddSub = &AddeSubeOp1; 12543 else 12544 HiAddSub = &AddeSubeOp0; 12545 12546 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 12547 // whose low result is fed to the ADDC/SUBC we are checking. 12548 12549 if (AddcSubcOp0 == MULOp.getValue(0)) { 12550 LoMul = &AddcSubcOp0; 12551 LowAddSub = &AddcSubcOp1; 12552 } 12553 if (AddcSubcOp1 == MULOp.getValue(0)) { 12554 LoMul = &AddcSubcOp1; 12555 LowAddSub = &AddcSubcOp0; 12556 } 12557 12558 if (!LoMul) 12559 return SDValue(); 12560 12561 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 12562 // the replacement below will create a cycle. 12563 if (AddcSubcNode == HiAddSub->getNode() || 12564 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 12565 return SDValue(); 12566 12567 // Create the merged node. 12568 SelectionDAG &DAG = DCI.DAG; 12569 12570 // Start building operand list. 12571 SmallVector<SDValue, 8> Ops; 12572 Ops.push_back(LoMul->getOperand(0)); 12573 Ops.push_back(LoMul->getOperand(1)); 12574 12575 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 12576 // the case, we must be doing signed multiplication and only use the higher 12577 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 12578 // addition or subtraction with the value of 0x800000. 12579 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 12580 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 12581 LowAddSub->getNode()->getOpcode() == ISD::Constant && 12582 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 12583 0x80000000) { 12584 Ops.push_back(*HiAddSub); 12585 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 12586 FinalOpc = ARMISD::SMMLSR; 12587 } else { 12588 FinalOpc = ARMISD::SMMLAR; 12589 } 12590 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 12591 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 12592 12593 return SDValue(AddeSubeNode, 0); 12594 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 12595 // SMMLS is generated during instruction selection and the rest of this 12596 // function can not handle the case where AddcSubcNode is a SUBC. 12597 return SDValue(); 12598 12599 // Finish building the operand list for {U/S}MLAL 12600 Ops.push_back(*LowAddSub); 12601 Ops.push_back(*HiAddSub); 12602 12603 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 12604 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12605 12606 // Replace the ADDs' nodes uses by the MLA node's values. 12607 SDValue HiMLALResult(MLALNode.getNode(), 1); 12608 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 12609 12610 SDValue LoMLALResult(MLALNode.getNode(), 0); 12611 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 12612 12613 // Return original node to notify the driver to stop replacing. 12614 return SDValue(AddeSubeNode, 0); 12615 } 12616 12617 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 12618 TargetLowering::DAGCombinerInfo &DCI, 12619 const ARMSubtarget *Subtarget) { 12620 // UMAAL is similar to UMLAL except that it adds two unsigned values. 12621 // While trying to combine for the other MLAL nodes, first search for the 12622 // chance to use UMAAL. Check if Addc uses a node which has already 12623 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 12624 // as the addend, and it's handled in PerformUMLALCombine. 12625 12626 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12627 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12628 12629 // Check that we have a glued ADDC node. 12630 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 12631 if (AddcNode->getOpcode() != ARMISD::ADDC) 12632 return SDValue(); 12633 12634 // Find the converted UMAAL or quit if it doesn't exist. 12635 SDNode *UmlalNode = nullptr; 12636 SDValue AddHi; 12637 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 12638 UmlalNode = AddcNode->getOperand(0).getNode(); 12639 AddHi = AddcNode->getOperand(1); 12640 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 12641 UmlalNode = AddcNode->getOperand(1).getNode(); 12642 AddHi = AddcNode->getOperand(0); 12643 } else { 12644 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12645 } 12646 12647 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 12648 // the ADDC as well as Zero. 12649 if (!isNullConstant(UmlalNode->getOperand(3))) 12650 return SDValue(); 12651 12652 if ((isNullConstant(AddeNode->getOperand(0)) && 12653 AddeNode->getOperand(1).getNode() == UmlalNode) || 12654 (AddeNode->getOperand(0).getNode() == UmlalNode && 12655 isNullConstant(AddeNode->getOperand(1)))) { 12656 SelectionDAG &DAG = DCI.DAG; 12657 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 12658 UmlalNode->getOperand(2), AddHi }; 12659 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 12660 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12661 12662 // Replace the ADDs' nodes uses by the UMAAL node's values. 12663 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 12664 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 12665 12666 // Return original node to notify the driver to stop replacing. 12667 return SDValue(AddeNode, 0); 12668 } 12669 return SDValue(); 12670 } 12671 12672 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 12673 const ARMSubtarget *Subtarget) { 12674 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12675 return SDValue(); 12676 12677 // Check that we have a pair of ADDC and ADDE as operands. 12678 // Both addends of the ADDE must be zero. 12679 SDNode* AddcNode = N->getOperand(2).getNode(); 12680 SDNode* AddeNode = N->getOperand(3).getNode(); 12681 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 12682 (AddeNode->getOpcode() == ARMISD::ADDE) && 12683 isNullConstant(AddeNode->getOperand(0)) && 12684 isNullConstant(AddeNode->getOperand(1)) && 12685 (AddeNode->getOperand(2).getNode() == AddcNode)) 12686 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 12687 DAG.getVTList(MVT::i32, MVT::i32), 12688 {N->getOperand(0), N->getOperand(1), 12689 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 12690 else 12691 return SDValue(); 12692 } 12693 12694 static SDValue PerformAddcSubcCombine(SDNode *N, 12695 TargetLowering::DAGCombinerInfo &DCI, 12696 const ARMSubtarget *Subtarget) { 12697 SelectionDAG &DAG(DCI.DAG); 12698 12699 if (N->getOpcode() == ARMISD::SUBC) { 12700 // (SUBC (ADDE 0, 0, C), 1) -> C 12701 SDValue LHS = N->getOperand(0); 12702 SDValue RHS = N->getOperand(1); 12703 if (LHS->getOpcode() == ARMISD::ADDE && 12704 isNullConstant(LHS->getOperand(0)) && 12705 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 12706 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 12707 } 12708 } 12709 12710 if (Subtarget->isThumb1Only()) { 12711 SDValue RHS = N->getOperand(1); 12712 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12713 int32_t imm = C->getSExtValue(); 12714 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 12715 SDLoc DL(N); 12716 RHS = DAG.getConstant(-imm, DL, MVT::i32); 12717 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 12718 : ARMISD::ADDC; 12719 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 12720 } 12721 } 12722 } 12723 12724 return SDValue(); 12725 } 12726 12727 static SDValue PerformAddeSubeCombine(SDNode *N, 12728 TargetLowering::DAGCombinerInfo &DCI, 12729 const ARMSubtarget *Subtarget) { 12730 if (Subtarget->isThumb1Only()) { 12731 SelectionDAG &DAG = DCI.DAG; 12732 SDValue RHS = N->getOperand(1); 12733 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12734 int64_t imm = C->getSExtValue(); 12735 if (imm < 0) { 12736 SDLoc DL(N); 12737 12738 // The with-carry-in form matches bitwise not instead of the negation. 12739 // Effectively, the inverse interpretation of the carry flag already 12740 // accounts for part of the negation. 12741 RHS = DAG.getConstant(~imm, DL, MVT::i32); 12742 12743 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 12744 : ARMISD::ADDE; 12745 return DAG.getNode(Opcode, DL, N->getVTList(), 12746 N->getOperand(0), RHS, N->getOperand(2)); 12747 } 12748 } 12749 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 12750 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 12751 } 12752 return SDValue(); 12753 } 12754 12755 static SDValue PerformSELECTCombine(SDNode *N, 12756 TargetLowering::DAGCombinerInfo &DCI, 12757 const ARMSubtarget *Subtarget) { 12758 if (!Subtarget->hasMVEIntegerOps()) 12759 return SDValue(); 12760 12761 SDLoc dl(N); 12762 SDValue SetCC; 12763 SDValue LHS; 12764 SDValue RHS; 12765 ISD::CondCode CC; 12766 SDValue TrueVal; 12767 SDValue FalseVal; 12768 12769 if (N->getOpcode() == ISD::SELECT && 12770 N->getOperand(0)->getOpcode() == ISD::SETCC) { 12771 SetCC = N->getOperand(0); 12772 LHS = SetCC->getOperand(0); 12773 RHS = SetCC->getOperand(1); 12774 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 12775 TrueVal = N->getOperand(1); 12776 FalseVal = N->getOperand(2); 12777 } else if (N->getOpcode() == ISD::SELECT_CC) { 12778 LHS = N->getOperand(0); 12779 RHS = N->getOperand(1); 12780 CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 12781 TrueVal = N->getOperand(2); 12782 FalseVal = N->getOperand(3); 12783 } else { 12784 return SDValue(); 12785 } 12786 12787 unsigned int Opcode = 0; 12788 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN || 12789 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) && 12790 (CC == ISD::SETULT || CC == ISD::SETUGT)) { 12791 Opcode = ARMISD::VMINVu; 12792 if (CC == ISD::SETUGT) 12793 std::swap(TrueVal, FalseVal); 12794 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN || 12795 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) && 12796 (CC == ISD::SETLT || CC == ISD::SETGT)) { 12797 Opcode = ARMISD::VMINVs; 12798 if (CC == ISD::SETGT) 12799 std::swap(TrueVal, FalseVal); 12800 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX || 12801 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) && 12802 (CC == ISD::SETUGT || CC == ISD::SETULT)) { 12803 Opcode = ARMISD::VMAXVu; 12804 if (CC == ISD::SETULT) 12805 std::swap(TrueVal, FalseVal); 12806 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX || 12807 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) && 12808 (CC == ISD::SETGT || CC == ISD::SETLT)) { 12809 Opcode = ARMISD::VMAXVs; 12810 if (CC == ISD::SETLT) 12811 std::swap(TrueVal, FalseVal); 12812 } else 12813 return SDValue(); 12814 12815 // Normalise to the right hand side being the vector reduction 12816 switch (TrueVal->getOpcode()) { 12817 case ISD::VECREDUCE_UMIN: 12818 case ISD::VECREDUCE_SMIN: 12819 case ISD::VECREDUCE_UMAX: 12820 case ISD::VECREDUCE_SMAX: 12821 std::swap(LHS, RHS); 12822 std::swap(TrueVal, FalseVal); 12823 break; 12824 } 12825 12826 EVT VectorType = FalseVal->getOperand(0).getValueType(); 12827 12828 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 && 12829 VectorType != MVT::v4i32) 12830 return SDValue(); 12831 12832 EVT VectorScalarType = VectorType.getVectorElementType(); 12833 12834 // The values being selected must also be the ones being compared 12835 if (TrueVal != LHS || FalseVal != RHS) 12836 return SDValue(); 12837 12838 EVT LeftType = LHS->getValueType(0); 12839 EVT RightType = RHS->getValueType(0); 12840 12841 // The types must match the reduced type too 12842 if (LeftType != VectorScalarType || RightType != VectorScalarType) 12843 return SDValue(); 12844 12845 // Legalise the scalar to an i32 12846 if (VectorScalarType != MVT::i32) 12847 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 12848 12849 // Generate the reduction as an i32 for legalisation purposes 12850 auto Reduction = 12851 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0)); 12852 12853 // The result isn't actually an i32 so truncate it back to its original type 12854 if (VectorScalarType != MVT::i32) 12855 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction); 12856 12857 return Reduction; 12858 } 12859 12860 // A special combine for the vqdmulh family of instructions. This is one of the 12861 // potential set of patterns that could patch this instruction. The base pattern 12862 // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). 12863 // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), 12864 // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as 12865 // the max is unnecessary. 12866 static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { 12867 EVT VT = N->getValueType(0); 12868 SDValue Shft; 12869 ConstantSDNode *Clamp; 12870 12871 if (N->getOpcode() == ISD::SMIN) { 12872 Shft = N->getOperand(0); 12873 Clamp = isConstOrConstSplat(N->getOperand(1)); 12874 } else if (N->getOpcode() == ISD::VSELECT) { 12875 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. 12876 SDValue Cmp = N->getOperand(0); 12877 if (Cmp.getOpcode() != ISD::SETCC || 12878 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT || 12879 Cmp.getOperand(0) != N->getOperand(1) || 12880 Cmp.getOperand(1) != N->getOperand(2)) 12881 return SDValue(); 12882 Shft = N->getOperand(1); 12883 Clamp = isConstOrConstSplat(N->getOperand(2)); 12884 } else 12885 return SDValue(); 12886 12887 if (!Clamp) 12888 return SDValue(); 12889 12890 MVT ScalarType; 12891 int ShftAmt = 0; 12892 switch (Clamp->getSExtValue()) { 12893 case (1 << 7) - 1: 12894 ScalarType = MVT::i8; 12895 ShftAmt = 7; 12896 break; 12897 case (1 << 15) - 1: 12898 ScalarType = MVT::i16; 12899 ShftAmt = 15; 12900 break; 12901 case (1ULL << 31) - 1: 12902 ScalarType = MVT::i32; 12903 ShftAmt = 31; 12904 break; 12905 default: 12906 return SDValue(); 12907 } 12908 12909 if (Shft.getOpcode() != ISD::SRA) 12910 return SDValue(); 12911 ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); 12912 if (!N1 || N1->getSExtValue() != ShftAmt) 12913 return SDValue(); 12914 12915 SDValue Mul = Shft.getOperand(0); 12916 if (Mul.getOpcode() != ISD::MUL) 12917 return SDValue(); 12918 12919 SDValue Ext0 = Mul.getOperand(0); 12920 SDValue Ext1 = Mul.getOperand(1); 12921 if (Ext0.getOpcode() != ISD::SIGN_EXTEND || 12922 Ext1.getOpcode() != ISD::SIGN_EXTEND) 12923 return SDValue(); 12924 EVT VecVT = Ext0.getOperand(0).getValueType(); 12925 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1) 12926 return SDValue(); 12927 if (Ext1.getOperand(0).getValueType() != VecVT || 12928 VecVT.getScalarType() != ScalarType || 12929 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) 12930 return SDValue(); 12931 12932 SDLoc DL(Mul); 12933 unsigned LegalLanes = 128 / (ShftAmt + 1); 12934 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes); 12935 // For types smaller than legal vectors extend to be legal and only use needed 12936 // lanes. 12937 if (VecVT.getSizeInBits() < 128) { 12938 EVT ExtVecVT = 12939 MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()), 12940 VecVT.getVectorNumElements()); 12941 SDValue Inp0 = 12942 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0)); 12943 SDValue Inp1 = 12944 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0)); 12945 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0); 12946 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1); 12947 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); 12948 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH); 12949 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc); 12950 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc); 12951 } 12952 12953 // For larger types, split into legal sized chunks. 12954 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type"); 12955 unsigned NumParts = VecVT.getSizeInBits() / 128; 12956 SmallVector<SDValue> Parts; 12957 for (unsigned I = 0; I < NumParts; ++I) { 12958 SDValue Inp0 = 12959 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0), 12960 DAG.getVectorIdxConstant(I * LegalLanes, DL)); 12961 SDValue Inp1 = 12962 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0), 12963 DAG.getVectorIdxConstant(I * LegalLanes, DL)); 12964 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); 12965 Parts.push_back(VQDMULH); 12966 } 12967 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, 12968 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts)); 12969 } 12970 12971 static SDValue PerformVSELECTCombine(SDNode *N, 12972 TargetLowering::DAGCombinerInfo &DCI, 12973 const ARMSubtarget *Subtarget) { 12974 if (!Subtarget->hasMVEIntegerOps()) 12975 return SDValue(); 12976 12977 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) 12978 return V; 12979 12980 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). 12981 // 12982 // We need to re-implement this optimization here as the implementation in the 12983 // Target-Independent DAGCombiner does not handle the kind of constant we make 12984 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for 12985 // good reason, allowing truncation there would break other targets). 12986 // 12987 // Currently, this is only done for MVE, as it's the only target that benefits 12988 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). 12989 if (N->getOperand(0).getOpcode() != ISD::XOR) 12990 return SDValue(); 12991 SDValue XOR = N->getOperand(0); 12992 12993 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. 12994 // It is important to check with truncation allowed as the BUILD_VECTORs we 12995 // generate in those situations will truncate their operands. 12996 ConstantSDNode *Const = 12997 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, 12998 /*AllowTruncation*/ true); 12999 if (!Const || !Const->isOne()) 13000 return SDValue(); 13001 13002 // Rewrite into vselect(cond, rhs, lhs). 13003 SDValue Cond = XOR->getOperand(0); 13004 SDValue LHS = N->getOperand(1); 13005 SDValue RHS = N->getOperand(2); 13006 EVT Type = N->getValueType(0); 13007 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); 13008 } 13009 13010 static SDValue PerformABSCombine(SDNode *N, 13011 TargetLowering::DAGCombinerInfo &DCI, 13012 const ARMSubtarget *Subtarget) { 13013 SDValue res; 13014 SelectionDAG &DAG = DCI.DAG; 13015 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13016 13017 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 13018 return SDValue(); 13019 13020 if (!TLI.expandABS(N, res, DAG)) 13021 return SDValue(); 13022 13023 return res; 13024 } 13025 13026 /// PerformADDECombine - Target-specific dag combine transform from 13027 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 13028 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 13029 static SDValue PerformADDECombine(SDNode *N, 13030 TargetLowering::DAGCombinerInfo &DCI, 13031 const ARMSubtarget *Subtarget) { 13032 // Only ARM and Thumb2 support UMLAL/SMLAL. 13033 if (Subtarget->isThumb1Only()) 13034 return PerformAddeSubeCombine(N, DCI, Subtarget); 13035 13036 // Only perform the checks after legalize when the pattern is available. 13037 if (DCI.isBeforeLegalize()) return SDValue(); 13038 13039 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 13040 } 13041 13042 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 13043 /// operands N0 and N1. This is a helper for PerformADDCombine that is 13044 /// called with the default operands, and if that fails, with commuted 13045 /// operands. 13046 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 13047 TargetLowering::DAGCombinerInfo &DCI, 13048 const ARMSubtarget *Subtarget){ 13049 // Attempt to create vpadd for this add. 13050 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 13051 return Result; 13052 13053 // Attempt to create vpaddl for this add. 13054 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 13055 return Result; 13056 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 13057 Subtarget)) 13058 return Result; 13059 13060 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 13061 if (N0.getNode()->hasOneUse()) 13062 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 13063 return Result; 13064 return SDValue(); 13065 } 13066 13067 static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, 13068 const ARMSubtarget *Subtarget) { 13069 if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64) 13070 return SDValue(); 13071 13072 SDValue N0 = N->getOperand(0); 13073 SDValue N1 = N->getOperand(1); 13074 13075 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this 13076 // will look like: 13077 // t1: i32,i32 = ARMISD::VADDLVs x 13078 // t2: i64 = build_pair t1, t1:1 13079 // t3: i64 = add t2, y 13080 // Otherwise we try to push the add up above VADDLVAx, to potentially allow 13081 // the add to be simplified seperately. 13082 // We also need to check for sext / zext and commutitive adds. 13083 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, 13084 SDValue NB) { 13085 if (NB->getOpcode() != ISD::BUILD_PAIR) 13086 return SDValue(); 13087 SDValue VecRed = NB->getOperand(0); 13088 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) || 13089 VecRed.getResNo() != 0 || 13090 NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) 13091 return SDValue(); 13092 13093 SDLoc dl(N); 13094 if (VecRed->getOpcode() == OpcodeA) { 13095 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y) 13096 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 13097 VecRed.getOperand(0), VecRed.getOperand(1)); 13098 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA); 13099 } 13100 13101 SmallVector<SDValue, 4> Ops; 13102 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 13103 DAG.getConstant(0, dl, MVT::i32))); 13104 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 13105 DAG.getConstant(1, dl, MVT::i32))); 13106 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0; 13107 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++) 13108 Ops.push_back(VecRed->getOperand(I)); 13109 SDValue Red = 13110 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops); 13111 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, 13112 SDValue(Red.getNode(), 1)); 13113 }; 13114 13115 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) 13116 return M; 13117 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) 13118 return M; 13119 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) 13120 return M; 13121 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) 13122 return M; 13123 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) 13124 return M; 13125 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) 13126 return M; 13127 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) 13128 return M; 13129 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) 13130 return M; 13131 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) 13132 return M; 13133 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) 13134 return M; 13135 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) 13136 return M; 13137 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) 13138 return M; 13139 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1)) 13140 return M; 13141 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1)) 13142 return M; 13143 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0)) 13144 return M; 13145 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0)) 13146 return M; 13147 return SDValue(); 13148 } 13149 13150 bool 13151 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 13152 CombineLevel Level) const { 13153 if (Level == BeforeLegalizeTypes) 13154 return true; 13155 13156 if (N->getOpcode() != ISD::SHL) 13157 return true; 13158 13159 if (Subtarget->isThumb1Only()) { 13160 // Avoid making expensive immediates by commuting shifts. (This logic 13161 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 13162 // for free.) 13163 if (N->getOpcode() != ISD::SHL) 13164 return true; 13165 SDValue N1 = N->getOperand(0); 13166 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 13167 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 13168 return true; 13169 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 13170 if (Const->getAPIntValue().ult(256)) 13171 return false; 13172 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 13173 Const->getAPIntValue().sgt(-256)) 13174 return false; 13175 } 13176 return true; 13177 } 13178 13179 // Turn off commute-with-shift transform after legalization, so it doesn't 13180 // conflict with PerformSHLSimplify. (We could try to detect when 13181 // PerformSHLSimplify would trigger more precisely, but it isn't 13182 // really necessary.) 13183 return false; 13184 } 13185 13186 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 13187 const SDNode *N, CombineLevel Level) const { 13188 if (!Subtarget->isThumb1Only()) 13189 return true; 13190 13191 if (Level == BeforeLegalizeTypes) 13192 return true; 13193 13194 return false; 13195 } 13196 13197 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 13198 if (!Subtarget->hasNEON()) { 13199 if (Subtarget->isThumb1Only()) 13200 return VT.getScalarSizeInBits() <= 32; 13201 return true; 13202 } 13203 return VT.isScalarInteger(); 13204 } 13205 13206 static SDValue PerformSHLSimplify(SDNode *N, 13207 TargetLowering::DAGCombinerInfo &DCI, 13208 const ARMSubtarget *ST) { 13209 // Allow the generic combiner to identify potential bswaps. 13210 if (DCI.isBeforeLegalize()) 13211 return SDValue(); 13212 13213 // DAG combiner will fold: 13214 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 13215 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 13216 // Other code patterns that can be also be modified have the following form: 13217 // b + ((a << 1) | 510) 13218 // b + ((a << 1) & 510) 13219 // b + ((a << 1) ^ 510) 13220 // b + ((a << 1) + 510) 13221 13222 // Many instructions can perform the shift for free, but it requires both 13223 // the operands to be registers. If c1 << c2 is too large, a mov immediate 13224 // instruction will needed. So, unfold back to the original pattern if: 13225 // - if c1 and c2 are small enough that they don't require mov imms. 13226 // - the user(s) of the node can perform an shl 13227 13228 // No shifted operands for 16-bit instructions. 13229 if (ST->isThumb() && ST->isThumb1Only()) 13230 return SDValue(); 13231 13232 // Check that all the users could perform the shl themselves. 13233 for (auto U : N->uses()) { 13234 switch(U->getOpcode()) { 13235 default: 13236 return SDValue(); 13237 case ISD::SUB: 13238 case ISD::ADD: 13239 case ISD::AND: 13240 case ISD::OR: 13241 case ISD::XOR: 13242 case ISD::SETCC: 13243 case ARMISD::CMP: 13244 // Check that the user isn't already using a constant because there 13245 // aren't any instructions that support an immediate operand and a 13246 // shifted operand. 13247 if (isa<ConstantSDNode>(U->getOperand(0)) || 13248 isa<ConstantSDNode>(U->getOperand(1))) 13249 return SDValue(); 13250 13251 // Check that it's not already using a shift. 13252 if (U->getOperand(0).getOpcode() == ISD::SHL || 13253 U->getOperand(1).getOpcode() == ISD::SHL) 13254 return SDValue(); 13255 break; 13256 } 13257 } 13258 13259 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 13260 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 13261 return SDValue(); 13262 13263 if (N->getOperand(0).getOpcode() != ISD::SHL) 13264 return SDValue(); 13265 13266 SDValue SHL = N->getOperand(0); 13267 13268 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13269 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 13270 if (!C1ShlC2 || !C2) 13271 return SDValue(); 13272 13273 APInt C2Int = C2->getAPIntValue(); 13274 APInt C1Int = C1ShlC2->getAPIntValue(); 13275 13276 // Check that performing a lshr will not lose any information. 13277 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 13278 C2Int.getBitWidth() - C2->getZExtValue()); 13279 if ((C1Int & Mask) != C1Int) 13280 return SDValue(); 13281 13282 // Shift the first constant. 13283 C1Int.lshrInPlace(C2Int); 13284 13285 // The immediates are encoded as an 8-bit value that can be rotated. 13286 auto LargeImm = [](const APInt &Imm) { 13287 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 13288 return Imm.getBitWidth() - Zeros > 8; 13289 }; 13290 13291 if (LargeImm(C1Int) || LargeImm(C2Int)) 13292 return SDValue(); 13293 13294 SelectionDAG &DAG = DCI.DAG; 13295 SDLoc dl(N); 13296 SDValue X = SHL.getOperand(0); 13297 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 13298 DAG.getConstant(C1Int, dl, MVT::i32)); 13299 // Shift left to compensate for the lshr of C1Int. 13300 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 13301 13302 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 13303 SHL.dump(); N->dump()); 13304 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 13305 return Res; 13306 } 13307 13308 13309 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 13310 /// 13311 static SDValue PerformADDCombine(SDNode *N, 13312 TargetLowering::DAGCombinerInfo &DCI, 13313 const ARMSubtarget *Subtarget) { 13314 SDValue N0 = N->getOperand(0); 13315 SDValue N1 = N->getOperand(1); 13316 13317 // Only works one way, because it needs an immediate operand. 13318 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 13319 return Result; 13320 13321 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget)) 13322 return Result; 13323 13324 // First try with the default operand order. 13325 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 13326 return Result; 13327 13328 // If that didn't work, try again with the operands commuted. 13329 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 13330 } 13331 13332 // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC) 13333 // providing -X is as cheap as X (currently, just a constant). 13334 static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) { 13335 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0))) 13336 return SDValue(); 13337 SDValue CSINC = N->getOperand(1); 13338 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse()) 13339 return SDValue(); 13340 13341 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0)); 13342 if (!X) 13343 return SDValue(); 13344 13345 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32, 13346 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0), 13347 CSINC.getOperand(0)), 13348 CSINC.getOperand(1), CSINC.getOperand(2), 13349 CSINC.getOperand(3)); 13350 } 13351 13352 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 13353 /// 13354 static SDValue PerformSUBCombine(SDNode *N, 13355 TargetLowering::DAGCombinerInfo &DCI, 13356 const ARMSubtarget *Subtarget) { 13357 SDValue N0 = N->getOperand(0); 13358 SDValue N1 = N->getOperand(1); 13359 13360 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 13361 if (N1.getNode()->hasOneUse()) 13362 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 13363 return Result; 13364 13365 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG)) 13366 return R; 13367 13368 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 13369 return SDValue(); 13370 13371 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 13372 // so that we can readily pattern match more mve instructions which can use 13373 // a scalar operand. 13374 SDValue VDup = N->getOperand(1); 13375 if (VDup->getOpcode() != ARMISD::VDUP) 13376 return SDValue(); 13377 13378 SDValue VMov = N->getOperand(0); 13379 if (VMov->getOpcode() == ISD::BITCAST) 13380 VMov = VMov->getOperand(0); 13381 13382 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 13383 return SDValue(); 13384 13385 SDLoc dl(N); 13386 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 13387 DCI.DAG.getConstant(0, dl, MVT::i32), 13388 VDup->getOperand(0)); 13389 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 13390 } 13391 13392 /// PerformVMULCombine 13393 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 13394 /// special multiplier accumulator forwarding. 13395 /// vmul d3, d0, d2 13396 /// vmla d3, d1, d2 13397 /// is faster than 13398 /// vadd d3, d0, d1 13399 /// vmul d3, d3, d2 13400 // However, for (A + B) * (A + B), 13401 // vadd d2, d0, d1 13402 // vmul d3, d0, d2 13403 // vmla d3, d1, d2 13404 // is slower than 13405 // vadd d2, d0, d1 13406 // vmul d3, d2, d2 13407 static SDValue PerformVMULCombine(SDNode *N, 13408 TargetLowering::DAGCombinerInfo &DCI, 13409 const ARMSubtarget *Subtarget) { 13410 if (!Subtarget->hasVMLxForwarding()) 13411 return SDValue(); 13412 13413 SelectionDAG &DAG = DCI.DAG; 13414 SDValue N0 = N->getOperand(0); 13415 SDValue N1 = N->getOperand(1); 13416 unsigned Opcode = N0.getOpcode(); 13417 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13418 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 13419 Opcode = N1.getOpcode(); 13420 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13421 Opcode != ISD::FADD && Opcode != ISD::FSUB) 13422 return SDValue(); 13423 std::swap(N0, N1); 13424 } 13425 13426 if (N0 == N1) 13427 return SDValue(); 13428 13429 EVT VT = N->getValueType(0); 13430 SDLoc DL(N); 13431 SDValue N00 = N0->getOperand(0); 13432 SDValue N01 = N0->getOperand(1); 13433 return DAG.getNode(Opcode, DL, VT, 13434 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 13435 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 13436 } 13437 13438 static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, 13439 const ARMSubtarget *Subtarget) { 13440 EVT VT = N->getValueType(0); 13441 if (VT != MVT::v2i64) 13442 return SDValue(); 13443 13444 SDValue N0 = N->getOperand(0); 13445 SDValue N1 = N->getOperand(1); 13446 13447 auto IsSignExt = [&](SDValue Op) { 13448 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) 13449 return SDValue(); 13450 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); 13451 if (VT.getScalarSizeInBits() == 32) 13452 return Op->getOperand(0); 13453 return SDValue(); 13454 }; 13455 auto IsZeroExt = [&](SDValue Op) { 13456 // Zero extends are a little more awkward. At the point we are matching 13457 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. 13458 // That might be before of after a bitcast depending on how the and is 13459 // placed. Because this has to look through bitcasts, it is currently only 13460 // supported on LE. 13461 if (!Subtarget->isLittle()) 13462 return SDValue(); 13463 13464 SDValue And = Op; 13465 if (And->getOpcode() == ISD::BITCAST) 13466 And = And->getOperand(0); 13467 if (And->getOpcode() != ISD::AND) 13468 return SDValue(); 13469 SDValue Mask = And->getOperand(1); 13470 if (Mask->getOpcode() == ISD::BITCAST) 13471 Mask = Mask->getOperand(0); 13472 13473 if (Mask->getOpcode() != ISD::BUILD_VECTOR || 13474 Mask.getValueType() != MVT::v4i32) 13475 return SDValue(); 13476 if (isAllOnesConstant(Mask->getOperand(0)) && 13477 isNullConstant(Mask->getOperand(1)) && 13478 isAllOnesConstant(Mask->getOperand(2)) && 13479 isNullConstant(Mask->getOperand(3))) 13480 return And->getOperand(0); 13481 return SDValue(); 13482 }; 13483 13484 SDLoc dl(N); 13485 if (SDValue Op0 = IsSignExt(N0)) { 13486 if (SDValue Op1 = IsSignExt(N1)) { 13487 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13488 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13489 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); 13490 } 13491 } 13492 if (SDValue Op0 = IsZeroExt(N0)) { 13493 if (SDValue Op1 = IsZeroExt(N1)) { 13494 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13495 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13496 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); 13497 } 13498 } 13499 13500 return SDValue(); 13501 } 13502 13503 static SDValue PerformMULCombine(SDNode *N, 13504 TargetLowering::DAGCombinerInfo &DCI, 13505 const ARMSubtarget *Subtarget) { 13506 SelectionDAG &DAG = DCI.DAG; 13507 13508 EVT VT = N->getValueType(0); 13509 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) 13510 return PerformMVEVMULLCombine(N, DAG, Subtarget); 13511 13512 if (Subtarget->isThumb1Only()) 13513 return SDValue(); 13514 13515 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13516 return SDValue(); 13517 13518 if (VT.is64BitVector() || VT.is128BitVector()) 13519 return PerformVMULCombine(N, DCI, Subtarget); 13520 if (VT != MVT::i32) 13521 return SDValue(); 13522 13523 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13524 if (!C) 13525 return SDValue(); 13526 13527 int64_t MulAmt = C->getSExtValue(); 13528 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 13529 13530 ShiftAmt = ShiftAmt & (32 - 1); 13531 SDValue V = N->getOperand(0); 13532 SDLoc DL(N); 13533 13534 SDValue Res; 13535 MulAmt >>= ShiftAmt; 13536 13537 if (MulAmt >= 0) { 13538 if (isPowerOf2_32(MulAmt - 1)) { 13539 // (mul x, 2^N + 1) => (add (shl x, N), x) 13540 Res = DAG.getNode(ISD::ADD, DL, VT, 13541 V, 13542 DAG.getNode(ISD::SHL, DL, VT, 13543 V, 13544 DAG.getConstant(Log2_32(MulAmt - 1), DL, 13545 MVT::i32))); 13546 } else if (isPowerOf2_32(MulAmt + 1)) { 13547 // (mul x, 2^N - 1) => (sub (shl x, N), x) 13548 Res = DAG.getNode(ISD::SUB, DL, VT, 13549 DAG.getNode(ISD::SHL, DL, VT, 13550 V, 13551 DAG.getConstant(Log2_32(MulAmt + 1), DL, 13552 MVT::i32)), 13553 V); 13554 } else 13555 return SDValue(); 13556 } else { 13557 uint64_t MulAmtAbs = -MulAmt; 13558 if (isPowerOf2_32(MulAmtAbs + 1)) { 13559 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 13560 Res = DAG.getNode(ISD::SUB, DL, VT, 13561 V, 13562 DAG.getNode(ISD::SHL, DL, VT, 13563 V, 13564 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 13565 MVT::i32))); 13566 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 13567 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 13568 Res = DAG.getNode(ISD::ADD, DL, VT, 13569 V, 13570 DAG.getNode(ISD::SHL, DL, VT, 13571 V, 13572 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 13573 MVT::i32))); 13574 Res = DAG.getNode(ISD::SUB, DL, VT, 13575 DAG.getConstant(0, DL, MVT::i32), Res); 13576 } else 13577 return SDValue(); 13578 } 13579 13580 if (ShiftAmt != 0) 13581 Res = DAG.getNode(ISD::SHL, DL, VT, 13582 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 13583 13584 // Do not add new nodes to DAG combiner worklist. 13585 DCI.CombineTo(N, Res, false); 13586 return SDValue(); 13587 } 13588 13589 static SDValue CombineANDShift(SDNode *N, 13590 TargetLowering::DAGCombinerInfo &DCI, 13591 const ARMSubtarget *Subtarget) { 13592 // Allow DAGCombine to pattern-match before we touch the canonical form. 13593 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13594 return SDValue(); 13595 13596 if (N->getValueType(0) != MVT::i32) 13597 return SDValue(); 13598 13599 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13600 if (!N1C) 13601 return SDValue(); 13602 13603 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 13604 // Don't transform uxtb/uxth. 13605 if (C1 == 255 || C1 == 65535) 13606 return SDValue(); 13607 13608 SDNode *N0 = N->getOperand(0).getNode(); 13609 if (!N0->hasOneUse()) 13610 return SDValue(); 13611 13612 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 13613 return SDValue(); 13614 13615 bool LeftShift = N0->getOpcode() == ISD::SHL; 13616 13617 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 13618 if (!N01C) 13619 return SDValue(); 13620 13621 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 13622 if (!C2 || C2 >= 32) 13623 return SDValue(); 13624 13625 // Clear irrelevant bits in the mask. 13626 if (LeftShift) 13627 C1 &= (-1U << C2); 13628 else 13629 C1 &= (-1U >> C2); 13630 13631 SelectionDAG &DAG = DCI.DAG; 13632 SDLoc DL(N); 13633 13634 // We have a pattern of the form "(and (shl x, c2) c1)" or 13635 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 13636 // transform to a pair of shifts, to save materializing c1. 13637 13638 // First pattern: right shift, then mask off leading bits. 13639 // FIXME: Use demanded bits? 13640 if (!LeftShift && isMask_32(C1)) { 13641 uint32_t C3 = countLeadingZeros(C1); 13642 if (C2 < C3) { 13643 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13644 DAG.getConstant(C3 - C2, DL, MVT::i32)); 13645 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 13646 DAG.getConstant(C3, DL, MVT::i32)); 13647 } 13648 } 13649 13650 // First pattern, reversed: left shift, then mask off trailing bits. 13651 if (LeftShift && isMask_32(~C1)) { 13652 uint32_t C3 = countTrailingZeros(C1); 13653 if (C2 < C3) { 13654 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 13655 DAG.getConstant(C3 - C2, DL, MVT::i32)); 13656 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 13657 DAG.getConstant(C3, DL, MVT::i32)); 13658 } 13659 } 13660 13661 // Second pattern: left shift, then mask off leading bits. 13662 // FIXME: Use demanded bits? 13663 if (LeftShift && isShiftedMask_32(C1)) { 13664 uint32_t Trailing = countTrailingZeros(C1); 13665 uint32_t C3 = countLeadingZeros(C1); 13666 if (Trailing == C2 && C2 + C3 < 32) { 13667 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13668 DAG.getConstant(C2 + C3, DL, MVT::i32)); 13669 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 13670 DAG.getConstant(C3, DL, MVT::i32)); 13671 } 13672 } 13673 13674 // Second pattern, reversed: right shift, then mask off trailing bits. 13675 // FIXME: Handle other patterns of known/demanded bits. 13676 if (!LeftShift && isShiftedMask_32(C1)) { 13677 uint32_t Leading = countLeadingZeros(C1); 13678 uint32_t C3 = countTrailingZeros(C1); 13679 if (Leading == C2 && C2 + C3 < 32) { 13680 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 13681 DAG.getConstant(C2 + C3, DL, MVT::i32)); 13682 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 13683 DAG.getConstant(C3, DL, MVT::i32)); 13684 } 13685 } 13686 13687 // FIXME: Transform "(and (shl x, c2) c1)" -> 13688 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 13689 // c1. 13690 return SDValue(); 13691 } 13692 13693 static SDValue PerformANDCombine(SDNode *N, 13694 TargetLowering::DAGCombinerInfo &DCI, 13695 const ARMSubtarget *Subtarget) { 13696 // Attempt to use immediate-form VBIC 13697 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 13698 SDLoc dl(N); 13699 EVT VT = N->getValueType(0); 13700 SelectionDAG &DAG = DCI.DAG; 13701 13702 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 || 13703 VT == MVT::v8i1 || VT == MVT::v16i1) 13704 return SDValue(); 13705 13706 APInt SplatBits, SplatUndef; 13707 unsigned SplatBitSize; 13708 bool HasAnyUndefs; 13709 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 13710 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 13711 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 13712 SplatBitSize == 64) { 13713 EVT VbicVT; 13714 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 13715 SplatUndef.getZExtValue(), SplatBitSize, 13716 DAG, dl, VbicVT, VT, OtherModImm); 13717 if (Val.getNode()) { 13718 SDValue Input = 13719 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 13720 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 13721 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 13722 } 13723 } 13724 } 13725 13726 if (!Subtarget->isThumb1Only()) { 13727 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 13728 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 13729 return Result; 13730 13731 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 13732 return Result; 13733 } 13734 13735 if (Subtarget->isThumb1Only()) 13736 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 13737 return Result; 13738 13739 return SDValue(); 13740 } 13741 13742 // Try combining OR nodes to SMULWB, SMULWT. 13743 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 13744 TargetLowering::DAGCombinerInfo &DCI, 13745 const ARMSubtarget *Subtarget) { 13746 if (!Subtarget->hasV6Ops() || 13747 (Subtarget->isThumb() && 13748 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 13749 return SDValue(); 13750 13751 SDValue SRL = OR->getOperand(0); 13752 SDValue SHL = OR->getOperand(1); 13753 13754 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 13755 SRL = OR->getOperand(1); 13756 SHL = OR->getOperand(0); 13757 } 13758 if (!isSRL16(SRL) || !isSHL16(SHL)) 13759 return SDValue(); 13760 13761 // The first operands to the shifts need to be the two results from the 13762 // same smul_lohi node. 13763 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 13764 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 13765 return SDValue(); 13766 13767 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 13768 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 13769 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 13770 return SDValue(); 13771 13772 // Now we have: 13773 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 13774 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 13775 // For SMUWB the 16-bit value will signed extended somehow. 13776 // For SMULWT only the SRA is required. 13777 // Check both sides of SMUL_LOHI 13778 SDValue OpS16 = SMULLOHI->getOperand(0); 13779 SDValue OpS32 = SMULLOHI->getOperand(1); 13780 13781 SelectionDAG &DAG = DCI.DAG; 13782 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 13783 OpS16 = OpS32; 13784 OpS32 = SMULLOHI->getOperand(0); 13785 } 13786 13787 SDLoc dl(OR); 13788 unsigned Opcode = 0; 13789 if (isS16(OpS16, DAG)) 13790 Opcode = ARMISD::SMULWB; 13791 else if (isSRA16(OpS16)) { 13792 Opcode = ARMISD::SMULWT; 13793 OpS16 = OpS16->getOperand(0); 13794 } 13795 else 13796 return SDValue(); 13797 13798 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 13799 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 13800 return SDValue(OR, 0); 13801 } 13802 13803 static SDValue PerformORCombineToBFI(SDNode *N, 13804 TargetLowering::DAGCombinerInfo &DCI, 13805 const ARMSubtarget *Subtarget) { 13806 // BFI is only available on V6T2+ 13807 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 13808 return SDValue(); 13809 13810 EVT VT = N->getValueType(0); 13811 SDValue N0 = N->getOperand(0); 13812 SDValue N1 = N->getOperand(1); 13813 SelectionDAG &DAG = DCI.DAG; 13814 SDLoc DL(N); 13815 // 1) or (and A, mask), val => ARMbfi A, val, mask 13816 // iff (val & mask) == val 13817 // 13818 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 13819 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 13820 // && mask == ~mask2 13821 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 13822 // && ~mask == mask2 13823 // (i.e., copy a bitfield value into another bitfield of the same width) 13824 13825 if (VT != MVT::i32) 13826 return SDValue(); 13827 13828 SDValue N00 = N0.getOperand(0); 13829 13830 // The value and the mask need to be constants so we can verify this is 13831 // actually a bitfield set. If the mask is 0xffff, we can do better 13832 // via a movt instruction, so don't use BFI in that case. 13833 SDValue MaskOp = N0.getOperand(1); 13834 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 13835 if (!MaskC) 13836 return SDValue(); 13837 unsigned Mask = MaskC->getZExtValue(); 13838 if (Mask == 0xffff) 13839 return SDValue(); 13840 SDValue Res; 13841 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 13842 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 13843 if (N1C) { 13844 unsigned Val = N1C->getZExtValue(); 13845 if ((Val & ~Mask) != Val) 13846 return SDValue(); 13847 13848 if (ARM::isBitFieldInvertedMask(Mask)) { 13849 Val >>= countTrailingZeros(~Mask); 13850 13851 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 13852 DAG.getConstant(Val, DL, MVT::i32), 13853 DAG.getConstant(Mask, DL, MVT::i32)); 13854 13855 DCI.CombineTo(N, Res, false); 13856 // Return value from the original node to inform the combiner than N is 13857 // now dead. 13858 return SDValue(N, 0); 13859 } 13860 } else if (N1.getOpcode() == ISD::AND) { 13861 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 13862 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 13863 if (!N11C) 13864 return SDValue(); 13865 unsigned Mask2 = N11C->getZExtValue(); 13866 13867 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 13868 // as is to match. 13869 if (ARM::isBitFieldInvertedMask(Mask) && 13870 (Mask == ~Mask2)) { 13871 // The pack halfword instruction works better for masks that fit it, 13872 // so use that when it's available. 13873 if (Subtarget->hasDSP() && 13874 (Mask == 0xffff || Mask == 0xffff0000)) 13875 return SDValue(); 13876 // 2a 13877 unsigned amt = countTrailingZeros(Mask2); 13878 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 13879 DAG.getConstant(amt, DL, MVT::i32)); 13880 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 13881 DAG.getConstant(Mask, DL, MVT::i32)); 13882 DCI.CombineTo(N, Res, false); 13883 // Return value from the original node to inform the combiner than N is 13884 // now dead. 13885 return SDValue(N, 0); 13886 } else if (ARM::isBitFieldInvertedMask(~Mask) && 13887 (~Mask == Mask2)) { 13888 // The pack halfword instruction works better for masks that fit it, 13889 // so use that when it's available. 13890 if (Subtarget->hasDSP() && 13891 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 13892 return SDValue(); 13893 // 2b 13894 unsigned lsb = countTrailingZeros(Mask); 13895 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 13896 DAG.getConstant(lsb, DL, MVT::i32)); 13897 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 13898 DAG.getConstant(Mask2, DL, MVT::i32)); 13899 DCI.CombineTo(N, Res, false); 13900 // Return value from the original node to inform the combiner than N is 13901 // now dead. 13902 return SDValue(N, 0); 13903 } 13904 } 13905 13906 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 13907 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 13908 ARM::isBitFieldInvertedMask(~Mask)) { 13909 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 13910 // where lsb(mask) == #shamt and masked bits of B are known zero. 13911 SDValue ShAmt = N00.getOperand(1); 13912 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 13913 unsigned LSB = countTrailingZeros(Mask); 13914 if (ShAmtC != LSB) 13915 return SDValue(); 13916 13917 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 13918 DAG.getConstant(~Mask, DL, MVT::i32)); 13919 13920 DCI.CombineTo(N, Res, false); 13921 // Return value from the original node to inform the combiner than N is 13922 // now dead. 13923 return SDValue(N, 0); 13924 } 13925 13926 return SDValue(); 13927 } 13928 13929 static bool isValidMVECond(unsigned CC, bool IsFloat) { 13930 switch (CC) { 13931 case ARMCC::EQ: 13932 case ARMCC::NE: 13933 case ARMCC::LE: 13934 case ARMCC::GT: 13935 case ARMCC::GE: 13936 case ARMCC::LT: 13937 return true; 13938 case ARMCC::HS: 13939 case ARMCC::HI: 13940 return !IsFloat; 13941 default: 13942 return false; 13943 }; 13944 } 13945 13946 static ARMCC::CondCodes getVCMPCondCode(SDValue N) { 13947 if (N->getOpcode() == ARMISD::VCMP) 13948 return (ARMCC::CondCodes)N->getConstantOperandVal(2); 13949 else if (N->getOpcode() == ARMISD::VCMPZ) 13950 return (ARMCC::CondCodes)N->getConstantOperandVal(1); 13951 else 13952 llvm_unreachable("Not a VCMP/VCMPZ!"); 13953 } 13954 13955 static bool CanInvertMVEVCMP(SDValue N) { 13956 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); 13957 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); 13958 } 13959 13960 static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, 13961 const ARMSubtarget *Subtarget) { 13962 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 13963 // together with predicates 13964 EVT VT = N->getValueType(0); 13965 SDLoc DL(N); 13966 SDValue N0 = N->getOperand(0); 13967 SDValue N1 = N->getOperand(1); 13968 13969 auto IsFreelyInvertable = [&](SDValue V) { 13970 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) 13971 return CanInvertMVEVCMP(V); 13972 return false; 13973 }; 13974 13975 // At least one operand must be freely invertable. 13976 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) 13977 return SDValue(); 13978 13979 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT); 13980 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT); 13981 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); 13982 return DAG.getLogicalNOT(DL, And, VT); 13983 } 13984 13985 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 13986 static SDValue PerformORCombine(SDNode *N, 13987 TargetLowering::DAGCombinerInfo &DCI, 13988 const ARMSubtarget *Subtarget) { 13989 // Attempt to use immediate-form VORR 13990 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 13991 SDLoc dl(N); 13992 EVT VT = N->getValueType(0); 13993 SelectionDAG &DAG = DCI.DAG; 13994 13995 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13996 return SDValue(); 13997 13998 if (Subtarget->hasMVEIntegerOps() && 13999 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 14000 return PerformORCombine_i1(N, DAG, Subtarget); 14001 14002 APInt SplatBits, SplatUndef; 14003 unsigned SplatBitSize; 14004 bool HasAnyUndefs; 14005 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 14006 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 14007 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 14008 SplatBitSize == 64) { 14009 EVT VorrVT; 14010 SDValue Val = 14011 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 14012 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); 14013 if (Val.getNode()) { 14014 SDValue Input = 14015 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 14016 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 14017 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 14018 } 14019 } 14020 } 14021 14022 if (!Subtarget->isThumb1Only()) { 14023 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 14024 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 14025 return Result; 14026 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 14027 return Result; 14028 } 14029 14030 SDValue N0 = N->getOperand(0); 14031 SDValue N1 = N->getOperand(1); 14032 14033 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 14034 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 14035 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 14036 14037 // The code below optimizes (or (and X, Y), Z). 14038 // The AND operand needs to have a single user to make these optimizations 14039 // profitable. 14040 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 14041 return SDValue(); 14042 14043 APInt SplatUndef; 14044 unsigned SplatBitSize; 14045 bool HasAnyUndefs; 14046 14047 APInt SplatBits0, SplatBits1; 14048 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 14049 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 14050 // Ensure that the second operand of both ands are constants 14051 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 14052 HasAnyUndefs) && !HasAnyUndefs) { 14053 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 14054 HasAnyUndefs) && !HasAnyUndefs) { 14055 // Ensure that the bit width of the constants are the same and that 14056 // the splat arguments are logical inverses as per the pattern we 14057 // are trying to simplify. 14058 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 14059 SplatBits0 == ~SplatBits1) { 14060 // Canonicalize the vector type to make instruction selection 14061 // simpler. 14062 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 14063 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, 14064 N0->getOperand(1), 14065 N0->getOperand(0), 14066 N1->getOperand(0)); 14067 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 14068 } 14069 } 14070 } 14071 } 14072 14073 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 14074 // reasonable. 14075 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 14076 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 14077 return Res; 14078 } 14079 14080 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14081 return Result; 14082 14083 return SDValue(); 14084 } 14085 14086 static SDValue PerformXORCombine(SDNode *N, 14087 TargetLowering::DAGCombinerInfo &DCI, 14088 const ARMSubtarget *Subtarget) { 14089 EVT VT = N->getValueType(0); 14090 SelectionDAG &DAG = DCI.DAG; 14091 14092 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 14093 return SDValue(); 14094 14095 if (!Subtarget->isThumb1Only()) { 14096 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 14097 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 14098 return Result; 14099 14100 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14101 return Result; 14102 } 14103 14104 if (Subtarget->hasMVEIntegerOps()) { 14105 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. 14106 SDValue N0 = N->getOperand(0); 14107 SDValue N1 = N->getOperand(1); 14108 const TargetLowering *TLI = Subtarget->getTargetLowering(); 14109 if (TLI->isConstTrueVal(N1.getNode()) && 14110 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { 14111 if (CanInvertMVEVCMP(N0)) { 14112 SDLoc DL(N0); 14113 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); 14114 14115 SmallVector<SDValue, 4> Ops; 14116 Ops.push_back(N0->getOperand(0)); 14117 if (N0->getOpcode() == ARMISD::VCMP) 14118 Ops.push_back(N0->getOperand(1)); 14119 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32)); 14120 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); 14121 } 14122 } 14123 } 14124 14125 return SDValue(); 14126 } 14127 14128 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 14129 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 14130 // their position in "to" (Rd). 14131 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 14132 assert(N->getOpcode() == ARMISD::BFI); 14133 14134 SDValue From = N->getOperand(1); 14135 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 14136 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 14137 14138 // If the Base came from a SHR #C, we can deduce that it is really testing bit 14139 // #C in the base of the SHR. 14140 if (From->getOpcode() == ISD::SRL && 14141 isa<ConstantSDNode>(From->getOperand(1))) { 14142 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 14143 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 14144 FromMask <<= Shift.getLimitedValue(31); 14145 From = From->getOperand(0); 14146 } 14147 14148 return From; 14149 } 14150 14151 // If A and B contain one contiguous set of bits, does A | B == A . B? 14152 // 14153 // Neither A nor B must be zero. 14154 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 14155 unsigned LastActiveBitInA = A.countTrailingZeros(); 14156 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 14157 return LastActiveBitInA - 1 == FirstActiveBitInB; 14158 } 14159 14160 static SDValue FindBFIToCombineWith(SDNode *N) { 14161 // We have a BFI in N. Find a BFI it can combine with, if one exists. 14162 APInt ToMask, FromMask; 14163 SDValue From = ParseBFI(N, ToMask, FromMask); 14164 SDValue To = N->getOperand(0); 14165 14166 SDValue V = To; 14167 if (V.getOpcode() != ARMISD::BFI) 14168 return SDValue(); 14169 14170 APInt NewToMask, NewFromMask; 14171 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 14172 if (NewFrom != From) 14173 return SDValue(); 14174 14175 // Do the written bits conflict with any we've seen so far? 14176 if ((NewToMask & ToMask).getBoolValue()) 14177 // Conflicting bits. 14178 return SDValue(); 14179 14180 // Are the new bits contiguous when combined with the old bits? 14181 if (BitsProperlyConcatenate(ToMask, NewToMask) && 14182 BitsProperlyConcatenate(FromMask, NewFromMask)) 14183 return V; 14184 if (BitsProperlyConcatenate(NewToMask, ToMask) && 14185 BitsProperlyConcatenate(NewFromMask, FromMask)) 14186 return V; 14187 14188 return SDValue(); 14189 } 14190 14191 static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { 14192 SDValue N0 = N->getOperand(0); 14193 SDValue N1 = N->getOperand(1); 14194 14195 if (N1.getOpcode() == ISD::AND) { 14196 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 14197 // the bits being cleared by the AND are not demanded by the BFI. 14198 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 14199 if (!N11C) 14200 return SDValue(); 14201 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 14202 unsigned LSB = countTrailingZeros(~InvMask); 14203 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 14204 assert(Width < 14205 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 14206 "undefined behavior"); 14207 unsigned Mask = (1u << Width) - 1; 14208 unsigned Mask2 = N11C->getZExtValue(); 14209 if ((Mask & (~Mask2)) == 0) 14210 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 14211 N->getOperand(0), N1.getOperand(0), N->getOperand(2)); 14212 return SDValue(); 14213 } 14214 14215 // Look for another BFI to combine with. 14216 if (SDValue CombineBFI = FindBFIToCombineWith(N)) { 14217 // We've found a BFI. 14218 APInt ToMask1, FromMask1; 14219 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 14220 14221 APInt ToMask2, FromMask2; 14222 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 14223 assert(From1 == From2); 14224 (void)From2; 14225 14226 // Create a new BFI, combining the two together. 14227 APInt NewFromMask = FromMask1 | FromMask2; 14228 APInt NewToMask = ToMask1 | ToMask2; 14229 14230 EVT VT = N->getValueType(0); 14231 SDLoc dl(N); 14232 14233 if (NewFromMask[0] == 0) 14234 From1 = DAG.getNode( 14235 ISD::SRL, dl, VT, From1, 14236 DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 14237 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1, 14238 DAG.getConstant(~NewToMask, dl, VT)); 14239 } 14240 14241 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so 14242 // that lower bit insertions are performed first, providing that M1 and M2 14243 // do no overlap. This can allow multiple BFI instructions to be combined 14244 // together by the other folds above. 14245 if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 14246 APInt ToMask1 = ~N->getConstantOperandAPInt(2); 14247 APInt ToMask2 = ~N0.getConstantOperandAPInt(2); 14248 14249 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 || 14250 ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros()) 14251 return SDValue(); 14252 14253 EVT VT = N->getValueType(0); 14254 SDLoc dl(N); 14255 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0), 14256 N->getOperand(1), N->getOperand(2)); 14257 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1), 14258 N0.getOperand(2)); 14259 } 14260 14261 return SDValue(); 14262 } 14263 14264 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 14265 /// ARMISD::VMOVRRD. 14266 static SDValue PerformVMOVRRDCombine(SDNode *N, 14267 TargetLowering::DAGCombinerInfo &DCI, 14268 const ARMSubtarget *Subtarget) { 14269 // vmovrrd(vmovdrr x, y) -> x,y 14270 SDValue InDouble = N->getOperand(0); 14271 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 14272 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 14273 14274 // vmovrrd(load f64) -> (load i32), (load i32) 14275 SDNode *InNode = InDouble.getNode(); 14276 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 14277 InNode->getValueType(0) == MVT::f64 && 14278 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 14279 !cast<LoadSDNode>(InNode)->isVolatile()) { 14280 // TODO: Should this be done for non-FrameIndex operands? 14281 LoadSDNode *LD = cast<LoadSDNode>(InNode); 14282 14283 SelectionDAG &DAG = DCI.DAG; 14284 SDLoc DL(LD); 14285 SDValue BasePtr = LD->getBasePtr(); 14286 SDValue NewLD1 = 14287 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 14288 LD->getAlignment(), LD->getMemOperand()->getFlags()); 14289 14290 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 14291 DAG.getConstant(4, DL, MVT::i32)); 14292 14293 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 14294 LD->getPointerInfo().getWithOffset(4), 14295 std::min(4U, LD->getAlignment()), 14296 LD->getMemOperand()->getFlags()); 14297 14298 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 14299 if (DCI.DAG.getDataLayout().isBigEndian()) 14300 std::swap (NewLD1, NewLD2); 14301 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 14302 return Result; 14303 } 14304 14305 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d 14306 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b 14307 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14308 isa<ConstantSDNode>(InDouble.getOperand(1))) { 14309 SDValue BV = InDouble.getOperand(0); 14310 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may 14311 // change lane order under big endian. 14312 bool BVSwap = BV.getOpcode() == ISD::BITCAST; 14313 while ( 14314 (BV.getOpcode() == ISD::BITCAST || 14315 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) && 14316 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) { 14317 BVSwap = BV.getOpcode() == ISD::BITCAST; 14318 BV = BV.getOperand(0); 14319 } 14320 if (BV.getValueType() != MVT::v4i32) 14321 return SDValue(); 14322 14323 // Handle buildvectors, pulling out the correct lane depending on 14324 // endianness. 14325 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0; 14326 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 14327 SDValue Op0 = BV.getOperand(Offset); 14328 SDValue Op1 = BV.getOperand(Offset + 1); 14329 if (!Subtarget->isLittle() && BVSwap) 14330 std::swap(Op0, Op1); 14331 14332 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14333 } 14334 14335 // A chain of insert_vectors, grabbing the correct value of the chain of 14336 // inserts. 14337 SDValue Op0, Op1; 14338 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) { 14339 if (isa<ConstantSDNode>(BV.getOperand(2))) { 14340 if (BV.getConstantOperandVal(2) == Offset) 14341 Op0 = BV.getOperand(1); 14342 if (BV.getConstantOperandVal(2) == Offset + 1) 14343 Op1 = BV.getOperand(1); 14344 } 14345 BV = BV.getOperand(0); 14346 } 14347 if (!Subtarget->isLittle() && BVSwap) 14348 std::swap(Op0, Op1); 14349 if (Op0 && Op1) 14350 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14351 } 14352 14353 return SDValue(); 14354 } 14355 14356 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 14357 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 14358 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 14359 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 14360 SDValue Op0 = N->getOperand(0); 14361 SDValue Op1 = N->getOperand(1); 14362 if (Op0.getOpcode() == ISD::BITCAST) 14363 Op0 = Op0.getOperand(0); 14364 if (Op1.getOpcode() == ISD::BITCAST) 14365 Op1 = Op1.getOperand(0); 14366 if (Op0.getOpcode() == ARMISD::VMOVRRD && 14367 Op0.getNode() == Op1.getNode() && 14368 Op0.getResNo() == 0 && Op1.getResNo() == 1) 14369 return DAG.getNode(ISD::BITCAST, SDLoc(N), 14370 N->getValueType(0), Op0.getOperand(0)); 14371 return SDValue(); 14372 } 14373 14374 static SDValue PerformVMOVhrCombine(SDNode *N, 14375 TargetLowering::DAGCombinerInfo &DCI) { 14376 SDValue Op0 = N->getOperand(0); 14377 14378 // VMOVhr (VMOVrh (X)) -> X 14379 if (Op0->getOpcode() == ARMISD::VMOVrh) 14380 return Op0->getOperand(0); 14381 14382 // FullFP16: half values are passed in S-registers, and we don't 14383 // need any of the bitcast and moves: 14384 // 14385 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 14386 // t5: i32 = bitcast t2 14387 // t18: f16 = ARMISD::VMOVhr t5 14388 if (Op0->getOpcode() == ISD::BITCAST) { 14389 SDValue Copy = Op0->getOperand(0); 14390 if (Copy.getValueType() == MVT::f32 && 14391 Copy->getOpcode() == ISD::CopyFromReg) { 14392 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; 14393 SDValue NewCopy = 14394 DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops); 14395 return NewCopy; 14396 } 14397 } 14398 14399 // fold (VMOVhr (load x)) -> (load (f16*)x) 14400 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { 14401 if (LN0->hasOneUse() && LN0->isUnindexed() && 14402 LN0->getMemoryVT() == MVT::i16) { 14403 SDValue Load = 14404 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(), 14405 LN0->getBasePtr(), LN0->getMemOperand()); 14406 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14407 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); 14408 return Load; 14409 } 14410 } 14411 14412 // Only the bottom 16 bits of the source register are used. 14413 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 14414 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 14415 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) 14416 return SDValue(N, 0); 14417 14418 return SDValue(); 14419 } 14420 14421 static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) { 14422 SDValue N0 = N->getOperand(0); 14423 EVT VT = N->getValueType(0); 14424 14425 // fold (VMOVrh (fpconst x)) -> const x 14426 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) { 14427 APFloat V = C->getValueAPF(); 14428 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT); 14429 } 14430 14431 // fold (VMOVrh (load x)) -> (zextload (i16*)x) 14432 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { 14433 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 14434 14435 SDValue Load = 14436 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), 14437 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); 14438 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14439 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 14440 return Load; 14441 } 14442 14443 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) 14444 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14445 isa<ConstantSDNode>(N0->getOperand(1))) 14446 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), 14447 N0->getOperand(1)); 14448 14449 return SDValue(); 14450 } 14451 14452 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 14453 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 14454 /// i64 vector to have f64 elements, since the value can then be loaded 14455 /// directly into a VFP register. 14456 static bool hasNormalLoadOperand(SDNode *N) { 14457 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 14458 for (unsigned i = 0; i < NumElts; ++i) { 14459 SDNode *Elt = N->getOperand(i).getNode(); 14460 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 14461 return true; 14462 } 14463 return false; 14464 } 14465 14466 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 14467 /// ISD::BUILD_VECTOR. 14468 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 14469 TargetLowering::DAGCombinerInfo &DCI, 14470 const ARMSubtarget *Subtarget) { 14471 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 14472 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 14473 // into a pair of GPRs, which is fine when the value is used as a scalar, 14474 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 14475 SelectionDAG &DAG = DCI.DAG; 14476 if (N->getNumOperands() == 2) 14477 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 14478 return RV; 14479 14480 // Load i64 elements as f64 values so that type legalization does not split 14481 // them up into i32 values. 14482 EVT VT = N->getValueType(0); 14483 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 14484 return SDValue(); 14485 SDLoc dl(N); 14486 SmallVector<SDValue, 8> Ops; 14487 unsigned NumElts = VT.getVectorNumElements(); 14488 for (unsigned i = 0; i < NumElts; ++i) { 14489 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 14490 Ops.push_back(V); 14491 // Make the DAGCombiner fold the bitcast. 14492 DCI.AddToWorklist(V.getNode()); 14493 } 14494 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 14495 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 14496 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 14497 } 14498 14499 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 14500 static SDValue 14501 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14502 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 14503 // At that time, we may have inserted bitcasts from integer to float. 14504 // If these bitcasts have survived DAGCombine, change the lowering of this 14505 // BUILD_VECTOR in something more vector friendly, i.e., that does not 14506 // force to use floating point types. 14507 14508 // Make sure we can change the type of the vector. 14509 // This is possible iff: 14510 // 1. The vector is only used in a bitcast to a integer type. I.e., 14511 // 1.1. Vector is used only once. 14512 // 1.2. Use is a bit convert to an integer type. 14513 // 2. The size of its operands are 32-bits (64-bits are not legal). 14514 EVT VT = N->getValueType(0); 14515 EVT EltVT = VT.getVectorElementType(); 14516 14517 // Check 1.1. and 2. 14518 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 14519 return SDValue(); 14520 14521 // By construction, the input type must be float. 14522 assert(EltVT == MVT::f32 && "Unexpected type!"); 14523 14524 // Check 1.2. 14525 SDNode *Use = *N->use_begin(); 14526 if (Use->getOpcode() != ISD::BITCAST || 14527 Use->getValueType(0).isFloatingPoint()) 14528 return SDValue(); 14529 14530 // Check profitability. 14531 // Model is, if more than half of the relevant operands are bitcast from 14532 // i32, turn the build_vector into a sequence of insert_vector_elt. 14533 // Relevant operands are everything that is not statically 14534 // (i.e., at compile time) bitcasted. 14535 unsigned NumOfBitCastedElts = 0; 14536 unsigned NumElts = VT.getVectorNumElements(); 14537 unsigned NumOfRelevantElts = NumElts; 14538 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 14539 SDValue Elt = N->getOperand(Idx); 14540 if (Elt->getOpcode() == ISD::BITCAST) { 14541 // Assume only bit cast to i32 will go away. 14542 if (Elt->getOperand(0).getValueType() == MVT::i32) 14543 ++NumOfBitCastedElts; 14544 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 14545 // Constants are statically casted, thus do not count them as 14546 // relevant operands. 14547 --NumOfRelevantElts; 14548 } 14549 14550 // Check if more than half of the elements require a non-free bitcast. 14551 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 14552 return SDValue(); 14553 14554 SelectionDAG &DAG = DCI.DAG; 14555 // Create the new vector type. 14556 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 14557 // Check if the type is legal. 14558 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14559 if (!TLI.isTypeLegal(VecVT)) 14560 return SDValue(); 14561 14562 // Combine: 14563 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 14564 // => BITCAST INSERT_VECTOR_ELT 14565 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 14566 // (BITCAST EN), N. 14567 SDValue Vec = DAG.getUNDEF(VecVT); 14568 SDLoc dl(N); 14569 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 14570 SDValue V = N->getOperand(Idx); 14571 if (V.isUndef()) 14572 continue; 14573 if (V.getOpcode() == ISD::BITCAST && 14574 V->getOperand(0).getValueType() == MVT::i32) 14575 // Fold obvious case. 14576 V = V.getOperand(0); 14577 else { 14578 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 14579 // Make the DAGCombiner fold the bitcasts. 14580 DCI.AddToWorklist(V.getNode()); 14581 } 14582 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 14583 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 14584 } 14585 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 14586 // Make the DAGCombiner fold the bitcasts. 14587 DCI.AddToWorklist(Vec.getNode()); 14588 return Vec; 14589 } 14590 14591 static SDValue 14592 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14593 EVT VT = N->getValueType(0); 14594 SDValue Op = N->getOperand(0); 14595 SDLoc dl(N); 14596 14597 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 14598 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 14599 // If the valuetypes are the same, we can remove the cast entirely. 14600 if (Op->getOperand(0).getValueType() == VT) 14601 return Op->getOperand(0); 14602 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 14603 } 14604 14605 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce 14606 // more VPNOT which might get folded as else predicates. 14607 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) { 14608 SDValue X = 14609 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 14610 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 14611 DCI.DAG.getConstant(65535, dl, MVT::i32)); 14612 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C); 14613 } 14614 14615 // Only the bottom 16 bits of the source register are used. 14616 if (Op.getValueType() == MVT::i32) { 14617 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 14618 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 14619 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) 14620 return SDValue(N, 0); 14621 } 14622 return SDValue(); 14623 } 14624 14625 static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, 14626 const ARMSubtarget *ST) { 14627 EVT VT = N->getValueType(0); 14628 SDValue Op = N->getOperand(0); 14629 SDLoc dl(N); 14630 14631 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST 14632 if (ST->isLittle()) 14633 return DAG.getNode(ISD::BITCAST, dl, VT, Op); 14634 14635 // VECTOR_REG_CAST undef -> undef 14636 if (Op.isUndef()) 14637 return DAG.getUNDEF(VT); 14638 14639 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) 14640 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { 14641 // If the valuetypes are the same, we can remove the cast entirely. 14642 if (Op->getOperand(0).getValueType() == VT) 14643 return Op->getOperand(0); 14644 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); 14645 } 14646 14647 return SDValue(); 14648 } 14649 14650 static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, 14651 const ARMSubtarget *Subtarget) { 14652 if (!Subtarget->hasMVEIntegerOps()) 14653 return SDValue(); 14654 14655 EVT VT = N->getValueType(0); 14656 SDValue Op0 = N->getOperand(0); 14657 SDValue Op1 = N->getOperand(1); 14658 ARMCC::CondCodes Cond = 14659 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 14660 SDLoc dl(N); 14661 14662 // vcmp X, 0, cc -> vcmpz X, cc 14663 if (isZeroVector(Op1)) 14664 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2)); 14665 14666 unsigned SwappedCond = getSwappedCondition(Cond); 14667 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 14668 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 14669 if (isZeroVector(Op0)) 14670 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 14671 DAG.getConstant(SwappedCond, dl, MVT::i32)); 14672 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 14673 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 14674 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 14675 DAG.getConstant(SwappedCond, dl, MVT::i32)); 14676 } 14677 14678 return SDValue(); 14679 } 14680 14681 /// PerformInsertEltCombine - Target-specific dag combine xforms for 14682 /// ISD::INSERT_VECTOR_ELT. 14683 static SDValue PerformInsertEltCombine(SDNode *N, 14684 TargetLowering::DAGCombinerInfo &DCI) { 14685 // Bitcast an i64 load inserted into a vector to f64. 14686 // Otherwise, the i64 value will be legalized to a pair of i32 values. 14687 EVT VT = N->getValueType(0); 14688 SDNode *Elt = N->getOperand(1).getNode(); 14689 if (VT.getVectorElementType() != MVT::i64 || 14690 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 14691 return SDValue(); 14692 14693 SelectionDAG &DAG = DCI.DAG; 14694 SDLoc dl(N); 14695 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 14696 VT.getVectorNumElements()); 14697 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 14698 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 14699 // Make the DAGCombiner fold the bitcasts. 14700 DCI.AddToWorklist(Vec.getNode()); 14701 DCI.AddToWorklist(V.getNode()); 14702 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 14703 Vec, V, N->getOperand(2)); 14704 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 14705 } 14706 14707 // Convert a pair of extracts from the same base vector to a VMOVRRD. Either 14708 // directly or bitcast to an integer if the original is a float vector. 14709 // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2) 14710 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2) 14711 static SDValue 14712 PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14713 EVT VT = N->getValueType(0); 14714 SDLoc dl(N); 14715 14716 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 || 14717 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64)) 14718 return SDValue(); 14719 14720 SDValue Ext = SDValue(N, 0); 14721 if (Ext.getOpcode() == ISD::BITCAST && 14722 Ext.getOperand(0).getValueType() == MVT::f32) 14723 Ext = Ext.getOperand(0); 14724 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 14725 !isa<ConstantSDNode>(Ext.getOperand(1)) || 14726 Ext.getConstantOperandVal(1) % 2 != 0) 14727 return SDValue(); 14728 if (Ext->use_size() == 1 && 14729 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP || 14730 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP)) 14731 return SDValue(); 14732 14733 SDValue Op0 = Ext.getOperand(0); 14734 EVT VecVT = Op0.getValueType(); 14735 unsigned Lane = Ext.getConstantOperandVal(1); 14736 if (VecVT.getVectorNumElements() != 4) 14737 return SDValue(); 14738 14739 // Find another extract, of Lane + 1 14740 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) { 14741 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14742 isa<ConstantSDNode>(V->getOperand(1)) && 14743 V->getConstantOperandVal(1) == Lane + 1; 14744 }); 14745 if (OtherIt == Op0->uses().end()) 14746 return SDValue(); 14747 14748 // For float extracts, we need to be converting to a i32 for both vector 14749 // lanes. 14750 SDValue OtherExt(*OtherIt, 0); 14751 if (OtherExt.getValueType() != MVT::i32) { 14752 if (OtherExt->use_size() != 1 || 14753 OtherExt->use_begin()->getOpcode() != ISD::BITCAST || 14754 OtherExt->use_begin()->getValueType(0) != MVT::i32) 14755 return SDValue(); 14756 OtherExt = SDValue(*OtherExt->use_begin(), 0); 14757 } 14758 14759 // Convert the type to a f64 and extract with a VMOVRRD. 14760 SDValue F64 = DCI.DAG.getNode( 14761 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 14762 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0), 14763 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32)); 14764 SDValue VMOVRRD = 14765 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64); 14766 14767 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1)); 14768 return VMOVRRD; 14769 } 14770 14771 static SDValue PerformExtractEltCombine(SDNode *N, 14772 TargetLowering::DAGCombinerInfo &DCI, 14773 const ARMSubtarget *ST) { 14774 SDValue Op0 = N->getOperand(0); 14775 EVT VT = N->getValueType(0); 14776 SDLoc dl(N); 14777 14778 // extract (vdup x) -> x 14779 if (Op0->getOpcode() == ARMISD::VDUP) { 14780 SDValue X = Op0->getOperand(0); 14781 if (VT == MVT::f16 && X.getValueType() == MVT::i32) 14782 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); 14783 if (VT == MVT::i32 && X.getValueType() == MVT::f16) 14784 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); 14785 if (VT == MVT::f32 && X.getValueType() == MVT::i32) 14786 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X); 14787 14788 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) 14789 X = X->getOperand(0); 14790 if (X.getValueType() == VT) 14791 return X; 14792 } 14793 14794 // extract ARM_BUILD_VECTOR -> x 14795 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR && 14796 isa<ConstantSDNode>(N->getOperand(1)) && 14797 N->getConstantOperandVal(1) < Op0.getNumOperands()) { 14798 return Op0.getOperand(N->getConstantOperandVal(1)); 14799 } 14800 14801 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b 14802 if (Op0.getValueType() == MVT::v4i32 && 14803 isa<ConstantSDNode>(N->getOperand(1)) && 14804 Op0.getOpcode() == ISD::BITCAST && 14805 Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 14806 Op0.getOperand(0).getValueType() == MVT::v2f64) { 14807 SDValue BV = Op0.getOperand(0); 14808 unsigned Offset = N->getConstantOperandVal(1); 14809 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1); 14810 if (MOV.getOpcode() == ARMISD::VMOVDRR) 14811 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2); 14812 } 14813 14814 // extract x, n; extract x, n+1 -> VMOVRRD x 14815 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 14816 return R; 14817 14818 // extract (MVETrunc(x)) -> extract x 14819 if (Op0->getOpcode() == ARMISD::MVETRUNC) { 14820 unsigned Idx = N->getConstantOperandVal(1); 14821 unsigned Vec = 14822 Idx / Op0->getOperand(0).getValueType().getVectorNumElements(); 14823 unsigned SubIdx = 14824 Idx % Op0->getOperand(0).getValueType().getVectorNumElements(); 14825 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec), 14826 DCI.DAG.getConstant(SubIdx, dl, MVT::i32)); 14827 } 14828 14829 return SDValue(); 14830 } 14831 14832 static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) { 14833 SDValue Op = N->getOperand(0); 14834 EVT VT = N->getValueType(0); 14835 14836 // sext_inreg(VGETLANEu) -> VGETLANEs 14837 if (Op.getOpcode() == ARMISD::VGETLANEu && 14838 cast<VTSDNode>(N->getOperand(1))->getVT() == 14839 Op.getOperand(0).getValueType().getScalarType()) 14840 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0), 14841 Op.getOperand(1)); 14842 14843 return SDValue(); 14844 } 14845 14846 // When lowering complex nodes that we recognize, like VQDMULH and MULH, we 14847 // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to 14848 // binop as the shuffles cancel out. 14849 static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) { 14850 EVT VT = N->getValueType(0); 14851 if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT) 14852 return SDValue(); 14853 SDValue Op = N->getOperand(0); 14854 14855 // Looking for binary operators that will have been folded from 14856 // truncates/extends. 14857 switch (Op.getOpcode()) { 14858 case ARMISD::VQDMULH: 14859 case ISD::MULHS: 14860 case ISD::MULHU: 14861 case ISD::ABDS: 14862 case ISD::ABDU: 14863 break; 14864 default: 14865 return SDValue(); 14866 } 14867 14868 ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0)); 14869 ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1)); 14870 if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() || 14871 !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() || 14872 Op0->getOperand(0).getValueType() != VT) 14873 return SDValue(); 14874 14875 // Check the mask turns into an identity shuffle. 14876 ArrayRef<int> NMask = N->getMask(); 14877 ArrayRef<int> OpMask = Op0->getMask(); 14878 for (int i = 0, e = NMask.size(); i != e; i++) { 14879 if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i) 14880 return SDValue(); 14881 } 14882 14883 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), 14884 Op0->getOperand(0), Op1->getOperand(0)); 14885 } 14886 14887 // shuffle(MVETrunc(x, y)) -> VMOVN(x, y) 14888 static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, 14889 SelectionDAG &DAG) { 14890 SDValue Trunc = N->getOperand(0); 14891 EVT VT = Trunc.getValueType(); 14892 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef()) 14893 return SDValue(); 14894 14895 SDLoc DL(Trunc); 14896 if (isVMOVNTruncMask(N->getMask(), VT, 0)) 14897 return DAG.getNode( 14898 ARMISD::VMOVN, DL, VT, 14899 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), 14900 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), 14901 DAG.getConstant(1, DL, MVT::i32)); 14902 else if (isVMOVNTruncMask(N->getMask(), VT, 1)) 14903 return DAG.getNode( 14904 ARMISD::VMOVN, DL, VT, 14905 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), 14906 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), 14907 DAG.getConstant(1, DL, MVT::i32)); 14908 return SDValue(); 14909 } 14910 14911 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 14912 /// ISD::VECTOR_SHUFFLE. 14913 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 14914 if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG)) 14915 return R; 14916 if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG)) 14917 return R; 14918 14919 // The LLVM shufflevector instruction does not require the shuffle mask 14920 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 14921 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 14922 // operands do not match the mask length, they are extended by concatenating 14923 // them with undef vectors. That is probably the right thing for other 14924 // targets, but for NEON it is better to concatenate two double-register 14925 // size vector operands into a single quad-register size vector. Do that 14926 // transformation here: 14927 // shuffle(concat(v1, undef), concat(v2, undef)) -> 14928 // shuffle(concat(v1, v2), undef) 14929 SDValue Op0 = N->getOperand(0); 14930 SDValue Op1 = N->getOperand(1); 14931 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 14932 Op1.getOpcode() != ISD::CONCAT_VECTORS || 14933 Op0.getNumOperands() != 2 || 14934 Op1.getNumOperands() != 2) 14935 return SDValue(); 14936 SDValue Concat0Op1 = Op0.getOperand(1); 14937 SDValue Concat1Op1 = Op1.getOperand(1); 14938 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 14939 return SDValue(); 14940 // Skip the transformation if any of the types are illegal. 14941 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14942 EVT VT = N->getValueType(0); 14943 if (!TLI.isTypeLegal(VT) || 14944 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 14945 !TLI.isTypeLegal(Concat1Op1.getValueType())) 14946 return SDValue(); 14947 14948 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 14949 Op0.getOperand(0), Op1.getOperand(0)); 14950 // Translate the shuffle mask. 14951 SmallVector<int, 16> NewMask; 14952 unsigned NumElts = VT.getVectorNumElements(); 14953 unsigned HalfElts = NumElts/2; 14954 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 14955 for (unsigned n = 0; n < NumElts; ++n) { 14956 int MaskElt = SVN->getMaskElt(n); 14957 int NewElt = -1; 14958 if (MaskElt < (int)HalfElts) 14959 NewElt = MaskElt; 14960 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 14961 NewElt = HalfElts + MaskElt - NumElts; 14962 NewMask.push_back(NewElt); 14963 } 14964 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 14965 DAG.getUNDEF(VT), NewMask); 14966 } 14967 14968 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 14969 /// NEON load/store intrinsics, and generic vector load/stores, to merge 14970 /// base address updates. 14971 /// For generic load/stores, the memory type is assumed to be a vector. 14972 /// The caller is assumed to have checked legality. 14973 static SDValue CombineBaseUpdate(SDNode *N, 14974 TargetLowering::DAGCombinerInfo &DCI) { 14975 SelectionDAG &DAG = DCI.DAG; 14976 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 14977 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 14978 const bool isStore = N->getOpcode() == ISD::STORE; 14979 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 14980 SDValue Addr = N->getOperand(AddrOpIdx); 14981 MemSDNode *MemN = cast<MemSDNode>(N); 14982 SDLoc dl(N); 14983 14984 // Search for a use of the address operand that is an increment. 14985 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 14986 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 14987 SDNode *User = *UI; 14988 if (User->getOpcode() != ISD::ADD || 14989 UI.getUse().getResNo() != Addr.getResNo()) 14990 continue; 14991 14992 // Check that the add is independent of the load/store. Otherwise, folding 14993 // it would create a cycle. We can avoid searching through Addr as it's a 14994 // predecessor to both. 14995 SmallPtrSet<const SDNode *, 32> Visited; 14996 SmallVector<const SDNode *, 16> Worklist; 14997 Visited.insert(Addr.getNode()); 14998 Worklist.push_back(N); 14999 Worklist.push_back(User); 15000 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 15001 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 15002 continue; 15003 15004 // Find the new opcode for the updating load/store. 15005 bool isLoadOp = true; 15006 bool isLaneOp = false; 15007 // Workaround for vst1x and vld1x intrinsics which do not have alignment 15008 // as an operand. 15009 bool hasAlignment = true; 15010 unsigned NewOpc = 0; 15011 unsigned NumVecs = 0; 15012 if (isIntrinsic) { 15013 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 15014 switch (IntNo) { 15015 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 15016 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 15017 NumVecs = 1; break; 15018 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 15019 NumVecs = 2; break; 15020 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 15021 NumVecs = 3; break; 15022 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 15023 NumVecs = 4; break; 15024 case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD; 15025 NumVecs = 2; hasAlignment = false; break; 15026 case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD; 15027 NumVecs = 3; hasAlignment = false; break; 15028 case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD; 15029 NumVecs = 4; hasAlignment = false; break; 15030 case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD; 15031 NumVecs = 2; break; 15032 case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD; 15033 NumVecs = 3; break; 15034 case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD; 15035 NumVecs = 4; break; 15036 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 15037 NumVecs = 2; isLaneOp = true; break; 15038 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 15039 NumVecs = 3; isLaneOp = true; break; 15040 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 15041 NumVecs = 4; isLaneOp = true; break; 15042 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 15043 NumVecs = 1; isLoadOp = false; break; 15044 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 15045 NumVecs = 2; isLoadOp = false; break; 15046 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 15047 NumVecs = 3; isLoadOp = false; break; 15048 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 15049 NumVecs = 4; isLoadOp = false; break; 15050 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 15051 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 15052 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 15053 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 15054 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 15055 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 15056 case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD; 15057 NumVecs = 2; isLoadOp = false; hasAlignment = false; break; 15058 case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD; 15059 NumVecs = 3; isLoadOp = false; hasAlignment = false; break; 15060 case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD; 15061 NumVecs = 4; isLoadOp = false; hasAlignment = false; break; 15062 } 15063 } else { 15064 isLaneOp = true; 15065 switch (N->getOpcode()) { 15066 default: llvm_unreachable("unexpected opcode for Neon base update"); 15067 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 15068 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 15069 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 15070 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 15071 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 15072 NumVecs = 1; isLaneOp = false; break; 15073 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 15074 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 15075 } 15076 } 15077 15078 // Find the size of memory referenced by the load/store. 15079 EVT VecTy; 15080 if (isLoadOp) { 15081 VecTy = N->getValueType(0); 15082 } else if (isIntrinsic) { 15083 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 15084 } else { 15085 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 15086 VecTy = N->getOperand(1).getValueType(); 15087 } 15088 15089 bool isVLDDUPOp = 15090 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD || 15091 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD; 15092 15093 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 15094 if (isLaneOp || isVLDDUPOp) 15095 NumBytes /= VecTy.getVectorNumElements(); 15096 15097 // If the increment is a constant, it must match the memory ref size. 15098 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 15099 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 15100 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 15101 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 15102 // separate instructions that make it harder to use a non-constant update. 15103 continue; 15104 } 15105 15106 // OK, we found an ADD we can fold into the base update. 15107 // Now, create a _UPD node, taking care of not breaking alignment. 15108 15109 EVT AlignedVecTy = VecTy; 15110 unsigned Alignment = MemN->getAlignment(); 15111 15112 // If this is a less-than-standard-aligned load/store, change the type to 15113 // match the standard alignment. 15114 // The alignment is overlooked when selecting _UPD variants; and it's 15115 // easier to introduce bitcasts here than fix that. 15116 // There are 3 ways to get to this base-update combine: 15117 // - intrinsics: they are assumed to be properly aligned (to the standard 15118 // alignment of the memory type), so we don't need to do anything. 15119 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 15120 // intrinsics, so, likewise, there's nothing to do. 15121 // - generic load/store instructions: the alignment is specified as an 15122 // explicit operand, rather than implicitly as the standard alignment 15123 // of the memory type (like the intrisics). We need to change the 15124 // memory type to match the explicit alignment. That way, we don't 15125 // generate non-standard-aligned ARMISD::VLDx nodes. 15126 if (isa<LSBaseSDNode>(N)) { 15127 if (Alignment == 0) 15128 Alignment = 1; 15129 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 15130 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 15131 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 15132 assert(!isLaneOp && "Unexpected generic load/store lane."); 15133 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 15134 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 15135 } 15136 // Don't set an explicit alignment on regular load/stores that we want 15137 // to transform to VLD/VST 1_UPD nodes. 15138 // This matches the behavior of regular load/stores, which only get an 15139 // explicit alignment if the MMO alignment is larger than the standard 15140 // alignment of the memory type. 15141 // Intrinsics, however, always get an explicit alignment, set to the 15142 // alignment of the MMO. 15143 Alignment = 1; 15144 } 15145 15146 // Create the new updating load/store node. 15147 // First, create an SDVTList for the new updating node's results. 15148 EVT Tys[6]; 15149 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 15150 unsigned n; 15151 for (n = 0; n < NumResultVecs; ++n) 15152 Tys[n] = AlignedVecTy; 15153 Tys[n++] = MVT::i32; 15154 Tys[n] = MVT::Other; 15155 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 15156 15157 // Then, gather the new node's operands. 15158 SmallVector<SDValue, 8> Ops; 15159 Ops.push_back(N->getOperand(0)); // incoming chain 15160 Ops.push_back(N->getOperand(AddrOpIdx)); 15161 Ops.push_back(Inc); 15162 15163 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 15164 // Try to match the intrinsic's signature 15165 Ops.push_back(StN->getValue()); 15166 } else { 15167 // Loads (and of course intrinsics) match the intrinsics' signature, 15168 // so just add all but the alignment operand. 15169 unsigned LastOperand = 15170 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands(); 15171 for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i) 15172 Ops.push_back(N->getOperand(i)); 15173 } 15174 15175 // For all node types, the alignment operand is always the last one. 15176 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 15177 15178 // If this is a non-standard-aligned STORE, the penultimate operand is the 15179 // stored value. Bitcast it to the aligned type. 15180 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 15181 SDValue &StVal = Ops[Ops.size()-2]; 15182 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 15183 } 15184 15185 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 15186 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 15187 MemN->getMemOperand()); 15188 15189 // Update the uses. 15190 SmallVector<SDValue, 5> NewResults; 15191 for (unsigned i = 0; i < NumResultVecs; ++i) 15192 NewResults.push_back(SDValue(UpdN.getNode(), i)); 15193 15194 // If this is an non-standard-aligned LOAD, the first result is the loaded 15195 // value. Bitcast it to the expected result type. 15196 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 15197 SDValue &LdVal = NewResults[0]; 15198 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 15199 } 15200 15201 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 15202 DCI.CombineTo(N, NewResults); 15203 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 15204 15205 break; 15206 } 15207 return SDValue(); 15208 } 15209 15210 static SDValue PerformVLDCombine(SDNode *N, 15211 TargetLowering::DAGCombinerInfo &DCI) { 15212 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 15213 return SDValue(); 15214 15215 return CombineBaseUpdate(N, DCI); 15216 } 15217 15218 static SDValue PerformMVEVLDCombine(SDNode *N, 15219 TargetLowering::DAGCombinerInfo &DCI) { 15220 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 15221 return SDValue(); 15222 15223 SelectionDAG &DAG = DCI.DAG; 15224 SDValue Addr = N->getOperand(2); 15225 MemSDNode *MemN = cast<MemSDNode>(N); 15226 SDLoc dl(N); 15227 15228 // For the stores, where there are multiple intrinsics we only actually want 15229 // to post-inc the last of the them. 15230 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 15231 if (IntNo == Intrinsic::arm_mve_vst2q && 15232 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) 15233 return SDValue(); 15234 if (IntNo == Intrinsic::arm_mve_vst4q && 15235 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) 15236 return SDValue(); 15237 15238 // Search for a use of the address operand that is an increment. 15239 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 15240 UE = Addr.getNode()->use_end(); 15241 UI != UE; ++UI) { 15242 SDNode *User = *UI; 15243 if (User->getOpcode() != ISD::ADD || 15244 UI.getUse().getResNo() != Addr.getResNo()) 15245 continue; 15246 15247 // Check that the add is independent of the load/store. Otherwise, folding 15248 // it would create a cycle. We can avoid searching through Addr as it's a 15249 // predecessor to both. 15250 SmallPtrSet<const SDNode *, 32> Visited; 15251 SmallVector<const SDNode *, 16> Worklist; 15252 Visited.insert(Addr.getNode()); 15253 Worklist.push_back(N); 15254 Worklist.push_back(User); 15255 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 15256 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 15257 continue; 15258 15259 // Find the new opcode for the updating load/store. 15260 bool isLoadOp = true; 15261 unsigned NewOpc = 0; 15262 unsigned NumVecs = 0; 15263 switch (IntNo) { 15264 default: 15265 llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); 15266 case Intrinsic::arm_mve_vld2q: 15267 NewOpc = ARMISD::VLD2_UPD; 15268 NumVecs = 2; 15269 break; 15270 case Intrinsic::arm_mve_vld4q: 15271 NewOpc = ARMISD::VLD4_UPD; 15272 NumVecs = 4; 15273 break; 15274 case Intrinsic::arm_mve_vst2q: 15275 NewOpc = ARMISD::VST2_UPD; 15276 NumVecs = 2; 15277 isLoadOp = false; 15278 break; 15279 case Intrinsic::arm_mve_vst4q: 15280 NewOpc = ARMISD::VST4_UPD; 15281 NumVecs = 4; 15282 isLoadOp = false; 15283 break; 15284 } 15285 15286 // Find the size of memory referenced by the load/store. 15287 EVT VecTy; 15288 if (isLoadOp) { 15289 VecTy = N->getValueType(0); 15290 } else { 15291 VecTy = N->getOperand(3).getValueType(); 15292 } 15293 15294 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 15295 15296 // If the increment is a constant, it must match the memory ref size. 15297 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 15298 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 15299 if (!CInc || CInc->getZExtValue() != NumBytes) 15300 continue; 15301 15302 // Create the new updating load/store node. 15303 // First, create an SDVTList for the new updating node's results. 15304 EVT Tys[6]; 15305 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 15306 unsigned n; 15307 for (n = 0; n < NumResultVecs; ++n) 15308 Tys[n] = VecTy; 15309 Tys[n++] = MVT::i32; 15310 Tys[n] = MVT::Other; 15311 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 15312 15313 // Then, gather the new node's operands. 15314 SmallVector<SDValue, 8> Ops; 15315 Ops.push_back(N->getOperand(0)); // incoming chain 15316 Ops.push_back(N->getOperand(2)); // ptr 15317 Ops.push_back(Inc); 15318 15319 for (unsigned i = 3; i < N->getNumOperands(); ++i) 15320 Ops.push_back(N->getOperand(i)); 15321 15322 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, 15323 MemN->getMemOperand()); 15324 15325 // Update the uses. 15326 SmallVector<SDValue, 5> NewResults; 15327 for (unsigned i = 0; i < NumResultVecs; ++i) 15328 NewResults.push_back(SDValue(UpdN.getNode(), i)); 15329 15330 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 15331 DCI.CombineTo(N, NewResults); 15332 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 15333 15334 break; 15335 } 15336 15337 return SDValue(); 15338 } 15339 15340 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 15341 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 15342 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 15343 /// return true. 15344 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15345 SelectionDAG &DAG = DCI.DAG; 15346 EVT VT = N->getValueType(0); 15347 // vldN-dup instructions only support 64-bit vectors for N > 1. 15348 if (!VT.is64BitVector()) 15349 return false; 15350 15351 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 15352 SDNode *VLD = N->getOperand(0).getNode(); 15353 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 15354 return false; 15355 unsigned NumVecs = 0; 15356 unsigned NewOpc = 0; 15357 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 15358 if (IntNo == Intrinsic::arm_neon_vld2lane) { 15359 NumVecs = 2; 15360 NewOpc = ARMISD::VLD2DUP; 15361 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 15362 NumVecs = 3; 15363 NewOpc = ARMISD::VLD3DUP; 15364 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 15365 NumVecs = 4; 15366 NewOpc = ARMISD::VLD4DUP; 15367 } else { 15368 return false; 15369 } 15370 15371 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 15372 // numbers match the load. 15373 unsigned VLDLaneNo = 15374 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 15375 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 15376 UI != UE; ++UI) { 15377 // Ignore uses of the chain result. 15378 if (UI.getUse().getResNo() == NumVecs) 15379 continue; 15380 SDNode *User = *UI; 15381 if (User->getOpcode() != ARMISD::VDUPLANE || 15382 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 15383 return false; 15384 } 15385 15386 // Create the vldN-dup node. 15387 EVT Tys[5]; 15388 unsigned n; 15389 for (n = 0; n < NumVecs; ++n) 15390 Tys[n] = VT; 15391 Tys[n] = MVT::Other; 15392 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 15393 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 15394 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 15395 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 15396 Ops, VLDMemInt->getMemoryVT(), 15397 VLDMemInt->getMemOperand()); 15398 15399 // Update the uses. 15400 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 15401 UI != UE; ++UI) { 15402 unsigned ResNo = UI.getUse().getResNo(); 15403 // Ignore uses of the chain result. 15404 if (ResNo == NumVecs) 15405 continue; 15406 SDNode *User = *UI; 15407 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 15408 } 15409 15410 // Now the vldN-lane intrinsic is dead except for its chain result. 15411 // Update uses of the chain. 15412 std::vector<SDValue> VLDDupResults; 15413 for (unsigned n = 0; n < NumVecs; ++n) 15414 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 15415 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 15416 DCI.CombineTo(VLD, VLDDupResults); 15417 15418 return true; 15419 } 15420 15421 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 15422 /// ARMISD::VDUPLANE. 15423 static SDValue PerformVDUPLANECombine(SDNode *N, 15424 TargetLowering::DAGCombinerInfo &DCI, 15425 const ARMSubtarget *Subtarget) { 15426 SDValue Op = N->getOperand(0); 15427 EVT VT = N->getValueType(0); 15428 15429 // On MVE, we just convert the VDUPLANE to a VDUP with an extract. 15430 if (Subtarget->hasMVEIntegerOps()) { 15431 EVT ExtractVT = VT.getVectorElementType(); 15432 // We need to ensure we are creating a legal type. 15433 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) 15434 ExtractVT = MVT::i32; 15435 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, 15436 N->getOperand(0), N->getOperand(1)); 15437 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); 15438 } 15439 15440 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 15441 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 15442 if (CombineVLDDUP(N, DCI)) 15443 return SDValue(N, 0); 15444 15445 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 15446 // redundant. Ignore bit_converts for now; element sizes are checked below. 15447 while (Op.getOpcode() == ISD::BITCAST) 15448 Op = Op.getOperand(0); 15449 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 15450 return SDValue(); 15451 15452 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 15453 unsigned EltSize = Op.getScalarValueSizeInBits(); 15454 // The canonical VMOV for a zero vector uses a 32-bit element size. 15455 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 15456 unsigned EltBits; 15457 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 15458 EltSize = 8; 15459 if (EltSize > VT.getScalarSizeInBits()) 15460 return SDValue(); 15461 15462 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 15463 } 15464 15465 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 15466 static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, 15467 const ARMSubtarget *Subtarget) { 15468 SDValue Op = N->getOperand(0); 15469 SDLoc dl(N); 15470 15471 if (Subtarget->hasMVEIntegerOps()) { 15472 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will 15473 // need to come from a GPR. 15474 if (Op.getValueType() == MVT::f32) 15475 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 15476 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); 15477 else if (Op.getValueType() == MVT::f16) 15478 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 15479 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); 15480 } 15481 15482 if (!Subtarget->hasNEON()) 15483 return SDValue(); 15484 15485 // Match VDUP(LOAD) -> VLD1DUP. 15486 // We match this pattern here rather than waiting for isel because the 15487 // transform is only legal for unindexed loads. 15488 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 15489 if (LD && Op.hasOneUse() && LD->isUnindexed() && 15490 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 15491 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1), 15492 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32)}; 15493 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 15494 SDValue VLDDup = 15495 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops, 15496 LD->getMemoryVT(), LD->getMemOperand()); 15497 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 15498 return VLDDup; 15499 } 15500 15501 return SDValue(); 15502 } 15503 15504 static SDValue PerformLOADCombine(SDNode *N, 15505 TargetLowering::DAGCombinerInfo &DCI) { 15506 EVT VT = N->getValueType(0); 15507 15508 // If this is a legal vector load, try to combine it into a VLD1_UPD. 15509 if (ISD::isNormalLoad(N) && VT.isVector() && 15510 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 15511 return CombineBaseUpdate(N, DCI); 15512 15513 return SDValue(); 15514 } 15515 15516 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 15517 // pack all of the elements in one place. Next, store to memory in fewer 15518 // chunks. 15519 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 15520 SelectionDAG &DAG) { 15521 SDValue StVal = St->getValue(); 15522 EVT VT = StVal.getValueType(); 15523 if (!St->isTruncatingStore() || !VT.isVector()) 15524 return SDValue(); 15525 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15526 EVT StVT = St->getMemoryVT(); 15527 unsigned NumElems = VT.getVectorNumElements(); 15528 assert(StVT != VT && "Cannot truncate to the same type"); 15529 unsigned FromEltSz = VT.getScalarSizeInBits(); 15530 unsigned ToEltSz = StVT.getScalarSizeInBits(); 15531 15532 // From, To sizes and ElemCount must be pow of two 15533 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 15534 return SDValue(); 15535 15536 // We are going to use the original vector elt for storing. 15537 // Accumulated smaller vector elements must be a multiple of the store size. 15538 if (0 != (NumElems * FromEltSz) % ToEltSz) 15539 return SDValue(); 15540 15541 unsigned SizeRatio = FromEltSz / ToEltSz; 15542 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 15543 15544 // Create a type on which we perform the shuffle. 15545 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 15546 NumElems * SizeRatio); 15547 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 15548 15549 SDLoc DL(St); 15550 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 15551 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 15552 for (unsigned i = 0; i < NumElems; ++i) 15553 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 15554 : i * SizeRatio; 15555 15556 // Can't shuffle using an illegal type. 15557 if (!TLI.isTypeLegal(WideVecVT)) 15558 return SDValue(); 15559 15560 SDValue Shuff = DAG.getVectorShuffle( 15561 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 15562 // At this point all of the data is stored at the bottom of the 15563 // register. We now need to save it to mem. 15564 15565 // Find the largest store unit 15566 MVT StoreType = MVT::i8; 15567 for (MVT Tp : MVT::integer_valuetypes()) { 15568 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 15569 StoreType = Tp; 15570 } 15571 // Didn't find a legal store type. 15572 if (!TLI.isTypeLegal(StoreType)) 15573 return SDValue(); 15574 15575 // Bitcast the original vector into a vector of store-size units 15576 EVT StoreVecVT = 15577 EVT::getVectorVT(*DAG.getContext(), StoreType, 15578 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 15579 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 15580 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 15581 SmallVector<SDValue, 8> Chains; 15582 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 15583 TLI.getPointerTy(DAG.getDataLayout())); 15584 SDValue BasePtr = St->getBasePtr(); 15585 15586 // Perform one or more big stores into memory. 15587 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 15588 for (unsigned I = 0; I < E; I++) { 15589 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 15590 ShuffWide, DAG.getIntPtrConstant(I, DL)); 15591 SDValue Ch = 15592 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 15593 St->getAlignment(), St->getMemOperand()->getFlags()); 15594 BasePtr = 15595 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 15596 Chains.push_back(Ch); 15597 } 15598 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 15599 } 15600 15601 // Try taking a single vector store from an fpround (which would otherwise turn 15602 // into an expensive buildvector) and splitting it into a series of narrowing 15603 // stores. 15604 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 15605 SelectionDAG &DAG) { 15606 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 15607 return SDValue(); 15608 SDValue Trunc = St->getValue(); 15609 if (Trunc->getOpcode() != ISD::FP_ROUND) 15610 return SDValue(); 15611 EVT FromVT = Trunc->getOperand(0).getValueType(); 15612 EVT ToVT = Trunc.getValueType(); 15613 if (!ToVT.isVector()) 15614 return SDValue(); 15615 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 15616 EVT ToEltVT = ToVT.getVectorElementType(); 15617 EVT FromEltVT = FromVT.getVectorElementType(); 15618 15619 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16) 15620 return SDValue(); 15621 15622 unsigned NumElements = 4; 15623 if (FromVT.getVectorNumElements() % NumElements != 0) 15624 return SDValue(); 15625 15626 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so 15627 // use the VMOVN over splitting the store. We are looking for patterns of: 15628 // !rev: 0 N 1 N+1 2 N+2 ... 15629 // rev: N 0 N+1 1 N+2 2 ... 15630 // The shuffle may either be a single source (in which case N = NumElts/2) or 15631 // two inputs extended with concat to the same size (in which case N = 15632 // NumElts). 15633 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { 15634 ArrayRef<int> M = SVN->getMask(); 15635 unsigned NumElts = ToVT.getVectorNumElements(); 15636 if (SVN->getOperand(1).isUndef()) 15637 NumElts /= 2; 15638 15639 unsigned Off0 = Rev ? NumElts : 0; 15640 unsigned Off1 = Rev ? 0 : NumElts; 15641 15642 for (unsigned I = 0; I < NumElts; I += 2) { 15643 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) 15644 return false; 15645 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) 15646 return false; 15647 } 15648 15649 return true; 15650 }; 15651 15652 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0))) 15653 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) 15654 return SDValue(); 15655 15656 LLVMContext &C = *DAG.getContext(); 15657 SDLoc DL(St); 15658 // Details about the old store 15659 SDValue Ch = St->getChain(); 15660 SDValue BasePtr = St->getBasePtr(); 15661 Align Alignment = St->getOriginalAlign(); 15662 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 15663 AAMDNodes AAInfo = St->getAAInfo(); 15664 15665 // We split the store into slices of NumElements. fp16 trunc stores are vcvt 15666 // and then stored as truncating integer stores. 15667 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); 15668 EVT NewToVT = EVT::getVectorVT( 15669 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); 15670 15671 SmallVector<SDValue, 4> Stores; 15672 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 15673 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 15674 SDValue NewPtr = 15675 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 15676 15677 SDValue Extract = 15678 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 15679 DAG.getConstant(i * NumElements, DL, MVT::i32)); 15680 15681 SDValue FPTrunc = 15682 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), 15683 Extract, DAG.getConstant(0, DL, MVT::i32)); 15684 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); 15685 15686 SDValue Store = DAG.getTruncStore( 15687 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 15688 NewToVT, Alignment.value(), MMOFlags, AAInfo); 15689 Stores.push_back(Store); 15690 } 15691 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 15692 } 15693 15694 // Try taking a single vector store from an MVETRUNC (which would otherwise turn 15695 // into an expensive buildvector) and splitting it into a series of narrowing 15696 // stores. 15697 static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, 15698 SelectionDAG &DAG) { 15699 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 15700 return SDValue(); 15701 SDValue Trunc = St->getValue(); 15702 if (Trunc->getOpcode() != ARMISD::MVETRUNC) 15703 return SDValue(); 15704 EVT FromVT = Trunc->getOperand(0).getValueType(); 15705 EVT ToVT = Trunc.getValueType(); 15706 15707 LLVMContext &C = *DAG.getContext(); 15708 SDLoc DL(St); 15709 // Details about the old store 15710 SDValue Ch = St->getChain(); 15711 SDValue BasePtr = St->getBasePtr(); 15712 Align Alignment = St->getOriginalAlign(); 15713 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 15714 AAMDNodes AAInfo = St->getAAInfo(); 15715 15716 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(), 15717 FromVT.getVectorNumElements()); 15718 15719 SmallVector<SDValue, 4> Stores; 15720 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) { 15721 unsigned NewOffset = 15722 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8; 15723 SDValue NewPtr = 15724 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 15725 15726 SDValue Extract = Trunc.getOperand(i); 15727 SDValue Store = DAG.getTruncStore( 15728 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 15729 NewToVT, Alignment.value(), MMOFlags, AAInfo); 15730 Stores.push_back(Store); 15731 } 15732 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 15733 } 15734 15735 // Given a floating point store from an extracted vector, with an integer 15736 // VGETLANE that already exists, store the existing VGETLANEu directly. This can 15737 // help reduce fp register pressure, doesn't require the fp extract and allows 15738 // use of more integer post-inc stores not available with vstr. 15739 static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) { 15740 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 15741 return SDValue(); 15742 SDValue Extract = St->getValue(); 15743 EVT VT = Extract.getValueType(); 15744 // For now only uses f16. This may be useful for f32 too, but that will 15745 // be bitcast(extract), not the VGETLANEu we currently check here. 15746 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 15747 return SDValue(); 15748 15749 SDNode *GetLane = 15750 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32), 15751 {Extract.getOperand(0), Extract.getOperand(1)}); 15752 if (!GetLane) 15753 return SDValue(); 15754 15755 LLVMContext &C = *DAG.getContext(); 15756 SDLoc DL(St); 15757 // Create a new integer store to replace the existing floating point version. 15758 SDValue Ch = St->getChain(); 15759 SDValue BasePtr = St->getBasePtr(); 15760 Align Alignment = St->getOriginalAlign(); 15761 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 15762 AAMDNodes AAInfo = St->getAAInfo(); 15763 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits()); 15764 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr, 15765 St->getPointerInfo(), NewToVT, 15766 Alignment.value(), MMOFlags, AAInfo); 15767 15768 return Store; 15769 } 15770 15771 /// PerformSTORECombine - Target-specific dag combine xforms for 15772 /// ISD::STORE. 15773 static SDValue PerformSTORECombine(SDNode *N, 15774 TargetLowering::DAGCombinerInfo &DCI, 15775 const ARMSubtarget *Subtarget) { 15776 StoreSDNode *St = cast<StoreSDNode>(N); 15777 if (St->isVolatile()) 15778 return SDValue(); 15779 SDValue StVal = St->getValue(); 15780 EVT VT = StVal.getValueType(); 15781 15782 if (Subtarget->hasNEON()) 15783 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 15784 return Store; 15785 15786 if (Subtarget->hasMVEIntegerOps()) { 15787 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 15788 return NewToken; 15789 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG)) 15790 return NewChain; 15791 if (SDValue NewToken = 15792 PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG)) 15793 return NewToken; 15794 } 15795 15796 if (!ISD::isNormalStore(St)) 15797 return SDValue(); 15798 15799 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 15800 // ARM stores of arguments in the same cache line. 15801 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 15802 StVal.getNode()->hasOneUse()) { 15803 SelectionDAG &DAG = DCI.DAG; 15804 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 15805 SDLoc DL(St); 15806 SDValue BasePtr = St->getBasePtr(); 15807 SDValue NewST1 = DAG.getStore( 15808 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 15809 BasePtr, St->getPointerInfo(), St->getOriginalAlign(), 15810 St->getMemOperand()->getFlags()); 15811 15812 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 15813 DAG.getConstant(4, DL, MVT::i32)); 15814 return DAG.getStore(NewST1.getValue(0), DL, 15815 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 15816 OffsetPtr, St->getPointerInfo().getWithOffset(4), 15817 St->getOriginalAlign(), 15818 St->getMemOperand()->getFlags()); 15819 } 15820 15821 if (StVal.getValueType() == MVT::i64 && 15822 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 15823 15824 // Bitcast an i64 store extracted from a vector to f64. 15825 // Otherwise, the i64 value will be legalized to a pair of i32 values. 15826 SelectionDAG &DAG = DCI.DAG; 15827 SDLoc dl(StVal); 15828 SDValue IntVec = StVal.getOperand(0); 15829 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 15830 IntVec.getValueType().getVectorNumElements()); 15831 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 15832 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 15833 Vec, StVal.getOperand(1)); 15834 dl = SDLoc(N); 15835 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 15836 // Make the DAGCombiner fold the bitcasts. 15837 DCI.AddToWorklist(Vec.getNode()); 15838 DCI.AddToWorklist(ExtElt.getNode()); 15839 DCI.AddToWorklist(V.getNode()); 15840 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 15841 St->getPointerInfo(), St->getAlignment(), 15842 St->getMemOperand()->getFlags(), St->getAAInfo()); 15843 } 15844 15845 // If this is a legal vector store, try to combine it into a VST1_UPD. 15846 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 15847 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 15848 return CombineBaseUpdate(N, DCI); 15849 15850 return SDValue(); 15851 } 15852 15853 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 15854 /// can replace combinations of VMUL and VCVT (floating-point to integer) 15855 /// when the VMUL has a constant operand that is a power of 2. 15856 /// 15857 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 15858 /// vmul.f32 d16, d17, d16 15859 /// vcvt.s32.f32 d16, d16 15860 /// becomes: 15861 /// vcvt.s32.f32 d16, d16, #3 15862 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 15863 const ARMSubtarget *Subtarget) { 15864 if (!Subtarget->hasNEON()) 15865 return SDValue(); 15866 15867 SDValue Op = N->getOperand(0); 15868 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 15869 Op.getOpcode() != ISD::FMUL) 15870 return SDValue(); 15871 15872 SDValue ConstVec = Op->getOperand(1); 15873 if (!isa<BuildVectorSDNode>(ConstVec)) 15874 return SDValue(); 15875 15876 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 15877 uint32_t FloatBits = FloatTy.getSizeInBits(); 15878 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 15879 uint32_t IntBits = IntTy.getSizeInBits(); 15880 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 15881 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 15882 // These instructions only exist converting from f32 to i32. We can handle 15883 // smaller integers by generating an extra truncate, but larger ones would 15884 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 15885 // these intructions only support v2i32/v4i32 types. 15886 return SDValue(); 15887 } 15888 15889 BitVector UndefElements; 15890 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 15891 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 15892 if (C == -1 || C == 0 || C > 32) 15893 return SDValue(); 15894 15895 SDLoc dl(N); 15896 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 15897 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 15898 Intrinsic::arm_neon_vcvtfp2fxu; 15899 SDValue FixConv = DAG.getNode( 15900 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 15901 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 15902 DAG.getConstant(C, dl, MVT::i32)); 15903 15904 if (IntBits < FloatBits) 15905 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 15906 15907 return FixConv; 15908 } 15909 15910 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 15911 /// can replace combinations of VCVT (integer to floating-point) and VDIV 15912 /// when the VDIV has a constant operand that is a power of 2. 15913 /// 15914 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 15915 /// vcvt.f32.s32 d16, d16 15916 /// vdiv.f32 d16, d17, d16 15917 /// becomes: 15918 /// vcvt.f32.s32 d16, d16, #3 15919 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 15920 const ARMSubtarget *Subtarget) { 15921 if (!Subtarget->hasNEON()) 15922 return SDValue(); 15923 15924 SDValue Op = N->getOperand(0); 15925 unsigned OpOpcode = Op.getNode()->getOpcode(); 15926 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 15927 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 15928 return SDValue(); 15929 15930 SDValue ConstVec = N->getOperand(1); 15931 if (!isa<BuildVectorSDNode>(ConstVec)) 15932 return SDValue(); 15933 15934 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 15935 uint32_t FloatBits = FloatTy.getSizeInBits(); 15936 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 15937 uint32_t IntBits = IntTy.getSizeInBits(); 15938 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 15939 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 15940 // These instructions only exist converting from i32 to f32. We can handle 15941 // smaller integers by generating an extra extend, but larger ones would 15942 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 15943 // these intructions only support v2i32/v4i32 types. 15944 return SDValue(); 15945 } 15946 15947 BitVector UndefElements; 15948 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 15949 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 15950 if (C == -1 || C == 0 || C > 32) 15951 return SDValue(); 15952 15953 SDLoc dl(N); 15954 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 15955 SDValue ConvInput = Op.getOperand(0); 15956 if (IntBits < FloatBits) 15957 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 15958 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 15959 ConvInput); 15960 15961 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 15962 Intrinsic::arm_neon_vcvtfxu2fp; 15963 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 15964 Op.getValueType(), 15965 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 15966 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 15967 } 15968 15969 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, 15970 const ARMSubtarget *ST) { 15971 if (!ST->hasMVEIntegerOps()) 15972 return SDValue(); 15973 15974 assert(N->getOpcode() == ISD::VECREDUCE_ADD); 15975 EVT ResVT = N->getValueType(0); 15976 SDValue N0 = N->getOperand(0); 15977 SDLoc dl(N); 15978 15979 // We are looking for something that will have illegal types if left alone, 15980 // but that we can convert to a single instruction under MVE. For example 15981 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A 15982 // or 15983 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B 15984 15985 // The legal cases are: 15986 // VADDV u/s 8/16/32 15987 // VMLAV u/s 8/16/32 15988 // VADDLV u/s 32 15989 // VMLALV u/s 16/32 15990 15991 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can 15992 // extend it and use v4i32 instead. 15993 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) { 15994 EVT AVT = A.getValueType(); 15995 return any_of(ExtTypes, [&](MVT Ty) { 15996 return AVT.getVectorNumElements() == Ty.getVectorNumElements() && 15997 AVT.bitsLE(Ty); 15998 }); 15999 }; 16000 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) { 16001 EVT AVT = A.getValueType(); 16002 if (!AVT.is128BitVector()) 16003 A = DAG.getNode(ExtendCode, dl, 16004 AVT.changeVectorElementType(MVT::getIntegerVT( 16005 128 / AVT.getVectorMinNumElements())), 16006 A); 16007 return A; 16008 }; 16009 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { 16010 if (ResVT != RetTy || N0->getOpcode() != ExtendCode) 16011 return SDValue(); 16012 SDValue A = N0->getOperand(0); 16013 if (ExtTypeMatches(A, ExtTypes)) 16014 return ExtendIfNeeded(A, ExtendCode); 16015 return SDValue(); 16016 }; 16017 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode, 16018 ArrayRef<MVT> ExtTypes, SDValue &Mask) { 16019 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 16020 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 16021 return SDValue(); 16022 Mask = N0->getOperand(0); 16023 SDValue Ext = N0->getOperand(1); 16024 if (Ext->getOpcode() != ExtendCode) 16025 return SDValue(); 16026 SDValue A = Ext->getOperand(0); 16027 if (ExtTypeMatches(A, ExtTypes)) 16028 return ExtendIfNeeded(A, ExtendCode); 16029 return SDValue(); 16030 }; 16031 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 16032 SDValue &A, SDValue &B) { 16033 // For a vmla we are trying to match a larger pattern: 16034 // ExtA = sext/zext A 16035 // ExtB = sext/zext B 16036 // Mul = mul ExtA, ExtB 16037 // vecreduce.add Mul 16038 // There might also be en extra extend between the mul and the addreduce, so 16039 // long as the bitwidth is high enough to make them equivalent (for example 16040 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). 16041 if (ResVT != RetTy) 16042 return false; 16043 SDValue Mul = N0; 16044 if (Mul->getOpcode() == ExtendCode && 16045 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 16046 ResVT.getScalarSizeInBits()) 16047 Mul = Mul->getOperand(0); 16048 if (Mul->getOpcode() != ISD::MUL) 16049 return false; 16050 SDValue ExtA = Mul->getOperand(0); 16051 SDValue ExtB = Mul->getOperand(1); 16052 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) 16053 return false; 16054 A = ExtA->getOperand(0); 16055 B = ExtB->getOperand(0); 16056 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { 16057 A = ExtendIfNeeded(A, ExtendCode); 16058 B = ExtendIfNeeded(B, ExtendCode); 16059 return true; 16060 } 16061 return false; 16062 }; 16063 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 16064 SDValue &A, SDValue &B, SDValue &Mask) { 16065 // Same as the pattern above with a select for the zero predicated lanes 16066 // ExtA = sext/zext A 16067 // ExtB = sext/zext B 16068 // Mul = mul ExtA, ExtB 16069 // N0 = select Mask, Mul, 0 16070 // vecreduce.add N0 16071 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 16072 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 16073 return false; 16074 Mask = N0->getOperand(0); 16075 SDValue Mul = N0->getOperand(1); 16076 if (Mul->getOpcode() == ExtendCode && 16077 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 16078 ResVT.getScalarSizeInBits()) 16079 Mul = Mul->getOperand(0); 16080 if (Mul->getOpcode() != ISD::MUL) 16081 return false; 16082 SDValue ExtA = Mul->getOperand(0); 16083 SDValue ExtB = Mul->getOperand(1); 16084 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) 16085 return false; 16086 A = ExtA->getOperand(0); 16087 B = ExtB->getOperand(0); 16088 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { 16089 A = ExtendIfNeeded(A, ExtendCode); 16090 B = ExtendIfNeeded(B, ExtendCode); 16091 return true; 16092 } 16093 return false; 16094 }; 16095 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { 16096 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64 16097 // reductions. The operands are extended with MVEEXT, but as they are 16098 // reductions the lane orders do not matter. MVEEXT may be combined with 16099 // loads to produce two extending loads, or else they will be expanded to 16100 // VREV/VMOVL. 16101 EVT VT = Ops[0].getValueType(); 16102 if (VT == MVT::v16i8) { 16103 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) && 16104 "Unexpected illegal long reduction opcode"); 16105 bool IsUnsigned = Opcode == ARMISD::VMLALVu; 16106 16107 SDValue Ext0 = 16108 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, 16109 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]); 16110 SDValue Ext1 = 16111 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, 16112 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]); 16113 16114 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 16115 Ext0, Ext1); 16116 SDValue MLA1 = 16117 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl, 16118 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1), 16119 Ext0.getValue(1), Ext1.getValue(1)); 16120 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1)); 16121 } 16122 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); 16123 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, 16124 SDValue(Node.getNode(), 1)); 16125 }; 16126 16127 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) 16128 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); 16129 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) 16130 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); 16131 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) 16132 return Create64bitNode(ARMISD::VADDLVs, {A}); 16133 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) 16134 return Create64bitNode(ARMISD::VADDLVu, {A}); 16135 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8})) 16136 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16137 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A)); 16138 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8})) 16139 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16140 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A)); 16141 16142 SDValue Mask; 16143 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 16144 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask); 16145 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 16146 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask); 16147 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask)) 16148 return Create64bitNode(ARMISD::VADDLVps, {A, Mask}); 16149 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask)) 16150 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask}); 16151 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask)) 16152 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16153 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask)); 16154 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask)) 16155 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16156 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask)); 16157 16158 SDValue A, B; 16159 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 16160 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); 16161 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 16162 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); 16163 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, 16164 A, B)) 16165 return Create64bitNode(ARMISD::VMLALVs, {A, B}); 16166 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, 16167 A, B)) 16168 return Create64bitNode(ARMISD::VMLALVu, {A, B}); 16169 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B)) 16170 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16171 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B)); 16172 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B)) 16173 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16174 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B)); 16175 16176 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, 16177 Mask)) 16178 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask); 16179 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, 16180 Mask)) 16181 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask); 16182 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, 16183 Mask)) 16184 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask}); 16185 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, 16186 Mask)) 16187 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask}); 16188 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask)) 16189 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16190 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask)); 16191 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) 16192 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 16193 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); 16194 16195 // Some complications. We can get a case where the two inputs of the mul are 16196 // the same, then the output sext will have been helpfully converted to a 16197 // zext. Turn it back. 16198 SDValue Op = N0; 16199 if (Op->getOpcode() == ISD::VSELECT) 16200 Op = Op->getOperand(1); 16201 if (Op->getOpcode() == ISD::ZERO_EXTEND && 16202 Op->getOperand(0)->getOpcode() == ISD::MUL) { 16203 SDValue Mul = Op->getOperand(0); 16204 if (Mul->getOperand(0) == Mul->getOperand(1) && 16205 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { 16206 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); 16207 if (Op != N0) 16208 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), 16209 N0->getOperand(0), Ext, N0->getOperand(2)); 16210 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); 16211 } 16212 } 16213 16214 return SDValue(); 16215 } 16216 16217 static SDValue PerformVMOVNCombine(SDNode *N, 16218 TargetLowering::DAGCombinerInfo &DCI) { 16219 SDValue Op0 = N->getOperand(0); 16220 SDValue Op1 = N->getOperand(1); 16221 unsigned IsTop = N->getConstantOperandVal(2); 16222 16223 // VMOVNT a undef -> a 16224 // VMOVNB a undef -> a 16225 // VMOVNB undef a -> a 16226 if (Op1->isUndef()) 16227 return Op0; 16228 if (Op0->isUndef() && !IsTop) 16229 return Op1; 16230 16231 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) 16232 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) 16233 if ((Op1->getOpcode() == ARMISD::VQMOVNs || 16234 Op1->getOpcode() == ARMISD::VQMOVNu) && 16235 Op1->getConstantOperandVal(2) == 0) 16236 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), 16237 Op0, Op1->getOperand(1), N->getOperand(2)); 16238 16239 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from 16240 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting 16241 // into the top or bottom lanes. 16242 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 16243 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); 16244 APInt Op0DemandedElts = 16245 IsTop ? Op1DemandedElts 16246 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); 16247 16248 APInt KnownUndef, KnownZero; 16249 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 16250 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, 16251 KnownZero, DCI)) 16252 return SDValue(N, 0); 16253 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef, 16254 KnownZero, DCI)) 16255 return SDValue(N, 0); 16256 16257 return SDValue(); 16258 } 16259 16260 static SDValue PerformVQMOVNCombine(SDNode *N, 16261 TargetLowering::DAGCombinerInfo &DCI) { 16262 SDValue Op0 = N->getOperand(0); 16263 unsigned IsTop = N->getConstantOperandVal(2); 16264 16265 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 16266 APInt Op0DemandedElts = 16267 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) 16268 : APInt::getHighBitsSet(2, 1)); 16269 16270 APInt KnownUndef, KnownZero; 16271 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 16272 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, 16273 KnownZero, DCI)) 16274 return SDValue(N, 0); 16275 return SDValue(); 16276 } 16277 16278 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { 16279 SDLoc DL(N); 16280 SDValue Op0 = N->getOperand(0); 16281 SDValue Op1 = N->getOperand(1); 16282 16283 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from 16284 // uses of the intrinsics. 16285 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 16286 int ShiftAmt = C->getSExtValue(); 16287 if (ShiftAmt == 0) { 16288 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); 16289 DAG.ReplaceAllUsesWith(N, Merge.getNode()); 16290 return SDValue(); 16291 } 16292 16293 if (ShiftAmt >= -32 && ShiftAmt < 0) { 16294 unsigned NewOpcode = 16295 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; 16296 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, 16297 DAG.getConstant(-ShiftAmt, DL, MVT::i32)); 16298 DAG.ReplaceAllUsesWith(N, NewShift.getNode()); 16299 return NewShift; 16300 } 16301 } 16302 16303 return SDValue(); 16304 } 16305 16306 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 16307 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, 16308 DAGCombinerInfo &DCI) const { 16309 SelectionDAG &DAG = DCI.DAG; 16310 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 16311 switch (IntNo) { 16312 default: 16313 // Don't do anything for most intrinsics. 16314 break; 16315 16316 // Vector shifts: check for immediate versions and lower them. 16317 // Note: This is done during DAG combining instead of DAG legalizing because 16318 // the build_vectors for 64-bit vector element shift counts are generally 16319 // not legal, and it is hard to see their values after they get legalized to 16320 // loads from a constant pool. 16321 case Intrinsic::arm_neon_vshifts: 16322 case Intrinsic::arm_neon_vshiftu: 16323 case Intrinsic::arm_neon_vrshifts: 16324 case Intrinsic::arm_neon_vrshiftu: 16325 case Intrinsic::arm_neon_vrshiftn: 16326 case Intrinsic::arm_neon_vqshifts: 16327 case Intrinsic::arm_neon_vqshiftu: 16328 case Intrinsic::arm_neon_vqshiftsu: 16329 case Intrinsic::arm_neon_vqshiftns: 16330 case Intrinsic::arm_neon_vqshiftnu: 16331 case Intrinsic::arm_neon_vqshiftnsu: 16332 case Intrinsic::arm_neon_vqrshiftns: 16333 case Intrinsic::arm_neon_vqrshiftnu: 16334 case Intrinsic::arm_neon_vqrshiftnsu: { 16335 EVT VT = N->getOperand(1).getValueType(); 16336 int64_t Cnt; 16337 unsigned VShiftOpc = 0; 16338 16339 switch (IntNo) { 16340 case Intrinsic::arm_neon_vshifts: 16341 case Intrinsic::arm_neon_vshiftu: 16342 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 16343 VShiftOpc = ARMISD::VSHLIMM; 16344 break; 16345 } 16346 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 16347 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 16348 : ARMISD::VSHRuIMM); 16349 break; 16350 } 16351 return SDValue(); 16352 16353 case Intrinsic::arm_neon_vrshifts: 16354 case Intrinsic::arm_neon_vrshiftu: 16355 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 16356 break; 16357 return SDValue(); 16358 16359 case Intrinsic::arm_neon_vqshifts: 16360 case Intrinsic::arm_neon_vqshiftu: 16361 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 16362 break; 16363 return SDValue(); 16364 16365 case Intrinsic::arm_neon_vqshiftsu: 16366 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 16367 break; 16368 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 16369 16370 case Intrinsic::arm_neon_vrshiftn: 16371 case Intrinsic::arm_neon_vqshiftns: 16372 case Intrinsic::arm_neon_vqshiftnu: 16373 case Intrinsic::arm_neon_vqshiftnsu: 16374 case Intrinsic::arm_neon_vqrshiftns: 16375 case Intrinsic::arm_neon_vqrshiftnu: 16376 case Intrinsic::arm_neon_vqrshiftnsu: 16377 // Narrowing shifts require an immediate right shift. 16378 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 16379 break; 16380 llvm_unreachable("invalid shift count for narrowing vector shift " 16381 "intrinsic"); 16382 16383 default: 16384 llvm_unreachable("unhandled vector shift"); 16385 } 16386 16387 switch (IntNo) { 16388 case Intrinsic::arm_neon_vshifts: 16389 case Intrinsic::arm_neon_vshiftu: 16390 // Opcode already set above. 16391 break; 16392 case Intrinsic::arm_neon_vrshifts: 16393 VShiftOpc = ARMISD::VRSHRsIMM; 16394 break; 16395 case Intrinsic::arm_neon_vrshiftu: 16396 VShiftOpc = ARMISD::VRSHRuIMM; 16397 break; 16398 case Intrinsic::arm_neon_vrshiftn: 16399 VShiftOpc = ARMISD::VRSHRNIMM; 16400 break; 16401 case Intrinsic::arm_neon_vqshifts: 16402 VShiftOpc = ARMISD::VQSHLsIMM; 16403 break; 16404 case Intrinsic::arm_neon_vqshiftu: 16405 VShiftOpc = ARMISD::VQSHLuIMM; 16406 break; 16407 case Intrinsic::arm_neon_vqshiftsu: 16408 VShiftOpc = ARMISD::VQSHLsuIMM; 16409 break; 16410 case Intrinsic::arm_neon_vqshiftns: 16411 VShiftOpc = ARMISD::VQSHRNsIMM; 16412 break; 16413 case Intrinsic::arm_neon_vqshiftnu: 16414 VShiftOpc = ARMISD::VQSHRNuIMM; 16415 break; 16416 case Intrinsic::arm_neon_vqshiftnsu: 16417 VShiftOpc = ARMISD::VQSHRNsuIMM; 16418 break; 16419 case Intrinsic::arm_neon_vqrshiftns: 16420 VShiftOpc = ARMISD::VQRSHRNsIMM; 16421 break; 16422 case Intrinsic::arm_neon_vqrshiftnu: 16423 VShiftOpc = ARMISD::VQRSHRNuIMM; 16424 break; 16425 case Intrinsic::arm_neon_vqrshiftnsu: 16426 VShiftOpc = ARMISD::VQRSHRNsuIMM; 16427 break; 16428 } 16429 16430 SDLoc dl(N); 16431 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 16432 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 16433 } 16434 16435 case Intrinsic::arm_neon_vshiftins: { 16436 EVT VT = N->getOperand(1).getValueType(); 16437 int64_t Cnt; 16438 unsigned VShiftOpc = 0; 16439 16440 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 16441 VShiftOpc = ARMISD::VSLIIMM; 16442 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 16443 VShiftOpc = ARMISD::VSRIIMM; 16444 else { 16445 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 16446 } 16447 16448 SDLoc dl(N); 16449 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 16450 N->getOperand(1), N->getOperand(2), 16451 DAG.getConstant(Cnt, dl, MVT::i32)); 16452 } 16453 16454 case Intrinsic::arm_neon_vqrshifts: 16455 case Intrinsic::arm_neon_vqrshiftu: 16456 // No immediate versions of these to check for. 16457 break; 16458 16459 case Intrinsic::arm_mve_vqdmlah: 16460 case Intrinsic::arm_mve_vqdmlash: 16461 case Intrinsic::arm_mve_vqrdmlah: 16462 case Intrinsic::arm_mve_vqrdmlash: 16463 case Intrinsic::arm_mve_vmla_n_predicated: 16464 case Intrinsic::arm_mve_vmlas_n_predicated: 16465 case Intrinsic::arm_mve_vqdmlah_predicated: 16466 case Intrinsic::arm_mve_vqdmlash_predicated: 16467 case Intrinsic::arm_mve_vqrdmlah_predicated: 16468 case Intrinsic::arm_mve_vqrdmlash_predicated: { 16469 // These intrinsics all take an i32 scalar operand which is narrowed to the 16470 // size of a single lane of the vector type they return. So we don't need 16471 // any bits of that operand above that point, which allows us to eliminate 16472 // uxth/sxth. 16473 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 16474 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 16475 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) 16476 return SDValue(); 16477 break; 16478 } 16479 16480 case Intrinsic::arm_mve_minv: 16481 case Intrinsic::arm_mve_maxv: 16482 case Intrinsic::arm_mve_minav: 16483 case Intrinsic::arm_mve_maxav: 16484 case Intrinsic::arm_mve_minv_predicated: 16485 case Intrinsic::arm_mve_maxv_predicated: 16486 case Intrinsic::arm_mve_minav_predicated: 16487 case Intrinsic::arm_mve_maxav_predicated: { 16488 // These intrinsics all take an i32 scalar operand which is narrowed to the 16489 // size of a single lane of the vector type they take as the other input. 16490 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); 16491 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 16492 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 16493 return SDValue(); 16494 break; 16495 } 16496 16497 case Intrinsic::arm_mve_addv: { 16498 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, 16499 // which allow PerformADDVecReduce to turn it into VADDLV when possible. 16500 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 16501 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; 16502 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); 16503 } 16504 16505 case Intrinsic::arm_mve_addlv: 16506 case Intrinsic::arm_mve_addlv_predicated: { 16507 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR 16508 // which recombines the two outputs into an i64 16509 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 16510 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? 16511 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : 16512 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); 16513 16514 SmallVector<SDValue, 4> Ops; 16515 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) 16516 if (i != 2) // skip the unsigned flag 16517 Ops.push_back(N->getOperand(i)); 16518 16519 SDLoc dl(N); 16520 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); 16521 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), 16522 val.getValue(1)); 16523 } 16524 } 16525 16526 return SDValue(); 16527 } 16528 16529 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 16530 /// lowers them. As with the vector shift intrinsics, this is done during DAG 16531 /// combining instead of DAG legalizing because the build_vectors for 64-bit 16532 /// vector element shift counts are generally not legal, and it is hard to see 16533 /// their values after they get legalized to loads from a constant pool. 16534 static SDValue PerformShiftCombine(SDNode *N, 16535 TargetLowering::DAGCombinerInfo &DCI, 16536 const ARMSubtarget *ST) { 16537 SelectionDAG &DAG = DCI.DAG; 16538 EVT VT = N->getValueType(0); 16539 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 16540 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16541 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 16542 SDValue N1 = N->getOperand(1); 16543 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 16544 SDValue N0 = N->getOperand(0); 16545 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 16546 DAG.MaskedValueIsZero(N0.getOperand(0), 16547 APInt::getHighBitsSet(32, 16))) 16548 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 16549 } 16550 } 16551 16552 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 16553 N->getOperand(0)->getOpcode() == ISD::AND && 16554 N->getOperand(0)->hasOneUse()) { 16555 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 16556 return SDValue(); 16557 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 16558 // usually show up because instcombine prefers to canonicalize it to 16559 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 16560 // out of GEP lowering in some cases. 16561 SDValue N0 = N->getOperand(0); 16562 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 16563 if (!ShiftAmtNode) 16564 return SDValue(); 16565 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 16566 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 16567 if (!AndMaskNode) 16568 return SDValue(); 16569 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 16570 // Don't transform uxtb/uxth. 16571 if (AndMask == 255 || AndMask == 65535) 16572 return SDValue(); 16573 if (isMask_32(AndMask)) { 16574 uint32_t MaskedBits = countLeadingZeros(AndMask); 16575 if (MaskedBits > ShiftAmt) { 16576 SDLoc DL(N); 16577 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 16578 DAG.getConstant(MaskedBits, DL, MVT::i32)); 16579 return DAG.getNode( 16580 ISD::SRL, DL, MVT::i32, SHL, 16581 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 16582 } 16583 } 16584 } 16585 16586 // Nothing to be done for scalar shifts. 16587 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16588 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 16589 return SDValue(); 16590 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 16591 return SDValue(); 16592 16593 int64_t Cnt; 16594 16595 switch (N->getOpcode()) { 16596 default: llvm_unreachable("unexpected shift opcode"); 16597 16598 case ISD::SHL: 16599 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 16600 SDLoc dl(N); 16601 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 16602 DAG.getConstant(Cnt, dl, MVT::i32)); 16603 } 16604 break; 16605 16606 case ISD::SRA: 16607 case ISD::SRL: 16608 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 16609 unsigned VShiftOpc = 16610 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 16611 SDLoc dl(N); 16612 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 16613 DAG.getConstant(Cnt, dl, MVT::i32)); 16614 } 16615 } 16616 return SDValue(); 16617 } 16618 16619 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be 16620 // split into multiple extending loads, which are simpler to deal with than an 16621 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL 16622 // to convert the type to an f32. 16623 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 16624 SDValue N0 = N->getOperand(0); 16625 if (N0.getOpcode() != ISD::LOAD) 16626 return SDValue(); 16627 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 16628 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 16629 LD->getExtensionType() != ISD::NON_EXTLOAD) 16630 return SDValue(); 16631 EVT FromVT = LD->getValueType(0); 16632 EVT ToVT = N->getValueType(0); 16633 if (!ToVT.isVector()) 16634 return SDValue(); 16635 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 16636 EVT ToEltVT = ToVT.getVectorElementType(); 16637 EVT FromEltVT = FromVT.getVectorElementType(); 16638 16639 unsigned NumElements = 0; 16640 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8) 16641 NumElements = 4; 16642 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) 16643 NumElements = 4; 16644 if (NumElements == 0 || 16645 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || 16646 FromVT.getVectorNumElements() % NumElements != 0 || 16647 !isPowerOf2_32(NumElements)) 16648 return SDValue(); 16649 16650 LLVMContext &C = *DAG.getContext(); 16651 SDLoc DL(LD); 16652 // Details about the old load 16653 SDValue Ch = LD->getChain(); 16654 SDValue BasePtr = LD->getBasePtr(); 16655 Align Alignment = LD->getOriginalAlign(); 16656 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 16657 AAMDNodes AAInfo = LD->getAAInfo(); 16658 16659 ISD::LoadExtType NewExtType = 16660 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 16661 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 16662 EVT NewFromVT = EVT::getVectorVT( 16663 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); 16664 EVT NewToVT = EVT::getVectorVT( 16665 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); 16666 16667 SmallVector<SDValue, 4> Loads; 16668 SmallVector<SDValue, 4> Chains; 16669 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 16670 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; 16671 SDValue NewPtr = 16672 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 16673 16674 SDValue NewLoad = 16675 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 16676 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 16677 Alignment, MMOFlags, AAInfo); 16678 Loads.push_back(NewLoad); 16679 Chains.push_back(SDValue(NewLoad.getNode(), 1)); 16680 } 16681 16682 // Float truncs need to extended with VCVTB's into their floating point types. 16683 if (FromEltVT == MVT::f16) { 16684 SmallVector<SDValue, 4> Extends; 16685 16686 for (unsigned i = 0; i < Loads.size(); i++) { 16687 SDValue LoadBC = 16688 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); 16689 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, 16690 DAG.getConstant(0, DL, MVT::i32)); 16691 Extends.push_back(FPExt); 16692 } 16693 16694 Loads = Extends; 16695 } 16696 16697 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 16698 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 16699 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads); 16700 } 16701 16702 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 16703 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 16704 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 16705 const ARMSubtarget *ST) { 16706 SDValue N0 = N->getOperand(0); 16707 16708 // Check for sign- and zero-extensions of vector extract operations of 8- and 16709 // 16-bit vector elements. NEON and MVE support these directly. They are 16710 // handled during DAG combining because type legalization will promote them 16711 // to 32-bit types and it is messy to recognize the operations after that. 16712 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 16713 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 16714 SDValue Vec = N0.getOperand(0); 16715 SDValue Lane = N0.getOperand(1); 16716 EVT VT = N->getValueType(0); 16717 EVT EltVT = N0.getValueType(); 16718 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16719 16720 if (VT == MVT::i32 && 16721 (EltVT == MVT::i8 || EltVT == MVT::i16) && 16722 TLI.isTypeLegal(Vec.getValueType()) && 16723 isa<ConstantSDNode>(Lane)) { 16724 16725 unsigned Opc = 0; 16726 switch (N->getOpcode()) { 16727 default: llvm_unreachable("unexpected opcode"); 16728 case ISD::SIGN_EXTEND: 16729 Opc = ARMISD::VGETLANEs; 16730 break; 16731 case ISD::ZERO_EXTEND: 16732 case ISD::ANY_EXTEND: 16733 Opc = ARMISD::VGETLANEu; 16734 break; 16735 } 16736 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 16737 } 16738 } 16739 16740 if (ST->hasMVEIntegerOps()) 16741 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 16742 return NewLoad; 16743 16744 return SDValue(); 16745 } 16746 16747 static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, 16748 const ARMSubtarget *ST) { 16749 if (ST->hasMVEFloatOps()) 16750 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 16751 return NewLoad; 16752 16753 return SDValue(); 16754 } 16755 16756 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating 16757 /// saturates. 16758 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, 16759 const ARMSubtarget *ST) { 16760 EVT VT = N->getValueType(0); 16761 SDValue N0 = N->getOperand(0); 16762 if (!ST->hasMVEIntegerOps()) 16763 return SDValue(); 16764 16765 if (SDValue V = PerformVQDMULHCombine(N, DAG)) 16766 return V; 16767 16768 if (VT != MVT::v4i32 && VT != MVT::v8i16) 16769 return SDValue(); 16770 16771 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { 16772 // Check one is a smin and the other is a smax 16773 if (Min->getOpcode() != ISD::SMIN) 16774 std::swap(Min, Max); 16775 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) 16776 return false; 16777 16778 APInt SaturateC; 16779 if (VT == MVT::v4i32) 16780 SaturateC = APInt(32, (1 << 15) - 1, true); 16781 else //if (VT == MVT::v8i16) 16782 SaturateC = APInt(16, (1 << 7) - 1, true); 16783 16784 APInt MinC, MaxC; 16785 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 16786 MinC != SaturateC) 16787 return false; 16788 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || 16789 MaxC != ~SaturateC) 16790 return false; 16791 return true; 16792 }; 16793 16794 if (IsSignedSaturate(N, N0.getNode())) { 16795 SDLoc DL(N); 16796 MVT ExtVT, HalfVT; 16797 if (VT == MVT::v4i32) { 16798 HalfVT = MVT::v8i16; 16799 ExtVT = MVT::v4i16; 16800 } else { // if (VT == MVT::v8i16) 16801 HalfVT = MVT::v16i8; 16802 ExtVT = MVT::v8i8; 16803 } 16804 16805 // Create a VQMOVNB with undef top lanes, then signed extended into the top 16806 // half. That extend will hopefully be removed if only the bottom bits are 16807 // demanded (though a truncating store, for example). 16808 SDValue VQMOVN = 16809 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), 16810 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); 16811 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 16812 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, 16813 DAG.getValueType(ExtVT)); 16814 } 16815 16816 auto IsUnsignedSaturate = [&](SDNode *Min) { 16817 // For unsigned, we just need to check for <= 0xffff 16818 if (Min->getOpcode() != ISD::UMIN) 16819 return false; 16820 16821 APInt SaturateC; 16822 if (VT == MVT::v4i32) 16823 SaturateC = APInt(32, (1 << 16) - 1, true); 16824 else //if (VT == MVT::v8i16) 16825 SaturateC = APInt(16, (1 << 8) - 1, true); 16826 16827 APInt MinC; 16828 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 16829 MinC != SaturateC) 16830 return false; 16831 return true; 16832 }; 16833 16834 if (IsUnsignedSaturate(N)) { 16835 SDLoc DL(N); 16836 MVT HalfVT; 16837 unsigned ExtConst; 16838 if (VT == MVT::v4i32) { 16839 HalfVT = MVT::v8i16; 16840 ExtConst = 0x0000FFFF; 16841 } else { //if (VT == MVT::v8i16) 16842 HalfVT = MVT::v16i8; 16843 ExtConst = 0x00FF; 16844 } 16845 16846 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with 16847 // an AND. That extend will hopefully be removed if only the bottom bits are 16848 // demanded (though a truncating store, for example). 16849 SDValue VQMOVN = 16850 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, 16851 DAG.getConstant(0, DL, MVT::i32)); 16852 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 16853 return DAG.getNode(ISD::AND, DL, VT, Bitcast, 16854 DAG.getConstant(ExtConst, DL, VT)); 16855 } 16856 16857 return SDValue(); 16858 } 16859 16860 static const APInt *isPowerOf2Constant(SDValue V) { 16861 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 16862 if (!C) 16863 return nullptr; 16864 const APInt *CV = &C->getAPIntValue(); 16865 return CV->isPowerOf2() ? CV : nullptr; 16866 } 16867 16868 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 16869 // If we have a CMOV, OR and AND combination such as: 16870 // if (x & CN) 16871 // y |= CM; 16872 // 16873 // And: 16874 // * CN is a single bit; 16875 // * All bits covered by CM are known zero in y 16876 // 16877 // Then we can convert this into a sequence of BFI instructions. This will 16878 // always be a win if CM is a single bit, will always be no worse than the 16879 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 16880 // three bits (due to the extra IT instruction). 16881 16882 SDValue Op0 = CMOV->getOperand(0); 16883 SDValue Op1 = CMOV->getOperand(1); 16884 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 16885 auto CC = CCNode->getAPIntValue().getLimitedValue(); 16886 SDValue CmpZ = CMOV->getOperand(4); 16887 16888 // The compare must be against zero. 16889 if (!isNullConstant(CmpZ->getOperand(1))) 16890 return SDValue(); 16891 16892 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 16893 SDValue And = CmpZ->getOperand(0); 16894 if (And->getOpcode() != ISD::AND) 16895 return SDValue(); 16896 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 16897 if (!AndC) 16898 return SDValue(); 16899 SDValue X = And->getOperand(0); 16900 16901 if (CC == ARMCC::EQ) { 16902 // We're performing an "equal to zero" compare. Swap the operands so we 16903 // canonicalize on a "not equal to zero" compare. 16904 std::swap(Op0, Op1); 16905 } else { 16906 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 16907 } 16908 16909 if (Op1->getOpcode() != ISD::OR) 16910 return SDValue(); 16911 16912 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 16913 if (!OrC) 16914 return SDValue(); 16915 SDValue Y = Op1->getOperand(0); 16916 16917 if (Op0 != Y) 16918 return SDValue(); 16919 16920 // Now, is it profitable to continue? 16921 APInt OrCI = OrC->getAPIntValue(); 16922 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 16923 if (OrCI.countPopulation() > Heuristic) 16924 return SDValue(); 16925 16926 // Lastly, can we determine that the bits defined by OrCI 16927 // are zero in Y? 16928 KnownBits Known = DAG.computeKnownBits(Y); 16929 if ((OrCI & Known.Zero) != OrCI) 16930 return SDValue(); 16931 16932 // OK, we can do the combine. 16933 SDValue V = Y; 16934 SDLoc dl(X); 16935 EVT VT = X.getValueType(); 16936 unsigned BitInX = AndC->logBase2(); 16937 16938 if (BitInX != 0) { 16939 // We must shift X first. 16940 X = DAG.getNode(ISD::SRL, dl, VT, X, 16941 DAG.getConstant(BitInX, dl, VT)); 16942 } 16943 16944 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 16945 BitInY < NumActiveBits; ++BitInY) { 16946 if (OrCI[BitInY] == 0) 16947 continue; 16948 APInt Mask(VT.getSizeInBits(), 0); 16949 Mask.setBit(BitInY); 16950 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 16951 // Confusingly, the operand is an *inverted* mask. 16952 DAG.getConstant(~Mask, dl, VT)); 16953 } 16954 16955 return V; 16956 } 16957 16958 // Given N, the value controlling the conditional branch, search for the loop 16959 // intrinsic, returning it, along with how the value is used. We need to handle 16960 // patterns such as the following: 16961 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 16962 // (brcond (setcc (loop.decrement), 0, eq), exit) 16963 // (brcond (setcc (loop.decrement), 0, ne), header) 16964 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 16965 bool &Negate) { 16966 switch (N->getOpcode()) { 16967 default: 16968 break; 16969 case ISD::XOR: { 16970 if (!isa<ConstantSDNode>(N.getOperand(1))) 16971 return SDValue(); 16972 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 16973 return SDValue(); 16974 Negate = !Negate; 16975 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 16976 } 16977 case ISD::SETCC: { 16978 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 16979 if (!Const) 16980 return SDValue(); 16981 if (Const->isNullValue()) 16982 Imm = 0; 16983 else if (Const->isOne()) 16984 Imm = 1; 16985 else 16986 return SDValue(); 16987 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 16988 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 16989 } 16990 case ISD::INTRINSIC_W_CHAIN: { 16991 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 16992 if (IntOp != Intrinsic::test_start_loop_iterations && 16993 IntOp != Intrinsic::loop_decrement_reg) 16994 return SDValue(); 16995 return N; 16996 } 16997 } 16998 return SDValue(); 16999 } 17000 17001 static SDValue PerformHWLoopCombine(SDNode *N, 17002 TargetLowering::DAGCombinerInfo &DCI, 17003 const ARMSubtarget *ST) { 17004 17005 // The hwloop intrinsics that we're interested are used for control-flow, 17006 // either for entering or exiting the loop: 17007 // - test.start.loop.iterations will test whether its operand is zero. If it 17008 // is zero, the proceeding branch should not enter the loop. 17009 // - loop.decrement.reg also tests whether its operand is zero. If it is 17010 // zero, the proceeding branch should not branch back to the beginning of 17011 // the loop. 17012 // So here, we need to check that how the brcond is using the result of each 17013 // of the intrinsics to ensure that we're branching to the right place at the 17014 // right time. 17015 17016 ISD::CondCode CC; 17017 SDValue Cond; 17018 int Imm = 1; 17019 bool Negate = false; 17020 SDValue Chain = N->getOperand(0); 17021 SDValue Dest; 17022 17023 if (N->getOpcode() == ISD::BRCOND) { 17024 CC = ISD::SETEQ; 17025 Cond = N->getOperand(1); 17026 Dest = N->getOperand(2); 17027 } else { 17028 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 17029 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 17030 Cond = N->getOperand(2); 17031 Dest = N->getOperand(4); 17032 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 17033 if (!Const->isOne() && !Const->isNullValue()) 17034 return SDValue(); 17035 Imm = Const->getZExtValue(); 17036 } else 17037 return SDValue(); 17038 } 17039 17040 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 17041 if (!Int) 17042 return SDValue(); 17043 17044 if (Negate) 17045 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 17046 17047 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 17048 return (CC == ISD::SETEQ && Imm == 0) || 17049 (CC == ISD::SETNE && Imm == 1) || 17050 (CC == ISD::SETLT && Imm == 1) || 17051 (CC == ISD::SETULT && Imm == 1); 17052 }; 17053 17054 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 17055 return (CC == ISD::SETEQ && Imm == 1) || 17056 (CC == ISD::SETNE && Imm == 0) || 17057 (CC == ISD::SETGT && Imm == 0) || 17058 (CC == ISD::SETUGT && Imm == 0) || 17059 (CC == ISD::SETGE && Imm == 1) || 17060 (CC == ISD::SETUGE && Imm == 1); 17061 }; 17062 17063 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 17064 "unsupported condition"); 17065 17066 SDLoc dl(Int); 17067 SelectionDAG &DAG = DCI.DAG; 17068 SDValue Elements = Int.getOperand(2); 17069 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 17070 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 17071 && "expected single br user"); 17072 SDNode *Br = *N->use_begin(); 17073 SDValue OtherTarget = Br->getOperand(1); 17074 17075 // Update the unconditional branch to branch to the given Dest. 17076 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 17077 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 17078 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 17079 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 17080 }; 17081 17082 if (IntOp == Intrinsic::test_start_loop_iterations) { 17083 SDValue Res; 17084 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements); 17085 // We expect this 'instruction' to branch when the counter is zero. 17086 if (IsTrueIfZero(CC, Imm)) { 17087 SDValue Ops[] = {Chain, Setup, Dest}; 17088 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 17089 } else { 17090 // The logic is the reverse of what we need for WLS, so find the other 17091 // basic block target: the target of the proceeding br. 17092 UpdateUncondBr(Br, Dest, DAG); 17093 17094 SDValue Ops[] = {Chain, Setup, OtherTarget}; 17095 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 17096 } 17097 // Update LR count to the new value 17098 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup); 17099 // Update chain 17100 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0)); 17101 return Res; 17102 } else { 17103 SDValue Size = DAG.getTargetConstant( 17104 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 17105 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 17106 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 17107 DAG.getVTList(MVT::i32, MVT::Other), Args); 17108 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 17109 17110 // We expect this instruction to branch when the count is not zero. 17111 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 17112 17113 // Update the unconditional branch to target the loop preheader if we've 17114 // found the condition has been reversed. 17115 if (Target == OtherTarget) 17116 UpdateUncondBr(Br, Dest, DAG); 17117 17118 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 17119 SDValue(LoopDec.getNode(), 1), Chain); 17120 17121 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 17122 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 17123 } 17124 return SDValue(); 17125 } 17126 17127 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 17128 SDValue 17129 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 17130 SDValue Cmp = N->getOperand(4); 17131 if (Cmp.getOpcode() != ARMISD::CMPZ) 17132 // Only looking at NE cases. 17133 return SDValue(); 17134 17135 EVT VT = N->getValueType(0); 17136 SDLoc dl(N); 17137 SDValue LHS = Cmp.getOperand(0); 17138 SDValue RHS = Cmp.getOperand(1); 17139 SDValue Chain = N->getOperand(0); 17140 SDValue BB = N->getOperand(1); 17141 SDValue ARMcc = N->getOperand(2); 17142 ARMCC::CondCodes CC = 17143 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 17144 17145 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 17146 // -> (brcond Chain BB CC CPSR Cmp) 17147 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 17148 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 17149 LHS->getOperand(0)->hasOneUse()) { 17150 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 17151 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 17152 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 17153 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 17154 if ((LHS00C && LHS00C->getZExtValue() == 0) && 17155 (LHS01C && LHS01C->getZExtValue() == 1) && 17156 (LHS1C && LHS1C->getZExtValue() == 1) && 17157 (RHSC && RHSC->getZExtValue() == 0)) { 17158 return DAG.getNode( 17159 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 17160 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 17161 } 17162 } 17163 17164 return SDValue(); 17165 } 17166 17167 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 17168 SDValue 17169 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 17170 SDValue Cmp = N->getOperand(4); 17171 if (Cmp.getOpcode() != ARMISD::CMPZ) 17172 // Only looking at EQ and NE cases. 17173 return SDValue(); 17174 17175 EVT VT = N->getValueType(0); 17176 SDLoc dl(N); 17177 SDValue LHS = Cmp.getOperand(0); 17178 SDValue RHS = Cmp.getOperand(1); 17179 SDValue FalseVal = N->getOperand(0); 17180 SDValue TrueVal = N->getOperand(1); 17181 SDValue ARMcc = N->getOperand(2); 17182 ARMCC::CondCodes CC = 17183 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 17184 17185 // BFI is only available on V6T2+. 17186 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 17187 SDValue R = PerformCMOVToBFICombine(N, DAG); 17188 if (R) 17189 return R; 17190 } 17191 17192 // Simplify 17193 // mov r1, r0 17194 // cmp r1, x 17195 // mov r0, y 17196 // moveq r0, x 17197 // to 17198 // cmp r0, x 17199 // movne r0, y 17200 // 17201 // mov r1, r0 17202 // cmp r1, x 17203 // mov r0, x 17204 // movne r0, y 17205 // to 17206 // cmp r0, x 17207 // movne r0, y 17208 /// FIXME: Turn this into a target neutral optimization? 17209 SDValue Res; 17210 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 17211 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 17212 N->getOperand(3), Cmp); 17213 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 17214 SDValue ARMcc; 17215 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 17216 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 17217 N->getOperand(3), NewCmp); 17218 } 17219 17220 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 17221 // -> (cmov F T CC CPSR Cmp) 17222 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 17223 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 17224 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 17225 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 17226 if ((LHS0C && LHS0C->getZExtValue() == 0) && 17227 (LHS1C && LHS1C->getZExtValue() == 1) && 17228 (RHSC && RHSC->getZExtValue() == 0)) { 17229 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 17230 LHS->getOperand(2), LHS->getOperand(3), 17231 LHS->getOperand(4)); 17232 } 17233 } 17234 17235 if (!VT.isInteger()) 17236 return SDValue(); 17237 17238 // Materialize a boolean comparison for integers so we can avoid branching. 17239 if (isNullConstant(FalseVal)) { 17240 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 17241 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 17242 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 17243 // right 5 bits will make that 32 be 1, otherwise it will be 0. 17244 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 17245 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 17246 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 17247 DAG.getConstant(5, dl, MVT::i32)); 17248 } else { 17249 // CMOV 0, 1, ==, (CMPZ x, y) -> 17250 // (ADDCARRY (SUB x, y), t:0, t:1) 17251 // where t = (SUBCARRY 0, (SUB x, y), 0) 17252 // 17253 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 17254 // x != y. In other words, a carry C == 1 when x == y, C == 0 17255 // otherwise. 17256 // The final ADDCARRY computes 17257 // x - y + (0 - (x - y)) + C == C 17258 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 17259 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 17260 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 17261 // ISD::SUBCARRY returns a borrow but we want the carry here 17262 // actually. 17263 SDValue Carry = 17264 DAG.getNode(ISD::SUB, dl, MVT::i32, 17265 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 17266 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 17267 } 17268 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 17269 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 17270 // This seems pointless but will allow us to combine it further below. 17271 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 17272 SDValue Sub = 17273 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 17274 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 17275 Sub.getValue(1), SDValue()); 17276 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 17277 N->getOperand(3), CPSRGlue.getValue(1)); 17278 FalseVal = Sub; 17279 } 17280 } else if (isNullConstant(TrueVal)) { 17281 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 17282 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 17283 // This seems pointless but will allow us to combine it further below 17284 // Note that we change == for != as this is the dual for the case above. 17285 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 17286 SDValue Sub = 17287 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 17288 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 17289 Sub.getValue(1), SDValue()); 17290 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 17291 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 17292 N->getOperand(3), CPSRGlue.getValue(1)); 17293 FalseVal = Sub; 17294 } 17295 } 17296 17297 // On Thumb1, the DAG above may be further combined if z is a power of 2 17298 // (z == 2 ^ K). 17299 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 17300 // t1 = (USUBO (SUB x, y), 1) 17301 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 17302 // Result = if K != 0 then (SHL t2:0, K) else t2:0 17303 // 17304 // This also handles the special case of comparing against zero; it's 17305 // essentially, the same pattern, except there's no SUBS: 17306 // CMOV x, z, !=, (CMPZ x, 0) -> 17307 // t1 = (USUBO x, 1) 17308 // t2 = (SUBCARRY x, t1:0, t1:1) 17309 // Result = if K != 0 then (SHL t2:0, K) else t2:0 17310 const APInt *TrueConst; 17311 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 17312 ((FalseVal.getOpcode() == ARMISD::SUBS && 17313 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 17314 (FalseVal == LHS && isNullConstant(RHS))) && 17315 (TrueConst = isPowerOf2Constant(TrueVal))) { 17316 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 17317 unsigned ShiftAmount = TrueConst->logBase2(); 17318 if (ShiftAmount) 17319 TrueVal = DAG.getConstant(1, dl, VT); 17320 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 17321 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 17322 17323 if (ShiftAmount) 17324 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 17325 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 17326 } 17327 17328 if (Res.getNode()) { 17329 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 17330 // Capture demanded bits information that would be otherwise lost. 17331 if (Known.Zero == 0xfffffffe) 17332 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 17333 DAG.getValueType(MVT::i1)); 17334 else if (Known.Zero == 0xffffff00) 17335 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 17336 DAG.getValueType(MVT::i8)); 17337 else if (Known.Zero == 0xffff0000) 17338 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 17339 DAG.getValueType(MVT::i16)); 17340 } 17341 17342 return Res; 17343 } 17344 17345 static SDValue PerformBITCASTCombine(SDNode *N, 17346 TargetLowering::DAGCombinerInfo &DCI, 17347 const ARMSubtarget *ST) { 17348 SelectionDAG &DAG = DCI.DAG; 17349 SDValue Src = N->getOperand(0); 17350 EVT DstVT = N->getValueType(0); 17351 17352 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. 17353 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { 17354 EVT SrcVT = Src.getValueType(); 17355 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) 17356 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); 17357 } 17358 17359 // We may have a bitcast of something that has already had this bitcast 17360 // combine performed on it, so skip past any VECTOR_REG_CASTs. 17361 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) 17362 Src = Src.getOperand(0); 17363 17364 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that 17365 // would be generated is at least the width of the element type. 17366 EVT SrcVT = Src.getValueType(); 17367 if ((Src.getOpcode() == ARMISD::VMOVIMM || 17368 Src.getOpcode() == ARMISD::VMVNIMM || 17369 Src.getOpcode() == ARMISD::VMOVFPIMM) && 17370 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && 17371 DAG.getDataLayout().isBigEndian()) 17372 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); 17373 17374 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x 17375 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 17376 return R; 17377 17378 return SDValue(); 17379 } 17380 17381 // Some combines for the MVETrunc truncations legalizer helper. Also lowers the 17382 // node into stack operations after legalizeOps. 17383 SDValue ARMTargetLowering::PerformMVETruncCombine( 17384 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { 17385 SelectionDAG &DAG = DCI.DAG; 17386 EVT VT = N->getValueType(0); 17387 SDLoc DL(N); 17388 17389 // MVETrunc(Undef, Undef) -> Undef 17390 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); })) 17391 return DAG.getUNDEF(VT); 17392 17393 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc 17394 if (N->getNumOperands() == 2 && 17395 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC && 17396 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC) 17397 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0), 17398 N->getOperand(0).getOperand(1), 17399 N->getOperand(1).getOperand(0), 17400 N->getOperand(1).getOperand(1)); 17401 17402 // MVETrunc(shuffle, shuffle) -> VMOVN 17403 if (N->getNumOperands() == 2 && 17404 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && 17405 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) { 17406 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode()); 17407 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode()); 17408 17409 if (S0->getOperand(0) == S1->getOperand(0) && 17410 S0->getOperand(1) == S1->getOperand(1)) { 17411 // Construct complete shuffle mask 17412 SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end()); 17413 Mask.append(S1->getMask().begin(), S1->getMask().end()); 17414 17415 if (isVMOVNTruncMask(Mask, VT, 0)) 17416 return DAG.getNode( 17417 ARMISD::VMOVN, DL, VT, 17418 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), 17419 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), 17420 DAG.getConstant(1, DL, MVT::i32)); 17421 if (isVMOVNTruncMask(Mask, VT, 1)) 17422 return DAG.getNode( 17423 ARMISD::VMOVN, DL, VT, 17424 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), 17425 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), 17426 DAG.getConstant(1, DL, MVT::i32)); 17427 } 17428 } 17429 17430 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the 17431 // truncate to a buildvector to allow the generic optimisations to kick in. 17432 if (all_of(N->ops(), [](SDValue Op) { 17433 return Op.getOpcode() == ISD::BUILD_VECTOR || 17434 Op.getOpcode() == ISD::VECTOR_SHUFFLE || 17435 (Op.getOpcode() == ISD::BITCAST && 17436 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR); 17437 })) { 17438 SmallVector<SDValue, 8> Extracts; 17439 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) { 17440 SDValue O = N->getOperand(Op); 17441 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) { 17442 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O, 17443 DAG.getConstant(i, DL, MVT::i32)); 17444 Extracts.push_back(Ext); 17445 } 17446 } 17447 return DAG.getBuildVector(VT, DL, Extracts); 17448 } 17449 17450 // If we are late in the legalization process and nothing has optimised 17451 // the trunc to anything better, lower it to a stack store and reload, 17452 // performing the truncation whilst keeping the lanes in the correct order: 17453 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack; 17454 if (!DCI.isAfterLegalizeDAG()) 17455 return SDValue(); 17456 17457 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); 17458 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); 17459 int NumIns = N->getNumOperands(); 17460 assert((NumIns == 2 || NumIns == 4) && 17461 "Expected 2 or 4 inputs to an MVETrunc"); 17462 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 17463 if (N->getNumOperands() == 4) 17464 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext()); 17465 17466 SmallVector<SDValue> Chains; 17467 for (int I = 0; I < NumIns; I++) { 17468 SDValue Ptr = DAG.getNode( 17469 ISD::ADD, DL, StackPtr.getValueType(), StackPtr, 17470 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType())); 17471 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( 17472 DAG.getMachineFunction(), SPFI, I * 16 / NumIns); 17473 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I), 17474 Ptr, MPI, StoreVT, Align(4)); 17475 Chains.push_back(Ch); 17476 } 17477 17478 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 17479 MachinePointerInfo MPI = 17480 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); 17481 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4)); 17482 } 17483 17484 // Take a MVEEXT(load x) and split that into (extload x, extload x+8) 17485 static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, 17486 SelectionDAG &DAG) { 17487 SDValue N0 = N->getOperand(0); 17488 LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode()); 17489 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed()) 17490 return SDValue(); 17491 17492 EVT FromVT = LD->getMemoryVT(); 17493 EVT ToVT = N->getValueType(0); 17494 if (!ToVT.isVector()) 17495 return SDValue(); 17496 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2); 17497 EVT ToEltVT = ToVT.getVectorElementType(); 17498 EVT FromEltVT = FromVT.getVectorElementType(); 17499 17500 unsigned NumElements = 0; 17501 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 17502 NumElements = 4; 17503 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 17504 NumElements = 8; 17505 assert(NumElements != 0); 17506 17507 ISD::LoadExtType NewExtType = 17508 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 17509 if (LD->getExtensionType() != ISD::NON_EXTLOAD && 17510 LD->getExtensionType() != ISD::EXTLOAD && 17511 LD->getExtensionType() != NewExtType) 17512 return SDValue(); 17513 17514 LLVMContext &C = *DAG.getContext(); 17515 SDLoc DL(LD); 17516 // Details about the old load 17517 SDValue Ch = LD->getChain(); 17518 SDValue BasePtr = LD->getBasePtr(); 17519 Align Alignment = LD->getOriginalAlign(); 17520 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 17521 AAMDNodes AAInfo = LD->getAAInfo(); 17522 17523 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 17524 EVT NewFromVT = EVT::getVectorVT( 17525 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); 17526 EVT NewToVT = EVT::getVectorVT( 17527 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); 17528 17529 SmallVector<SDValue, 4> Loads; 17530 SmallVector<SDValue, 4> Chains; 17531 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 17532 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; 17533 SDValue NewPtr = 17534 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 17535 17536 SDValue NewLoad = 17537 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 17538 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 17539 Alignment, MMOFlags, AAInfo); 17540 Loads.push_back(NewLoad); 17541 Chains.push_back(SDValue(NewLoad.getNode(), 1)); 17542 } 17543 17544 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 17545 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 17546 return DAG.getMergeValues(Loads, DL); 17547 } 17548 17549 // Perform combines for MVEEXT. If it has not be optimized to anything better 17550 // before lowering, it gets converted to stack store and extloads performing the 17551 // extend whilst still keeping the same lane ordering. 17552 SDValue ARMTargetLowering::PerformMVEExtCombine( 17553 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { 17554 SelectionDAG &DAG = DCI.DAG; 17555 EVT VT = N->getValueType(0); 17556 SDLoc DL(N); 17557 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements"); 17558 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type"); 17559 17560 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( 17561 *DAG.getContext()); 17562 auto Extend = [&](SDValue V) { 17563 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V); 17564 return N->getOpcode() == ARMISD::MVESEXT 17565 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT, 17566 DAG.getValueType(ExtVT)) 17567 : DAG.getZeroExtendInReg(VVT, DL, ExtVT); 17568 }; 17569 17570 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP) 17571 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) { 17572 SDValue Ext = Extend(N->getOperand(0)); 17573 return DAG.getMergeValues({Ext, Ext}, DL); 17574 } 17575 17576 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG 17577 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) { 17578 ArrayRef<int> Mask = SVN->getMask(); 17579 assert(Mask.size() == 2 * VT.getVectorNumElements()); 17580 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements()); 17581 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16; 17582 SDValue Op0 = SVN->getOperand(0); 17583 SDValue Op1 = SVN->getOperand(1); 17584 17585 auto CheckInregMask = [&](int Start, int Offset) { 17586 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx) 17587 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset) 17588 return false; 17589 return true; 17590 }; 17591 SDValue V0 = SDValue(N, 0); 17592 SDValue V1 = SDValue(N, 1); 17593 if (CheckInregMask(0, 0)) 17594 V0 = Extend(Op0); 17595 else if (CheckInregMask(0, 1)) 17596 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); 17597 else if (CheckInregMask(0, Mask.size())) 17598 V0 = Extend(Op1); 17599 else if (CheckInregMask(0, Mask.size() + 1)) 17600 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); 17601 17602 if (CheckInregMask(VT.getVectorNumElements(), Mask.size())) 17603 V1 = Extend(Op1); 17604 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1)) 17605 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); 17606 else if (CheckInregMask(VT.getVectorNumElements(), 0)) 17607 V1 = Extend(Op0); 17608 else if (CheckInregMask(VT.getVectorNumElements(), 1)) 17609 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); 17610 17611 if (V0.getNode() != N || V1.getNode() != N) 17612 return DAG.getMergeValues({V0, V1}, DL); 17613 } 17614 17615 // MVEEXT(load) -> extload, extload 17616 if (N->getOperand(0)->getOpcode() == ISD::LOAD) 17617 if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG)) 17618 return L; 17619 17620 if (!DCI.isAfterLegalizeDAG()) 17621 return SDValue(); 17622 17623 // Lower to a stack store and reload: 17624 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8; 17625 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); 17626 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); 17627 int NumOuts = N->getNumValues(); 17628 assert((NumOuts == 2 || NumOuts == 4) && 17629 "Expected 2 or 4 outputs to an MVEEXT"); 17630 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( 17631 *DAG.getContext()); 17632 if (N->getNumOperands() == 4) 17633 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext()); 17634 17635 MachinePointerInfo MPI = 17636 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); 17637 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0), 17638 StackPtr, MPI, Align(4)); 17639 17640 SmallVector<SDValue> Loads; 17641 for (int I = 0; I < NumOuts; I++) { 17642 SDValue Ptr = DAG.getNode( 17643 ISD::ADD, DL, StackPtr.getValueType(), StackPtr, 17644 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType())); 17645 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( 17646 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts); 17647 SDValue Load = DAG.getExtLoad( 17648 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, 17649 VT, Chain, Ptr, MPI, LoadVT, Align(4)); 17650 Loads.push_back(Load); 17651 } 17652 17653 return DAG.getMergeValues(Loads, DL); 17654 } 17655 17656 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 17657 DAGCombinerInfo &DCI) const { 17658 switch (N->getOpcode()) { 17659 default: break; 17660 case ISD::SELECT_CC: 17661 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); 17662 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); 17663 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 17664 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 17665 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 17666 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 17667 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 17668 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 17669 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 17670 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 17671 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 17672 case ISD::BRCOND: 17673 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 17674 case ARMISD::ADDC: 17675 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 17676 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 17677 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG); 17678 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 17679 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 17680 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); 17681 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG); 17682 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 17683 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 17684 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 17685 case ISD::EXTRACT_VECTOR_ELT: 17686 return PerformExtractEltCombine(N, DCI, Subtarget); 17687 case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG); 17688 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 17689 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); 17690 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget); 17691 case ISD::FP_TO_SINT: 17692 case ISD::FP_TO_UINT: 17693 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 17694 case ISD::FDIV: 17695 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 17696 case ISD::INTRINSIC_WO_CHAIN: 17697 return PerformIntrinsicCombine(N, DCI); 17698 case ISD::SHL: 17699 case ISD::SRA: 17700 case ISD::SRL: 17701 return PerformShiftCombine(N, DCI, Subtarget); 17702 case ISD::SIGN_EXTEND: 17703 case ISD::ZERO_EXTEND: 17704 case ISD::ANY_EXTEND: 17705 return PerformExtendCombine(N, DCI.DAG, Subtarget); 17706 case ISD::FP_EXTEND: 17707 return PerformFPExtendCombine(N, DCI.DAG, Subtarget); 17708 case ISD::SMIN: 17709 case ISD::UMIN: 17710 case ISD::SMAX: 17711 case ISD::UMAX: 17712 return PerformMinMaxCombine(N, DCI.DAG, Subtarget); 17713 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 17714 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 17715 case ISD::LOAD: return PerformLOADCombine(N, DCI); 17716 case ARMISD::VLD1DUP: 17717 case ARMISD::VLD2DUP: 17718 case ARMISD::VLD3DUP: 17719 case ARMISD::VLD4DUP: 17720 return PerformVLDCombine(N, DCI); 17721 case ARMISD::BUILD_VECTOR: 17722 return PerformARMBUILD_VECTORCombine(N, DCI); 17723 case ISD::BITCAST: 17724 return PerformBITCASTCombine(N, DCI, Subtarget); 17725 case ARMISD::PREDICATE_CAST: 17726 return PerformPREDICATE_CASTCombine(N, DCI); 17727 case ARMISD::VECTOR_REG_CAST: 17728 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget); 17729 case ARMISD::MVETRUNC: 17730 return PerformMVETruncCombine(N, DCI); 17731 case ARMISD::MVESEXT: 17732 case ARMISD::MVEZEXT: 17733 return PerformMVEExtCombine(N, DCI); 17734 case ARMISD::VCMP: 17735 return PerformVCMPCombine(N, DCI.DAG, Subtarget); 17736 case ISD::VECREDUCE_ADD: 17737 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); 17738 case ARMISD::VMOVN: 17739 return PerformVMOVNCombine(N, DCI); 17740 case ARMISD::VQMOVNs: 17741 case ARMISD::VQMOVNu: 17742 return PerformVQMOVNCombine(N, DCI); 17743 case ARMISD::ASRL: 17744 case ARMISD::LSRL: 17745 case ARMISD::LSLL: 17746 return PerformLongShiftCombine(N, DCI.DAG); 17747 case ARMISD::SMULWB: { 17748 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17749 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 17750 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 17751 return SDValue(); 17752 break; 17753 } 17754 case ARMISD::SMULWT: { 17755 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17756 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 17757 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 17758 return SDValue(); 17759 break; 17760 } 17761 case ARMISD::SMLALBB: 17762 case ARMISD::QADD16b: 17763 case ARMISD::QSUB16b: 17764 case ARMISD::UQADD16b: 17765 case ARMISD::UQSUB16b: { 17766 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17767 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 17768 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 17769 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 17770 return SDValue(); 17771 break; 17772 } 17773 case ARMISD::SMLALBT: { 17774 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 17775 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 17776 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 17777 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 17778 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 17779 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 17780 return SDValue(); 17781 break; 17782 } 17783 case ARMISD::SMLALTB: { 17784 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 17785 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 17786 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 17787 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 17788 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 17789 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 17790 return SDValue(); 17791 break; 17792 } 17793 case ARMISD::SMLALTT: { 17794 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17795 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 17796 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 17797 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 17798 return SDValue(); 17799 break; 17800 } 17801 case ARMISD::QADD8b: 17802 case ARMISD::QSUB8b: 17803 case ARMISD::UQADD8b: 17804 case ARMISD::UQSUB8b: { 17805 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17806 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 17807 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 17808 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 17809 return SDValue(); 17810 break; 17811 } 17812 case ISD::INTRINSIC_VOID: 17813 case ISD::INTRINSIC_W_CHAIN: 17814 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 17815 case Intrinsic::arm_neon_vld1: 17816 case Intrinsic::arm_neon_vld1x2: 17817 case Intrinsic::arm_neon_vld1x3: 17818 case Intrinsic::arm_neon_vld1x4: 17819 case Intrinsic::arm_neon_vld2: 17820 case Intrinsic::arm_neon_vld3: 17821 case Intrinsic::arm_neon_vld4: 17822 case Intrinsic::arm_neon_vld2lane: 17823 case Intrinsic::arm_neon_vld3lane: 17824 case Intrinsic::arm_neon_vld4lane: 17825 case Intrinsic::arm_neon_vld2dup: 17826 case Intrinsic::arm_neon_vld3dup: 17827 case Intrinsic::arm_neon_vld4dup: 17828 case Intrinsic::arm_neon_vst1: 17829 case Intrinsic::arm_neon_vst1x2: 17830 case Intrinsic::arm_neon_vst1x3: 17831 case Intrinsic::arm_neon_vst1x4: 17832 case Intrinsic::arm_neon_vst2: 17833 case Intrinsic::arm_neon_vst3: 17834 case Intrinsic::arm_neon_vst4: 17835 case Intrinsic::arm_neon_vst2lane: 17836 case Intrinsic::arm_neon_vst3lane: 17837 case Intrinsic::arm_neon_vst4lane: 17838 return PerformVLDCombine(N, DCI); 17839 case Intrinsic::arm_mve_vld2q: 17840 case Intrinsic::arm_mve_vld4q: 17841 case Intrinsic::arm_mve_vst2q: 17842 case Intrinsic::arm_mve_vst4q: 17843 return PerformMVEVLDCombine(N, DCI); 17844 default: break; 17845 } 17846 break; 17847 } 17848 return SDValue(); 17849 } 17850 17851 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 17852 EVT VT) const { 17853 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 17854 } 17855 17856 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 17857 Align Alignment, 17858 MachineMemOperand::Flags, 17859 bool *Fast) const { 17860 // Depends what it gets converted into if the type is weird. 17861 if (!VT.isSimple()) 17862 return false; 17863 17864 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 17865 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 17866 auto Ty = VT.getSimpleVT().SimpleTy; 17867 17868 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 17869 // Unaligned access can use (for example) LRDB, LRDH, LDR 17870 if (AllowsUnaligned) { 17871 if (Fast) 17872 *Fast = Subtarget->hasV7Ops(); 17873 return true; 17874 } 17875 } 17876 17877 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 17878 // For any little-endian targets with neon, we can support unaligned ld/st 17879 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 17880 // A big-endian target may also explicitly support unaligned accesses 17881 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 17882 if (Fast) 17883 *Fast = true; 17884 return true; 17885 } 17886 } 17887 17888 if (!Subtarget->hasMVEIntegerOps()) 17889 return false; 17890 17891 // These are for predicates 17892 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 17893 if (Fast) 17894 *Fast = true; 17895 return true; 17896 } 17897 17898 // These are for truncated stores/narrowing loads. They are fine so long as 17899 // the alignment is at least the size of the item being loaded 17900 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 17901 Alignment >= VT.getScalarSizeInBits() / 8) { 17902 if (Fast) 17903 *Fast = true; 17904 return true; 17905 } 17906 17907 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 17908 // VSTRW.U32 all store the vector register in exactly the same format, and 17909 // differ only in the range of their immediate offset field and the required 17910 // alignment. So there is always a store that can be used, regardless of 17911 // actual type. 17912 // 17913 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 17914 // VREV64.8) pair and get the same effect. This will likely be better than 17915 // aligning the vector through the stack. 17916 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 17917 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 17918 Ty == MVT::v2f64) { 17919 if (Fast) 17920 *Fast = true; 17921 return true; 17922 } 17923 17924 return false; 17925 } 17926 17927 17928 EVT ARMTargetLowering::getOptimalMemOpType( 17929 const MemOp &Op, const AttributeList &FuncAttributes) const { 17930 // See if we can use NEON instructions for this... 17931 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && 17932 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 17933 bool Fast; 17934 if (Op.size() >= 16 && 17935 (Op.isAligned(Align(16)) || 17936 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1), 17937 MachineMemOperand::MONone, &Fast) && 17938 Fast))) { 17939 return MVT::v2f64; 17940 } else if (Op.size() >= 8 && 17941 (Op.isAligned(Align(8)) || 17942 (allowsMisalignedMemoryAccesses( 17943 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) && 17944 Fast))) { 17945 return MVT::f64; 17946 } 17947 } 17948 17949 // Let the target-independent logic figure it out. 17950 return MVT::Other; 17951 } 17952 17953 // 64-bit integers are split into their high and low parts and held in two 17954 // different registers, so the trunc is free since the low register can just 17955 // be used. 17956 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 17957 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 17958 return false; 17959 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 17960 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 17961 return (SrcBits == 64 && DestBits == 32); 17962 } 17963 17964 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 17965 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 17966 !DstVT.isInteger()) 17967 return false; 17968 unsigned SrcBits = SrcVT.getSizeInBits(); 17969 unsigned DestBits = DstVT.getSizeInBits(); 17970 return (SrcBits == 64 && DestBits == 32); 17971 } 17972 17973 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 17974 if (Val.getOpcode() != ISD::LOAD) 17975 return false; 17976 17977 EVT VT1 = Val.getValueType(); 17978 if (!VT1.isSimple() || !VT1.isInteger() || 17979 !VT2.isSimple() || !VT2.isInteger()) 17980 return false; 17981 17982 switch (VT1.getSimpleVT().SimpleTy) { 17983 default: break; 17984 case MVT::i1: 17985 case MVT::i8: 17986 case MVT::i16: 17987 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 17988 return true; 17989 } 17990 17991 return false; 17992 } 17993 17994 bool ARMTargetLowering::isFNegFree(EVT VT) const { 17995 if (!VT.isSimple()) 17996 return false; 17997 17998 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 17999 // negate values directly (fneg is free). So, we don't want to let the DAG 18000 // combiner rewrite fneg into xors and some other instructions. For f16 and 18001 // FullFP16 argument passing, some bitcast nodes may be introduced, 18002 // triggering this DAG combine rewrite, so we are avoiding that with this. 18003 switch (VT.getSimpleVT().SimpleTy) { 18004 default: break; 18005 case MVT::f16: 18006 return Subtarget->hasFullFP16(); 18007 } 18008 18009 return false; 18010 } 18011 18012 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 18013 /// of the vector elements. 18014 static bool areExtractExts(Value *Ext1, Value *Ext2) { 18015 auto areExtDoubled = [](Instruction *Ext) { 18016 return Ext->getType()->getScalarSizeInBits() == 18017 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 18018 }; 18019 18020 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 18021 !match(Ext2, m_ZExtOrSExt(m_Value())) || 18022 !areExtDoubled(cast<Instruction>(Ext1)) || 18023 !areExtDoubled(cast<Instruction>(Ext2))) 18024 return false; 18025 18026 return true; 18027 } 18028 18029 /// Check if sinking \p I's operands to I's basic block is profitable, because 18030 /// the operands can be folded into a target instruction, e.g. 18031 /// sext/zext can be folded into vsubl. 18032 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 18033 SmallVectorImpl<Use *> &Ops) const { 18034 if (!I->getType()->isVectorTy()) 18035 return false; 18036 18037 if (Subtarget->hasNEON()) { 18038 switch (I->getOpcode()) { 18039 case Instruction::Sub: 18040 case Instruction::Add: { 18041 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 18042 return false; 18043 Ops.push_back(&I->getOperandUse(0)); 18044 Ops.push_back(&I->getOperandUse(1)); 18045 return true; 18046 } 18047 default: 18048 return false; 18049 } 18050 } 18051 18052 if (!Subtarget->hasMVEIntegerOps()) 18053 return false; 18054 18055 auto IsFMSMul = [&](Instruction *I) { 18056 if (!I->hasOneUse()) 18057 return false; 18058 auto *Sub = cast<Instruction>(*I->users().begin()); 18059 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; 18060 }; 18061 auto IsFMS = [&](Instruction *I) { 18062 if (match(I->getOperand(0), m_FNeg(m_Value())) || 18063 match(I->getOperand(1), m_FNeg(m_Value()))) 18064 return true; 18065 return false; 18066 }; 18067 18068 auto IsSinker = [&](Instruction *I, int Operand) { 18069 switch (I->getOpcode()) { 18070 case Instruction::Add: 18071 case Instruction::Mul: 18072 case Instruction::FAdd: 18073 case Instruction::ICmp: 18074 case Instruction::FCmp: 18075 return true; 18076 case Instruction::FMul: 18077 return !IsFMSMul(I); 18078 case Instruction::Sub: 18079 case Instruction::FSub: 18080 case Instruction::Shl: 18081 case Instruction::LShr: 18082 case Instruction::AShr: 18083 return Operand == 1; 18084 case Instruction::Call: 18085 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 18086 switch (II->getIntrinsicID()) { 18087 case Intrinsic::fma: 18088 return !IsFMS(I); 18089 case Intrinsic::arm_mve_add_predicated: 18090 case Intrinsic::arm_mve_mul_predicated: 18091 case Intrinsic::arm_mve_qadd_predicated: 18092 case Intrinsic::arm_mve_hadd_predicated: 18093 case Intrinsic::arm_mve_vqdmull_predicated: 18094 case Intrinsic::arm_mve_qdmulh_predicated: 18095 case Intrinsic::arm_mve_qrdmulh_predicated: 18096 case Intrinsic::arm_mve_fma_predicated: 18097 return true; 18098 case Intrinsic::arm_mve_sub_predicated: 18099 case Intrinsic::arm_mve_qsub_predicated: 18100 case Intrinsic::arm_mve_hsub_predicated: 18101 return Operand == 1; 18102 default: 18103 return false; 18104 } 18105 } 18106 return false; 18107 default: 18108 return false; 18109 } 18110 }; 18111 18112 for (auto OpIdx : enumerate(I->operands())) { 18113 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); 18114 // Make sure we are not already sinking this operand 18115 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) 18116 continue; 18117 18118 Instruction *Shuffle = Op; 18119 if (Shuffle->getOpcode() == Instruction::BitCast) 18120 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); 18121 // We are looking for a splat that can be sunk. 18122 if (!Shuffle || 18123 !match(Shuffle, m_Shuffle( 18124 m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 18125 m_Undef(), m_ZeroMask()))) 18126 continue; 18127 if (!IsSinker(I, OpIdx.index())) 18128 continue; 18129 18130 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 18131 // and vector registers 18132 for (Use &U : Op->uses()) { 18133 Instruction *Insn = cast<Instruction>(U.getUser()); 18134 if (!IsSinker(Insn, U.getOperandNo())) 18135 return false; 18136 } 18137 18138 Ops.push_back(&Shuffle->getOperandUse(0)); 18139 if (Shuffle != Op) 18140 Ops.push_back(&Op->getOperandUse(0)); 18141 Ops.push_back(&OpIdx.value()); 18142 } 18143 return true; 18144 } 18145 18146 Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { 18147 if (!Subtarget->hasMVEIntegerOps()) 18148 return nullptr; 18149 Type *SVIType = SVI->getType(); 18150 Type *ScalarType = SVIType->getScalarType(); 18151 18152 if (ScalarType->isFloatTy()) 18153 return Type::getInt32Ty(SVIType->getContext()); 18154 if (ScalarType->isHalfTy()) 18155 return Type::getInt16Ty(SVIType->getContext()); 18156 return nullptr; 18157 } 18158 18159 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 18160 EVT VT = ExtVal.getValueType(); 18161 18162 if (!isTypeLegal(VT)) 18163 return false; 18164 18165 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 18166 if (Ld->isExpandingLoad()) 18167 return false; 18168 } 18169 18170 if (Subtarget->hasMVEIntegerOps()) 18171 return true; 18172 18173 // Don't create a loadext if we can fold the extension into a wide/long 18174 // instruction. 18175 // If there's more than one user instruction, the loadext is desirable no 18176 // matter what. There can be two uses by the same instruction. 18177 if (ExtVal->use_empty() || 18178 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 18179 return true; 18180 18181 SDNode *U = *ExtVal->use_begin(); 18182 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 18183 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 18184 return false; 18185 18186 return true; 18187 } 18188 18189 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 18190 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 18191 return false; 18192 18193 if (!isTypeLegal(EVT::getEVT(Ty1))) 18194 return false; 18195 18196 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 18197 18198 // Assuming the caller doesn't have a zeroext or signext return parameter, 18199 // truncation all the way down to i1 is valid. 18200 return true; 18201 } 18202 18203 InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 18204 const AddrMode &AM, 18205 Type *Ty, 18206 unsigned AS) const { 18207 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 18208 if (Subtarget->hasFPAO()) 18209 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 18210 return 0; 18211 } 18212 return -1; 18213 } 18214 18215 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 18216 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 18217 /// expanded to FMAs when this method returns true, otherwise fmuladd is 18218 /// expanded to fmul + fadd. 18219 /// 18220 /// ARM supports both fused and unfused multiply-add operations; we already 18221 /// lower a pair of fmul and fadd to the latter so it's not clear that there 18222 /// would be a gain or that the gain would be worthwhile enough to risk 18223 /// correctness bugs. 18224 /// 18225 /// For MVE, we set this to true as it helps simplify the need for some 18226 /// patterns (and we don't have the non-fused floating point instruction). 18227 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 18228 EVT VT) const { 18229 if (!VT.isSimple()) 18230 return false; 18231 18232 switch (VT.getSimpleVT().SimpleTy) { 18233 case MVT::v4f32: 18234 case MVT::v8f16: 18235 return Subtarget->hasMVEFloatOps(); 18236 case MVT::f16: 18237 return Subtarget->useFPVFMx16(); 18238 case MVT::f32: 18239 return Subtarget->useFPVFMx(); 18240 case MVT::f64: 18241 return Subtarget->useFPVFMx64(); 18242 default: 18243 break; 18244 } 18245 18246 return false; 18247 } 18248 18249 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 18250 if (V < 0) 18251 return false; 18252 18253 unsigned Scale = 1; 18254 switch (VT.getSimpleVT().SimpleTy) { 18255 case MVT::i1: 18256 case MVT::i8: 18257 // Scale == 1; 18258 break; 18259 case MVT::i16: 18260 // Scale == 2; 18261 Scale = 2; 18262 break; 18263 default: 18264 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 18265 // Scale == 4; 18266 Scale = 4; 18267 break; 18268 } 18269 18270 if ((V & (Scale - 1)) != 0) 18271 return false; 18272 return isUInt<5>(V / Scale); 18273 } 18274 18275 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 18276 const ARMSubtarget *Subtarget) { 18277 if (!VT.isInteger() && !VT.isFloatingPoint()) 18278 return false; 18279 if (VT.isVector() && Subtarget->hasNEON()) 18280 return false; 18281 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 18282 !Subtarget->hasMVEFloatOps()) 18283 return false; 18284 18285 bool IsNeg = false; 18286 if (V < 0) { 18287 IsNeg = true; 18288 V = -V; 18289 } 18290 18291 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 18292 18293 // MVE: size * imm7 18294 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 18295 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 18296 case MVT::i32: 18297 case MVT::f32: 18298 return isShiftedUInt<7,2>(V); 18299 case MVT::i16: 18300 case MVT::f16: 18301 return isShiftedUInt<7,1>(V); 18302 case MVT::i8: 18303 return isUInt<7>(V); 18304 default: 18305 return false; 18306 } 18307 } 18308 18309 // half VLDR: 2 * imm8 18310 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 18311 return isShiftedUInt<8, 1>(V); 18312 // VLDR and LDRD: 4 * imm8 18313 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 18314 return isShiftedUInt<8, 2>(V); 18315 18316 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 18317 // + imm12 or - imm8 18318 if (IsNeg) 18319 return isUInt<8>(V); 18320 return isUInt<12>(V); 18321 } 18322 18323 return false; 18324 } 18325 18326 /// isLegalAddressImmediate - Return true if the integer value can be used 18327 /// as the offset of the target addressing mode for load / store of the 18328 /// given type. 18329 static bool isLegalAddressImmediate(int64_t V, EVT VT, 18330 const ARMSubtarget *Subtarget) { 18331 if (V == 0) 18332 return true; 18333 18334 if (!VT.isSimple()) 18335 return false; 18336 18337 if (Subtarget->isThumb1Only()) 18338 return isLegalT1AddressImmediate(V, VT); 18339 else if (Subtarget->isThumb2()) 18340 return isLegalT2AddressImmediate(V, VT, Subtarget); 18341 18342 // ARM mode. 18343 if (V < 0) 18344 V = - V; 18345 switch (VT.getSimpleVT().SimpleTy) { 18346 default: return false; 18347 case MVT::i1: 18348 case MVT::i8: 18349 case MVT::i32: 18350 // +- imm12 18351 return isUInt<12>(V); 18352 case MVT::i16: 18353 // +- imm8 18354 return isUInt<8>(V); 18355 case MVT::f32: 18356 case MVT::f64: 18357 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 18358 return false; 18359 return isShiftedUInt<8, 2>(V); 18360 } 18361 } 18362 18363 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 18364 EVT VT) const { 18365 int Scale = AM.Scale; 18366 if (Scale < 0) 18367 return false; 18368 18369 switch (VT.getSimpleVT().SimpleTy) { 18370 default: return false; 18371 case MVT::i1: 18372 case MVT::i8: 18373 case MVT::i16: 18374 case MVT::i32: 18375 if (Scale == 1) 18376 return true; 18377 // r + r << imm 18378 Scale = Scale & ~1; 18379 return Scale == 2 || Scale == 4 || Scale == 8; 18380 case MVT::i64: 18381 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 18382 // version in Thumb mode. 18383 // r + r 18384 if (Scale == 1) 18385 return true; 18386 // r * 2 (this can be lowered to r + r). 18387 if (!AM.HasBaseReg && Scale == 2) 18388 return true; 18389 return false; 18390 case MVT::isVoid: 18391 // Note, we allow "void" uses (basically, uses that aren't loads or 18392 // stores), because arm allows folding a scale into many arithmetic 18393 // operations. This should be made more precise and revisited later. 18394 18395 // Allow r << imm, but the imm has to be a multiple of two. 18396 if (Scale & 1) return false; 18397 return isPowerOf2_32(Scale); 18398 } 18399 } 18400 18401 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 18402 EVT VT) const { 18403 const int Scale = AM.Scale; 18404 18405 // Negative scales are not supported in Thumb1. 18406 if (Scale < 0) 18407 return false; 18408 18409 // Thumb1 addressing modes do not support register scaling excepting the 18410 // following cases: 18411 // 1. Scale == 1 means no scaling. 18412 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 18413 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 18414 } 18415 18416 /// isLegalAddressingMode - Return true if the addressing mode represented 18417 /// by AM is legal for this target, for a load/store of the specified type. 18418 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 18419 const AddrMode &AM, Type *Ty, 18420 unsigned AS, Instruction *I) const { 18421 EVT VT = getValueType(DL, Ty, true); 18422 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 18423 return false; 18424 18425 // Can never fold addr of global into load/store. 18426 if (AM.BaseGV) 18427 return false; 18428 18429 switch (AM.Scale) { 18430 case 0: // no scale reg, must be "r+i" or "r", or "i". 18431 break; 18432 default: 18433 // ARM doesn't support any R+R*scale+imm addr modes. 18434 if (AM.BaseOffs) 18435 return false; 18436 18437 if (!VT.isSimple()) 18438 return false; 18439 18440 if (Subtarget->isThumb1Only()) 18441 return isLegalT1ScaledAddressingMode(AM, VT); 18442 18443 if (Subtarget->isThumb2()) 18444 return isLegalT2ScaledAddressingMode(AM, VT); 18445 18446 int Scale = AM.Scale; 18447 switch (VT.getSimpleVT().SimpleTy) { 18448 default: return false; 18449 case MVT::i1: 18450 case MVT::i8: 18451 case MVT::i32: 18452 if (Scale < 0) Scale = -Scale; 18453 if (Scale == 1) 18454 return true; 18455 // r + r << imm 18456 return isPowerOf2_32(Scale & ~1); 18457 case MVT::i16: 18458 case MVT::i64: 18459 // r +/- r 18460 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 18461 return true; 18462 // r * 2 (this can be lowered to r + r). 18463 if (!AM.HasBaseReg && Scale == 2) 18464 return true; 18465 return false; 18466 18467 case MVT::isVoid: 18468 // Note, we allow "void" uses (basically, uses that aren't loads or 18469 // stores), because arm allows folding a scale into many arithmetic 18470 // operations. This should be made more precise and revisited later. 18471 18472 // Allow r << imm, but the imm has to be a multiple of two. 18473 if (Scale & 1) return false; 18474 return isPowerOf2_32(Scale); 18475 } 18476 } 18477 return true; 18478 } 18479 18480 /// isLegalICmpImmediate - Return true if the specified immediate is legal 18481 /// icmp immediate, that is the target has icmp instructions which can compare 18482 /// a register against the immediate without having to materialize the 18483 /// immediate into a register. 18484 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 18485 // Thumb2 and ARM modes can use cmn for negative immediates. 18486 if (!Subtarget->isThumb()) 18487 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 18488 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 18489 if (Subtarget->isThumb2()) 18490 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 18491 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 18492 // Thumb1 doesn't have cmn, and only 8-bit immediates. 18493 return Imm >= 0 && Imm <= 255; 18494 } 18495 18496 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 18497 /// *or sub* immediate, that is the target has add or sub instructions which can 18498 /// add a register with the immediate without having to materialize the 18499 /// immediate into a register. 18500 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 18501 // Same encoding for add/sub, just flip the sign. 18502 int64_t AbsImm = std::abs(Imm); 18503 if (!Subtarget->isThumb()) 18504 return ARM_AM::getSOImmVal(AbsImm) != -1; 18505 if (Subtarget->isThumb2()) 18506 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 18507 // Thumb1 only has 8-bit unsigned immediate. 18508 return AbsImm >= 0 && AbsImm <= 255; 18509 } 18510 18511 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 18512 bool isSEXTLoad, SDValue &Base, 18513 SDValue &Offset, bool &isInc, 18514 SelectionDAG &DAG) { 18515 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 18516 return false; 18517 18518 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 18519 // AddressingMode 3 18520 Base = Ptr->getOperand(0); 18521 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 18522 int RHSC = (int)RHS->getZExtValue(); 18523 if (RHSC < 0 && RHSC > -256) { 18524 assert(Ptr->getOpcode() == ISD::ADD); 18525 isInc = false; 18526 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 18527 return true; 18528 } 18529 } 18530 isInc = (Ptr->getOpcode() == ISD::ADD); 18531 Offset = Ptr->getOperand(1); 18532 return true; 18533 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 18534 // AddressingMode 2 18535 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 18536 int RHSC = (int)RHS->getZExtValue(); 18537 if (RHSC < 0 && RHSC > -0x1000) { 18538 assert(Ptr->getOpcode() == ISD::ADD); 18539 isInc = false; 18540 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 18541 Base = Ptr->getOperand(0); 18542 return true; 18543 } 18544 } 18545 18546 if (Ptr->getOpcode() == ISD::ADD) { 18547 isInc = true; 18548 ARM_AM::ShiftOpc ShOpcVal= 18549 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 18550 if (ShOpcVal != ARM_AM::no_shift) { 18551 Base = Ptr->getOperand(1); 18552 Offset = Ptr->getOperand(0); 18553 } else { 18554 Base = Ptr->getOperand(0); 18555 Offset = Ptr->getOperand(1); 18556 } 18557 return true; 18558 } 18559 18560 isInc = (Ptr->getOpcode() == ISD::ADD); 18561 Base = Ptr->getOperand(0); 18562 Offset = Ptr->getOperand(1); 18563 return true; 18564 } 18565 18566 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 18567 return false; 18568 } 18569 18570 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 18571 bool isSEXTLoad, SDValue &Base, 18572 SDValue &Offset, bool &isInc, 18573 SelectionDAG &DAG) { 18574 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 18575 return false; 18576 18577 Base = Ptr->getOperand(0); 18578 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 18579 int RHSC = (int)RHS->getZExtValue(); 18580 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 18581 assert(Ptr->getOpcode() == ISD::ADD); 18582 isInc = false; 18583 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 18584 return true; 18585 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 18586 isInc = Ptr->getOpcode() == ISD::ADD; 18587 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 18588 return true; 18589 } 18590 } 18591 18592 return false; 18593 } 18594 18595 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, 18596 bool isSEXTLoad, bool IsMasked, bool isLE, 18597 SDValue &Base, SDValue &Offset, 18598 bool &isInc, SelectionDAG &DAG) { 18599 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 18600 return false; 18601 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 18602 return false; 18603 18604 // We allow LE non-masked loads to change the type (for example use a vldrb.8 18605 // as opposed to a vldrw.32). This can allow extra addressing modes or 18606 // alignments for what is otherwise an equivalent instruction. 18607 bool CanChangeType = isLE && !IsMasked; 18608 18609 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 18610 int RHSC = (int)RHS->getZExtValue(); 18611 18612 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 18613 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 18614 assert(Ptr->getOpcode() == ISD::ADD); 18615 isInc = false; 18616 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 18617 return true; 18618 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 18619 isInc = Ptr->getOpcode() == ISD::ADD; 18620 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 18621 return true; 18622 } 18623 return false; 18624 }; 18625 18626 // Try to find a matching instruction based on s/zext, Alignment, Offset and 18627 // (in BE/masked) type. 18628 Base = Ptr->getOperand(0); 18629 if (VT == MVT::v4i16) { 18630 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2)) 18631 return true; 18632 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 18633 if (IsInRange(RHSC, 0x80, 1)) 18634 return true; 18635 } else if (Alignment >= 4 && 18636 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 18637 IsInRange(RHSC, 0x80, 4)) 18638 return true; 18639 else if (Alignment >= 2 && 18640 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 18641 IsInRange(RHSC, 0x80, 2)) 18642 return true; 18643 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 18644 return true; 18645 return false; 18646 } 18647 18648 /// getPreIndexedAddressParts - returns true by value, base pointer and 18649 /// offset pointer and addressing mode by reference if the node's address 18650 /// can be legally represented as pre-indexed load / store address. 18651 bool 18652 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 18653 SDValue &Offset, 18654 ISD::MemIndexedMode &AM, 18655 SelectionDAG &DAG) const { 18656 if (Subtarget->isThumb1Only()) 18657 return false; 18658 18659 EVT VT; 18660 SDValue Ptr; 18661 Align Alignment; 18662 bool isSEXTLoad = false; 18663 bool IsMasked = false; 18664 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 18665 Ptr = LD->getBasePtr(); 18666 VT = LD->getMemoryVT(); 18667 Alignment = LD->getAlign(); 18668 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 18669 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 18670 Ptr = ST->getBasePtr(); 18671 VT = ST->getMemoryVT(); 18672 Alignment = ST->getAlign(); 18673 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 18674 Ptr = LD->getBasePtr(); 18675 VT = LD->getMemoryVT(); 18676 Alignment = LD->getAlign(); 18677 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 18678 IsMasked = true; 18679 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 18680 Ptr = ST->getBasePtr(); 18681 VT = ST->getMemoryVT(); 18682 Alignment = ST->getAlign(); 18683 IsMasked = true; 18684 } else 18685 return false; 18686 18687 bool isInc; 18688 bool isLegal = false; 18689 if (VT.isVector()) 18690 isLegal = Subtarget->hasMVEIntegerOps() && 18691 getMVEIndexedAddressParts( 18692 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked, 18693 Subtarget->isLittle(), Base, Offset, isInc, DAG); 18694 else { 18695 if (Subtarget->isThumb2()) 18696 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 18697 Offset, isInc, DAG); 18698 else 18699 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 18700 Offset, isInc, DAG); 18701 } 18702 if (!isLegal) 18703 return false; 18704 18705 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 18706 return true; 18707 } 18708 18709 /// getPostIndexedAddressParts - returns true by value, base pointer and 18710 /// offset pointer and addressing mode by reference if this node can be 18711 /// combined with a load / store to form a post-indexed load / store. 18712 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 18713 SDValue &Base, 18714 SDValue &Offset, 18715 ISD::MemIndexedMode &AM, 18716 SelectionDAG &DAG) const { 18717 EVT VT; 18718 SDValue Ptr; 18719 Align Alignment; 18720 bool isSEXTLoad = false, isNonExt; 18721 bool IsMasked = false; 18722 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 18723 VT = LD->getMemoryVT(); 18724 Ptr = LD->getBasePtr(); 18725 Alignment = LD->getAlign(); 18726 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 18727 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 18728 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 18729 VT = ST->getMemoryVT(); 18730 Ptr = ST->getBasePtr(); 18731 Alignment = ST->getAlign(); 18732 isNonExt = !ST->isTruncatingStore(); 18733 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 18734 VT = LD->getMemoryVT(); 18735 Ptr = LD->getBasePtr(); 18736 Alignment = LD->getAlign(); 18737 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 18738 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 18739 IsMasked = true; 18740 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 18741 VT = ST->getMemoryVT(); 18742 Ptr = ST->getBasePtr(); 18743 Alignment = ST->getAlign(); 18744 isNonExt = !ST->isTruncatingStore(); 18745 IsMasked = true; 18746 } else 18747 return false; 18748 18749 if (Subtarget->isThumb1Only()) { 18750 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 18751 // must be non-extending/truncating, i32, with an offset of 4. 18752 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 18753 if (Op->getOpcode() != ISD::ADD || !isNonExt) 18754 return false; 18755 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 18756 if (!RHS || RHS->getZExtValue() != 4) 18757 return false; 18758 if (Alignment < Align(4)) 18759 return false; 18760 18761 Offset = Op->getOperand(1); 18762 Base = Op->getOperand(0); 18763 AM = ISD::POST_INC; 18764 return true; 18765 } 18766 18767 bool isInc; 18768 bool isLegal = false; 18769 if (VT.isVector()) 18770 isLegal = Subtarget->hasMVEIntegerOps() && 18771 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked, 18772 Subtarget->isLittle(), Base, Offset, 18773 isInc, DAG); 18774 else { 18775 if (Subtarget->isThumb2()) 18776 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 18777 isInc, DAG); 18778 else 18779 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 18780 isInc, DAG); 18781 } 18782 if (!isLegal) 18783 return false; 18784 18785 if (Ptr != Base) { 18786 // Swap base ptr and offset to catch more post-index load / store when 18787 // it's legal. In Thumb2 mode, offset must be an immediate. 18788 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 18789 !Subtarget->isThumb2()) 18790 std::swap(Base, Offset); 18791 18792 // Post-indexed load / store update the base pointer. 18793 if (Ptr != Base) 18794 return false; 18795 } 18796 18797 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 18798 return true; 18799 } 18800 18801 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 18802 KnownBits &Known, 18803 const APInt &DemandedElts, 18804 const SelectionDAG &DAG, 18805 unsigned Depth) const { 18806 unsigned BitWidth = Known.getBitWidth(); 18807 Known.resetAll(); 18808 switch (Op.getOpcode()) { 18809 default: break; 18810 case ARMISD::ADDC: 18811 case ARMISD::ADDE: 18812 case ARMISD::SUBC: 18813 case ARMISD::SUBE: 18814 // Special cases when we convert a carry to a boolean. 18815 if (Op.getResNo() == 0) { 18816 SDValue LHS = Op.getOperand(0); 18817 SDValue RHS = Op.getOperand(1); 18818 // (ADDE 0, 0, C) will give us a single bit. 18819 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 18820 isNullConstant(RHS)) { 18821 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 18822 return; 18823 } 18824 } 18825 break; 18826 case ARMISD::CMOV: { 18827 // Bits are known zero/one if known on the LHS and RHS. 18828 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 18829 if (Known.isUnknown()) 18830 return; 18831 18832 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 18833 Known = KnownBits::commonBits(Known, KnownRHS); 18834 return; 18835 } 18836 case ISD::INTRINSIC_W_CHAIN: { 18837 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 18838 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 18839 switch (IntID) { 18840 default: return; 18841 case Intrinsic::arm_ldaex: 18842 case Intrinsic::arm_ldrex: { 18843 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 18844 unsigned MemBits = VT.getScalarSizeInBits(); 18845 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 18846 return; 18847 } 18848 } 18849 } 18850 case ARMISD::BFI: { 18851 // Conservatively, we can recurse down the first operand 18852 // and just mask out all affected bits. 18853 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 18854 18855 // The operand to BFI is already a mask suitable for removing the bits it 18856 // sets. 18857 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 18858 const APInt &Mask = CI->getAPIntValue(); 18859 Known.Zero &= Mask; 18860 Known.One &= Mask; 18861 return; 18862 } 18863 case ARMISD::VGETLANEs: 18864 case ARMISD::VGETLANEu: { 18865 const SDValue &SrcSV = Op.getOperand(0); 18866 EVT VecVT = SrcSV.getValueType(); 18867 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 18868 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 18869 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 18870 assert(Pos->getAPIntValue().ult(NumSrcElts) && 18871 "VGETLANE index out of bounds"); 18872 unsigned Idx = Pos->getZExtValue(); 18873 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 18874 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 18875 18876 EVT VT = Op.getValueType(); 18877 const unsigned DstSz = VT.getScalarSizeInBits(); 18878 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 18879 (void)SrcSz; 18880 assert(SrcSz == Known.getBitWidth()); 18881 assert(DstSz > SrcSz); 18882 if (Op.getOpcode() == ARMISD::VGETLANEs) 18883 Known = Known.sext(DstSz); 18884 else { 18885 Known = Known.zext(DstSz); 18886 } 18887 assert(DstSz == Known.getBitWidth()); 18888 break; 18889 } 18890 case ARMISD::VMOVrh: { 18891 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 18892 assert(KnownOp.getBitWidth() == 16); 18893 Known = KnownOp.zext(32); 18894 break; 18895 } 18896 case ARMISD::CSINC: 18897 case ARMISD::CSINV: 18898 case ARMISD::CSNEG: { 18899 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 18900 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 18901 18902 // The result is either: 18903 // CSINC: KnownOp0 or KnownOp1 + 1 18904 // CSINV: KnownOp0 or ~KnownOp1 18905 // CSNEG: KnownOp0 or KnownOp1 * -1 18906 if (Op.getOpcode() == ARMISD::CSINC) 18907 KnownOp1 = KnownBits::computeForAddSub( 18908 true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1))); 18909 else if (Op.getOpcode() == ARMISD::CSINV) 18910 std::swap(KnownOp1.Zero, KnownOp1.One); 18911 else if (Op.getOpcode() == ARMISD::CSNEG) 18912 KnownOp1 = KnownBits::mul( 18913 KnownOp1, KnownBits::makeConstant(APInt(32, -1))); 18914 18915 Known = KnownBits::commonBits(KnownOp0, KnownOp1); 18916 break; 18917 } 18918 } 18919 } 18920 18921 bool ARMTargetLowering::targetShrinkDemandedConstant( 18922 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 18923 TargetLoweringOpt &TLO) const { 18924 // Delay optimization, so we don't have to deal with illegal types, or block 18925 // optimizations. 18926 if (!TLO.LegalOps) 18927 return false; 18928 18929 // Only optimize AND for now. 18930 if (Op.getOpcode() != ISD::AND) 18931 return false; 18932 18933 EVT VT = Op.getValueType(); 18934 18935 // Ignore vectors. 18936 if (VT.isVector()) 18937 return false; 18938 18939 assert(VT == MVT::i32 && "Unexpected integer type"); 18940 18941 // Make sure the RHS really is a constant. 18942 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 18943 if (!C) 18944 return false; 18945 18946 unsigned Mask = C->getZExtValue(); 18947 18948 unsigned Demanded = DemandedBits.getZExtValue(); 18949 unsigned ShrunkMask = Mask & Demanded; 18950 unsigned ExpandedMask = Mask | ~Demanded; 18951 18952 // If the mask is all zeros, let the target-independent code replace the 18953 // result with zero. 18954 if (ShrunkMask == 0) 18955 return false; 18956 18957 // If the mask is all ones, erase the AND. (Currently, the target-independent 18958 // code won't do this, so we have to do it explicitly to avoid an infinite 18959 // loop in obscure cases.) 18960 if (ExpandedMask == ~0U) 18961 return TLO.CombineTo(Op, Op.getOperand(0)); 18962 18963 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 18964 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 18965 }; 18966 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 18967 if (NewMask == Mask) 18968 return true; 18969 SDLoc DL(Op); 18970 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 18971 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 18972 return TLO.CombineTo(Op, NewOp); 18973 }; 18974 18975 // Prefer uxtb mask. 18976 if (IsLegalMask(0xFF)) 18977 return UseMask(0xFF); 18978 18979 // Prefer uxth mask. 18980 if (IsLegalMask(0xFFFF)) 18981 return UseMask(0xFFFF); 18982 18983 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 18984 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 18985 if (ShrunkMask < 256) 18986 return UseMask(ShrunkMask); 18987 18988 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 18989 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 18990 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 18991 return UseMask(ExpandedMask); 18992 18993 // Potential improvements: 18994 // 18995 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 18996 // We could try to prefer Thumb1 immediates which can be lowered to a 18997 // two-instruction sequence. 18998 // We could try to recognize more legal ARM/Thumb2 immediates here. 18999 19000 return false; 19001 } 19002 19003 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( 19004 SDValue Op, const APInt &OriginalDemandedBits, 19005 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 19006 unsigned Depth) const { 19007 unsigned Opc = Op.getOpcode(); 19008 19009 switch (Opc) { 19010 case ARMISD::ASRL: 19011 case ARMISD::LSRL: { 19012 // If this is result 0 and the other result is unused, see if the demand 19013 // bits allow us to shrink this long shift into a standard small shift in 19014 // the opposite direction. 19015 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && 19016 isa<ConstantSDNode>(Op->getOperand(2))) { 19017 unsigned ShAmt = Op->getConstantOperandVal(2); 19018 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf( 19019 APInt::getAllOnesValue(32) << (32 - ShAmt))) 19020 return TLO.CombineTo( 19021 Op, TLO.DAG.getNode( 19022 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), 19023 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); 19024 } 19025 break; 19026 } 19027 } 19028 19029 return TargetLowering::SimplifyDemandedBitsForTargetNode( 19030 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 19031 } 19032 19033 //===----------------------------------------------------------------------===// 19034 // ARM Inline Assembly Support 19035 //===----------------------------------------------------------------------===// 19036 19037 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 19038 // Looking for "rev" which is V6+. 19039 if (!Subtarget->hasV6Ops()) 19040 return false; 19041 19042 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); 19043 std::string AsmStr = IA->getAsmString(); 19044 SmallVector<StringRef, 4> AsmPieces; 19045 SplitString(AsmStr, AsmPieces, ";\n"); 19046 19047 switch (AsmPieces.size()) { 19048 default: return false; 19049 case 1: 19050 AsmStr = std::string(AsmPieces[0]); 19051 AsmPieces.clear(); 19052 SplitString(AsmStr, AsmPieces, " \t,"); 19053 19054 // rev $0, $1 19055 if (AsmPieces.size() == 3 && 19056 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 19057 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 19058 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 19059 if (Ty && Ty->getBitWidth() == 32) 19060 return IntrinsicLowering::LowerToByteSwap(CI); 19061 } 19062 break; 19063 } 19064 19065 return false; 19066 } 19067 19068 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 19069 // At this point, we have to lower this constraint to something else, so we 19070 // lower it to an "r" or "w". However, by doing this we will force the result 19071 // to be in register, while the X constraint is much more permissive. 19072 // 19073 // Although we are correct (we are free to emit anything, without 19074 // constraints), we might break use cases that would expect us to be more 19075 // efficient and emit something else. 19076 if (!Subtarget->hasVFP2Base()) 19077 return "r"; 19078 if (ConstraintVT.isFloatingPoint()) 19079 return "w"; 19080 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 19081 (ConstraintVT.getSizeInBits() == 64 || 19082 ConstraintVT.getSizeInBits() == 128)) 19083 return "w"; 19084 19085 return "r"; 19086 } 19087 19088 /// getConstraintType - Given a constraint letter, return the type of 19089 /// constraint it is for this target. 19090 ARMTargetLowering::ConstraintType 19091 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 19092 unsigned S = Constraint.size(); 19093 if (S == 1) { 19094 switch (Constraint[0]) { 19095 default: break; 19096 case 'l': return C_RegisterClass; 19097 case 'w': return C_RegisterClass; 19098 case 'h': return C_RegisterClass; 19099 case 'x': return C_RegisterClass; 19100 case 't': return C_RegisterClass; 19101 case 'j': return C_Immediate; // Constant for movw. 19102 // An address with a single base register. Due to the way we 19103 // currently handle addresses it is the same as an 'r' memory constraint. 19104 case 'Q': return C_Memory; 19105 } 19106 } else if (S == 2) { 19107 switch (Constraint[0]) { 19108 default: break; 19109 case 'T': return C_RegisterClass; 19110 // All 'U+' constraints are addresses. 19111 case 'U': return C_Memory; 19112 } 19113 } 19114 return TargetLowering::getConstraintType(Constraint); 19115 } 19116 19117 /// Examine constraint type and operand type and determine a weight value. 19118 /// This object must already have been set up with the operand type 19119 /// and the current alternative constraint selected. 19120 TargetLowering::ConstraintWeight 19121 ARMTargetLowering::getSingleConstraintMatchWeight( 19122 AsmOperandInfo &info, const char *constraint) const { 19123 ConstraintWeight weight = CW_Invalid; 19124 Value *CallOperandVal = info.CallOperandVal; 19125 // If we don't have a value, we can't do a match, 19126 // but allow it at the lowest weight. 19127 if (!CallOperandVal) 19128 return CW_Default; 19129 Type *type = CallOperandVal->getType(); 19130 // Look at the constraint type. 19131 switch (*constraint) { 19132 default: 19133 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 19134 break; 19135 case 'l': 19136 if (type->isIntegerTy()) { 19137 if (Subtarget->isThumb()) 19138 weight = CW_SpecificReg; 19139 else 19140 weight = CW_Register; 19141 } 19142 break; 19143 case 'w': 19144 if (type->isFloatingPointTy()) 19145 weight = CW_Register; 19146 break; 19147 } 19148 return weight; 19149 } 19150 19151 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 19152 19153 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 19154 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 19155 switch (Constraint.size()) { 19156 case 1: 19157 // GCC ARM Constraint Letters 19158 switch (Constraint[0]) { 19159 case 'l': // Low regs or general regs. 19160 if (Subtarget->isThumb()) 19161 return RCPair(0U, &ARM::tGPRRegClass); 19162 return RCPair(0U, &ARM::GPRRegClass); 19163 case 'h': // High regs or no regs. 19164 if (Subtarget->isThumb()) 19165 return RCPair(0U, &ARM::hGPRRegClass); 19166 break; 19167 case 'r': 19168 if (Subtarget->isThumb1Only()) 19169 return RCPair(0U, &ARM::tGPRRegClass); 19170 return RCPair(0U, &ARM::GPRRegClass); 19171 case 'w': 19172 if (VT == MVT::Other) 19173 break; 19174 if (VT == MVT::f32) 19175 return RCPair(0U, &ARM::SPRRegClass); 19176 if (VT.getSizeInBits() == 64) 19177 return RCPair(0U, &ARM::DPRRegClass); 19178 if (VT.getSizeInBits() == 128) 19179 return RCPair(0U, &ARM::QPRRegClass); 19180 break; 19181 case 'x': 19182 if (VT == MVT::Other) 19183 break; 19184 if (VT == MVT::f32) 19185 return RCPair(0U, &ARM::SPR_8RegClass); 19186 if (VT.getSizeInBits() == 64) 19187 return RCPair(0U, &ARM::DPR_8RegClass); 19188 if (VT.getSizeInBits() == 128) 19189 return RCPair(0U, &ARM::QPR_8RegClass); 19190 break; 19191 case 't': 19192 if (VT == MVT::Other) 19193 break; 19194 if (VT == MVT::f32 || VT == MVT::i32) 19195 return RCPair(0U, &ARM::SPRRegClass); 19196 if (VT.getSizeInBits() == 64) 19197 return RCPair(0U, &ARM::DPR_VFP2RegClass); 19198 if (VT.getSizeInBits() == 128) 19199 return RCPair(0U, &ARM::QPR_VFP2RegClass); 19200 break; 19201 } 19202 break; 19203 19204 case 2: 19205 if (Constraint[0] == 'T') { 19206 switch (Constraint[1]) { 19207 default: 19208 break; 19209 case 'e': 19210 return RCPair(0U, &ARM::tGPREvenRegClass); 19211 case 'o': 19212 return RCPair(0U, &ARM::tGPROddRegClass); 19213 } 19214 } 19215 break; 19216 19217 default: 19218 break; 19219 } 19220 19221 if (StringRef("{cc}").equals_insensitive(Constraint)) 19222 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 19223 19224 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 19225 } 19226 19227 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 19228 /// vector. If it is invalid, don't add anything to Ops. 19229 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 19230 std::string &Constraint, 19231 std::vector<SDValue>&Ops, 19232 SelectionDAG &DAG) const { 19233 SDValue Result; 19234 19235 // Currently only support length 1 constraints. 19236 if (Constraint.length() != 1) return; 19237 19238 char ConstraintLetter = Constraint[0]; 19239 switch (ConstraintLetter) { 19240 default: break; 19241 case 'j': 19242 case 'I': case 'J': case 'K': case 'L': 19243 case 'M': case 'N': case 'O': 19244 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 19245 if (!C) 19246 return; 19247 19248 int64_t CVal64 = C->getSExtValue(); 19249 int CVal = (int) CVal64; 19250 // None of these constraints allow values larger than 32 bits. Check 19251 // that the value fits in an int. 19252 if (CVal != CVal64) 19253 return; 19254 19255 switch (ConstraintLetter) { 19256 case 'j': 19257 // Constant suitable for movw, must be between 0 and 19258 // 65535. 19259 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 19260 if (CVal >= 0 && CVal <= 65535) 19261 break; 19262 return; 19263 case 'I': 19264 if (Subtarget->isThumb1Only()) { 19265 // This must be a constant between 0 and 255, for ADD 19266 // immediates. 19267 if (CVal >= 0 && CVal <= 255) 19268 break; 19269 } else if (Subtarget->isThumb2()) { 19270 // A constant that can be used as an immediate value in a 19271 // data-processing instruction. 19272 if (ARM_AM::getT2SOImmVal(CVal) != -1) 19273 break; 19274 } else { 19275 // A constant that can be used as an immediate value in a 19276 // data-processing instruction. 19277 if (ARM_AM::getSOImmVal(CVal) != -1) 19278 break; 19279 } 19280 return; 19281 19282 case 'J': 19283 if (Subtarget->isThumb1Only()) { 19284 // This must be a constant between -255 and -1, for negated ADD 19285 // immediates. This can be used in GCC with an "n" modifier that 19286 // prints the negated value, for use with SUB instructions. It is 19287 // not useful otherwise but is implemented for compatibility. 19288 if (CVal >= -255 && CVal <= -1) 19289 break; 19290 } else { 19291 // This must be a constant between -4095 and 4095. It is not clear 19292 // what this constraint is intended for. Implemented for 19293 // compatibility with GCC. 19294 if (CVal >= -4095 && CVal <= 4095) 19295 break; 19296 } 19297 return; 19298 19299 case 'K': 19300 if (Subtarget->isThumb1Only()) { 19301 // A 32-bit value where only one byte has a nonzero value. Exclude 19302 // zero to match GCC. This constraint is used by GCC internally for 19303 // constants that can be loaded with a move/shift combination. 19304 // It is not useful otherwise but is implemented for compatibility. 19305 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 19306 break; 19307 } else if (Subtarget->isThumb2()) { 19308 // A constant whose bitwise inverse can be used as an immediate 19309 // value in a data-processing instruction. This can be used in GCC 19310 // with a "B" modifier that prints the inverted value, for use with 19311 // BIC and MVN instructions. It is not useful otherwise but is 19312 // implemented for compatibility. 19313 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 19314 break; 19315 } else { 19316 // A constant whose bitwise inverse can be used as an immediate 19317 // value in a data-processing instruction. This can be used in GCC 19318 // with a "B" modifier that prints the inverted value, for use with 19319 // BIC and MVN instructions. It is not useful otherwise but is 19320 // implemented for compatibility. 19321 if (ARM_AM::getSOImmVal(~CVal) != -1) 19322 break; 19323 } 19324 return; 19325 19326 case 'L': 19327 if (Subtarget->isThumb1Only()) { 19328 // This must be a constant between -7 and 7, 19329 // for 3-operand ADD/SUB immediate instructions. 19330 if (CVal >= -7 && CVal < 7) 19331 break; 19332 } else if (Subtarget->isThumb2()) { 19333 // A constant whose negation can be used as an immediate value in a 19334 // data-processing instruction. This can be used in GCC with an "n" 19335 // modifier that prints the negated value, for use with SUB 19336 // instructions. It is not useful otherwise but is implemented for 19337 // compatibility. 19338 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 19339 break; 19340 } else { 19341 // A constant whose negation can be used as an immediate value in a 19342 // data-processing instruction. This can be used in GCC with an "n" 19343 // modifier that prints the negated value, for use with SUB 19344 // instructions. It is not useful otherwise but is implemented for 19345 // compatibility. 19346 if (ARM_AM::getSOImmVal(-CVal) != -1) 19347 break; 19348 } 19349 return; 19350 19351 case 'M': 19352 if (Subtarget->isThumb1Only()) { 19353 // This must be a multiple of 4 between 0 and 1020, for 19354 // ADD sp + immediate. 19355 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 19356 break; 19357 } else { 19358 // A power of two or a constant between 0 and 32. This is used in 19359 // GCC for the shift amount on shifted register operands, but it is 19360 // useful in general for any shift amounts. 19361 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 19362 break; 19363 } 19364 return; 19365 19366 case 'N': 19367 if (Subtarget->isThumb1Only()) { 19368 // This must be a constant between 0 and 31, for shift amounts. 19369 if (CVal >= 0 && CVal <= 31) 19370 break; 19371 } 19372 return; 19373 19374 case 'O': 19375 if (Subtarget->isThumb1Only()) { 19376 // This must be a multiple of 4 between -508 and 508, for 19377 // ADD/SUB sp = sp + immediate. 19378 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 19379 break; 19380 } 19381 return; 19382 } 19383 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 19384 break; 19385 } 19386 19387 if (Result.getNode()) { 19388 Ops.push_back(Result); 19389 return; 19390 } 19391 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 19392 } 19393 19394 static RTLIB::Libcall getDivRemLibcall( 19395 const SDNode *N, MVT::SimpleValueType SVT) { 19396 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 19397 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 19398 "Unhandled Opcode in getDivRemLibcall"); 19399 bool isSigned = N->getOpcode() == ISD::SDIVREM || 19400 N->getOpcode() == ISD::SREM; 19401 RTLIB::Libcall LC; 19402 switch (SVT) { 19403 default: llvm_unreachable("Unexpected request for libcall!"); 19404 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 19405 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 19406 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 19407 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 19408 } 19409 return LC; 19410 } 19411 19412 static TargetLowering::ArgListTy getDivRemArgList( 19413 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 19414 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 19415 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 19416 "Unhandled Opcode in getDivRemArgList"); 19417 bool isSigned = N->getOpcode() == ISD::SDIVREM || 19418 N->getOpcode() == ISD::SREM; 19419 TargetLowering::ArgListTy Args; 19420 TargetLowering::ArgListEntry Entry; 19421 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 19422 EVT ArgVT = N->getOperand(i).getValueType(); 19423 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 19424 Entry.Node = N->getOperand(i); 19425 Entry.Ty = ArgTy; 19426 Entry.IsSExt = isSigned; 19427 Entry.IsZExt = !isSigned; 19428 Args.push_back(Entry); 19429 } 19430 if (Subtarget->isTargetWindows() && Args.size() >= 2) 19431 std::swap(Args[0], Args[1]); 19432 return Args; 19433 } 19434 19435 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 19436 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 19437 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 19438 Subtarget->isTargetWindows()) && 19439 "Register-based DivRem lowering only"); 19440 unsigned Opcode = Op->getOpcode(); 19441 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 19442 "Invalid opcode for Div/Rem lowering"); 19443 bool isSigned = (Opcode == ISD::SDIVREM); 19444 EVT VT = Op->getValueType(0); 19445 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 19446 SDLoc dl(Op); 19447 19448 // If the target has hardware divide, use divide + multiply + subtract: 19449 // div = a / b 19450 // rem = a - b * div 19451 // return {div, rem} 19452 // This should be lowered into UDIV/SDIV + MLS later on. 19453 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 19454 : Subtarget->hasDivideInARMMode(); 19455 if (hasDivide && Op->getValueType(0).isSimple() && 19456 Op->getSimpleValueType(0) == MVT::i32) { 19457 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 19458 const SDValue Dividend = Op->getOperand(0); 19459 const SDValue Divisor = Op->getOperand(1); 19460 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 19461 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 19462 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 19463 19464 SDValue Values[2] = {Div, Rem}; 19465 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 19466 } 19467 19468 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 19469 VT.getSimpleVT().SimpleTy); 19470 SDValue InChain = DAG.getEntryNode(); 19471 19472 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 19473 DAG.getContext(), 19474 Subtarget); 19475 19476 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 19477 getPointerTy(DAG.getDataLayout())); 19478 19479 Type *RetTy = StructType::get(Ty, Ty); 19480 19481 if (Subtarget->isTargetWindows()) 19482 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 19483 19484 TargetLowering::CallLoweringInfo CLI(DAG); 19485 CLI.setDebugLoc(dl).setChain(InChain) 19486 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 19487 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 19488 19489 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 19490 return CallInfo.first; 19491 } 19492 19493 // Lowers REM using divmod helpers 19494 // see RTABI section 4.2/4.3 19495 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 19496 // Build return types (div and rem) 19497 std::vector<Type*> RetTyParams; 19498 Type *RetTyElement; 19499 19500 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 19501 default: llvm_unreachable("Unexpected request for libcall!"); 19502 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 19503 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 19504 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 19505 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 19506 } 19507 19508 RetTyParams.push_back(RetTyElement); 19509 RetTyParams.push_back(RetTyElement); 19510 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 19511 Type *RetTy = StructType::get(*DAG.getContext(), ret); 19512 19513 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 19514 SimpleTy); 19515 SDValue InChain = DAG.getEntryNode(); 19516 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 19517 Subtarget); 19518 bool isSigned = N->getOpcode() == ISD::SREM; 19519 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 19520 getPointerTy(DAG.getDataLayout())); 19521 19522 if (Subtarget->isTargetWindows()) 19523 InChain = WinDBZCheckDenominator(DAG, N, InChain); 19524 19525 // Lower call 19526 CallLoweringInfo CLI(DAG); 19527 CLI.setChain(InChain) 19528 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 19529 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 19530 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 19531 19532 // Return second (rem) result operand (first contains div) 19533 SDNode *ResNode = CallResult.first.getNode(); 19534 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 19535 return ResNode->getOperand(1); 19536 } 19537 19538 SDValue 19539 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 19540 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 19541 SDLoc DL(Op); 19542 19543 // Get the inputs. 19544 SDValue Chain = Op.getOperand(0); 19545 SDValue Size = Op.getOperand(1); 19546 19547 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 19548 "no-stack-arg-probe")) { 19549 MaybeAlign Align = 19550 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 19551 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 19552 Chain = SP.getValue(1); 19553 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 19554 if (Align) 19555 SP = 19556 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 19557 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); 19558 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 19559 SDValue Ops[2] = { SP, Chain }; 19560 return DAG.getMergeValues(Ops, DL); 19561 } 19562 19563 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 19564 DAG.getConstant(2, DL, MVT::i32)); 19565 19566 SDValue Flag; 19567 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 19568 Flag = Chain.getValue(1); 19569 19570 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 19571 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 19572 19573 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 19574 Chain = NewSP.getValue(1); 19575 19576 SDValue Ops[2] = { NewSP, Chain }; 19577 return DAG.getMergeValues(Ops, DL); 19578 } 19579 19580 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 19581 bool IsStrict = Op->isStrictFPOpcode(); 19582 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 19583 const unsigned DstSz = Op.getValueType().getSizeInBits(); 19584 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 19585 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 19586 "Unexpected type for custom-lowering FP_EXTEND"); 19587 19588 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 19589 "With both FP DP and 16, any FP conversion is legal!"); 19590 19591 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 19592 "With FP16, 16 to 32 conversion is legal!"); 19593 19594 // Converting from 32 -> 64 is valid if we have FP64. 19595 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 19596 // FIXME: Remove this when we have strict fp instruction selection patterns 19597 if (IsStrict) { 19598 SDLoc Loc(Op); 19599 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 19600 Loc, Op.getValueType(), SrcVal); 19601 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 19602 } 19603 return Op; 19604 } 19605 19606 // Either we are converting from 16 -> 64, without FP16 and/or 19607 // FP.double-precision or without Armv8-fp. So we must do it in two 19608 // steps. 19609 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 19610 // without FP16. So we must do a function call. 19611 SDLoc Loc(Op); 19612 RTLIB::Libcall LC; 19613 MakeLibCallOptions CallOptions; 19614 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 19615 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 19616 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 19617 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 19618 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 19619 if (Supported) { 19620 if (IsStrict) { 19621 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 19622 {DstVT, MVT::Other}, {Chain, SrcVal}); 19623 Chain = SrcVal.getValue(1); 19624 } else { 19625 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 19626 } 19627 } else { 19628 LC = RTLIB::getFPEXT(SrcVT, DstVT); 19629 assert(LC != RTLIB::UNKNOWN_LIBCALL && 19630 "Unexpected type for custom-lowering FP_EXTEND"); 19631 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 19632 Loc, Chain); 19633 } 19634 } 19635 19636 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 19637 } 19638 19639 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 19640 bool IsStrict = Op->isStrictFPOpcode(); 19641 19642 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 19643 EVT SrcVT = SrcVal.getValueType(); 19644 EVT DstVT = Op.getValueType(); 19645 const unsigned DstSz = Op.getValueType().getSizeInBits(); 19646 const unsigned SrcSz = SrcVT.getSizeInBits(); 19647 (void)DstSz; 19648 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 19649 "Unexpected type for custom-lowering FP_ROUND"); 19650 19651 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 19652 "With both FP DP and 16, any FP conversion is legal!"); 19653 19654 SDLoc Loc(Op); 19655 19656 // Instruction from 32 -> 16 if hasFP16 is valid 19657 if (SrcSz == 32 && Subtarget->hasFP16()) 19658 return Op; 19659 19660 // Lib call from 32 -> 16 / 64 -> [32, 16] 19661 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 19662 assert(LC != RTLIB::UNKNOWN_LIBCALL && 19663 "Unexpected type for custom-lowering FP_ROUND"); 19664 MakeLibCallOptions CallOptions; 19665 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 19666 SDValue Result; 19667 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 19668 Loc, Chain); 19669 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 19670 } 19671 19672 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 19673 SelectionDAG &DAG) const { 19674 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 19675 MVT HalfT = MVT::i32; 19676 SDLoc dl(N); 19677 SDValue Hi, Lo, Tmp; 19678 19679 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 19680 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 19681 return ; 19682 19683 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 19684 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 19685 19686 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 19687 DAG.getConstant(0, dl, HalfT)); 19688 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 19689 DAG.getConstant(1, dl, HalfT)); 19690 19691 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 19692 DAG.getConstant(OpTypeBits - 1, dl, 19693 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 19694 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 19695 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 19696 SDValue(Lo.getNode(), 1)); 19697 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 19698 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 19699 19700 Results.push_back(Lo); 19701 Results.push_back(Hi); 19702 } 19703 19704 bool 19705 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 19706 // The ARM target isn't yet aware of offsets. 19707 return false; 19708 } 19709 19710 bool ARM::isBitFieldInvertedMask(unsigned v) { 19711 if (v == 0xffffffff) 19712 return false; 19713 19714 // there can be 1's on either or both "outsides", all the "inside" 19715 // bits must be 0's 19716 return isShiftedMask_32(~v); 19717 } 19718 19719 /// isFPImmLegal - Returns true if the target can instruction select the 19720 /// specified FP immediate natively. If false, the legalizer will 19721 /// materialize the FP immediate as a load from a constant pool. 19722 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 19723 bool ForCodeSize) const { 19724 if (!Subtarget->hasVFP3Base()) 19725 return false; 19726 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 19727 return ARM_AM::getFP16Imm(Imm) != -1; 19728 if (VT == MVT::f32 && Subtarget->hasFullFP16() && 19729 ARM_AM::getFP32FP16Imm(Imm) != -1) 19730 return true; 19731 if (VT == MVT::f32) 19732 return ARM_AM::getFP32Imm(Imm) != -1; 19733 if (VT == MVT::f64 && Subtarget->hasFP64()) 19734 return ARM_AM::getFP64Imm(Imm) != -1; 19735 return false; 19736 } 19737 19738 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 19739 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 19740 /// specified in the intrinsic calls. 19741 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 19742 const CallInst &I, 19743 MachineFunction &MF, 19744 unsigned Intrinsic) const { 19745 switch (Intrinsic) { 19746 case Intrinsic::arm_neon_vld1: 19747 case Intrinsic::arm_neon_vld2: 19748 case Intrinsic::arm_neon_vld3: 19749 case Intrinsic::arm_neon_vld4: 19750 case Intrinsic::arm_neon_vld2lane: 19751 case Intrinsic::arm_neon_vld3lane: 19752 case Intrinsic::arm_neon_vld4lane: 19753 case Intrinsic::arm_neon_vld2dup: 19754 case Intrinsic::arm_neon_vld3dup: 19755 case Intrinsic::arm_neon_vld4dup: { 19756 Info.opc = ISD::INTRINSIC_W_CHAIN; 19757 // Conservatively set memVT to the entire set of vectors loaded. 19758 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19759 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 19760 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19761 Info.ptrVal = I.getArgOperand(0); 19762 Info.offset = 0; 19763 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 19764 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 19765 // volatile loads with NEON intrinsics not supported 19766 Info.flags = MachineMemOperand::MOLoad; 19767 return true; 19768 } 19769 case Intrinsic::arm_neon_vld1x2: 19770 case Intrinsic::arm_neon_vld1x3: 19771 case Intrinsic::arm_neon_vld1x4: { 19772 Info.opc = ISD::INTRINSIC_W_CHAIN; 19773 // Conservatively set memVT to the entire set of vectors loaded. 19774 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19775 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 19776 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19777 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 19778 Info.offset = 0; 19779 Info.align.reset(); 19780 // volatile loads with NEON intrinsics not supported 19781 Info.flags = MachineMemOperand::MOLoad; 19782 return true; 19783 } 19784 case Intrinsic::arm_neon_vst1: 19785 case Intrinsic::arm_neon_vst2: 19786 case Intrinsic::arm_neon_vst3: 19787 case Intrinsic::arm_neon_vst4: 19788 case Intrinsic::arm_neon_vst2lane: 19789 case Intrinsic::arm_neon_vst3lane: 19790 case Intrinsic::arm_neon_vst4lane: { 19791 Info.opc = ISD::INTRINSIC_VOID; 19792 // Conservatively set memVT to the entire set of vectors stored. 19793 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19794 unsigned NumElts = 0; 19795 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 19796 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 19797 if (!ArgTy->isVectorTy()) 19798 break; 19799 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 19800 } 19801 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19802 Info.ptrVal = I.getArgOperand(0); 19803 Info.offset = 0; 19804 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 19805 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 19806 // volatile stores with NEON intrinsics not supported 19807 Info.flags = MachineMemOperand::MOStore; 19808 return true; 19809 } 19810 case Intrinsic::arm_neon_vst1x2: 19811 case Intrinsic::arm_neon_vst1x3: 19812 case Intrinsic::arm_neon_vst1x4: { 19813 Info.opc = ISD::INTRINSIC_VOID; 19814 // Conservatively set memVT to the entire set of vectors stored. 19815 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19816 unsigned NumElts = 0; 19817 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 19818 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 19819 if (!ArgTy->isVectorTy()) 19820 break; 19821 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 19822 } 19823 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19824 Info.ptrVal = I.getArgOperand(0); 19825 Info.offset = 0; 19826 Info.align.reset(); 19827 // volatile stores with NEON intrinsics not supported 19828 Info.flags = MachineMemOperand::MOStore; 19829 return true; 19830 } 19831 case Intrinsic::arm_mve_vld2q: 19832 case Intrinsic::arm_mve_vld4q: { 19833 Info.opc = ISD::INTRINSIC_W_CHAIN; 19834 // Conservatively set memVT to the entire set of vectors loaded. 19835 Type *VecTy = cast<StructType>(I.getType())->getElementType(1); 19836 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; 19837 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 19838 Info.ptrVal = I.getArgOperand(0); 19839 Info.offset = 0; 19840 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 19841 // volatile loads with MVE intrinsics not supported 19842 Info.flags = MachineMemOperand::MOLoad; 19843 return true; 19844 } 19845 case Intrinsic::arm_mve_vst2q: 19846 case Intrinsic::arm_mve_vst4q: { 19847 Info.opc = ISD::INTRINSIC_VOID; 19848 // Conservatively set memVT to the entire set of vectors stored. 19849 Type *VecTy = I.getArgOperand(1)->getType(); 19850 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; 19851 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 19852 Info.ptrVal = I.getArgOperand(0); 19853 Info.offset = 0; 19854 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 19855 // volatile stores with MVE intrinsics not supported 19856 Info.flags = MachineMemOperand::MOStore; 19857 return true; 19858 } 19859 case Intrinsic::arm_mve_vldr_gather_base: 19860 case Intrinsic::arm_mve_vldr_gather_base_predicated: { 19861 Info.opc = ISD::INTRINSIC_W_CHAIN; 19862 Info.ptrVal = nullptr; 19863 Info.memVT = MVT::getVT(I.getType()); 19864 Info.align = Align(1); 19865 Info.flags |= MachineMemOperand::MOLoad; 19866 return true; 19867 } 19868 case Intrinsic::arm_mve_vldr_gather_base_wb: 19869 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { 19870 Info.opc = ISD::INTRINSIC_W_CHAIN; 19871 Info.ptrVal = nullptr; 19872 Info.memVT = MVT::getVT(I.getType()->getContainedType(0)); 19873 Info.align = Align(1); 19874 Info.flags |= MachineMemOperand::MOLoad; 19875 return true; 19876 } 19877 case Intrinsic::arm_mve_vldr_gather_offset: 19878 case Intrinsic::arm_mve_vldr_gather_offset_predicated: { 19879 Info.opc = ISD::INTRINSIC_W_CHAIN; 19880 Info.ptrVal = nullptr; 19881 MVT DataVT = MVT::getVT(I.getType()); 19882 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); 19883 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 19884 DataVT.getVectorNumElements()); 19885 Info.align = Align(1); 19886 Info.flags |= MachineMemOperand::MOLoad; 19887 return true; 19888 } 19889 case Intrinsic::arm_mve_vstr_scatter_base: 19890 case Intrinsic::arm_mve_vstr_scatter_base_predicated: { 19891 Info.opc = ISD::INTRINSIC_VOID; 19892 Info.ptrVal = nullptr; 19893 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 19894 Info.align = Align(1); 19895 Info.flags |= MachineMemOperand::MOStore; 19896 return true; 19897 } 19898 case Intrinsic::arm_mve_vstr_scatter_base_wb: 19899 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: { 19900 Info.opc = ISD::INTRINSIC_W_CHAIN; 19901 Info.ptrVal = nullptr; 19902 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 19903 Info.align = Align(1); 19904 Info.flags |= MachineMemOperand::MOStore; 19905 return true; 19906 } 19907 case Intrinsic::arm_mve_vstr_scatter_offset: 19908 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: { 19909 Info.opc = ISD::INTRINSIC_VOID; 19910 Info.ptrVal = nullptr; 19911 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType()); 19912 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); 19913 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 19914 DataVT.getVectorNumElements()); 19915 Info.align = Align(1); 19916 Info.flags |= MachineMemOperand::MOStore; 19917 return true; 19918 } 19919 case Intrinsic::arm_ldaex: 19920 case Intrinsic::arm_ldrex: { 19921 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19922 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 19923 Info.opc = ISD::INTRINSIC_W_CHAIN; 19924 Info.memVT = MVT::getVT(PtrTy->getElementType()); 19925 Info.ptrVal = I.getArgOperand(0); 19926 Info.offset = 0; 19927 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 19928 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 19929 return true; 19930 } 19931 case Intrinsic::arm_stlex: 19932 case Intrinsic::arm_strex: { 19933 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19934 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 19935 Info.opc = ISD::INTRINSIC_W_CHAIN; 19936 Info.memVT = MVT::getVT(PtrTy->getElementType()); 19937 Info.ptrVal = I.getArgOperand(1); 19938 Info.offset = 0; 19939 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 19940 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 19941 return true; 19942 } 19943 case Intrinsic::arm_stlexd: 19944 case Intrinsic::arm_strexd: 19945 Info.opc = ISD::INTRINSIC_W_CHAIN; 19946 Info.memVT = MVT::i64; 19947 Info.ptrVal = I.getArgOperand(2); 19948 Info.offset = 0; 19949 Info.align = Align(8); 19950 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 19951 return true; 19952 19953 case Intrinsic::arm_ldaexd: 19954 case Intrinsic::arm_ldrexd: 19955 Info.opc = ISD::INTRINSIC_W_CHAIN; 19956 Info.memVT = MVT::i64; 19957 Info.ptrVal = I.getArgOperand(0); 19958 Info.offset = 0; 19959 Info.align = Align(8); 19960 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 19961 return true; 19962 19963 default: 19964 break; 19965 } 19966 19967 return false; 19968 } 19969 19970 /// Returns true if it is beneficial to convert a load of a constant 19971 /// to just the constant itself. 19972 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 19973 Type *Ty) const { 19974 assert(Ty->isIntegerTy()); 19975 19976 unsigned Bits = Ty->getPrimitiveSizeInBits(); 19977 if (Bits == 0 || Bits > 32) 19978 return false; 19979 return true; 19980 } 19981 19982 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 19983 unsigned Index) const { 19984 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 19985 return false; 19986 19987 return (Index == 0 || Index == ResVT.getVectorNumElements()); 19988 } 19989 19990 Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, 19991 ARM_MB::MemBOpt Domain) const { 19992 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 19993 19994 // First, if the target has no DMB, see what fallback we can use. 19995 if (!Subtarget->hasDataBarrier()) { 19996 // Some ARMv6 cpus can support data barriers with an mcr instruction. 19997 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 19998 // here. 19999 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 20000 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 20001 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 20002 Builder.getInt32(0), Builder.getInt32(7), 20003 Builder.getInt32(10), Builder.getInt32(5)}; 20004 return Builder.CreateCall(MCR, args); 20005 } else { 20006 // Instead of using barriers, atomic accesses on these subtargets use 20007 // libcalls. 20008 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 20009 } 20010 } else { 20011 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 20012 // Only a full system barrier exists in the M-class architectures. 20013 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 20014 Constant *CDomain = Builder.getInt32(Domain); 20015 return Builder.CreateCall(DMB, CDomain); 20016 } 20017 } 20018 20019 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 20020 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, 20021 Instruction *Inst, 20022 AtomicOrdering Ord) const { 20023 switch (Ord) { 20024 case AtomicOrdering::NotAtomic: 20025 case AtomicOrdering::Unordered: 20026 llvm_unreachable("Invalid fence: unordered/non-atomic"); 20027 case AtomicOrdering::Monotonic: 20028 case AtomicOrdering::Acquire: 20029 return nullptr; // Nothing to do 20030 case AtomicOrdering::SequentiallyConsistent: 20031 if (!Inst->hasAtomicStore()) 20032 return nullptr; // Nothing to do 20033 LLVM_FALLTHROUGH; 20034 case AtomicOrdering::Release: 20035 case AtomicOrdering::AcquireRelease: 20036 if (Subtarget->preferISHSTBarriers()) 20037 return makeDMB(Builder, ARM_MB::ISHST); 20038 // FIXME: add a comment with a link to documentation justifying this. 20039 else 20040 return makeDMB(Builder, ARM_MB::ISH); 20041 } 20042 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 20043 } 20044 20045 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, 20046 Instruction *Inst, 20047 AtomicOrdering Ord) const { 20048 switch (Ord) { 20049 case AtomicOrdering::NotAtomic: 20050 case AtomicOrdering::Unordered: 20051 llvm_unreachable("Invalid fence: unordered/not-atomic"); 20052 case AtomicOrdering::Monotonic: 20053 case AtomicOrdering::Release: 20054 return nullptr; // Nothing to do 20055 case AtomicOrdering::Acquire: 20056 case AtomicOrdering::AcquireRelease: 20057 case AtomicOrdering::SequentiallyConsistent: 20058 return makeDMB(Builder, ARM_MB::ISH); 20059 } 20060 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 20061 } 20062 20063 // Loads and stores less than 64-bits are already atomic; ones above that 20064 // are doomed anyway, so defer to the default libcall and blame the OS when 20065 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 20066 // anything for those. 20067 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 20068 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 20069 return (Size == 64) && !Subtarget->isMClass(); 20070 } 20071 20072 // Loads and stores less than 64-bits are already atomic; ones above that 20073 // are doomed anyway, so defer to the default libcall and blame the OS when 20074 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 20075 // anything for those. 20076 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 20077 // guarantee, see DDI0406C ARM architecture reference manual, 20078 // sections A8.8.72-74 LDRD) 20079 TargetLowering::AtomicExpansionKind 20080 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 20081 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 20082 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 20083 : AtomicExpansionKind::None; 20084 } 20085 20086 // For the real atomic operations, we have ldrex/strex up to 32 bits, 20087 // and up to 64 bits on the non-M profiles 20088 TargetLowering::AtomicExpansionKind 20089 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 20090 if (AI->isFloatingPointOperation()) 20091 return AtomicExpansionKind::CmpXChg; 20092 20093 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 20094 // implement atomicrmw without spilling. If the target address is also on the 20095 // stack and close enough to the spill slot, this can lead to a situation 20096 // where the monitor always gets cleared and the atomic operation can never 20097 // succeed. So at -O0 lower this operation to a CAS loop. 20098 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 20099 return AtomicExpansionKind::CmpXChg; 20100 20101 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 20102 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 20103 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 20104 ? AtomicExpansionKind::LLSC 20105 : AtomicExpansionKind::None; 20106 } 20107 20108 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32 20109 // bits, and up to 64 bits on the non-M profiles. 20110 TargetLowering::AtomicExpansionKind 20111 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 20112 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 20113 // implement cmpxchg without spilling. If the address being exchanged is also 20114 // on the stack and close enough to the spill slot, this can lead to a 20115 // situation where the monitor always gets cleared and the atomic operation 20116 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 20117 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits(); 20118 bool HasAtomicCmpXchg = 20119 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 20120 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg && 20121 Size <= (Subtarget->isMClass() ? 32U : 64U)) 20122 return AtomicExpansionKind::LLSC; 20123 return AtomicExpansionKind::None; 20124 } 20125 20126 bool ARMTargetLowering::shouldInsertFencesForAtomic( 20127 const Instruction *I) const { 20128 return InsertFencesForAtomic; 20129 } 20130 20131 // This has so far only been implemented for MachO. 20132 bool ARMTargetLowering::useLoadStackGuardNode() const { 20133 return Subtarget->isTargetMachO(); 20134 } 20135 20136 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 20137 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 20138 return TargetLowering::insertSSPDeclarations(M); 20139 20140 // MSVC CRT has a global variable holding security cookie. 20141 M.getOrInsertGlobal("__security_cookie", 20142 Type::getInt8PtrTy(M.getContext())); 20143 20144 // MSVC CRT has a function to validate security cookie. 20145 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 20146 "__security_check_cookie", Type::getVoidTy(M.getContext()), 20147 Type::getInt8PtrTy(M.getContext())); 20148 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 20149 F->addAttribute(1, Attribute::AttrKind::InReg); 20150 } 20151 20152 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 20153 // MSVC CRT has a global variable holding security cookie. 20154 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 20155 return M.getGlobalVariable("__security_cookie"); 20156 return TargetLowering::getSDagStackGuard(M); 20157 } 20158 20159 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 20160 // MSVC CRT has a function to validate security cookie. 20161 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 20162 return M.getFunction("__security_check_cookie"); 20163 return TargetLowering::getSSPStackGuardCheck(M); 20164 } 20165 20166 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 20167 unsigned &Cost) const { 20168 // If we do not have NEON, vector types are not natively supported. 20169 if (!Subtarget->hasNEON()) 20170 return false; 20171 20172 // Floating point values and vector values map to the same register file. 20173 // Therefore, although we could do a store extract of a vector type, this is 20174 // better to leave at float as we have more freedom in the addressing mode for 20175 // those. 20176 if (VectorTy->isFPOrFPVectorTy()) 20177 return false; 20178 20179 // If the index is unknown at compile time, this is very expensive to lower 20180 // and it is not possible to combine the store with the extract. 20181 if (!isa<ConstantInt>(Idx)) 20182 return false; 20183 20184 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 20185 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); 20186 // We can do a store + vector extract on any vector that fits perfectly in a D 20187 // or Q register. 20188 if (BitWidth == 64 || BitWidth == 128) { 20189 Cost = 0; 20190 return true; 20191 } 20192 return false; 20193 } 20194 20195 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 20196 return Subtarget->hasV6T2Ops(); 20197 } 20198 20199 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 20200 return Subtarget->hasV6T2Ops(); 20201 } 20202 20203 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 20204 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 20205 } 20206 20207 Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, 20208 Value *Addr, 20209 AtomicOrdering Ord) const { 20210 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 20211 bool IsAcquire = isAcquireOrStronger(Ord); 20212 20213 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 20214 // intrinsic must return {i32, i32} and we have to recombine them into a 20215 // single i64 here. 20216 if (ValueTy->getPrimitiveSizeInBits() == 64) { 20217 Intrinsic::ID Int = 20218 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 20219 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 20220 20221 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 20222 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 20223 20224 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 20225 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 20226 if (!Subtarget->isLittle()) 20227 std::swap (Lo, Hi); 20228 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); 20229 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); 20230 return Builder.CreateOr( 20231 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64"); 20232 } 20233 20234 Type *Tys[] = { Addr->getType() }; 20235 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 20236 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 20237 20238 return Builder.CreateTruncOrBitCast(Builder.CreateCall(Ldrex, Addr), ValueTy); 20239 } 20240 20241 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 20242 IRBuilderBase &Builder) const { 20243 if (!Subtarget->hasV7Ops()) 20244 return; 20245 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 20246 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 20247 } 20248 20249 Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, 20250 Value *Val, Value *Addr, 20251 AtomicOrdering Ord) const { 20252 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 20253 bool IsRelease = isReleaseOrStronger(Ord); 20254 20255 // Since the intrinsics must have legal type, the i64 intrinsics take two 20256 // parameters: "i32, i32". We must marshal Val into the appropriate form 20257 // before the call. 20258 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 20259 Intrinsic::ID Int = 20260 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 20261 Function *Strex = Intrinsic::getDeclaration(M, Int); 20262 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 20263 20264 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 20265 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 20266 if (!Subtarget->isLittle()) 20267 std::swap(Lo, Hi); 20268 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 20269 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 20270 } 20271 20272 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 20273 Type *Tys[] = { Addr->getType() }; 20274 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 20275 20276 return Builder.CreateCall( 20277 Strex, {Builder.CreateZExtOrBitCast( 20278 Val, Strex->getFunctionType()->getParamType(0)), 20279 Addr}); 20280 } 20281 20282 20283 bool ARMTargetLowering::alignLoopsWithOptSize() const { 20284 return Subtarget->isMClass(); 20285 } 20286 20287 /// A helper function for determining the number of interleaved accesses we 20288 /// will generate when lowering accesses of the given type. 20289 unsigned 20290 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 20291 const DataLayout &DL) const { 20292 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 20293 } 20294 20295 bool ARMTargetLowering::isLegalInterleavedAccessType( 20296 unsigned Factor, FixedVectorType *VecTy, Align Alignment, 20297 const DataLayout &DL) const { 20298 20299 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 20300 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 20301 20302 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 20303 return false; 20304 20305 // Ensure the vector doesn't have f16 elements. Even though we could do an 20306 // i16 vldN, we can't hold the f16 vectors and will end up converting via 20307 // f32. 20308 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 20309 return false; 20310 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 20311 return false; 20312 20313 // Ensure the number of vector elements is greater than 1. 20314 if (VecTy->getNumElements() < 2) 20315 return false; 20316 20317 // Ensure the element type is legal. 20318 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 20319 return false; 20320 // And the alignment if high enough under MVE. 20321 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8) 20322 return false; 20323 20324 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 20325 // 128 will be split into multiple interleaved accesses. 20326 if (Subtarget->hasNEON() && VecSize == 64) 20327 return true; 20328 return VecSize % 128 == 0; 20329 } 20330 20331 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 20332 if (Subtarget->hasNEON()) 20333 return 4; 20334 if (Subtarget->hasMVEIntegerOps()) 20335 return MVEMaxSupportedInterleaveFactor; 20336 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 20337 } 20338 20339 /// Lower an interleaved load into a vldN intrinsic. 20340 /// 20341 /// E.g. Lower an interleaved load (Factor = 2): 20342 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 20343 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 20344 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 20345 /// 20346 /// Into: 20347 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 20348 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 20349 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 20350 bool ARMTargetLowering::lowerInterleavedLoad( 20351 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 20352 ArrayRef<unsigned> Indices, unsigned Factor) const { 20353 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 20354 "Invalid interleave factor"); 20355 assert(!Shuffles.empty() && "Empty shufflevector input"); 20356 assert(Shuffles.size() == Indices.size() && 20357 "Unmatched number of shufflevectors and indices"); 20358 20359 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); 20360 Type *EltTy = VecTy->getElementType(); 20361 20362 const DataLayout &DL = LI->getModule()->getDataLayout(); 20363 Align Alignment = LI->getAlign(); 20364 20365 // Skip if we do not have NEON and skip illegal vector types. We can 20366 // "legalize" wide vector types into multiple interleaved accesses as long as 20367 // the vector types are divisible by 128. 20368 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL)) 20369 return false; 20370 20371 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 20372 20373 // A pointer vector can not be the return type of the ldN intrinsics. Need to 20374 // load integer vectors first and then convert to pointer vectors. 20375 if (EltTy->isPointerTy()) 20376 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); 20377 20378 IRBuilder<> Builder(LI); 20379 20380 // The base address of the load. 20381 Value *BaseAddr = LI->getPointerOperand(); 20382 20383 if (NumLoads > 1) { 20384 // If we're going to generate more than one load, reset the sub-vector type 20385 // to something legal. 20386 VecTy = FixedVectorType::get(VecTy->getElementType(), 20387 VecTy->getNumElements() / NumLoads); 20388 20389 // We will compute the pointer operand of each load from the original base 20390 // address using GEPs. Cast the base address to a pointer to the scalar 20391 // element type. 20392 BaseAddr = Builder.CreateBitCast( 20393 BaseAddr, 20394 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 20395 } 20396 20397 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 20398 20399 auto createLoadIntrinsic = [&](Value *BaseAddr) { 20400 if (Subtarget->hasNEON()) { 20401 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 20402 Type *Tys[] = {VecTy, Int8Ptr}; 20403 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 20404 Intrinsic::arm_neon_vld3, 20405 Intrinsic::arm_neon_vld4}; 20406 Function *VldnFunc = 20407 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 20408 20409 SmallVector<Value *, 2> Ops; 20410 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 20411 Ops.push_back(Builder.getInt32(LI->getAlignment())); 20412 20413 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 20414 } else { 20415 assert((Factor == 2 || Factor == 4) && 20416 "expected interleave factor of 2 or 4 for MVE"); 20417 Intrinsic::ID LoadInts = 20418 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 20419 Type *VecEltTy = 20420 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); 20421 Type *Tys[] = {VecTy, VecEltTy}; 20422 Function *VldnFunc = 20423 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 20424 20425 SmallVector<Value *, 2> Ops; 20426 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 20427 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 20428 } 20429 }; 20430 20431 // Holds sub-vectors extracted from the load intrinsic return values. The 20432 // sub-vectors are associated with the shufflevector instructions they will 20433 // replace. 20434 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 20435 20436 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 20437 // If we're generating more than one load, compute the base address of 20438 // subsequent loads as an offset from the previous. 20439 if (LoadCount > 0) 20440 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, 20441 VecTy->getNumElements() * Factor); 20442 20443 CallInst *VldN = createLoadIntrinsic(BaseAddr); 20444 20445 // Replace uses of each shufflevector with the corresponding vector loaded 20446 // by ldN. 20447 for (unsigned i = 0; i < Shuffles.size(); i++) { 20448 ShuffleVectorInst *SV = Shuffles[i]; 20449 unsigned Index = Indices[i]; 20450 20451 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 20452 20453 // Convert the integer vector to pointer vector if the element is pointer. 20454 if (EltTy->isPointerTy()) 20455 SubVec = Builder.CreateIntToPtr( 20456 SubVec, 20457 FixedVectorType::get(SV->getType()->getElementType(), VecTy)); 20458 20459 SubVecs[SV].push_back(SubVec); 20460 } 20461 } 20462 20463 // Replace uses of the shufflevector instructions with the sub-vectors 20464 // returned by the load intrinsic. If a shufflevector instruction is 20465 // associated with more than one sub-vector, those sub-vectors will be 20466 // concatenated into a single wide vector. 20467 for (ShuffleVectorInst *SVI : Shuffles) { 20468 auto &SubVec = SubVecs[SVI]; 20469 auto *WideVec = 20470 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 20471 SVI->replaceAllUsesWith(WideVec); 20472 } 20473 20474 return true; 20475 } 20476 20477 /// Lower an interleaved store into a vstN intrinsic. 20478 /// 20479 /// E.g. Lower an interleaved store (Factor = 3): 20480 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 20481 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 20482 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 20483 /// 20484 /// Into: 20485 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 20486 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 20487 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 20488 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 20489 /// 20490 /// Note that the new shufflevectors will be removed and we'll only generate one 20491 /// vst3 instruction in CodeGen. 20492 /// 20493 /// Example for a more general valid mask (Factor 3). Lower: 20494 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 20495 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 20496 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 20497 /// 20498 /// Into: 20499 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 20500 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 20501 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 20502 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 20503 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 20504 ShuffleVectorInst *SVI, 20505 unsigned Factor) const { 20506 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 20507 "Invalid interleave factor"); 20508 20509 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 20510 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 20511 20512 unsigned LaneLen = VecTy->getNumElements() / Factor; 20513 Type *EltTy = VecTy->getElementType(); 20514 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 20515 20516 const DataLayout &DL = SI->getModule()->getDataLayout(); 20517 Align Alignment = SI->getAlign(); 20518 20519 // Skip if we do not have NEON and skip illegal vector types. We can 20520 // "legalize" wide vector types into multiple interleaved accesses as long as 20521 // the vector types are divisible by 128. 20522 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) 20523 return false; 20524 20525 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 20526 20527 Value *Op0 = SVI->getOperand(0); 20528 Value *Op1 = SVI->getOperand(1); 20529 IRBuilder<> Builder(SI); 20530 20531 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 20532 // vectors to integer vectors. 20533 if (EltTy->isPointerTy()) { 20534 Type *IntTy = DL.getIntPtrType(EltTy); 20535 20536 // Convert to the corresponding integer vector. 20537 auto *IntVecTy = 20538 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); 20539 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 20540 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 20541 20542 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 20543 } 20544 20545 // The base address of the store. 20546 Value *BaseAddr = SI->getPointerOperand(); 20547 20548 if (NumStores > 1) { 20549 // If we're going to generate more than one store, reset the lane length 20550 // and sub-vector type to something legal. 20551 LaneLen /= NumStores; 20552 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 20553 20554 // We will compute the pointer operand of each store from the original base 20555 // address using GEPs. Cast the base address to a pointer to the scalar 20556 // element type. 20557 BaseAddr = Builder.CreateBitCast( 20558 BaseAddr, 20559 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 20560 } 20561 20562 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 20563 20564 auto Mask = SVI->getShuffleMask(); 20565 20566 auto createStoreIntrinsic = [&](Value *BaseAddr, 20567 SmallVectorImpl<Value *> &Shuffles) { 20568 if (Subtarget->hasNEON()) { 20569 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 20570 Intrinsic::arm_neon_vst3, 20571 Intrinsic::arm_neon_vst4}; 20572 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 20573 Type *Tys[] = {Int8Ptr, SubVecTy}; 20574 20575 Function *VstNFunc = Intrinsic::getDeclaration( 20576 SI->getModule(), StoreInts[Factor - 2], Tys); 20577 20578 SmallVector<Value *, 6> Ops; 20579 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 20580 append_range(Ops, Shuffles); 20581 Ops.push_back(Builder.getInt32(SI->getAlignment())); 20582 Builder.CreateCall(VstNFunc, Ops); 20583 } else { 20584 assert((Factor == 2 || Factor == 4) && 20585 "expected interleave factor of 2 or 4 for MVE"); 20586 Intrinsic::ID StoreInts = 20587 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 20588 Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( 20589 SI->getPointerAddressSpace()); 20590 Type *Tys[] = {EltPtrTy, SubVecTy}; 20591 Function *VstNFunc = 20592 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 20593 20594 SmallVector<Value *, 6> Ops; 20595 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 20596 append_range(Ops, Shuffles); 20597 for (unsigned F = 0; F < Factor; F++) { 20598 Ops.push_back(Builder.getInt32(F)); 20599 Builder.CreateCall(VstNFunc, Ops); 20600 Ops.pop_back(); 20601 } 20602 } 20603 }; 20604 20605 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 20606 // If we generating more than one store, we compute the base address of 20607 // subsequent stores as an offset from the previous. 20608 if (StoreCount > 0) 20609 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 20610 BaseAddr, LaneLen * Factor); 20611 20612 SmallVector<Value *, 4> Shuffles; 20613 20614 // Split the shufflevector operands into sub vectors for the new vstN call. 20615 for (unsigned i = 0; i < Factor; i++) { 20616 unsigned IdxI = StoreCount * LaneLen * Factor + i; 20617 if (Mask[IdxI] >= 0) { 20618 Shuffles.push_back(Builder.CreateShuffleVector( 20619 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); 20620 } else { 20621 unsigned StartMask = 0; 20622 for (unsigned j = 1; j < LaneLen; j++) { 20623 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 20624 if (Mask[IdxJ * Factor + IdxI] >= 0) { 20625 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 20626 break; 20627 } 20628 } 20629 // Note: If all elements in a chunk are undefs, StartMask=0! 20630 // Note: Filling undef gaps with random elements is ok, since 20631 // those elements were being written anyway (with undefs). 20632 // In the case of all undefs we're defaulting to using elems from 0 20633 // Note: StartMask cannot be negative, it's checked in 20634 // isReInterleaveMask 20635 Shuffles.push_back(Builder.CreateShuffleVector( 20636 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); 20637 } 20638 } 20639 20640 createStoreIntrinsic(BaseAddr, Shuffles); 20641 } 20642 return true; 20643 } 20644 20645 enum HABaseType { 20646 HA_UNKNOWN = 0, 20647 HA_FLOAT, 20648 HA_DOUBLE, 20649 HA_VECT64, 20650 HA_VECT128 20651 }; 20652 20653 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 20654 uint64_t &Members) { 20655 if (auto *ST = dyn_cast<StructType>(Ty)) { 20656 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 20657 uint64_t SubMembers = 0; 20658 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 20659 return false; 20660 Members += SubMembers; 20661 } 20662 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 20663 uint64_t SubMembers = 0; 20664 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 20665 return false; 20666 Members += SubMembers * AT->getNumElements(); 20667 } else if (Ty->isFloatTy()) { 20668 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 20669 return false; 20670 Members = 1; 20671 Base = HA_FLOAT; 20672 } else if (Ty->isDoubleTy()) { 20673 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 20674 return false; 20675 Members = 1; 20676 Base = HA_DOUBLE; 20677 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 20678 Members = 1; 20679 switch (Base) { 20680 case HA_FLOAT: 20681 case HA_DOUBLE: 20682 return false; 20683 case HA_VECT64: 20684 return VT->getPrimitiveSizeInBits().getFixedSize() == 64; 20685 case HA_VECT128: 20686 return VT->getPrimitiveSizeInBits().getFixedSize() == 128; 20687 case HA_UNKNOWN: 20688 switch (VT->getPrimitiveSizeInBits().getFixedSize()) { 20689 case 64: 20690 Base = HA_VECT64; 20691 return true; 20692 case 128: 20693 Base = HA_VECT128; 20694 return true; 20695 default: 20696 return false; 20697 } 20698 } 20699 } 20700 20701 return (Members > 0 && Members <= 4); 20702 } 20703 20704 /// Return the correct alignment for the current calling convention. 20705 Align ARMTargetLowering::getABIAlignmentForCallingConv( 20706 Type *ArgTy, const DataLayout &DL) const { 20707 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy); 20708 if (!ArgTy->isVectorTy()) 20709 return ABITypeAlign; 20710 20711 // Avoid over-aligning vector parameters. It would require realigning the 20712 // stack and waste space for no real benefit. 20713 return std::min(ABITypeAlign, DL.getStackAlignment()); 20714 } 20715 20716 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 20717 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 20718 /// passing according to AAPCS rules. 20719 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 20720 Type *Ty, CallingConv::ID CallConv, bool isVarArg, 20721 const DataLayout &DL) const { 20722 if (getEffectiveCallingConv(CallConv, isVarArg) != 20723 CallingConv::ARM_AAPCS_VFP) 20724 return false; 20725 20726 HABaseType Base = HA_UNKNOWN; 20727 uint64_t Members = 0; 20728 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 20729 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 20730 20731 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 20732 return IsHA || IsIntArray; 20733 } 20734 20735 Register ARMTargetLowering::getExceptionPointerRegister( 20736 const Constant *PersonalityFn) const { 20737 // Platforms which do not use SjLj EH may return values in these registers 20738 // via the personality function. 20739 return Subtarget->useSjLjEH() ? Register() : ARM::R0; 20740 } 20741 20742 Register ARMTargetLowering::getExceptionSelectorRegister( 20743 const Constant *PersonalityFn) const { 20744 // Platforms which do not use SjLj EH may return values in these registers 20745 // via the personality function. 20746 return Subtarget->useSjLjEH() ? Register() : ARM::R1; 20747 } 20748 20749 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 20750 // Update IsSplitCSR in ARMFunctionInfo. 20751 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 20752 AFI->setIsSplitCSR(true); 20753 } 20754 20755 void ARMTargetLowering::insertCopiesSplitCSR( 20756 MachineBasicBlock *Entry, 20757 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 20758 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 20759 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 20760 if (!IStart) 20761 return; 20762 20763 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 20764 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 20765 MachineBasicBlock::iterator MBBI = Entry->begin(); 20766 for (const MCPhysReg *I = IStart; *I; ++I) { 20767 const TargetRegisterClass *RC = nullptr; 20768 if (ARM::GPRRegClass.contains(*I)) 20769 RC = &ARM::GPRRegClass; 20770 else if (ARM::DPRRegClass.contains(*I)) 20771 RC = &ARM::DPRRegClass; 20772 else 20773 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 20774 20775 Register NewVR = MRI->createVirtualRegister(RC); 20776 // Create copy from CSR to a virtual register. 20777 // FIXME: this currently does not emit CFI pseudo-instructions, it works 20778 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 20779 // nounwind. If we want to generalize this later, we may need to emit 20780 // CFI pseudo-instructions. 20781 assert(Entry->getParent()->getFunction().hasFnAttribute( 20782 Attribute::NoUnwind) && 20783 "Function should be nounwind in insertCopiesSplitCSR!"); 20784 Entry->addLiveIn(*I); 20785 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 20786 .addReg(*I); 20787 20788 // Insert the copy-back instructions right before the terminator. 20789 for (auto *Exit : Exits) 20790 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 20791 TII->get(TargetOpcode::COPY), *I) 20792 .addReg(NewVR); 20793 } 20794 } 20795 20796 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 20797 MF.getFrameInfo().computeMaxCallFrameSize(MF); 20798 TargetLoweringBase::finalizeLowering(MF); 20799 } 20800